diff options
author | Martin Matuska <mm@FreeBSD.org> | 2021-07-23 00:50:13 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2021-07-23 00:50:13 +0000 |
commit | 3f9d360c82e0724bfb61346038236bf15c5d4d84 (patch) | |
tree | f21ca33e877b6bbba3f57f9b32a054f1793b841a /sys/contrib/openzfs | |
parent | 95f0da5be1e3456c930f5f9538cbc099c65f2014 (diff) | |
parent | 14b43fbd9c13d802409ed886bb6b66fd528fb209 (diff) | |
download | src-3f9d360c82e0724bfb61346038236bf15c5d4d84.tar.gz src-3f9d360c82e0724bfb61346038236bf15c5d4d84.zip |
zfs: merge openzfs/zfs@14b43fbd9 (master) into main
Notable upstream pull request merges:
#12271 Tinker with slop space accounting with dedup
#12279 Fix ARC ghost states eviction accounting
#12284 Add Module Parameter Regarding Log Size Limit
#12300 Introduce dsl_dir_diduse_transfer_space()
#12314 Optimize allocation throttling
#12348 Minor ARC optimizations
#12350 Detect HAVE_LARGE_STACKS at compile time
#12356 Use SET_ERROR for more errors in FreeBSD vnops
#12375 FreeBSD: Ignore make_dev_s() errors
#12378 FreeBSD: Switch from MAXPHYS to maxphys on FreeBSD 13+
Obtained from: OpenZFS
OpenZFS commit: 14b43fbd9c13d802409ed886bb6b66fd528fb209
Diffstat (limited to 'sys/contrib/openzfs')
49 files changed, 668 insertions, 406 deletions
diff --git a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml index 8dcd5047a748..33276d269092 100644 --- a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml +++ b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml @@ -32,5 +32,19 @@ jobs: run: | make lint - name: CheckABI + id: CheckABI run: | make checkabi + - name: StoreABI + if: failure() && steps.CheckABI.outcome == 'failure' + run: | + make storeabi + - name: Prepare artifacts + if: failure() && steps.CheckABI.outcome == 'failure' + run: | + find -name *.abi | tar -cf abi_files.tar -T - + - uses: actions/upload-artifact@v2 + if: failure() && steps.CheckABI.outcome == 'failure' + with: + name: New ABI files (use only if you're sure about interface changes) + path: abi_files.tar diff --git a/sys/contrib/openzfs/.github/workflows/zloop.yml b/sys/contrib/openzfs/.github/workflows/zloop.yml index b3679e7f7f20..cf81ad4bcafc 100644 --- a/sys/contrib/openzfs/.github/workflows/zloop.yml +++ b/sys/contrib/openzfs/.github/workflows/zloop.yml @@ -45,7 +45,7 @@ jobs: run: | sudo mkdir -p $TEST_DIR # run for 20 minutes to have a total runner time of 30 minutes - sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1 + sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1 -- -T 120 -P 60 - name: Prepare artifacts if: failure() run: | diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index 38bfdc91aeaf..a8d68cd03cf8 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -317,7 +317,7 @@ get_usage(zfs_help_t idx) case HELP_SEND: return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] " "<snapshot>\n" - "\tsend [-nvPLecw] [-i snapshot|bookmark] " + "\tsend [-DnvPLecw] [-i snapshot|bookmark] " "<filesystem|volume|snapshot>\n" "\tsend [-DnPpvLec] [-i bookmark|snapshot] " "--redact <bookmark> <snapshot>\n" diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c index 5dc39afe830f..b60d18ee955c 100644 --- a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c @@ -684,9 +684,8 @@ print_recursive_stats(stat_printer_f func, nvlist_t *nvroot, if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { - (void) strncpy(vdev_name, get_vdev_name(nvroot, parent_name), + (void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name), sizeof (vdev_name)); - vdev_name[sizeof (vdev_name) - 1] = '\0'; for (c = 0; c < children; c++) { print_recursive_stats(func, child[c], pool_name, diff --git a/sys/contrib/openzfs/config/Rules.am b/sys/contrib/openzfs/config/Rules.am index 8fe2fa9ca8d9..be80c1e9c7ce 100644 --- a/sys/contrib/openzfs/config/Rules.am +++ b/sys/contrib/openzfs/config/Rules.am @@ -39,7 +39,6 @@ AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE -AM_CPPFLAGS += -DHAVE_LARGE_STACKS=1 AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" diff --git a/sys/contrib/openzfs/config/kernel-config-defined.m4 b/sys/contrib/openzfs/config/kernel-config-defined.m4 index 9b9468269ca3..c7d18b49b14e 100644 --- a/sys/contrib/openzfs/config/kernel-config-defined.m4 +++ b/sys/contrib/openzfs/config/kernel-config-defined.m4 @@ -19,7 +19,6 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [ ]) ]) - ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE @@ -29,7 +28,6 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [ ZFS_LINUX_TEST_COMPILE_ALL([config]) AC_MSG_RESULT([done]) - ZFS_AC_KERNEL_CONFIG_THREAD_SIZE ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE @@ -37,35 +35,6 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [ ]) dnl # -dnl # Check configured THREAD_SIZE -dnl # -dnl # The stack size will vary by architecture, but as of Linux 3.15 on x86_64 -dnl # the default thread stack size was increased to 16K from 8K. Therefore, -dnl # on newer kernels and some architectures stack usage optimizations can be -dnl # conditionally applied to improve performance without negatively impacting -dnl # stability. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE], [ - ZFS_LINUX_TEST_SRC([config_thread_size], [ - #include <linux/module.h> - ],[ - #if (THREAD_SIZE < 16384) - #error "THREAD_SIZE is less than 16K" - #endif - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_CONFIG_THREAD_SIZE], [ - AC_MSG_CHECKING([whether kernel was built with 16K or larger stacks]) - ZFS_LINUX_TEST_RESULT([config_thread_size], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_LARGE_STACKS, 1, [kernel has large stacks]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) - -dnl # dnl # Check CONFIG_DEBUG_LOCK_ALLOC dnl # dnl # This is typically only set for debug kernels because it comes with diff --git a/sys/contrib/openzfs/config/user-libatomic.m4 b/sys/contrib/openzfs/config/user-libatomic.m4 index 14a60bbea9d0..d15069f9c445 100644 --- a/sys/contrib/openzfs/config/user-libatomic.m4 +++ b/sys/contrib/openzfs/config/user-libatomic.m4 @@ -1,33 +1,27 @@ dnl # -dnl # If -latomic exists, it's needed for __atomic intrinsics. -dnl # -dnl # Some systems (like FreeBSD 13) don't have a libatomic at all because -dnl # their toolchain doesn't ship it – they obviously don't need it. -dnl # -dnl # Others (like sufficiently ancient CentOS) have one, -dnl # but terminally broken or unlinkable (e.g. it's a dangling symlink, -dnl # or a linker script that points to a nonexistent file) – -dnl # most arches affected by this don't actually need -latomic (and if they do, -dnl # then they should have libatomic that actually exists and links, -dnl # so don't fall into this category). -dnl # -dnl # Technically, we could check if the platform *actually* needs -latomic, -dnl # or if it has native support for all the intrinsics we use, -dnl # but it /really/ doesn't matter, and C11 recommends to always link it. +dnl # If -latomic exists and atomic.c doesn't link without it, +dnl # it's needed for __atomic intrinsics. dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_LIBATOMIC], [ - AC_MSG_CHECKING([whether -latomic is present]) + AC_MSG_CHECKING([whether -latomic is required]) saved_libs="$LIBS" LIBS="$LIBS -latomic" + LIBATOMIC_LIBS="" AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])], [ - LIBATOMIC_LIBS="-latomic" + LIBS="$saved_libs" + saved_cflags="$CFLAGS" + CFLAGS="$CFLAGS -isystem lib/libspl/include" + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include "lib/libspl/atomic.c"], [])], [], [LIBATOMIC_LIBS="-latomic"]) + CFLAGS="$saved_cflags" + ]) + + if test -n "$LIBATOMIC_LIBS"; then AC_MSG_RESULT([yes]) - ], [ - LIBATOMIC_LIBS="" + else AC_MSG_RESULT([no]) - ]) + fi LIBS="$saved_libs" AC_SUBST([LIBATOMIC_LIBS]) diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h index 8dbe907d098c..a32eb52c53c1 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -41,6 +41,10 @@ #include <sys/ccompat.h> #include <linux/types.h> +#if KSTACK_PAGES * PAGE_SIZE >= 16384 +#define HAVE_LARGE_STACKS 1 +#endif + #define cond_resched() kern_yield(PRI_USER) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h index de7015b929b6..981a6b8a63e5 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h @@ -25,5 +25,10 @@ #include <linux/dcache_compat.h> #include <linux/utsname_compat.h> +#include <linux/module.h> + +#if THREAD_SIZE >= 16384 +#define HAVE_LARGE_STACKS 1 +#endif #endif diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h index f58fa53b6003..ef07a657f53c 100644 --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -44,7 +44,7 @@ extern "C" { * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ -#define ARC_EVICT_ALL -1ULL +#define ARC_EVICT_ALL UINT64_MAX #define HDR_SET_LSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index 1f341ec94faf..747100a22068 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -964,6 +964,13 @@ typedef struct arc_evict_waiter { #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ +#define arc_anon (&ARC_anon) +#define arc_mru (&ARC_mru) +#define arc_mru_ghost (&ARC_mru_ghost) +#define arc_mfu (&ARC_mfu) +#define arc_mfu_ghost (&ARC_mfu_ghost) +#define arc_l2c_only (&ARC_l2c_only) + extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; extern arc_sums_t arc_sums; @@ -974,8 +981,8 @@ extern int arc_no_grow_shift; extern int arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; -extern arc_state_t *arc_mfu; -extern arc_state_t *arc_mru; +extern arc_state_t ARC_mfu; +extern arc_state_t ARC_mru; extern uint_t zfs_arc_pc_percent; extern int arc_lotsfree_percent; extern unsigned long zfs_arc_min; @@ -984,7 +991,6 @@ extern unsigned long zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); -extern boolean_t arc_is_overflowing(void); extern void arc_wait_for_eviction(uint64_t); extern void arc_lowmem_init(void); diff --git a/sys/contrib/openzfs/include/sys/dmu_tx.h b/sys/contrib/openzfs/include/sys/dmu_tx.h index 60e9ed6e26f5..71a9ac7ca7bf 100644 --- a/sys/contrib/openzfs/include/sys/dmu_tx.h +++ b/sys/contrib/openzfs/include/sys/dmu_tx.h @@ -124,6 +124,7 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; + kstat_named_t dmu_tx_wrlog_over_max; kstat_named_t dmu_tx_dirty_frees_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/sys/contrib/openzfs/include/sys/dsl_dir.h b/sys/contrib/openzfs/include/sys/dsl_dir.h index 7cf5093c2c30..d635b3140423 100644 --- a/sys/contrib/openzfs/include/sys/dsl_dir.h +++ b/sys/contrib/openzfs/include/sys/dsl_dir.h @@ -174,6 +174,9 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); +void dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, + int64_t compressed, int64_t uncompressed, int64_t tonew, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, diff --git a/sys/contrib/openzfs/include/sys/dsl_pool.h b/sys/contrib/openzfs/include/sys/dsl_pool.h index 8249bb8fc633..44900f8ceb2f 100644 --- a/sys/contrib/openzfs/include/sys/dsl_pool.h +++ b/sys/contrib/openzfs/include/sys/dsl_pool.h @@ -40,6 +40,7 @@ #include <sys/rrwlock.h> #include <sys/dsl_synctask.h> #include <sys/mmp.h> +#include <sys/aggsum.h> #ifdef __cplusplus extern "C" { @@ -58,6 +59,7 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; +extern unsigned long zfs_wrlog_data_max; extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; @@ -119,6 +121,9 @@ typedef struct dsl_pool { uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; + aggsum_t dp_wrlog_pertxg[TXG_SIZE]; + aggsum_t dp_wrlog_total; + /* * Time of most recently scheduled (furthest in the future) * wakeup for delayed transactions. @@ -158,6 +163,8 @@ int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); +void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); +boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index 9924c3ba0eaa..adf4c03a20db 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator { */ uint64_t mca_alloc_max_slots; zfs_refcount_t mca_alloc_slots; -} metaslab_class_allocator_t; +} ____cacheline_aligned metaslab_class_allocator_t; /* * A metaslab class encompasses a category of allocatable top-level vdevs. diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 280f8cf16958..21729e617aca 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -57,6 +57,11 @@ extern "C" { #endif +typedef struct spa_alloc { + kmutex_t spaa_lock; + avl_tree_t spaa_tree; +} ____cacheline_aligned spa_alloc_t; + typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; @@ -250,13 +255,11 @@ struct spa { list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ /* - * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are - * stored in spa_alloc_count. There is one tree and one lock for each - * allocator, to help improve allocation performance in write-heavy - * workloads. + * spa_allocs is an array, whose lengths is stored in spa_alloc_count. + * There is one tree and one lock for each allocator, to help improve + * allocation performance in write-heavy workloads. */ - kmutex_t *spa_alloc_locks; - avl_tree_t *spa_alloc_trees; + spa_alloc_t *spa_allocs; int spa_alloc_count; spa_aux_vdev_t spa_spares; /* hot spares */ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index c792cb65b67a..2d34481f6be6 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -572,8 +572,8 @@ extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); -extern void zio_execute(zio_t *zio); -extern void zio_interrupt(zio_t *zio); +extern void zio_execute(void *zio); +extern void zio_interrupt(void *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); extern void zio_deadman(zio_t *zio, char *tag); diff --git a/sys/contrib/openzfs/lib/libspl/include/os/freebsd/sys/zfs_context_os.h b/sys/contrib/openzfs/lib/libspl/include/os/freebsd/sys/zfs_context_os.h index f5a136d22125..b9bf487c2aef 100644 --- a/sys/contrib/openzfs/lib/libspl/include/os/freebsd/sys/zfs_context_os.h +++ b/sys/contrib/openzfs/lib/libspl/include/os/freebsd/sys/zfs_context_os.h @@ -29,6 +29,7 @@ #ifndef ZFS_CONTEXT_OS_H_ #define ZFS_CONTEXT_OS_H_ +#define HAVE_LARGE_STACKS 1 #define ZFS_EXPORTS_PATH "/etc/zfs/exports" #endif diff --git a/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/zfs_context_os.h b/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/zfs_context_os.h index 008e57df4eae..81ced5207749 100644 --- a/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/zfs_context_os.h +++ b/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/zfs_context_os.h @@ -22,4 +22,7 @@ #ifndef ZFS_CONTEXT_OS_H #define ZFS_CONTEXT_OS_H + +#define HAVE_LARGE_STACKS 1 + #endif diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_impl.h b/sys/contrib/openzfs/lib/libzfs/libzfs_impl.h index ce7373582f0e..b1cf4f825f8a 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_impl.h +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_impl.h @@ -243,7 +243,8 @@ extern proto_table_t proto_table[PROTO_END]; extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags); -extern int do_unmount(const char *mntpt, int flags); +extern int do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags); +extern int zfs_mount_delegation_check(void); extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto); extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *); extern int unshare_one(libzfs_handle_t *hdl, const char *name, diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c index b074a6e6f371..5729f120e9df 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c @@ -568,11 +568,11 @@ zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags, * Unmount a single filesystem. */ static int -unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) +unmount_one(zfs_handle_t *zhp, const char *mountpoint, int flags) { int error; - error = do_unmount(mountpoint, flags); + error = do_unmount(zhp, mountpoint, flags); if (error != 0) { int libzfs_err; @@ -595,7 +595,7 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) default: libzfs_err = EZFS_UMOUNTFAILED; } - return (zfs_error_fmt(hdl, libzfs_err, + return (zfs_error_fmt(zhp->zfs_hdl, libzfs_err, dgettext(TEXT_DOMAIN, "cannot unmount '%s'"), mountpoint)); } @@ -637,7 +637,7 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) } zfs_commit_all_shares(); - if (unmount_one(hdl, mntpt, flags) != 0) { + if (unmount_one(zhp, mntpt, flags) != 0) { free(mntpt); (void) zfs_shareall(zhp); zfs_commit_all_shares(); @@ -1503,13 +1503,18 @@ out: return (ret); } +struct sets_s { + char *mountpoint; + zfs_handle_t *dataset; +}; + static int mountpoint_compare(const void *a, const void *b) { - const char *mounta = *((char **)a); - const char *mountb = *((char **)b); + const struct sets_s *mounta = (struct sets_s *)a; + const struct sets_s *mountb = (struct sets_s *)b; - return (strcmp(mountb, mounta)); + return (strcmp(mountb->mountpoint, mounta->mountpoint)); } /* @@ -1526,8 +1531,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) FILE *mnttab; struct mnttab entry; size_t namelen; - char **mountpoints = NULL; - zfs_handle_t **datasets = NULL; + struct sets_s *sets = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; int i; int ret = -1; @@ -1562,35 +1566,27 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) */ if (used == alloc) { if (alloc == 0) { - if ((mountpoints = zfs_alloc(hdl, - 8 * sizeof (void *))) == NULL) - goto out; - if ((datasets = zfs_alloc(hdl, - 8 * sizeof (void *))) == NULL) + if ((sets = zfs_alloc(hdl, + 8 * sizeof (struct sets_s))) == NULL) goto out; alloc = 8; } else { void *ptr; - if ((ptr = zfs_realloc(hdl, mountpoints, - alloc * sizeof (void *), - alloc * 2 * sizeof (void *))) == NULL) - goto out; - mountpoints = ptr; - - if ((ptr = zfs_realloc(hdl, datasets, - alloc * sizeof (void *), - alloc * 2 * sizeof (void *))) == NULL) + if ((ptr = zfs_realloc(hdl, sets, + alloc * sizeof (struct sets_s), + alloc * 2 * sizeof (struct sets_s))) + == NULL) goto out; - datasets = ptr; + sets = ptr; alloc *= 2; } } - if ((mountpoints[used] = zfs_strdup(hdl, + if ((sets[used].mountpoint = zfs_strdup(hdl, entry.mnt_mountp)) == NULL) goto out; @@ -1599,7 +1595,8 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) * is only used to determine if we need to remove the underlying * mountpoint, so failure is not fatal. */ - datasets[used] = make_dataset_handle(hdl, entry.mnt_special); + sets[used].dataset = make_dataset_handle(hdl, + entry.mnt_special); used++; } @@ -1608,7 +1605,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) * At this point, we have the entire list of filesystems, so sort it by * mountpoint. */ - qsort(mountpoints, used, sizeof (char *), mountpoint_compare); + qsort(sets, used, sizeof (struct sets_s), mountpoint_compare); /* * Walk through and first unshare everything. @@ -1617,9 +1614,9 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) zfs_share_proto_t *curr_proto; for (curr_proto = share_all_proto; *curr_proto != PROTO_END; curr_proto++) { - if (is_shared(mountpoints[i], *curr_proto) && - unshare_one(hdl, mountpoints[i], - mountpoints[i], *curr_proto) != 0) + if (is_shared(sets[i].mountpoint, *curr_proto) && + unshare_one(hdl, sets[i].mountpoint, + sets[i].mountpoint, *curr_proto) != 0) goto out; } } @@ -1630,25 +1627,25 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) * appropriate. */ for (i = 0; i < used; i++) { - if (unmount_one(hdl, mountpoints[i], flags) != 0) + if (unmount_one(sets[i].dataset, sets[i].mountpoint, + flags) != 0) goto out; } for (i = 0; i < used; i++) { - if (datasets[i]) - remove_mountpoint(datasets[i]); + if (sets[i].dataset) + remove_mountpoint(sets[i].dataset); } ret = 0; out: (void) fclose(mnttab); for (i = 0; i < used; i++) { - if (datasets[i]) - zfs_close(datasets[i]); - free(mountpoints[i]); + if (sets[i].dataset) + zfs_close(sets[i].dataset); + free(sets[i].mountpoint); } - free(datasets); - free(mountpoints); + free(sets); return (ret); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c index 5c57028c4013..36a480d36a05 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c @@ -2391,7 +2391,6 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, int err; libzfs_handle_t *hdl = zhp->zfs_hdl; char *name = zhp->zfs_name; - int orig_fd = fd; pthread_t ptid; progress_arg_t pa = { 0 }; @@ -2523,7 +2522,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, if (flags->props || flags->holds || flags->backup) { /* Write the final end record. */ - err = send_conclusion_record(orig_fd, NULL); + err = send_conclusion_record(fd, NULL); if (err != 0) return (zfs_standard_error(hdl, err, errbuf)); } diff --git a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c index e1febe6a2d91..6bc073cb03b9 100644 --- a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c +++ b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c @@ -121,7 +121,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) } int -do_unmount(const char *mntpt, int flags) +do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags) { if (unmount(mntpt, flags) < 0) return (errno); diff --git a/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c b/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c index 547895d7e370..42f300b36c91 100644 --- a/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c +++ b/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c @@ -374,7 +374,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) } int -do_unmount(const char *mntpt, int flags) +do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags) { if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { int rv = umount2(mntpt, flags); diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 6da8d42b42bd..9a1dec3e67ca 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -712,20 +712,22 @@ equivalent to the greater of the number of online CPUs and The ARC size is considered to be overflowing if it exceeds the current ARC target size .Pq Sy arc_c -by a threshold determined by this parameter. -The threshold is calculated as a fraction of -.Sy arc_c -using the formula -.Sy arc_c >> zfs_arc_overflow_shift . +by thresholds determined by this parameter. +Exceeding by +.Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5 +starts ARC reclamation process. +If that appears insufficient, exceeding by +.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5 +blocks new buffer allocation until the reclaim thread catches up. +Started reclamation process continues till ARC size returns below the +target size. .Pp The default value of .Sy 8 -causes the ARC to be considered overflowing if it exceeds the target size by -.Em 1/256th Pq Em 0.3% -of the target size. -.Pp -When the ARC is overflowing, new buffer allocations are stalled until -the reclaim thread catches up and the overflow condition no longer exists. +causes the ARC to start reclamation if it exceeds the target size by +.Em 0.2% +of the target size, and block allocations by +.Em 0.6% . . .It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int If nonzero, this will update @@ -1066,6 +1068,18 @@ Start syncing out a transaction group if there's at least this much dirty data This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . +.It Sy zfs_wrlog_data_max Ns = Pq int +The upper limit of write-transaction zil log data size in bytes. +Once it is reached, write operation is blocked, until log data is cleared out +after transaction group sync. Because of some overhead, it should be set +at least 2 times the size of +.Sy zfs_dirty_data_max +.No to prevent harming normal write throughput. +It also should be smaller than the size of the slog device if slog is present. +.Pp +Defaults to +.Sy zfs_dirty_data_max*2 +. .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8 index a3d08fbf6e2c..688bd033979a 100644 --- a/sys/contrib/openzfs/man/man8/zfs-send.8 +++ b/sys/contrib/openzfs/man/man8/zfs-send.8 @@ -44,7 +44,7 @@ .Ar snapshot .Nm zfs .Cm send -.Op Fl DLPRcenpsvw +.Op Fl DLPcensvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm zfs @@ -285,7 +285,7 @@ You will be able to receive your streams on future versions of ZFS. .It Xo .Nm zfs .Cm send -.Op Fl DLPRcenpvw +.Op Fl DLPcenvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc @@ -296,7 +296,11 @@ filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be .Qq --head-- . -.Bl -tag -width "-L" +.Bl -tag -width "-D" +.It Fl D , -dedup +Deduplicated send is no longer supported. +This flag is accepted for backwards compatibility, but a regular, +non-deduplicated stream will be generated. .It Fl L , -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the diff --git a/sys/contrib/openzfs/module/nvpair/nvpair.c b/sys/contrib/openzfs/module/nvpair/nvpair.c index 990a4482c993..5f427c8cf2e7 100644 --- a/sys/contrib/openzfs/module/nvpair/nvpair.c +++ b/sys/contrib/openzfs/module/nvpair/nvpair.c @@ -3214,6 +3214,56 @@ nvs_xdr_nvl_fini(nvstream_t *nvs) } /* + * xdrproc_t-compatible callbacks for xdr_array() + */ + +#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */ + +#define NVS_BUILD_XDRPROC_T(type) \ +static bool_t \ +nvs_xdr_nvp_##type(XDR *xdrs, void *ptr) \ +{ \ + return (xdr_##type(xdrs, ptr)); \ +} + +#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */ + +#define NVS_BUILD_XDRPROC_T(type) \ +static bool_t \ +nvs_xdr_nvp_##type(XDR *xdrs, ...) \ +{ \ + va_list args; \ + void *ptr; \ + \ + va_start(args, xdrs); \ + ptr = va_arg(args, void *); \ + va_end(args); \ + \ + return (xdr_##type(xdrs, ptr)); \ +} + +#else /* FreeBSD, sunrpc */ + +#define NVS_BUILD_XDRPROC_T(type) \ +static bool_t \ +nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...) \ +{ \ + return (xdr_##type(xdrs, ptr)); \ +} + +#endif + +/* BEGIN CSTYLED */ +NVS_BUILD_XDRPROC_T(char); +NVS_BUILD_XDRPROC_T(short); +NVS_BUILD_XDRPROC_T(u_short); +NVS_BUILD_XDRPROC_T(int); +NVS_BUILD_XDRPROC_T(u_int); +NVS_BUILD_XDRPROC_T(longlong_t); +NVS_BUILD_XDRPROC_T(u_longlong_t); +/* END CSTYLED */ + +/* * The format of xdr encoded nvpair is: * encode_size, decode_size, name string, data type, nelem, data */ @@ -3335,38 +3385,38 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), - (xdrproc_t)xdr_char); + nvs_xdr_nvp_char); break; case DATA_TYPE_INT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), - sizeof (int16_t), (xdrproc_t)xdr_short); + sizeof (int16_t), nvs_xdr_nvp_short); break; case DATA_TYPE_UINT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), - sizeof (uint16_t), (xdrproc_t)xdr_u_short); + sizeof (uint16_t), nvs_xdr_nvp_u_short); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), - sizeof (int32_t), (xdrproc_t)xdr_int); + sizeof (int32_t), nvs_xdr_nvp_int); break; case DATA_TYPE_UINT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), - sizeof (uint32_t), (xdrproc_t)xdr_u_int); + sizeof (uint32_t), nvs_xdr_nvp_u_int); break; case DATA_TYPE_INT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), - sizeof (int64_t), (xdrproc_t)xdr_longlong_t); + sizeof (int64_t), nvs_xdr_nvp_longlong_t); break; case DATA_TYPE_UINT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), - sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); + sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t); break; case DATA_TYPE_STRING_ARRAY: { diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c index 05377bb7ed98..3b8b11cff0c2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -234,8 +234,6 @@ arc_lowmem(void *arg __unused, int howto __unused) */ if (curproc == pageproc) arc_wait_for_eviction(to_free); - else - arc_wait_for_eviction(0); } void diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c index b1407e4bd61d..6ac37da1c58a 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c @@ -381,7 +381,11 @@ vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, int i, n_bios, j; size_t bios_size; +#if __FreeBSD_version > 1300130 maxio = maxphys - (maxphys % cp->provider->sectorsize); +#else + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); +#endif n_bios = 0; /* How many bios are required for all commands ? */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 46a632b0385c..846b4b60531f 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -5343,7 +5343,7 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) - return (error); + return (SET_ERROR(error)); if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); @@ -5374,15 +5374,17 @@ zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname) error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname, &nv_value, &nv_size); - if (error) - return (error); + if (error != 0) + return (SET_ERROR(error)); if (ap->a_size != NULL) *ap->a_size = nv_size; else if (ap->a_uio != NULL) error = uiomove(nv_value, nv_size, ap->a_uio); + if (error != 0) + return (SET_ERROR(error)); - return (error); + return (0); } /* @@ -5405,7 +5407,7 @@ zfs_getextattr(struct vop_getextattr_args *ap) error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) - return (error); + return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); @@ -5456,7 +5458,7 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) vp = nd.ni_vp; if (error != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); - return (error); + return (SET_ERROR(error)); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); @@ -5487,7 +5489,9 @@ zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname) nvl = zp->z_xattr_cached; error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY); - if (error == 0) + if (error != 0) + error = SET_ERROR(error); + else error = zfs_sa_set_xattr(zp); if (error != 0) { zp->z_xattr_cached = NULL; @@ -5516,7 +5520,7 @@ zfs_deleteextattr(struct vop_deleteextattr_args *ap) error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) - return (error); + return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); @@ -5583,7 +5587,7 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) - return (error); + return (SET_ERROR(error)); VATTR_NULL(&va); va.va_size = 0; @@ -5617,13 +5621,18 @@ zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname) return (SET_ERROR(EFBIG)); error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error != 0) - return (error); + return (SET_ERROR(error)); if (sa_size > DXATTR_MAX_SA_SIZE) return (SET_ERROR(EFBIG)); uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP); error = uiomove(buf, entry_size, ap->a_uio); - if (error == 0) + if (error != 0) { + error = SET_ERROR(error); + } else { error = nvlist_add_byte_array(nvl, attrname, buf, entry_size); + if (error != 0) + error = SET_ERROR(error); + } kmem_free(buf, entry_size); if (error == 0) error = zfs_sa_set_xattr(zp); @@ -5654,7 +5663,7 @@ zfs_setextattr(struct vop_setextattr_args *ap) error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) - return (error); + return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); @@ -5733,7 +5742,7 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) - return (error); + return (SET_ERROR(error)); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; @@ -5779,8 +5788,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) char *namep = dp->d_name + plen; error = uiomove(namep, nlen, ap->a_uio); } - if (error != 0) + if (error != 0) { + error = SET_ERROR(error); break; + } } } } while (!eof && error == 0); @@ -5825,8 +5836,10 @@ zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix) char *namep = __DECONST(char *, name) + plen; error = uiomove(namep, nlen, ap->a_uio); } - if (error != 0) + if (error != 0) { + error = SET_ERROR(error); break; + } } } @@ -5856,7 +5869,7 @@ zfs_listextattr(struct vop_listextattr_args *ap) error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) - return (error); + return (SET_ERROR(error)); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof (attrprefix)); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index aecb9f4c7d87..450369192569 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -1241,7 +1241,11 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) args.mda_si_drv2 = zv; if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) == 0) { +#if __FreeBSD_version > 1300130 dev->si_iosize_max = maxphys; +#else + dev->si_iosize_max = MAXPHYS; +#endif zsd->zsd_cdev = dev; } } @@ -1277,9 +1281,10 @@ zvol_free(zvol_state_t *zv) struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; - ASSERT3P(dev->si_drv2, ==, NULL); - - destroy_dev(dev); + if (dev != NULL) { + ASSERT3P(dev->si_drv2, ==, NULL); + destroy_dev(dev); + } } mutex_destroy(&zv->zv_state_lock); @@ -1374,16 +1379,15 @@ zvol_create_minor_impl(const char *name) args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; - error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); - if (error) { - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); - mutex_destroy(&zv->zv_state_lock); - kmem_free(zv, sizeof (*zv)); - dmu_objset_disown(os, B_TRUE, FTAG); - goto out_doi; + if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) + == 0) { +#if __FreeBSD_version > 1300130 + dev->si_iosize_max = maxphys; +#else + dev->si_iosize_max = MAXPHYS; +#endif + zsd->zsd_cdev = dev; } - dev->si_iosize_max = maxphys; - zsd->zsd_cdev = dev; } (void) strlcpy(zv->zv_name, name, MAXPATHLEN); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); @@ -1456,7 +1460,8 @@ zvol_clear_private(zvol_state_t *zv) struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; - dev->si_drv2 = NULL; + if (dev != NULL) + dev->si_drv2 = NULL; } } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 24c016c5fcf1..e0dc6ed95747 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -367,6 +367,12 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len, return (error); } +static void +zfs_rele_async_task(void *arg) +{ + iput(arg); +} + void zfs_zrele_async(znode_t *zp) { @@ -386,7 +392,7 @@ zfs_zrele_async(znode_t *zp) */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), - (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); + zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 524c43dcded4..0319148b983d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -591,8 +591,8 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). */ -static int -zpl_readpage(struct file *filp, struct page *pp) +static inline int +zpl_readpage_common(struct page *pp) { struct inode *ip; struct page *pl[1]; @@ -620,6 +620,18 @@ zpl_readpage(struct file *filp, struct page *pp) return (error); } +static int +zpl_readpage(struct file *filp, struct page *pp) +{ + return (zpl_readpage_common(pp)); +} + +static int +zpl_readpage_filler(void *data, struct page *pp) +{ + return (zpl_readpage_common(pp)); +} + /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand @@ -630,8 +642,7 @@ static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return (read_cache_pages(mapping, pages, - (filler_t *)zpl_readpage, filp)); + return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } static int diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 394ca1bfe42d..02663e8e2e5d 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -648,13 +648,6 @@ arc_sums_t arc_sums; } while (0) kstat_t *arc_ksp; -static arc_state_t *arc_anon; -static arc_state_t *arc_mru_ghost; -static arc_state_t *arc_mfu_ghost; -static arc_state_t *arc_l2c_only; - -arc_state_t *arc_mru; -arc_state_t *arc_mfu; /* * There are several ARC variables that are critical to export as kstats -- @@ -826,6 +819,12 @@ typedef enum arc_fill_flags { ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; +typedef enum arc_ovf_level { + ARC_OVF_NONE, /* ARC within target size. */ + ARC_OVF_SOME, /* ARC is slightly overflowed. */ + ARC_OVF_SEVERE /* ARC is severely overflowed. */ +} arc_ovf_level_t; + static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; @@ -2197,7 +2196,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) return; } - ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); @@ -2238,7 +2236,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) return; } - ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); @@ -3861,9 +3858,18 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted + * + * Return total size of evicted data buffers for eviction progress tracking. + * When evicting from ghost states return logical buffer size to make eviction + * progress at the same (or at least comparable) rate as from non-ghost states. + * + * Return *real_evicted for actual ARC size reduction to wake up threads + * waiting for it. For non-ghost states it includes size of evicted data + * buffers (the headers are not freed there). For ghost states it includes + * only the evicted headers size. */ static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; @@ -3873,6 +3879,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); + *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); @@ -3909,9 +3916,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); + *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); + *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } @@ -3935,8 +3944,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ARCSTAT_BUMP(arcstat_mutex_miss); break; } - if (buf->b_data != NULL) + if (buf->b_data != NULL) { bytes_evicted += HDR_GET_LSIZE(hdr); + *real_evicted += HDR_GET_LSIZE(hdr); + } mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } @@ -3972,6 +3983,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); + *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed @@ -4010,23 +4022,21 @@ arc_set_need_free(void) static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, int64_t bytes) + uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; - uint64_t bytes_evicted = 0; + uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - int evict_count = 0; + int evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); - for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { - if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || - (evict_count >= zfs_arc_evict_batch_limit)) + if ((evict_count <= 0) || (bytes_evicted >= bytes)) break; /* @@ -4074,10 +4084,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { - uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + uint64_t revicted; + uint64_t evicted = arc_evict_hdr(hdr, hash_lock, + &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; + real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have @@ -4085,7 +4098,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * evict_count in this case. */ if (evicted != 0) - evict_count++; + evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); @@ -4107,7 +4120,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); - arc_evict_count += bytes_evicted; + arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; @@ -4146,7 +4159,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, +arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; @@ -4154,8 +4167,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, int num_sublists; arc_buf_hdr_t **markers; - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - num_sublists = multilist_get_num_sublists(ml); /* @@ -4187,7 +4198,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ - while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; @@ -4215,9 +4226,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, uint64_t bytes_remaining; uint64_t bytes_evicted; - if (bytes == ARC_EVICT_ALL) - bytes_remaining = ARC_EVICT_ALL; - else if (total_evicted < bytes) + if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; @@ -4312,7 +4321,7 @@ static uint64_t arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { - int64_t delta; + uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), @@ -5121,7 +5130,7 @@ arc_adapt(int bytes, arc_state_t *state) * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ -boolean_t +static arc_ovf_level_t arc_is_overflowing(void) { /* Always allow at least one block of overflow */ @@ -5137,8 +5146,10 @@ arc_is_overflowing(void) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - return (aggsum_lower_bound(&arc_sums.arcstat_size) >= - (int64_t)arc_c + overflow); + int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - + arc_c - overflow / 2; + return (over < 0 ? ARC_OVF_NONE : + over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -5180,58 +5191,73 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) void arc_wait_for_eviction(uint64_t amount) { - mutex_enter(&arc_evict_lock); - if (arc_is_overflowing()) { - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - - if (amount != 0) { - arc_evict_waiter_t aw; - list_link_init(&aw.aew_node); - cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); + switch (arc_is_overflowing()) { + case ARC_OVF_NONE: + return; + case ARC_OVF_SOME: + /* + * This is a bit racy without taking arc_evict_lock, but the + * worst that can happen is we either call zthr_wakeup() extra + * time due to race with other thread here, or the set flag + * get cleared by arc_evict_cb(), which is unlikely due to + * big hysteresis, but also not important since at this level + * of overflow the eviction is purely advisory. Same time + * taking the global lock here every time without waiting for + * the actual eviction creates a significant lock contention. + */ + if (!arc_evict_needed) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + } + return; + case ARC_OVF_SEVERE: + default: + { + arc_evict_waiter_t aw; + list_link_init(&aw.aew_node); + cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); - uint64_t last_count = 0; - if (!list_is_empty(&arc_evict_waiters)) { - arc_evict_waiter_t *last = - list_tail(&arc_evict_waiters); - last_count = last->aew_count; - } - /* - * Note, the last waiter's count may be less than - * arc_evict_count if we are low on memory in which - * case arc_evict_state_impl() may have deferred - * wakeups (but still incremented arc_evict_count). - */ - aw.aew_count = - MAX(last_count, arc_evict_count) + amount; + uint64_t last_count = 0; + mutex_enter(&arc_evict_lock); + if (!list_is_empty(&arc_evict_waiters)) { + arc_evict_waiter_t *last = + list_tail(&arc_evict_waiters); + last_count = last->aew_count; + } else if (!arc_evict_needed) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + } + /* + * Note, the last waiter's count may be less than + * arc_evict_count if we are low on memory in which + * case arc_evict_state_impl() may have deferred + * wakeups (but still incremented arc_evict_count). + */ + aw.aew_count = MAX(last_count, arc_evict_count) + amount; - list_insert_tail(&arc_evict_waiters, &aw); + list_insert_tail(&arc_evict_waiters, &aw); - arc_set_need_free(); + arc_set_need_free(); - DTRACE_PROBE3(arc__wait__for__eviction, - uint64_t, amount, - uint64_t, arc_evict_count, - uint64_t, aw.aew_count); + DTRACE_PROBE3(arc__wait__for__eviction, + uint64_t, amount, + uint64_t, arc_evict_count, + uint64_t, aw.aew_count); - /* - * We will be woken up either when arc_evict_count - * reaches aew_count, or when the ARC is no longer - * overflowing and eviction completes. - */ + /* + * We will be woken up either when arc_evict_count reaches + * aew_count, or when the ARC is no longer overflowing and + * eviction completes. + * In case of "false" wakeup, we will still be on the list. + */ + do { cv_wait(&aw.aew_cv, &arc_evict_lock); + } while (list_link_active(&aw.aew_node)); + mutex_exit(&arc_evict_lock); - /* - * In case of "false" wakeup, we will still be on the - * list. - */ - if (list_link_active(&aw.aew_node)) - list_remove(&arc_evict_waiters, &aw); - - cv_destroy(&aw.aew_cv); - } + cv_destroy(&aw.aew_cv); + } } - mutex_exit(&arc_evict_lock); } /* @@ -5262,16 +5288,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. - * - * We do the overflowing check without holding the arc_evict_lock to - * reduce lock contention in this hot path. Note that - * arc_wait_for_eviction() will acquire the lock and check again to - * ensure we are truly overflowing before blocking. */ - if (arc_is_overflowing()) { - arc_wait_for_eviction(size * - zfs_arc_eviction_pct / 100); - } + arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -7563,13 +7581,6 @@ arc_tuning_update(boolean_t verbose) static void arc_state_init(void) { - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), @@ -7969,6 +7980,18 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + if (zfs_wrlog_data_max == 0) { + + /* + * dp_wrlog_total is reduced for each txg at the end of + * spa_sync(). However, dp_dirty_total is reduced every time + * a block is written out. Thus under normal operation, + * dp_wrlog_total could grow 2 times as big as + * zfs_dirty_data_max. + */ + zfs_wrlog_data_max = zfs_dirty_data_max * 2; + } } void diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index 7b0b1d896761..479e5a3ad625 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -503,7 +503,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { ddt_histogram_add(ddh, diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c index 62c7d01d4bd2..fdbdf7d6e868 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_redact.c +++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c @@ -816,6 +816,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_remove(&end_tree, &redact_nodes[i]); kmem_free(redact_nodes[i].record, sizeof (struct redact_record)); + bqueue_destroy(&thread_args[i].q); } avl_destroy(&start_tree); @@ -1164,6 +1165,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc, TS_RUN, minclsyspri); err = perform_redaction(os, new_rl, rmta); + bqueue_destroy(&rmta->q); kmem_free(rmta, sizeof (struct redact_merge_thread_arg)); out: diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 0beb983f992f..5fa516866668 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -885,6 +886,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } if (!tx->tx_dirty_delayed && + dsl_pool_wrlog_over_max(tx->tx_pool)) { + DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + return (SET_ERROR(ERESTART)); + } + + if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 1c03216ef6d5..f99964511aa6 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -192,9 +192,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) } mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, - compressed, uncompressed, tx); - dsl_dir_transfer_space(ds->ds_dir, used - delta, + dsl_dir_diduse_transfer_space(ds->ds_dir, delta, + compressed, uncompressed, used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } @@ -291,9 +290,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - delta, -compressed, -uncompressed, tx); - dsl_dir_transfer_space(ds->ds_dir, -used - delta, + dsl_dir_diduse_transfer_space(ds->ds_dir, + delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c index df2c3d8f0637..84caace4dbab 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -1517,6 +1517,11 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, { int64_t accounted_delta; + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update @@ -1525,36 +1530,28 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(type < DD_USED_NUM); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - if (needlock) mutex_enter(&dd->dd_lock); - accounted_delta = - parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); - ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); - ASSERT(compressed >= 0 || - dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); + dsl_dir_phys_t *ddp = dsl_dir_phys(dd); + accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); + ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || - dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); - dsl_dir_phys(dd)->dd_used_bytes += used; - dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; - dsl_dir_phys(dd)->dd_compressed_bytes += compressed; - - if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { - ASSERT(used > 0 || - dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); - dsl_dir_phys(dd)->dd_used_breakdown[type] += used; + ddp->dd_uncompressed_bytes >= -uncompressed); + ddp->dd_used_bytes += used; + ddp->dd_uncompressed_bytes += uncompressed; + ddp->dd_compressed_bytes += compressed; + + if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used); + ddp->dd_used_breakdown[type] += used; #ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) - u += dsl_dir_phys(dd)->dd_used_breakdown[t]; - ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); + u += ddp->dd_used_breakdown[t]; + ASSERT3U(u, ==, ddp->dd_used_bytes); } #endif } @@ -1562,11 +1559,9 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, - accounted_delta, compressed, uncompressed, tx); - dsl_dir_transfer_space(dd->dd_parent, - used - accounted_delta, - DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + dsl_dir_diduse_transfer_space(dd->dd_parent, + accounted_delta, compressed, uncompressed, + used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } @@ -1578,21 +1573,72 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); + dsl_dir_phys_t *ddp = dsl_dir_phys(dd); if (delta == 0 || - !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) + !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? - dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : - dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); - ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); - dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; - dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; + ddp->dd_used_breakdown[oldtype] >= delta : + ddp->dd_used_breakdown[newtype] >= -delta); + ASSERT(ddp->dd_used_bytes >= ABS(delta)); + ddp->dd_used_breakdown[oldtype] -= delta; + ddp->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } +void +dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, + int64_t compressed, int64_t uncompressed, int64_t tonew, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + int64_t accounted_delta; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + dsl_dir_phys_t *ddp = dsl_dir_phys(dd); + accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); + ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + ddp->dd_uncompressed_bytes >= -uncompressed); + ddp->dd_used_bytes += used; + ddp->dd_uncompressed_bytes += uncompressed; + ddp->dd_compressed_bytes += compressed; + + if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(tonew - used <= 0 || + ddp->dd_used_breakdown[oldtype] >= tonew - used); + ASSERT(tonew >= 0 || + ddp->dd_used_breakdown[newtype] >= -tonew); + ddp->dd_used_breakdown[oldtype] -= tonew - used; + ddp->dd_used_breakdown[newtype] += tonew; +#ifdef ZFS_DEBUG + { + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += ddp->dd_used_breakdown[t]; + ASSERT3U(u, ==, ddp->dd_used_bytes); + } +#endif + } + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_transfer_space(dd->dd_parent, + accounted_delta, compressed, uncompressed, + used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + } +} + typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 72f4b86d772e..1350f1329564 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -105,6 +105,14 @@ int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* + * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. + * Once it is reached, write operation is blocked, + * until log data is cleared out after txg sync. + * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. + */ +unsigned long zfs_wrlog_data_max = 0; + +/* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than * zfs_vdev_async_write_active_min_dirty_percent. @@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); + aggsum_init(&dp->dp_wrlog_total, 0); + for (int i = 0; i < TXG_SIZE; i++) { + aggsum_init(&dp->dp_wrlog_pertxg[i], 0); + } + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); @@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + + ASSERT0(aggsum_value(&dp->dp_wrlog_total)); + aggsum_fini(&dp->dp_wrlog_total); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); + aggsum_fini(&dp->dp_wrlog_pertxg[i]); + } + taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) { @@ -592,6 +613,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +void +dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) +{ + ASSERT3S(size, >=, 0); + + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); + aggsum_add(&dp->dp_wrlog_total, size); + + /* Choose a value slightly bigger than min dirty sync bytes */ + uint64_t sync_min = + zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) + txg_kick(dp, txg); +} + +boolean_t +dsl_pool_wrlog_over_max(dsl_pool_t *dp) +{ + return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); +} + +static void +dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) +{ + int64_t delta; + delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); + aggsum_add(&dp->dp_wrlog_total, delta); +} + #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) @@ -816,6 +867,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } + + dsl_pool_wrlog_clear(dp, txg); + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -1405,6 +1459,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, "Determines the dirty space limit"); +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, + "The size limit of write-transaction zil log data"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 23f3e2989ae7..93d409ceb433 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -5611,19 +5611,11 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - uint64_t available_slots = 0; - boolean_t slot_reserved = B_FALSE; uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); - mutex_enter(&mc->mc_lock); - - uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots); - if (reserved_slots < max) - available_slots = max - reserved_slots; - - if (slots <= available_slots || GANG_ALLOCATION(flags) || - flags & METASLAB_MUST_RESERVE) { + if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || + zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { /* * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. @@ -5631,11 +5623,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, for (int d = 0; d < slots; d++) zfs_refcount_add(&mca->mca_alloc_slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; - slot_reserved = B_TRUE; + return (B_TRUE); } - - mutex_exit(&mc->mc_lock); - return (slot_reserved); + return (B_FALSE); } void @@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) zfs_refcount_remove(&mca->mca_alloc_slots, zio); - mutex_exit(&mc->mc_lock); } static int diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index f6dce076d136..2a4db7d562b6 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -9197,9 +9197,9 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); + mutex_enter(&spa->spa_allocs[i].spaa_lock); + VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); + mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* @@ -9309,9 +9309,9 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); + mutex_enter(&spa->spa_allocs[i].spaa_lock); + VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); + mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 157dede93cfc..58039f3d103c 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -700,13 +700,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_root = spa_strdup(altroot); spa->spa_alloc_count = spa_allocators; - spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * - sizeof (kmutex_t), KM_SLEEP); - spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * - sizeof (avl_tree_t), KM_SLEEP); + spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * + sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); - avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, + NULL); + avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, @@ -799,13 +798,11 @@ spa_remove(spa_t *spa) } for (int i = 0; i < spa->spa_alloc_count; i++) { - avl_destroy(&spa->spa_alloc_trees[i]); - mutex_destroy(&spa->spa_alloc_locks[i]); + avl_destroy(&spa->spa_allocs[i].spaa_tree); + mutex_destroy(&spa->spa_allocs[i].spaa_lock); } - kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * - sizeof (kmutex_t)); - kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * - sizeof (avl_tree_t)); + kmem_free(spa->spa_allocs, spa->spa_alloc_count * + sizeof (spa_alloc_t)); avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); @@ -1786,8 +1783,22 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) uint64_t spa_get_slop_space(spa_t *spa) { - uint64_t space = spa_get_dspace(spa); - uint64_t slop = MIN(space >> spa_slop_shift, spa_max_slop); + uint64_t space = 0; + uint64_t slop = 0; + + /* + * Make sure spa_dedup_dspace has been set. + */ + if (spa->spa_dedup_dspace == ~0ULL) + spa_update_dspace(spa); + + /* + * spa_get_dspace() includes the space only logically "used" by + * deduplicated data, so since it's not useful to reserve more + * space with more deduplicated data, we subtract that out here. + */ + space = spa_get_dspace(spa) - spa->spa_dedup_dspace; + slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c index 4e0fa0d85cbf..c6ade59b9ced 100644 --- a/sys/contrib/openzfs/module/zfs/zcp_synctask.c +++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c @@ -54,6 +54,12 @@ typedef struct zcp_synctask_info { int blocks_modified; } zcp_synctask_info_t; +static void +zcp_synctask_cleanup(void *arg) +{ + fnvlist_free(arg); +} + /* * Generic synctask interface for channel program syncfuncs. * @@ -275,7 +281,7 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) fnvlist_add_boolean(ddsa.ddsa_snaps, dsname); zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps); + zcp_synctask_cleanup, ddsa.ddsa_snaps); err = zcp_sync_task(state, dsl_dataset_snapshot_check, dsl_dataset_snapshot_sync, &ddsa, sync, dsname); @@ -363,7 +369,7 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync, fnvlist_add_boolean(dpsa->dpsa_props, prop); zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props); + zcp_synctask_cleanup, dpsa->dpsa_props); err = zcp_sync_task(state, zcp_synctask_inherit_prop_check, zcp_synctask_inherit_prop_sync, &zipa, sync, dsname); @@ -402,7 +408,7 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details) fnvlist_add_string(bmarks, new, source); zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, bmarks); + zcp_synctask_cleanup, bmarks); dsl_bookmark_create_arg_t dbca = { .dbca_bmarks = bmarks, @@ -467,8 +473,7 @@ zcp_synctask_wrapper(lua_State *state) * Make sure err_details is properly freed, even if a fatal error is * thrown during the synctask. */ - zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, err_details); + zch = zcp_register_cleanup(state, zcp_synctask_cleanup, err_details); zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); boolean_t sync = lua_toboolean(state, lua_upvalueindex(2)); diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c index 30d5c4821ae5..0f330ec933aa 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_log.c +++ b/sys/contrib/openzfs/module/zfs/zfs_log.c @@ -541,6 +541,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; uintptr_t fsync_cnt; uint64_t gen = 0; + ssize_t size = resid; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -626,6 +627,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, off += len; resid -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg); + } } /* diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 78d0711cce4e..d8d39f861c75 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1822,12 +1822,13 @@ zil_itx_destroy(itx_t *itx) * so no locks are needed. */ static void -zil_itxg_clean(itxs_t *itxs) +zil_itxg_clean(void *arg) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; + itxs_t *itxs = arg; itx_async_node_t *ian; list = &itxs->i_sync_list; @@ -2047,7 +2048,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, - (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP); + zil_itxg_clean, clean_me, TQ_NOSLEEP); if (id == TASKQID_INVALID) zil_itxg_clean(clean_me); } diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index e33d36dab5f9..76ed4fad4304 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { - if (zio->io_metaslab_class == NULL) - zio->io_metaslab_class = pio->io_metaslab_class; + zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) @@ -1891,8 +1890,8 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, - flags, &zio->io_tqent); + spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, + &zio->io_tqent); } static boolean_t @@ -1923,7 +1922,7 @@ zio_issue_async(zio_t *zio) } void -zio_interrupt(zio_t *zio) +zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } @@ -1981,8 +1980,8 @@ zio_delay_interrupt(zio_t *zio) * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, - (task_func_t *)zio_interrupt, - zio, TQ_NOSLEEP, expire_at_tick); + zio_interrupt, zio, TQ_NOSLEEP, + expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just @@ -2103,7 +2102,7 @@ static zio_pipe_stage_t *zio_pipeline[]; * it is externally visible. */ void -zio_execute(zio_t *zio) +zio_execute(void *zio) { fstrans_cookie_t cookie; @@ -2292,8 +2291,9 @@ zio_nowait(zio_t *zio) */ static void -zio_reexecute(zio_t *pio) +zio_reexecute(void *arg) { + zio_t *pio = arg; zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); @@ -3379,9 +3379,9 @@ zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); + ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); - zio = avl_first(&spa->spa_alloc_trees[allocator]); + zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); if (zio == NULL) return (NULL); @@ -3393,11 +3393,11 @@ zio_io_to_allocate(spa_t *spa, int allocator) */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, - zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { + zio->io_prop.zp_copies, allocator, zio, 0)) { return (NULL); } - avl_remove(&spa->spa_alloc_trees[allocator], zio); + avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); @@ -3421,8 +3421,8 @@ zio_dva_throttle(zio_t *zio) return (zio); } + ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); - ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); @@ -3434,14 +3434,14 @@ zio_dva_throttle(zio_t *zio) * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ - zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio->io_allocator = allocator; zio->io_metaslab_class = mc; - avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(spa, zio->io_allocator); - mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); + mutex_enter(&spa->spa_allocs[allocator].spaa_lock); + avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); + nio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_allocs[allocator].spaa_lock); return (nio); } @@ -3450,9 +3450,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; - mutex_enter(&spa->spa_alloc_locks[allocator]); + mutex_enter(&spa->spa_allocs[allocator].spaa_lock); zio = zio_io_to_allocate(spa, allocator); - mutex_exit(&spa->spa_alloc_locks[allocator]); + mutex_exit(&spa->spa_allocs[allocator].spaa_lock); if (zio == NULL) return; @@ -3642,8 +3642,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * some parallelism. */ int flags = METASLAB_FASTWRITE | METASLAB_ZIL; - int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % - spa->spa_alloc_count; + int allocator = (uint_t)cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); @@ -4788,8 +4788,7 @@ zio_done(zio_t *zio) ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - (task_func_t *)zio_reexecute, zio, 0, - &zio->io_tqent); + zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 23df0e1541a3..b7bc587cf624 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -84,10 +84,8 @@ #include <sys/zfs_rlock.h> #include <sys/spa_impl.h> #include <sys/zvol.h> - #include <sys/zvol_impl.h> - unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; @@ -106,10 +104,8 @@ typedef enum { typedef struct { zvol_async_op_t op; - char pool[MAXNAMELEN]; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; - zprop_source_t source; uint64_t value; } zvol_task_t; @@ -579,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; + uint64_t sz = size; if (zil_replaying(zilog, tx)) return; @@ -630,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, offset += len; size -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); + } } /* @@ -1197,6 +1198,12 @@ zvol_create_minor(const char *name) * Remove minors for specified dataset including children and snapshots. */ +static void +zvol_free_task(void *arg) +{ + ops->zv_free(arg); +} + void zvol_remove_minors_impl(const char *name) { @@ -1245,8 +1252,8 @@ zvol_remove_minors_impl(const char *name) mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ - t = taskq_dispatch(system_taskq, - (task_func_t *)ops->zv_free, zv, TQ_SLEEP); + t = taskq_dispatch(system_taskq, zvol_free_task, zv, + TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); } else { @@ -1435,7 +1442,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; - char *delim; /* Never allow tasks on hidden names. */ if (name1[0] == '$') @@ -1444,8 +1450,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; - delim = strchr(name1, '/'); - strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) diff --git a/sys/contrib/openzfs/scripts/zloop.sh b/sys/contrib/openzfs/scripts/zloop.sh index 546e7001776d..4a572ebab1fc 100755 --- a/sys/contrib/openzfs/scripts/zloop.sh +++ b/sys/contrib/openzfs/scripts/zloop.sh @@ -38,25 +38,30 @@ DEFAULTCOREDIR=/var/tmp/zloop function usage { - echo -e "\n$0 [-t <timeout>] [ -s <vdev size> ] [-c <dump directory>]" \ - "[ -- [extra ztest parameters]]\n" \ - "\n" \ - " This script runs ztest repeatedly with randomized arguments.\n" \ - " If a crash is encountered, the ztest logs, any associated\n" \ - " vdev files, and core file (if one exists) are moved to the\n" \ - " output directory ($DEFAULTCOREDIR by default). Any options\n" \ - " after the -- end-of-options marker will be passed to ztest.\n" \ - "\n" \ - " Options:\n" \ - " -t Total time to loop for, in seconds. If not provided,\n" \ - " zloop runs forever.\n" \ - " -s Size of vdev devices.\n" \ - " -f Specify working directory for ztest vdev files.\n" \ - " -c Specify a core dump directory to use.\n" \ - " -m Max number of core dumps to allow before exiting.\n" \ - " -l Create 'ztest.core.N' symlink to core directory.\n" \ - " -h Print this help message.\n" \ - "" >&2 + cat >&2 <<EOF + +$0 [-hl] [-c <dump directory>] [-f <vdev directory>] + [-m <max core dumps>] [-s <vdev size>] [-t <timeout>] + [-I <max iterations>] [-- [extra ztest parameters]] + + This script runs ztest repeatedly with randomized arguments. + If a crash is encountered, the ztest logs, any associated + vdev files, and core file (if one exists) are moved to the + output directory ($DEFAULTCOREDIR by default). Any options + after the -- end-of-options marker will be passed to ztest. + + Options: + -c Specify a core dump directory to use. + -f Specify working directory for ztest vdev files. + -h Print this help message. + -l Create 'ztest.core.N' symlink to core directory. + -m Max number of core dumps to allow before exiting. + -s Size of vdev devices. + -t Total time to loop for, in seconds. If not provided, + zloop runs forever. + -I Max number of iterations to loop before exiting. + +EOF } function or_die @@ -185,10 +190,12 @@ timeout=0 size="512m" coremax=0 symlink=0 -while getopts ":ht:m:s:c:f:l" opt; do +iterations=0 +while getopts ":ht:m:I:s:c:f:l" opt; do case $opt in t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;; m ) [[ $OPTARG -gt 0 ]] && coremax=$OPTARG ;; + I ) [[ $OPTARG ]] && iterations=$OPTARG ;; s ) [[ $OPTARG ]] && size=$OPTARG ;; c ) [[ $OPTARG ]] && coredir=$OPTARG ;; f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;; @@ -233,9 +240,14 @@ ztrc=0 # ztest return value foundcrashes=0 # number of crashes found so far starttime=$(date +%s) curtime=$starttime +iteration=0 # if no timeout was specified, loop forever. -while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do +while (( timeout == 0 )) || (( curtime <= (starttime + timeout) )); do + if (( iterations > 0 )) && (( iteration++ == iterations )); then + break + fi + zopt="-G -VVVVV" # start each run with an empty directory @@ -284,10 +296,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do raid_type="draid" fi - # run from 30 to 120 seconds - runtime=$(((RANDOM % 90) + 30)) - passtime=$((RANDOM % (runtime / 3 + 1) + 10)) - zopt="$zopt -K $raid_type" zopt="$zopt -m $mirrors" zopt="$zopt -r $raid_children" @@ -297,8 +305,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do zopt="$zopt -v $vdevs" zopt="$zopt -a $align" zopt="$zopt -C $class" - zopt="$zopt -T $runtime" - zopt="$zopt -P $passtime" zopt="$zopt -s $size" zopt="$zopt -f $workdir" |