aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2024-05-03 15:56:40 +0000
committerMartin Matuska <mm@FreeBSD.org>2024-05-03 16:05:08 +0000
commitb985c9cafd2aedac5cf92428c0211485ea4ede24 (patch)
treeaec972d5109510ea95cd2074bfcc93a1edb50573
parent04ea5e9f84e2a62e874f7964fc94d10f454c7846 (diff)
parent8f1b7a6fa6762ea4c89198ceb11c521f80b92ddc (diff)
downloadsrc-b985c9cafd2aedac5cf92428c0211485ea4ede24.tar.gz
src-b985c9cafd2aedac5cf92428c0211485ea4ede24.zip
zfs: merge openzfs/zfs@8f1b7a6fa
Notable upstream pull request merges: #15839 c3f2f1aa2 vdev probe to slow disk can stall mmp write checker #15888 5044c4e3f Fast Dedup: ZAP Shrinking #15996 db499e68f Overflowing refreservation is bad #16118 67d13998b Make more taskq parameters writable #16128 21bc066ec Fix updating the zvol_htable when renaming a zvol #16130 645b83307 Improve write issue taskqs utilization #16131 8fd3a5d02 Slightly improve dnode hash #16134 a6edc0adb zio: try to execute TYPE_NULL ZIOs on the current task #16141 b28461b7c Fix arcstats for FreeBSD after zfetch support Obtained from: OpenZFS OpenZFS commit: 8f1b7a6fa6762ea4c89198ceb11c521f80b92ddc
-rwxr-xr-xsys/contrib/openzfs/cmd/arcstat.in10
-rw-r--r--sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c2
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_main.c2
-rw-r--r--sys/contrib/openzfs/config/always-pyzfs.m49
-rw-r--r--sys/contrib/openzfs/config/ax_python_devel.m4341
-rw-r--r--sys/contrib/openzfs/config/find_system_library.m44
-rw-r--r--sys/contrib/openzfs/config/user-backtrace.m414
-rw-r--r--sys/contrib/openzfs/config/user-libunwind.m444
-rw-r--r--sys/contrib/openzfs/config/user.m44
-rw-r--r--sys/contrib/openzfs/contrib/debian/control2
-rw-r--r--sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h27
-rw-r--r--sys/contrib/openzfs/include/sys/spa.h6
-rw-r--r--sys/contrib/openzfs/include/sys/spa_impl.h9
-rw-r--r--sys/contrib/openzfs/include/sys/uberblock_impl.h16
-rw-r--r--sys/contrib/openzfs/include/sys/vdev_impl.h2
-rw-r--r--sys/contrib/openzfs/include/sys/zfs_context.h8
-rw-r--r--sys/contrib/openzfs/include/sys/zio.h3
-rw-r--r--sys/contrib/openzfs/lib/libspl/Makefile.am4
-rw-r--r--sys/contrib/openzfs/lib/libspl/assert.c93
-rw-r--r--sys/contrib/openzfs/lib/libuutil/uu_list.c14
-rw-r--r--sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c15
-rw-r--r--sys/contrib/openzfs/lib/libzpool/kernel.c5
-rw-r--r--sys/contrib/openzfs/lib/libzpool/taskq.c4
-rw-r--r--sys/contrib/openzfs/man/man4/zfs.441
-rw-r--r--sys/contrib/openzfs/man/man8/zpool-clear.87
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c7
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c191
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c22
-rw-r--r--sys/contrib/openzfs/module/zfs/txg.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c22
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/zap.c336
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c17
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c6
-rw-r--r--sys/contrib/openzfs/tests/runfiles/common.run4
-rw-r--r--sys/contrib/openzfs/tests/runfiles/linux.run2
-rw-r--r--sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am4
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh97
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh34
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh35
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh81
-rw-r--r--sys/modules/zfs/zfs_config.h13
-rw-r--r--sys/modules/zfs/zfs_gitrev.h2
49 files changed, 1332 insertions, 271 deletions
diff --git a/sys/contrib/openzfs/cmd/arcstat.in b/sys/contrib/openzfs/cmd/arcstat.in
index 220f343b5b62..c4f10a1d6d3b 100755
--- a/sys/contrib/openzfs/cmd/arcstat.in
+++ b/sys/contrib/openzfs/cmd/arcstat.in
@@ -200,6 +200,8 @@ if sys.platform.startswith('freebsd'):
k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats')
if ctl.type != sysctl.CTLTYPE_NODE]
+ k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats')
+ if ctl.type != sysctl.CTLTYPE_NODE]
if not k:
sys.exit(1)
@@ -211,8 +213,12 @@ if sys.platform.startswith('freebsd'):
continue
name, value = s.name, s.value
- # Trims 'kstat.zfs.misc.arcstats' from the name
- kstat[name[24:]] = int(value)
+
+ if "arcstats" in name:
+ # Trims 'kstat.zfs.misc.arcstats' from the name
+ kstat[name[24:]] = int(value)
+ else:
+ kstat["zfetch_" + name[27:]] = int(value)
elif sys.platform.startswith('linux'):
def kstat_update():
diff --git a/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
index 80627b58211c..f194d28c55a9 100644
--- a/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
+++ b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -438,7 +438,7 @@ static char *zpool_sysfs_gets(char *path)
return (NULL);
}
- buf = calloc(sizeof (*buf), statbuf.st_size + 1);
+ buf = calloc(statbuf.st_size + 1, sizeof (*buf));
if (buf == NULL) {
close(fd);
return (NULL);
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
index 636eb2a301cd..300b383af4f6 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -9050,7 +9050,7 @@ status_callback(zpool_handle_t *zhp, void *data)
printf_color(ANSI_BOLD, gettext("action: "));
printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
" are connected, then reboot your system and\n\timport the "
- "pool.\n"));
+ "pool or run 'zpool clear' to resume the pool.\n"));
break;
case ZPOOL_STATUS_IO_FAILURE_WAIT:
diff --git a/sys/contrib/openzfs/config/always-pyzfs.m4 b/sys/contrib/openzfs/config/always-pyzfs.m4
index 9b123b1b2db1..98c1cc230205 100644
--- a/sys/contrib/openzfs/config/always-pyzfs.m4
+++ b/sys/contrib/openzfs/config/always-pyzfs.m4
@@ -80,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
[AC_MSG_ERROR("Python $PYTHON_VERSION unknown")]
)
- AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [
- AS_IF([test "x$enable_pyzfs" = xyes], [
- AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed")
- ], [test "x$enable_pyzfs" != xno], [
+ AS_IF([test "x$enable_pyzfs" = xyes], [
+ AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION])
+ ], [
+ AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [true])
+ AS_IF([test "x$ax_python_devel_found" = xno], [
enable_pyzfs=no
])
])
diff --git a/sys/contrib/openzfs/config/ax_python_devel.m4 b/sys/contrib/openzfs/config/ax_python_devel.m4
index f6d4b01444d6..1f480db6d233 100644
--- a/sys/contrib/openzfs/config/ax_python_devel.m4
+++ b/sys/contrib/openzfs/config/ax_python_devel.m4
@@ -4,18 +4,13 @@
#
# SYNOPSIS
#
-# AX_PYTHON_DEVEL([version], [action-if-not-found])
+# AX_PYTHON_DEVEL([version[,optional]])
#
# DESCRIPTION
#
# Note: Defines as a precious variable "PYTHON_VERSION". Don't override it
# in your configure.ac.
#
-# Note: this is a slightly modified version of the original AX_PYTHON_DEVEL
-# macro which accepts an additional [action-if-not-found] argument. This
-# allow to detect if Python development is available without aborting the
-# configure phase with an hard error in case it is not.
-#
# This macro checks for Python and tries to get the include path to
# 'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output
# variables. It also exports $(PYTHON_EXTRA_LIBS) and
@@ -28,6 +23,11 @@
# version number. Don't use "PYTHON_VERSION" for this: that environment
# variable is declared as precious and thus reserved for the end-user.
#
+# By default this will fail if it does not detect a development version of
+# python. If you want it to continue, set optional to true, like
+# AX_PYTHON_DEVEL([], [true]). The ax_python_devel_found variable will be
+# "no" if it fails.
+#
# This macro should work for all versions of Python >= 2.1.0. As an end
# user, you can disable the check for the python version by setting the
# PYTHON_NOVERSIONCHECK environment variable to something else than the
@@ -45,7 +45,6 @@
# Copyright (c) 2009 Matteo Settenvini <matteo@member.fsf.org>
# Copyright (c) 2009 Horst Knorr <hk_classes@knoda.org>
# Copyright (c) 2013 Daniel Mullner <muellner@math.stanford.edu>
-# Copyright (c) 2018 loli10K <ezomori.nozomu@gmail.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
@@ -73,10 +72,18 @@
# modified version of the Autoconf Macro, you may extend this special
# exception to the GPL to apply to your modified version as well.
-#serial 21
+#serial 36
AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL])
AC_DEFUN([AX_PYTHON_DEVEL],[
+ # Get whether it's optional
+ if test -z "$2"; then
+ ax_python_devel_optional=false
+ else
+ ax_python_devel_optional=$2
+ fi
+ ax_python_devel_found=yes
+
#
# Allow the use of a (user set) custom python version
#
@@ -87,23 +94,26 @@ AC_DEFUN([AX_PYTHON_DEVEL],[
AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]])
if test -z "$PYTHON"; then
- m4_ifvaln([$2],[$2],[
- AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path])
- PYTHON_VERSION=""
- ])
+ AC_MSG_WARN([Cannot find python$PYTHON_VERSION in your system path])
+ if ! $ax_python_devel_optional; then
+ AC_MSG_ERROR([Giving up, python development not available])
+ fi
+ ax_python_devel_found=no
+ PYTHON_VERSION=""
fi
- #
- # Check for a version of Python >= 2.1.0
- #
- AC_MSG_CHECKING([for a version of Python >= '2.1.0'])
- ac_supports_python_ver=`$PYTHON -c "import sys; \
+ if test $ax_python_devel_found = yes; then
+ #
+ # Check for a version of Python >= 2.1.0
+ #
+ AC_MSG_CHECKING([for a version of Python >= '2.1.0'])
+ ac_supports_python_ver=`$PYTHON -c "import sys; \
ver = sys.version.split ()[[0]]; \
print (ver >= '2.1.0')"`
- if test "$ac_supports_python_ver" != "True"; then
+ if test "$ac_supports_python_ver" != "True"; then
if test -z "$PYTHON_NOVERSIONCHECK"; then
AC_MSG_RESULT([no])
- AC_MSG_FAILURE([
+ AC_MSG_WARN([
This version of the AC@&t@_PYTHON_DEVEL macro
doesn't work properly with versions of Python before
2.1.0. You may need to re-run configure, setting the
@@ -112,20 +122,27 @@ PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand.
Moreover, to disable this check, set PYTHON_NOVERSIONCHECK
to something else than an empty string.
])
+ if ! $ax_python_devel_optional; then
+ AC_MSG_FAILURE([Giving up])
+ fi
+ ax_python_devel_found=no
+ PYTHON_VERSION=""
else
AC_MSG_RESULT([skip at user request])
fi
- else
+ else
AC_MSG_RESULT([yes])
+ fi
fi
- #
- # If the macro parameter ``version'' is set, honour it.
- # A Python shim class, VPy, is used to implement correct version comparisons via
- # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for
- # Python 2.7.10 (the ".1" being evaluated as less than ".3").
- #
- if test -n "$1"; then
+ if test $ax_python_devel_found = yes; then
+ #
+ # If the macro parameter ``version'' is set, honour it.
+ # A Python shim class, VPy, is used to implement correct version comparisons via
+ # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for
+ # Python 2.7.10 (the ".1" being evaluated as less than ".3").
+ #
+ if test -n "$1"; then
AC_MSG_CHECKING([for a version of Python $1])
cat << EOF > ax_python_devel_vpy.py
class VPy:
@@ -133,7 +150,7 @@ class VPy:
return tuple(map(int, s.strip().replace("rc", ".").split(".")))
def __init__(self):
import sys
- self.vpy = tuple(sys.version_info)
+ self.vpy = tuple(sys.version_info)[[:3]]
def __eq__(self, s):
return self.vpy == self.vtup(s)
def __ne__(self, s):
@@ -155,25 +172,69 @@ EOF
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
- AC_MSG_ERROR([this package requires Python $1.
+ AC_MSG_WARN([this package requires Python $1.
If you have it installed, but it isn't the default Python
interpreter in your system path, please pass the PYTHON_VERSION
variable to configure. See ``configure --help'' for reference.
])
+ if ! $ax_python_devel_optional; then
+ AC_MSG_ERROR([Giving up])
+ fi
+ ax_python_devel_found=no
PYTHON_VERSION=""
fi
+ fi
fi
- #
- # Check for Python include path
- #
- #
- AC_MSG_CHECKING([for Python include path])
- if test -z "$PYTHON_CPPFLAGS"; then
- python_path=`$PYTHON -c "import sysconfig; \
- print (sysconfig.get_path('include'));"`
- plat_python_path=`$PYTHON -c "import sysconfig; \
- print (sysconfig.get_path('platinclude'));"`
+ if test $ax_python_devel_found = yes; then
+ #
+ # Check if you have distutils, else fail
+ #
+ AC_MSG_CHECKING([for the sysconfig Python package])
+ ac_sysconfig_result=`$PYTHON -c "import sysconfig" 2>&1`
+ if test $? -eq 0; then
+ AC_MSG_RESULT([yes])
+ IMPORT_SYSCONFIG="import sysconfig"
+ else
+ AC_MSG_RESULT([no])
+
+ AC_MSG_CHECKING([for the distutils Python package])
+ ac_sysconfig_result=`$PYTHON -c "from distutils import sysconfig" 2>&1`
+ if test $? -eq 0; then
+ AC_MSG_RESULT([yes])
+ IMPORT_SYSCONFIG="from distutils import sysconfig"
+ else
+ AC_MSG_WARN([cannot import Python module "distutils".
+Please check your Python installation. The error was:
+$ac_sysconfig_result])
+ if ! $ax_python_devel_optional; then
+ AC_MSG_ERROR([Giving up])
+ fi
+ ax_python_devel_found=no
+ PYTHON_VERSION=""
+ fi
+ fi
+ fi
+
+ if test $ax_python_devel_found = yes; then
+ #
+ # Check for Python include path
+ #
+ AC_MSG_CHECKING([for Python include path])
+ if test -z "$PYTHON_CPPFLAGS"; then
+ if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+ # sysconfig module has different functions
+ python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+ print (sysconfig.get_path ('include'));"`
+ plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+ print (sysconfig.get_path ('platinclude'));"`
+ else
+ # old distutils way
+ python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+ print (sysconfig.get_python_inc ());"`
+ plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+ print (sysconfig.get_python_inc (plat_specific=1));"`
+ fi
if test -n "${python_path}"; then
if test "${plat_python_path}" != "${python_path}"; then
python_path="-I$python_path -I$plat_python_path"
@@ -182,15 +243,15 @@ variable to configure. See ``configure --help'' for reference.
fi
fi
PYTHON_CPPFLAGS=$python_path
- fi
- AC_MSG_RESULT([$PYTHON_CPPFLAGS])
- AC_SUBST([PYTHON_CPPFLAGS])
+ fi
+ AC_MSG_RESULT([$PYTHON_CPPFLAGS])
+ AC_SUBST([PYTHON_CPPFLAGS])
- #
- # Check for Python library path
- #
- AC_MSG_CHECKING([for Python library path])
- if test -z "$PYTHON_LIBS"; then
+ #
+ # Check for Python library path
+ #
+ AC_MSG_CHECKING([for Python library path])
+ if test -z "$PYTHON_LIBS"; then
# (makes two attempts to ensure we've got a version number
# from the interpreter)
ac_python_version=`cat<<EOD | $PYTHON -
@@ -208,7 +269,7 @@ EOD`
ac_python_version=$PYTHON_VERSION
else
ac_python_version=`$PYTHON -c "import sys; \
- print ('.'.join(sys.version.split('.')[[:2]]))"`
+ print ("%d.%d" % sys.version_info[[:2]])"`
fi
fi
@@ -220,7 +281,7 @@ EOD`
ac_python_libdir=`cat<<EOD | $PYTHON -
# There should be only one
-import sysconfig
+$IMPORT_SYSCONFIG
e = sysconfig.get_config_var('LIBDIR')
if e is not None:
print (e)
@@ -229,7 +290,7 @@ EOD`
# Now, for the library:
ac_python_library=`cat<<EOD | $PYTHON -
-import sysconfig
+$IMPORT_SYSCONFIG
c = sysconfig.get_config_vars()
if 'LDVERSION' in c:
print ('python'+c[['LDVERSION']])
@@ -249,88 +310,140 @@ EOD`
else
# old way: use libpython from python_configdir
ac_python_libdir=`$PYTHON -c \
- "import sysconfig; \
+ "from sysconfig import get_python_lib as f; \
import os; \
- print (os.path.join(sysconfig.get_path('platstdlib'), 'config'));"`
+ print (os.path.join(f(plat_specific=1, standard_lib=1), 'config'));"`
PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version"
fi
if test -z "PYTHON_LIBS"; then
- m4_ifvaln([$2],[$2],[
- AC_MSG_ERROR([
+ AC_MSG_WARN([
Cannot determine location of your Python DSO. Please check it was installed with
dynamic libraries enabled, or try setting PYTHON_LIBS by hand.
- ])
])
+ if ! $ax_python_devel_optional; then
+ AC_MSG_ERROR([Giving up])
+ fi
+ ax_python_devel_found=no
+ PYTHON_VERSION=""
fi
+ fi
fi
- AC_MSG_RESULT([$PYTHON_LIBS])
- AC_SUBST([PYTHON_LIBS])
- #
- # Check for site packages
- #
- AC_MSG_CHECKING([for Python site-packages path])
- if test -z "$PYTHON_SITE_PKG"; then
- PYTHON_SITE_PKG=`$PYTHON -c "import distutils.sysconfig; \
- print (distutils.sysconfig.get_python_lib(0,0));" 2>/dev/null || \
- $PYTHON -c "import sysconfig; \
- print (sysconfig.get_path('purelib'));"`
- fi
- AC_MSG_RESULT([$PYTHON_SITE_PKG])
- AC_SUBST([PYTHON_SITE_PKG])
+ if test $ax_python_devel_found = yes; then
+ AC_MSG_RESULT([$PYTHON_LIBS])
+ AC_SUBST([PYTHON_LIBS])
- #
- # libraries which must be linked in when embedding
- #
- AC_MSG_CHECKING(python extra libraries)
- if test -z "$PYTHON_EXTRA_LIBS"; then
- PYTHON_EXTRA_LIBS=`$PYTHON -c "import sysconfig; \
+ #
+ # Check for site packages
+ #
+ AC_MSG_CHECKING([for Python site-packages path])
+ if test -z "$PYTHON_SITE_PKG"; then
+ if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+ PYTHON_SITE_PKG=`$PYTHON -c "
+$IMPORT_SYSCONFIG;
+if hasattr(sysconfig, 'get_default_scheme'):
+ scheme = sysconfig.get_default_scheme()
+else:
+ scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+ # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+ scheme = 'posix_prefix'
+prefix = '$prefix'
+if prefix == 'NONE':
+ prefix = '$ac_default_prefix'
+sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix})
+print(sitedir)"`
+ else
+ # distutils.sysconfig way
+ PYTHON_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+ print (sysconfig.get_python_lib(0,0));"`
+ fi
+ fi
+ AC_MSG_RESULT([$PYTHON_SITE_PKG])
+ AC_SUBST([PYTHON_SITE_PKG])
+
+ #
+ # Check for platform-specific site packages
+ #
+ AC_MSG_CHECKING([for Python platform specific site-packages path])
+ if test -z "$PYTHON_PLATFORM_SITE_PKG"; then
+ if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+ PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "
+$IMPORT_SYSCONFIG;
+if hasattr(sysconfig, 'get_default_scheme'):
+ scheme = sysconfig.get_default_scheme()
+else:
+ scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+ # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+ scheme = 'posix_prefix'
+prefix = '$prefix'
+if prefix == 'NONE':
+ prefix = '$ac_default_prefix'
+sitedir = sysconfig.get_path('platlib', scheme, vars={'platbase': prefix})
+print(sitedir)"`
+ else
+ # distutils.sysconfig way
+ PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+ print (sysconfig.get_python_lib(1,0));"`
+ fi
+ fi
+ AC_MSG_RESULT([$PYTHON_PLATFORM_SITE_PKG])
+ AC_SUBST([PYTHON_PLATFORM_SITE_PKG])
+
+ #
+ # libraries which must be linked in when embedding
+ #
+ AC_MSG_CHECKING(python extra libraries)
+ if test -z "$PYTHON_EXTRA_LIBS"; then
+ PYTHON_EXTRA_LIBS=`$PYTHON -c "$IMPORT_SYSCONFIG; \
conf = sysconfig.get_config_var; \
print (conf('LIBS') + ' ' + conf('SYSLIBS'))"`
- fi
- AC_MSG_RESULT([$PYTHON_EXTRA_LIBS])
- AC_SUBST(PYTHON_EXTRA_LIBS)
+ fi
+ AC_MSG_RESULT([$PYTHON_EXTRA_LIBS])
+ AC_SUBST(PYTHON_EXTRA_LIBS)
- #
- # linking flags needed when embedding
- #
- AC_MSG_CHECKING(python extra linking flags)
- if test -z "$PYTHON_EXTRA_LDFLAGS"; then
- PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "import sysconfig; \
+ #
+ # linking flags needed when embedding
+ #
+ AC_MSG_CHECKING(python extra linking flags)
+ if test -z "$PYTHON_EXTRA_LDFLAGS"; then
+ PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "$IMPORT_SYSCONFIG; \
conf = sysconfig.get_config_var; \
print (conf('LINKFORSHARED'))"`
- fi
- AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS])
- AC_SUBST(PYTHON_EXTRA_LDFLAGS)
+ # Hack for macos, it sticks this in here.
+ PYTHON_EXTRA_LDFLAGS=`echo $PYTHON_EXTRA_LDFLAGS | sed 's/CoreFoundation.*$/CoreFoundation/'`
+ fi
+ AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS])
+ AC_SUBST(PYTHON_EXTRA_LDFLAGS)
- #
- # final check to see if everything compiles alright
- #
- AC_MSG_CHECKING([consistency of all components of python development environment])
- # save current global flags
- ac_save_LIBS="$LIBS"
- ac_save_LDFLAGS="$LDFLAGS"
- ac_save_CPPFLAGS="$CPPFLAGS"
- LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS $PYTHON_EXTRA_LIBS"
- LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS"
- CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS"
- AC_LANG_PUSH([C])
- AC_LINK_IFELSE([
+ #
+ # final check to see if everything compiles alright
+ #
+ AC_MSG_CHECKING([consistency of all components of python development environment])
+ # save current global flags
+ ac_save_LIBS="$LIBS"
+ ac_save_LDFLAGS="$LDFLAGS"
+ ac_save_CPPFLAGS="$CPPFLAGS"
+ LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS"
+ LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS"
+ CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS"
+ AC_LANG_PUSH([C])
+ AC_LINK_IFELSE([
AC_LANG_PROGRAM([[#include <Python.h>]],
[[Py_Initialize();]])
],[pythonexists=yes],[pythonexists=no])
- AC_LANG_POP([C])
- # turn back to default flags
- CPPFLAGS="$ac_save_CPPFLAGS"
- LIBS="$ac_save_LIBS"
- LDFLAGS="$ac_save_LDFLAGS"
+ AC_LANG_POP([C])
+ # turn back to default flags
+ CPPFLAGS="$ac_save_CPPFLAGS"
+ LIBS="$ac_save_LIBS"
+ LDFLAGS="$ac_save_LDFLAGS"
- AC_MSG_RESULT([$pythonexists])
+ AC_MSG_RESULT([$pythonexists])
- if test ! "x$pythonexists" = "xyes"; then
- m4_ifvaln([$2],[$2],[
- AC_MSG_FAILURE([
+ if test ! "x$pythonexists" = "xyes"; then
+ AC_MSG_WARN([
Could not link test program to Python. Maybe the main Python library has been
installed in some non-standard library path. If so, pass it to configure,
via the LIBS environment variable.
@@ -340,9 +453,13 @@ EOD`
You probably have to install the development version of the Python package
for your distribution. The exact name of this package varies among them.
============================================================================
- ])
- PYTHON_VERSION=""
- ])
+ ])
+ if ! $ax_python_devel_optional; then
+ AC_MSG_ERROR([Giving up])
+ fi
+ ax_python_devel_found=no
+ PYTHON_VERSION=""
+ fi
fi
#
diff --git a/sys/contrib/openzfs/config/find_system_library.m4 b/sys/contrib/openzfs/config/find_system_library.m4
index 310b44112aea..8b98bd67d2ee 100644
--- a/sys/contrib/openzfs/config/find_system_library.m4
+++ b/sys/contrib/openzfs/config/find_system_library.m4
@@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [
AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]])
$7
],[dnl ELSE
- AC_SUBST([$1]_CFLAGS, [])
- AC_SUBST([$1]_LIBS, [])
+ AC_SUBST([$1]_CFLAGS, [""])
+ AC_SUBST([$1]_LIBS, [""])
AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations])
$8
])
diff --git a/sys/contrib/openzfs/config/user-backtrace.m4 b/sys/contrib/openzfs/config/user-backtrace.m4
new file mode 100644
index 000000000000..25706767cdc3
--- /dev/null
+++ b/sys/contrib/openzfs/config/user-backtrace.m4
@@ -0,0 +1,14 @@
+dnl
+dnl backtrace(), for userspace assertions. glibc has this directly in libc.
+dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed
+dnl that this will also get the companion function backtrace_symbols().
+dnl
+AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [
+ AX_SAVE_FLAGS
+ LIBS=""
+ AC_SEARCH_LIBS([backtrace], [execinfo], [
+ AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available])
+ AC_SUBST([BACKTRACE_LIBS], ["$LIBS"])
+ ])
+ AX_RESTORE_FLAGS
+])
diff --git a/sys/contrib/openzfs/config/user-libunwind.m4 b/sys/contrib/openzfs/config/user-libunwind.m4
new file mode 100644
index 000000000000..99ba3dcf452d
--- /dev/null
+++ b/sys/contrib/openzfs/config/user-libunwind.m4
@@ -0,0 +1,44 @@
+dnl
+dnl Checks for libunwind, which usually does a better job than backtrace() when
+dnl resolving symbols in the stack backtrace. Newer versions have support for
+dnl getting info about the object file the function came from, so we look for
+dnl that too and use it if found.
+dnl
+AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [
+ AC_ARG_WITH([libunwind],
+ AS_HELP_STRING([--with-libunwind],
+ [use libunwind for backtraces in userspace assertions]),
+ [],
+ [with_libunwind=auto])
+
+ AS_IF([test "x$with_libunwind" != "xno"], [
+ ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [
+ dnl unw_get_elf_filename() is sometimes a macro, other
+ dnl times a proper symbol, so we can't just do a link
+ dnl check; we need to include the header properly.
+ AX_SAVE_FLAGS
+ CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS"
+ LIBS="$LIBS $LIBUNWIND_LIBS"
+ AC_MSG_CHECKING([for unw_get_elf_filename in libunwind])
+ AC_LINK_IFELSE([
+ AC_LANG_PROGRAM([
+ #define UNW_LOCAL_ONLY
+ #include <libunwind.h>
+ ], [
+ unw_get_elf_filename(0, 0, 0, 0);
+ ])
+ ], [
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_LIBUNWIND_ELF, 1,
+ [libunwind has unw_get_elf_filename])
+ ], [
+ AC_MSG_RESULT([no])
+ ])
+ AX_RESTORE_FLAGS
+ ], [
+ AS_IF([test "x$with_libunwind" = "xyes"], [
+ AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel])
+ ])
+ ])
+ ])
+])
diff --git a/sys/contrib/openzfs/config/user.m4 b/sys/contrib/openzfs/config/user.m4
index 87df8c7ccabd..badd920d2b8a 100644
--- a/sys/contrib/openzfs/config/user.m4
+++ b/sys/contrib/openzfs/config/user.m4
@@ -26,12 +26,14 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
ZFS_AC_CONFIG_USER_AIO_H
ZFS_AC_CONFIG_USER_CLOCK_GETTIME
ZFS_AC_CONFIG_USER_PAM
+ ZFS_AC_CONFIG_USER_BACKTRACE
+ ZFS_AC_CONFIG_USER_LIBUNWIND
ZFS_AC_CONFIG_USER_RUNSTATEDIR
ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS
ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
ZFS_AC_CONFIG_USER_ZFSEXEC
- AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy])
+ AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid])
AC_SUBST(RM)
])
diff --git a/sys/contrib/openzfs/contrib/debian/control b/sys/contrib/openzfs/contrib/debian/control
index 98beb900d0fa..e56fbf0f1c93 100644
--- a/sys/contrib/openzfs/contrib/debian/control
+++ b/sys/contrib/openzfs/contrib/debian/control
@@ -189,7 +189,7 @@ Depends: dkms (>> 2.1.1.2-5),
file,
libc6-dev | libc-dev,
lsb-release,
- python3-distutils | libpython3-stdlib (<< 3.6.4),
+ python3 (>> 3.12) | python3-distutils | libpython3-stdlib (<< 3.6.4),
${misc:Depends},
${perl:Depends}
Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends}
diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
index b0f398354e4f..658f546213de 100644
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
@@ -94,6 +94,33 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
#endif
}
+/*
+ * Detect if a device has a write cache. Used to set the intial value for the
+ * vdev nowritecache flag.
+ *
+ * 4.10: QUEUE_FLAG_WC added. Initialised by the driver, but can be changed
+ * later by the operator. If not set, kernel will return flush requests
+ * immediately without doing anything.
+ * 6.6: QUEUE_FLAG_HW_WC added. Initialised by the driver, can't be changed.
+ * Only controls if the operator is allowed to change _WC. Initial version
+ * buggy; aliased to QUEUE_FLAG_FUA, so unuseable.
+ * 6.6.10, 6.7: QUEUE_FLAG_HW_WC fixed.
+ *
+ * Older than 4.10 we just assume write cache, and let the normal flush fail
+ * detection apply.
+ */
+static inline boolean_t
+zfs_bdev_has_write_cache(struct block_device *bdev)
+{
+#if defined(QUEUE_FLAG_HW_WC) && QUEUE_FLAG_HW_WC != QUEUE_FLAG_FUA
+ return (test_bit(QUEUE_FLAG_HW_WC, &bdev_get_queue(bdev)->queue_flags));
+#elif defined(QUEUE_FLAG_WC)
+ return (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags));
+#else
+ return (B_TRUE);
+#endif
+}
+
static inline void
blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
{
diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h
index ca15025ba33c..3073c4d1b937 100644
--- a/sys/contrib/openzfs/include/sys/spa.h
+++ b/sys/contrib/openzfs/include/sys/spa.h
@@ -770,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
-#define SPA_ASYNC_PROBE 0x04
+#define SPA_ASYNC_FAULT_VDEV 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
@@ -829,6 +829,8 @@ extern uint_t zfs_sync_pass_deferred_free;
/* spa sync taskqueues */
taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
void spa_sync_tq_destroy(spa_t *spa);
+uint_t spa_acq_allocator(spa_t *spa);
+void spa_rel_allocator(spa_t *spa, uint_t allocator);
void spa_select_allocator(zio_t *zio);
/* spa namespace global mutex */
@@ -1123,6 +1125,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);
+extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);
+
extern spa_mode_t spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h
index d7da085ab313..a40914ec5fcb 100644
--- a/sys/contrib/openzfs/include/sys/spa_impl.h
+++ b/sys/contrib/openzfs/include/sys/spa_impl.h
@@ -63,6 +63,12 @@ typedef struct spa_alloc {
avl_tree_t spaa_tree;
} ____cacheline_aligned spa_alloc_t;
+typedef struct spa_allocs_use {
+ kmutex_t sau_lock;
+ uint_t sau_rotor;
+ boolean_t sau_inuse[];
+} spa_allocs_use_t;
+
typedef struct spa_error_entry {
zbookmark_phys_t se_bookmark;
char *se_name;
@@ -192,7 +198,7 @@ typedef struct spa_taskqs {
/* one for each thread in the spa sync taskq */
typedef struct spa_syncthread_info {
kthread_t *sti_thread;
- taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */
+ uint_t sti_allocator;
} spa_syncthread_info_t;
typedef enum spa_all_vdev_zap_action {
@@ -270,6 +276,7 @@ struct spa {
* allocation performance in write-heavy workloads.
*/
spa_alloc_t *spa_allocs;
+ spa_allocs_use_t *spa_allocs_use;
int spa_alloc_count;
int spa_active_allocator; /* selectable allocator */
diff --git a/sys/contrib/openzfs/include/sys/uberblock_impl.h b/sys/contrib/openzfs/include/sys/uberblock_impl.h
index 1736b32cd3c6..e480a4bac0b9 100644
--- a/sys/contrib/openzfs/include/sys/uberblock_impl.h
+++ b/sys/contrib/openzfs/include/sys/uberblock_impl.h
@@ -50,20 +50,20 @@ extern "C" {
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04
-#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
- ubp->ub_mmp_magic == MMP_MAGIC)
-#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
+ (ubp)->ub_mmp_magic == MMP_MAGIC)
+#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_INTERVAL_VALID_BIT))
-#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_SEQ_VALID_BIT))
-#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))
-#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
>> 8)
-#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
>> 32)
-#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
+#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
>> 48)
#define MMP_INTERVAL_SET(write) \
diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h
index 95164c4546bb..57ff31e89eb9 100644
--- a/sys/contrib/openzfs/include/sys/vdev_impl.h
+++ b/sys/contrib/openzfs/include/sys/vdev_impl.h
@@ -273,7 +273,7 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
- boolean_t vdev_probe_wanted; /* async probe wanted? */
+ boolean_t vdev_fault_wanted; /* async faulted wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h
index 9ec2f73b366c..8f264b50e995 100644
--- a/sys/contrib/openzfs/include/sys/zfs_context.h
+++ b/sys/contrib/openzfs/include/sys/zfs_context.h
@@ -228,9 +228,9 @@ typedef pthread_t kthread_t;
#define thread_create_named(name, stk, stksize, func, arg, len, \
pp, state, pri) \
- zk_thread_create(func, arg, stksize, state)
+ zk_thread_create(name, func, arg, stksize, state)
#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
- zk_thread_create(func, arg, stksize, state)
+ zk_thread_create(#func, func, arg, stksize, state)
#define thread_exit() pthread_exit(NULL)
#define thread_join(t) pthread_join((pthread_t)(t), NULL)
@@ -246,8 +246,8 @@ extern struct proc p0;
#define PS_NONE -1
-extern kthread_t *zk_thread_create(void (*func)(void *), void *arg,
- size_t stksize, int state);
+extern kthread_t *zk_thread_create(const char *name, void (*func)(void *),
+ void *arg, size_t stksize, int state);
#define issig(why) (FALSE)
#define ISSIG(thr, why) (FALSE)
diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h
index 4037b429982b..77c70b9b481c 100644
--- a/sys/contrib/openzfs/include/sys/zio.h
+++ b/sys/contrib/openzfs/include/sys/zio.h
@@ -528,9 +528,6 @@ struct zio {
/* Taskq dispatching state */
taskq_ent_t io_tqent;
-
- /* write issue taskq selection, based upon sync thread */
- taskq_t *io_wr_iss_tq;
};
enum blk_verify_flag {
diff --git a/sys/contrib/openzfs/lib/libspl/Makefile.am b/sys/contrib/openzfs/lib/libspl/Makefile.am
index 822bef7e7a8d..eb2377305aca 100644
--- a/sys/contrib/openzfs/lib/libspl/Makefile.am
+++ b/sys/contrib/openzfs/lib/libspl/Makefile.am
@@ -1,6 +1,6 @@
include $(srcdir)/%D%/include/Makefile.am
-libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS)
+libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS)
libspl_la_CFLAGS = $(libspl_assert_la_CFLAGS)
noinst_LTLIBRARIES += libspl_assert.la libspl.la
@@ -43,3 +43,5 @@ libspl_la_LIBADD = \
libspl_assert.la
libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
+
+libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
diff --git a/sys/contrib/openzfs/lib/libspl/assert.c b/sys/contrib/openzfs/lib/libspl/assert.c
index 9d44740d4e3c..e6e3008f0aa6 100644
--- a/sys/contrib/openzfs/lib/libspl/assert.c
+++ b/sys/contrib/openzfs/lib/libspl/assert.c
@@ -22,8 +22,81 @@
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */
#include <assert.h>
+#include <pthread.h>
+
+#if defined(__linux__)
+#include <errno.h>
+#include <sys/prctl.h>
+#ifdef HAVE_GETTID
+#define libspl_gettid() gettid()
+#else
+#include <sys/syscall.h>
+#define libspl_gettid() ((pid_t)syscall(__NR_gettid))
+#endif
+#define libspl_getprogname() (program_invocation_short_name)
+#define libspl_getthreadname(buf, len) \
+ prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0)
+#elif defined(__FreeBSD__)
+#include <pthread_np.h>
+#define libspl_gettid() pthread_getthreadid_np()
+#define libspl_getprogname() getprogname()
+#define libspl_getthreadname(buf, len) \
+ pthread_getname_np(pthread_self(), buf, len);
+#endif
+
+#if defined(HAVE_LIBUNWIND)
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+
+static inline void
+libspl_dump_backtrace(void)
+{
+ unw_context_t uc;
+ unw_cursor_t cp;
+ unw_word_t ip, off;
+ char funcname[128];
+#ifdef HAVE_LIBUNWIND_ELF
+ char objname[128];
+ unw_word_t objoff;
+#endif
+
+ fprintf(stderr, "Call trace:\n");
+ unw_getcontext(&uc);
+ unw_init_local(&cp, &uc);
+ while (unw_step(&cp) > 0) {
+ unw_get_reg(&cp, UNW_REG_IP, &ip);
+ unw_get_proc_name(&cp, funcname, sizeof (funcname), &off);
+#ifdef HAVE_LIBUNWIND_ELF
+ unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff);
+ fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n",
+ ip, funcname, off, objname, objoff);
+#else
+ fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off);
+#endif
+ }
+}
+#elif defined(HAVE_BACKTRACE)
+#include <execinfo.h>
+
+static inline void
+libspl_dump_backtrace(void)
+{
+ void *btptrs[100];
+ size_t nptrs = backtrace(btptrs, 100);
+ char **bt = backtrace_symbols(btptrs, nptrs);
+ fprintf(stderr, "Call trace:\n");
+ for (size_t i = 0; i < nptrs; i++)
+ fprintf(stderr, " %s\n", bt[i]);
+ free(bt);
+}
+#else
+#define libspl_dump_backtrace()
+#endif
static boolean_t libspl_assert_ok = B_FALSE;
@@ -33,21 +106,37 @@ libspl_set_assert_ok(boolean_t val)
libspl_assert_ok = val;
}
+static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER;
+
/* printf version of libspl_assert */
void
libspl_assertf(const char *file, const char *func, int line,
const char *format, ...)
{
+ pthread_mutex_lock(&assert_lock);
+
va_list args;
+ char tname[64];
+
+ libspl_getthreadname(tname, sizeof (tname));
+
+ fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func);
va_start(args, format);
vfprintf(stderr, format, args);
- fprintf(stderr, "\n");
- fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func);
va_end(args);
+ fprintf(stderr, "\n"
+ " PID: %-8u COMM: %s\n"
+ " TID: %-8u NAME: %s\n",
+ getpid(), libspl_getprogname(),
+ libspl_gettid(), tname);
+
+ libspl_dump_backtrace();
+
#if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__)
if (libspl_assert_ok) {
+ pthread_mutex_unlock(&assert_lock);
return;
}
#endif
diff --git a/sys/contrib/openzfs/lib/libuutil/uu_list.c b/sys/contrib/openzfs/lib/libuutil/uu_list.c
index 0ca6f05205e9..aa8b129cc22a 100644
--- a/sys/contrib/openzfs/lib/libuutil/uu_list.c
+++ b/sys/contrib/openzfs/lib/libuutil/uu_list.c
@@ -505,14 +505,20 @@ uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags)
}
if (lp->ul_debug || robust) {
- uu_list_walk_t my_walk;
+ uu_list_walk_t *my_walk;
void *e;
- list_walk_init(&my_walk, lp, flags);
+ my_walk = uu_zalloc(sizeof (*my_walk));
+ if (my_walk == NULL)
+ return (-1);
+
+ list_walk_init(my_walk, lp, flags);
while (status == UU_WALK_NEXT &&
- (e = uu_list_walk_next(&my_walk)) != NULL)
+ (e = uu_list_walk_next(my_walk)) != NULL)
status = (*func)(e, private);
- list_walk_fini(&my_walk);
+ list_walk_fini(my_walk);
+
+ uu_free(my_walk);
} else {
if (!reverse) {
for (np = lp->ul_null_node.uln_next;
diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c
index 6f8773aed425..231bbbd92dbf 100644
--- a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c
@@ -5565,8 +5565,21 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
/*
* Scale this size down as a ratio of 128k / tsize.
* See theory statement above.
+ *
+ * Bitshift is to avoid the case of nblocks * asize < tsize
+ * producing a size of 0.
+ */
+ volsize = (nblocks * asize) / (tsize >> SPA_MINBLOCKSHIFT);
+ /*
+ * If we would blow UINT64_MAX with this next multiplication,
+ * don't.
*/
- volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
+ if (volsize >
+ (UINT64_MAX / (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT)))
+ volsize = UINT64_MAX;
+ else
+ volsize *= (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
if (volsize > ret) {
ret = volsize;
}
diff --git a/sys/contrib/openzfs/lib/libzpool/kernel.c b/sys/contrib/openzfs/lib/libzpool/kernel.c
index ffad7fc02bc9..a3930ee07f73 100644
--- a/sys/contrib/openzfs/lib/libzpool/kernel.c
+++ b/sys/contrib/openzfs/lib/libzpool/kernel.c
@@ -92,7 +92,8 @@ zk_thread_wrapper(void *arg)
}
kthread_t *
-zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state)
+zk_thread_create(const char *name, void (*func)(void *), void *arg,
+ size_t stksize, int state)
{
pthread_attr_t attr;
pthread_t tid;
@@ -140,6 +141,8 @@ zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state)
VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw));
VERIFY0(pthread_attr_destroy(&attr));
+ pthread_setname_np(tid, name);
+
return ((void *)(uintptr_t)tid);
}
diff --git a/sys/contrib/openzfs/lib/libzpool/taskq.c b/sys/contrib/openzfs/lib/libzpool/taskq.c
index 99a181ec3c93..5fb2283cf0b1 100644
--- a/sys/contrib/openzfs/lib/libzpool/taskq.c
+++ b/sys/contrib/openzfs/lib/libzpool/taskq.c
@@ -295,8 +295,8 @@ taskq_create(const char *name, int nthreads, pri_t pri,
}
for (t = 0; t < nthreads; t++)
- VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0,
- taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL);
+ VERIFY((tq->tq_threadlist[t] = thread_create_named(tq->tq_name,
+ NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL);
return (tq);
}
diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4
index 6088ebc7ef35..5edd80659e08 100644
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@@ -16,7 +16,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
-.Dd January 9, 2024
+.Dd February 14, 2024
.Dt ZFS 4
.Os
.
@@ -525,10 +525,17 @@ most ZPL operations (e.g. write, create) will return
.
.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
Determines the number of block alloctators to use per spa instance.
-Capped by the number of actual CPUs in the system.
+Capped by the number of actual CPUs in the system via
+.Sy spa_cpus_per_allocator .
.Pp
Note that setting this value too high could result in performance
degredation and/or excess fragmentation.
+Set value only applies to pools imported/created after that.
+.
+.It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int
+Determines the minimum number of CPUs in a system for block alloctator
+per spa instance.
+Set value only applies to pools imported/created after that.
.
.It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
Limits the number of on-disk error log entries that will be converted to the
@@ -564,9 +571,8 @@ However, this is limited by
Maximum micro ZAP size.
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
.
-.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
-Log2 fraction of holes in speculative prefetch stream allowed for it to
-proceed.
+.It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
+If set, adjacent empty ZAP blocks will be collapsed, reducing disk space.
.
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
Min bytes to prefetch per stream.
@@ -2327,8 +2333,8 @@ Prioritize requeued I/O.
.
.It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
Percentage of online CPUs which will run a worker thread for I/O.
-These workers are responsible for I/O work such as compression and
-checksum calculations.
+These workers are responsible for I/O work such as compression, encryption,
+checksum and parity calculations.
Fractional number of CPUs will be rounded down.
.Pp
The default value of
@@ -2336,33 +2342,36 @@ The default value of
was chosen to avoid using all CPUs which can result in
latency issues and inconsistent application performance,
especially when slower compression and/or checksumming is enabled.
+Set value only applies to pools imported/created after that.
.
.It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
Number of worker threads per taskq.
-Lower values improve I/O ordering and CPU utilization,
-while higher reduces lock contention.
+Higher values improve I/O ordering and CPU utilization,
+while lower reduce lock contention.
+Set value only applies to pools imported/created after that.
.Pp
If
.Sy 0 ,
generate a system-dependent value close to 6 threads per taskq.
+Set value only applies to pools imported/created after that.
.
-.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
-Determines the number of CPUs to run write issue taskqs.
-.Pp
-When 0 (the default), the value to use is computed internally
-as the number of actual CPUs in the system divided by the
-.Sy spa_num_allocators
-value.
+.It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint
+Determines the minumum number of threads per write issue taskq.
+Higher values improve CPU utilization on high throughput,
+while lower reduce taskq locks contention on high IOPS.
+Set value only applies to pools imported/created after that.
.
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
Set the queue and thread configuration for the IO read queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
+Set values only apply to pools imported/created after that.
.
.It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp
Set the queue and thread configuration for the IO write queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
+Set values only apply to pools imported/created after that.
.
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Do not create zvol device nodes.
diff --git a/sys/contrib/openzfs/man/man8/zpool-clear.8 b/sys/contrib/openzfs/man/man8/zpool-clear.8
index c61ecae483ac..3e448be87fc2 100644
--- a/sys/contrib/openzfs/man/man8/zpool-clear.8
+++ b/sys/contrib/openzfs/man/man8/zpool-clear.8
@@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the
devices can be accessed.
Pools with
.Sy multihost
-enabled which have been suspended cannot be resumed.
-While the pool was suspended, it may have been imported on
-another host, and resuming I/O could result in pool damage.
+enabled which have been suspended cannot be resumed when there is evidence
+that the pool was imported by another host.
+The same checks performed during an import will be applied before the clear
+proceeds.
.Bl -tag -width Ds
.It Fl -power
Power on the devices's slot in the storage enclosure and wait for the device
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 6a7c2d2811b1..712ff1b837d7 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -1259,7 +1259,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
/* Move to a new hashtable entry. */
- zv->zv_hash = zvol_name_hash(zv->zv_name);
+ zv->zv_hash = zvol_name_hash(newname);
hlist_del(&zv->zv_hlink);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 2cea61a6294c..463c5f705102 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -429,8 +429,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Determine the logical block size */
int logical_block_size = bdev_logical_block_size(bdev);
- /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
- v->vdev_nowritecache = B_FALSE;
+ /*
+ * If the device has a write cache, clear the nowritecache flag,
+ * so that we start issuing flush requests again.
+ */
+ v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev);
/* Set when device reports it supports TRIM. */
v->vdev_has_trim = bdev_discard_supported(bdev);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 4b960daf89ee..2a036dc5136b 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -1571,7 +1571,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
/* move to new hashtable entry */
- zv->zv_hash = zvol_name_hash(zv->zv_name);
+ zv->zv_hash = zvol_name_hash(newname);
hlist_del(&zv->zv_hlink);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index 2ba26f68e398..f1818ae155bd 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
/*
- * The low 6 bits of the pointer don't have much entropy, because
- * the objset_t is larger than 2^6 bytes long.
+ * The lower 11 bits of the pointer don't have much entropy, because
+ * the objset_t is more than 1KB long and so likely aligned to 2KB.
*/
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
@@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg)
sync_objset_arg_t *soa = sda->sda_soa;
objset_t *os = soa->soa_os;
+ uint_t allocator = spa_acq_allocator(os->os_spa);
multilist_sublist_t *ms =
multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
dmu_objset_sync_dnodes(ms, soa->soa_tx);
multilist_sublist_unlock(ms);
+ spa_rel_allocator(os->os_spa, allocator);
kmem_free(sda, sizeof (*sda));
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 66bc0ae60b10..71122542758d 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -664,12 +664,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
- "mmp_fail_intervals %llu mmp_fail_ns %llu",
+ "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
- (u_longlong_t)mmp_fail_ns);
+ (u_longlong_t)mmp_fail_ns,
+ (u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 96daf51b696a..ec2b674fb7ee 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
#endif
-static uint_t zio_taskq_wr_iss_ncpus = 0;
+static uint_t zio_taskq_write_tpq = 16;
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
case ZTI_MODE_SYNC:
/*
- * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
- * not to exceed the number of spa allocators.
+ * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+ * not to exceed the number of spa allocators, and align to it.
*/
- if (zio_taskq_wr_iss_ncpus == 0) {
- count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
- } else {
- count = MAX(1,
- boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
- }
+ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+ count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
count = MIN(count, spa->spa_alloc_count);
+ while (spa->spa_alloc_count % count != 0 &&
+ spa->spa_alloc_count < count * 2)
+ count--;
/*
* zio_taskq_batch_pct is unbounded and may exceed 100%, but no
@@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
- if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
- (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
- /* dispatch to assigned write issue taskq */
- tq = zio->io_wr_iss_tq;
- return (tq);
- }
-
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
+ } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+ (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
+ tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
@@ -3594,11 +3589,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
- * Perform the import activity check. If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ * 0 - no activity detected
+ * EREMOTEIO - remote activity detected
+ * EINTR - user canceled the operation
*/
static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+ boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
@@ -3643,19 +3643,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
- spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
- "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ if (importing) {
+ spa_import_progress_set_notes(spa, "Checking MMP activity, "
+ "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ }
- int interations = 0;
+ int iterations = 0;
while ((now = gethrtime()) < import_expire) {
- if (interations++ % 30 == 0) {
+ if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
- (void) spa_import_progress_set_mmp_check(spa_guid(spa),
- NSEC2SEC(import_expire - gethrtime()));
+ if (importing) {
+ (void) spa_import_progress_set_mmp_check(spa_guid(spa),
+ NSEC2SEC(import_expire - gethrtime()));
+ }
vdev_uberblock_load(rvd, ub, &mmp_label);
@@ -3737,6 +3741,61 @@ out:
return (error);
}
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+ ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+ nvlist_t *best_label;
+ uberblock_t best_ub;
+
+ /*
+ * Locate the best uberblock on disk
+ */
+ vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+ if (best_label) {
+ /*
+ * confirm that the best hostid matches our hostid
+ */
+ if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+ spa_get_hostid(spa) !=
+ fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+ nvlist_free(best_label);
+ return (B_TRUE);
+ }
+ nvlist_free(best_label);
+ } else {
+ return (B_TRUE);
+ }
+
+ if (!MMP_VALID(&best_ub) ||
+ !MMP_FAIL_INT_VALID(&best_ub) ||
+ MMP_FAIL_INT(&best_ub) == 0) {
+ return (B_TRUE);
+ }
+
+ if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+ best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+ zfs_dbgmsg("txg mismatch detected during pool clear "
+ "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+ (u_longlong_t)spa->spa_uberblock.ub_txg,
+ (u_longlong_t)best_ub.ub_txg,
+ (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+ (u_longlong_t)best_ub.ub_timestamp);
+ return (B_TRUE);
+ }
+
+ /*
+ * Perform an activity check looking for any remote writer
+ */
+ return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+ B_FALSE) != 0);
+}
+
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
@@ -4063,7 +4122,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
- int error = spa_activity_check(spa, ub, spa->spa_config);
+ int error =
+ spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@@ -8771,15 +8831,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
- if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = B_FALSE;
- vdev_reopen(vd); /* vdev_open() does the actual probe */
+ if (vd->vdev_fault_wanted) {
+ vd->vdev_fault_wanted = B_FALSE;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
- spa_async_probe(spa, vd->vdev_child[c]);
+ spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@@ -8867,11 +8928,11 @@ spa_async_thread(void *arg)
}
/*
- * See if any devices need to be probed.
+ * See if any devices need to be marked faulted.
*/
- if (tasks & SPA_ASYNC_PROBE) {
+ if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
- spa_async_probe(spa, spa->spa_root_vdev);
+ spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -10167,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name)
VERIFY(spa->spa_sync_tq != NULL);
VERIFY(kthreads != NULL);
- spa_taskqs_t *tqs =
- &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
-
spa_syncthread_info_t *ti = spa->spa_syncthreads;
- for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+ for (int i = 0; i < nthreads; i++, ti++) {
ti->sti_thread = kthreads[i];
- if (w == tqs->stqs_count) {
- w = 0;
- }
- ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+ ti->sti_allocator = i;
}
kmem_free(kthreads, sizeof (*kthreads) * nthreads);
@@ -10195,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa)
spa->spa_sync_tq = NULL;
}
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+ int i;
+
+ if (spa->spa_alloc_count == 1)
+ return (0);
+
+ mutex_enter(&spa->spa_allocs_use->sau_lock);
+ uint_t r = spa->spa_allocs_use->sau_rotor;
+ do {
+ if (++r == spa->spa_alloc_count)
+ r = 0;
+ } while (spa->spa_allocs_use->sau_inuse[r]);
+ spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+ spa->spa_allocs_use->sau_rotor = r;
+ mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+ if (ti->sti_thread == curthread) {
+ ti->sti_allocator = r;
+ break;
+ }
+ }
+ ASSERT3S(i, <, spa->spa_alloc_count);
+ return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+ if (spa->spa_alloc_count > 1)
+ spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
void
spa_select_allocator(zio_t *zio)
{
@@ -10222,8 +10313,7 @@ spa_select_allocator(zio_t *zio)
spa_syncthread_info_t *ti = spa->spa_syncthreads;
for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
if (ti->sti_thread == curthread) {
- zio->io_allocator = i;
- zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+ zio->io_allocator = ti->sti_allocator;
return;
}
}
@@ -10240,7 +10330,6 @@ spa_select_allocator(zio_t *zio)
bm->zb_blkid >> 20);
zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
- zio->io_wr_iss_tq = NULL;
}
/*
@@ -10811,10 +10900,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
"Print vdev tree to zfs_dbgmsg during pool import");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
"Percentage of CPUs to run an IO worker thread");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
"Number of threads per IO worker taskqueue");
/* BEGIN CSTYLED */
@@ -10845,13 +10934,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
- spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+ spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
"Configure IO queues for read IO");
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
- spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+ spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
"Configure IO queues for write IO");
#endif
/* END CSTYLED */
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
- "Number of CPUs to run write issue taskqs");
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+ "Number of CPUs per write issue taskq");
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 5fb7847b5d8b..e6d4a9bdb29c 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
* Number of allocators to use, per spa instance
*/
static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
/*
* Spa active allocator.
@@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
if (altroot)
spa->spa_root = spa_strdup(altroot);
- /* Do not allow more allocators than CPUs. */
- spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+ /* Do not allow more allocators than fraction of CPUs. */
+ spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+ boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
@@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
+ if (spa->spa_alloc_count > 1) {
+ spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+ sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+ mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+ }
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -853,6 +861,11 @@ spa_remove(spa_t *spa)
}
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (spa_alloc_t));
+ if (spa->spa_alloc_count > 1) {
+ mutex_destroy(&spa->spa_allocs_use->sau_lock);
+ kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+ sau_inuse[spa->spa_alloc_count]));
+ }
avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
param_get_uint, ZMOD_RW, "Reserved free space in pool");
ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
- "Number of allocators per spa, capped by ncpus");
+ "Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+ "Minimum number of CPUs per allocators");
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
index a67c043446f5..5ce6be69be14 100644
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -551,6 +551,15 @@ txg_sync_thread(void *arg)
}
/*
+ * When we're suspended, nothing should be changing and for
+ * MMP we don't want to bump anything that would make it
+ * harder to detect if another host is changing it when
+ * resuming after a MMP suspend.
+ */
+ if (spa_suspended(spa))
+ continue;
+
+ /*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index d97d0a8100c2..c5551eb6cf6e 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
+ boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;
@@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
+
+ /*
+ * If this probe was initiated from zio pipeline, then
+ * change the state in a spa_async_request. Probes that
+ * were initiated from a vdev_open can change the state
+ * as part of the open call.
+ */
+ if (vps->vps_zio_done_probe) {
+ vd->vdev_fault_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
+ }
}
mutex_enter(&vd->vdev_probe_lock);
@@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+ vps->vps_zio_done_probe = (zio != NULL);
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
- /*
- * We can't change the vdev state in this context, so we
- * kick off an async task to do it on our behalf.
- */
- if (zio != NULL) {
- vd->vdev_probe_wanted = B_TRUE;
- spa_async_request(spa, SPA_ASYNC_PROBE);
- }
}
if (zio != NULL)
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index c31f48028bbc..ed592514fded 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -2027,6 +2027,7 @@ retry:
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
+ * and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
@@ -2034,7 +2035,8 @@ retry:
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);
- if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+ if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
+ !spa_multihost(spa))
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index b03331ec69c6..de7d0fa79478 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -1891,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
static void
vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
- int n, i, c, t, tt;
- int nmissing_rows;
+ int i, c, t, tt;
+ unsigned int n;
+ unsigned int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
uint8_t *p, *pp;
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
index da86defb445c..1b6b16fc6662 100644
--- a/sys/contrib/openzfs/module/zfs/zap.c
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -22,6 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
*/
/*
@@ -41,6 +43,7 @@
#include <sys/spa.h>
#include <sys/dmu.h>
+#include <sys/dnode.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#include <sys/fs/zfs.h>
@@ -78,9 +81,16 @@
*/
static int zap_iterate_prefetch = B_TRUE;
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
+
int fzap_default_block_shift = 14; /* 16k blocksize */
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
void
fzap_byteswap(void *vbuf, size_t size)
@@ -587,6 +597,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
}
static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+ dmu_tx_t *tx)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int epb = bs >> 3; /* entries per block */
+ int err = 0;
+
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ /*
+ * Check for i/o errors
+ */
+ for (int i = 0; i < nptrs; i += epb) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, idx + i, &blk);
+ if (err != 0) {
+ return (err);
+ }
+ }
+
+ for (int i = 0; i < nptrs; i++) {
+ err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+ ASSERT0(err); /* we checked for i/o errors above */
+ if (err != 0)
+ break;
+ }
+
+ return (err);
+}
+
+#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len)))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+ uint64_t nptrs = (1 << pref_diff);
+ uint64_t first;
+ uint64_t last;
+
+ ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+ if (zap_idx_to_blk(zap, idx, &first) != 0)
+ return (0);
+
+ if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+ return (0);
+
+ if (first != last)
+ return (0);
+ return (first);
+}
+
+static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
uint64_t blk;
@@ -958,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
if (err == 0) {
zap_entry_remove(&zeh);
zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+ zap_shrink_enabled)
+ return (zap_shrink(zn, l, tx));
}
zap_put_leaf(l);
return (err);
@@ -1222,13 +1302,19 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
ZIO_PRIORITY_ASYNC_READ);
}
- if (zc->zc_leaf &&
- (ZAP_HASH_IDX(zc->zc_hash,
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
+
+ /*
+ * The leaf was either shrunk or split.
+ */
+ if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
}
again:
@@ -1237,8 +1323,6 @@ again:
&zc->zc_leaf);
if (err != 0)
return (err);
- } else {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
}
l = zc->zc_leaf;
@@ -1367,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+ uint64_t nentries;
+ uint64_t lastblk;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+ /* External ptrtbl */
+ nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+ } else {
+ /* Embedded ptrtbl */
+ nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ lastblk = 0;
+ }
+
+ for (uint64_t idx = 0; idx < nentries; idx++) {
+ uint64_t blk;
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ if (blk > lastblk)
+ lastblk = blk;
+ }
+
+ ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+ zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ uint64_t hash = zn->zn_hash;
+ uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ boolean_t trunc = B_FALSE;
+ int err = 0;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+ ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+ boolean_t writer = B_FALSE;
+
+ /*
+ * To avoid deadlock always deref leaves in the same order -
+ * sibling 0 first, then sibling 1.
+ */
+ while (prefix_len) {
+ zap_leaf_t *sl;
+ int64_t prefix_diff = zt_shift - prefix_len;
+ uint64_t sl_prefix = prefix ^ 1;
+ uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+ int slbit = prefix & 1;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+
+ /*
+ * Check if there is a sibling by reading ptrtbl ptrs.
+ */
+ if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+ break;
+
+ /*
+ * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+ */
+ if (slbit == 1) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Dereference sibling leaf and check if it is empty.
+ */
+ if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+ &sl)) != 0)
+ break;
+
+ ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+ /*
+ * Check if we have a sibling and it is empty.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+ zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ zap_put_leaf(sl);
+
+ /*
+ * If there two empty sibling, we have work to do, so
+ * we need to lock ZAP ptrtbl as WRITER.
+ */
+ if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+ /* We failed to upgrade */
+ if (l != NULL) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Usually, the right way to upgrade from a READER lock
+ * to a WRITER lock is to call zap_unlockdir() and
+ * zap_lockdir(), but we do not have a tag. Instead,
+ * we do it in more sophisticated way.
+ */
+ rw_exit(&zap->zap_rwlock);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+ zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ writer = B_TRUE;
+ }
+
+ /*
+ * Here we have WRITER lock for ptrtbl.
+ * Now, we need a WRITER lock for both siblings leaves.
+ * Also, we have to recheck if the leaves are still siblings
+ * and still empty.
+ */
+ if (l == NULL) {
+ /* sibling 0 */
+ if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+ tx, RW_WRITER, &l)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+ break;
+ }
+
+ /* sibling 1 */
+ if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+ RW_WRITER, &sl)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /* If we have gotten here, we have a leaf to collapse */
+ uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+ uint64_t nptrs = (1ULL << prefix_diff);
+ uint64_t sl_blkid = sl->l_blkid;
+
+ /*
+ * Set ptrtbl entries to point out to the slibling 0 blkid
+ */
+ if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+ tx)) != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /*
+ * Free sibling 1 disk block.
+ */
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+ trunc = B_TRUE;
+
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ sl_blkid << bs, 1 << bs, tx);
+ zap_put_leaf(sl);
+
+ zap_f_phys(zap)->zap_num_leafs--;
+
+ /*
+ * Update prefix and prefix_len.
+ */
+ zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+ prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ }
+
+ if (trunc)
+ zap_trunc(zap);
+
+ if (l != NULL)
+ zap_put_leaf(l);
+
+ return (err);
+}
+
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
"When iterating ZAP object, prefetch it");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+ "Enable ZAP shrinking");
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 2ac1e34dccec..908b9efc1813 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* If multihost is enabled, resuming I/O is unsafe as another
- * host may have imported the pool.
+ * host may have imported the pool. Check for remote activity.
*/
- if (spa_multihost(spa) && spa_suspended(spa))
- return (SET_ERROR(EINVAL));
+ if (spa_multihost(spa) && spa_suspended(spa) &&
+ spa_mmp_remote_host_activity(spa)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EREMOTEIO));
+ }
spa_vdev_state_enter(spa, SCL_NONE);
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 1ba99f4d4624..870343bf4fa3 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -803,9 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
/*
* If we can tell the caller to execute this parent next, do
- * so. We only do this if the parent's zio type matches the
- * child's type. Otherwise dispatch the parent zio in its
- * own taskq.
+ * so. We do this if the parent's zio type matches the child's
+ * type, or if it's a zio_null() with no done callback, and so
+ * has no actual work to do. Otherwise dispatch the parent zio
+ * in its own taskq.
*
* Having the caller execute the parent when possible reduces
* locking on the zio taskq's, reduces context switch
@@ -825,7 +826,8 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
* of writes for spa_sync(), and the chain of ZIL blocks.
*/
if (next_to_executep != NULL && *next_to_executep == NULL &&
- pio->io_type == zio->io_type) {
+ (pio->io_type == zio->io_type ||
+ (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
*next_to_executep = pio;
} else {
zio_taskq_dispatch(pio, type, B_FALSE);
@@ -2532,8 +2534,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
- cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
- "failure and has been suspended.\n", spa_name(spa));
+ if (reason != ZIO_SUSPEND_MMP) {
+ cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
+ "I/O failure and has been suspended.\n", spa_name(spa));
+ }
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
@@ -2921,7 +2925,6 @@ static void
zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
{
cio->io_allocator = pio->io_allocator;
- cio->io_wr_iss_tq = pio->io_wr_iss_tq;
}
static void
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index 3773e400d799..012a0e3c6c17 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
+ /* also match on I/O type (e.g., -T read) */
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
- handler->zi_record.zi_iotype != zio->io_type)
- continue;
+ handler->zi_record.zi_iotype != zio->io_type) {
+ continue;
+ }
/*
* Defensive; should never happen as the array allocation
diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run
index 0586d991b802..5e7fdf359a75 100644
--- a/sys/contrib/openzfs/tests/runfiles/common.run
+++ b/sys/contrib/openzfs/tests/runfiles/common.run
@@ -643,6 +643,10 @@ tags = ['functional', 'compression']
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
tags = ['functional', 'cp_files']
+[tests/functional/zap_shrink]
+tests = ['zap_shrink_001_pos']
+tags = ['functional', 'zap_shrink']
+
[tests/functional/crtime]
tests = ['crtime_001_pos' ]
tags = ['functional', 'crtime']
diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run
index a0b74ef4a8c6..92ce09ec6fcb 100644
--- a/sys/contrib/openzfs/tests/runfiles/linux.run
+++ b/sys/contrib/openzfs/tests/runfiles/linux.run
@@ -146,7 +146,7 @@ tags = ['functional', 'mmap']
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
- 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
+ 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
tags = ['functional', 'mmp']
[tests/functional/mount:Linux]
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
index dc447e042225..d625c040b819 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@@ -1593,6 +1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/mmp/mmp_on_zdb.ksh \
functional/mmp/mmp_reset_interval.ksh \
functional/mmp/mmp_write_distribution.ksh \
+ functional/mmp/mmp_write_slow_disk.ksh \
functional/mmp/mmp_write_uberblocks.ksh \
functional/mmp/multihost_history.ksh \
functional/mmp/setup.ksh \
@@ -2074,6 +2075,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/xattr/xattr_012_pos.ksh \
functional/xattr/xattr_013_pos.ksh \
functional/xattr/xattr_compat.ksh \
+ functional/zap_shrink/cleanup.ksh \
+ functional/zap_shrink/zap_shrink_001_pos.ksh \
+ functional/zap_shrink/setup.ksh \
functional/zpool_influxdb/cleanup.ksh \
functional/zpool_influxdb/setup.ksh \
functional/zpool_influxdb/zpool_influxdb.ksh \
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
new file mode 100755
index 000000000000..8b118684aa7f
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc
+#
+
+# DESCRIPTION:
+# Verify that long VDEV probes do not cause MMP checks to suspend pool
+# Note: without PR-15839 fix, this test will suspend the pool.
+#
+# A device that is returning unexpected errors will trigger a vdev_probe.
+# When the device additionally has slow response times, the probe can hold
+# the spa config lock as a writer for a long period of time such that the
+# mmp uberblock updates stall when trying to acquire the spa config lock.
+#
+# STRATEGY:
+# 1. Create a pool with multiple leaf vdevs
+# 2. Enable multihost and multihost_history
+# 3. Delay for MMP writes to occur
+# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/mmp/mmp.cfg
+. $STF_SUITE/tests/functional/mmp/mmp.kshlib
+
+verify_runnable "both"
+
+function cleanup
+{
+ log_must zinject -c all
+
+ if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
+ log_must zpool clear $MMP_POOL
+ zpool get state $MMP_POOL $MMP_DIR/file.3
+ zpool events | grep ".fs.zfs." | grep -v "history_event"
+ fi
+
+ poolexists $MMP_POOL && destroy_pool $MMP_POOL
+ log_must rm -r $MMP_DIR
+ log_must mmp_clear_hostid
+}
+
+log_assert "A long VDEV probe doesn't cause a MMP check suspend"
+log_onexit cleanup
+
+MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost
+
+# Create a multiple drive pool
+log_must zpool events -c
+log_must mkdir -p $MMP_DIR
+log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
+log_must zpool create -f $MMP_POOL \
+ mirror $MMP_DIR/file.{0,1,2} \
+ mirror $MMP_DIR/file.{3,4,5}
+
+# Enable MMP
+log_must mmp_set_hostid $HOSTID1
+log_must zpool set multihost=on $MMP_POOL
+clear_mmp_history
+
+# Inject vdev write error along with a delay
+log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
+log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
+log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL
+
+log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
+sleep 10
+sync_pool $MMP_POOL
+
+# Confirm mmp writes to the non-slow disks have taken place
+for x in {0,1,2,4}; do
+ write_count=$(grep -c file.${x} $MMP_HISTORY_URL)
+ [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
+done
+
+# Expect that the pool was not suspended
+log_must check_state $MMP_POOL "" "ONLINE"
+health=$(zpool list -H -o health $MMP_POOL)
+log_note "$MMP_POOL health is $health"
+[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"
+
+log_pass "A long VDEV probe doesn't cause a MMP check suspend"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
new file mode 100755
index 000000000000..42fe70042d6a
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
new file mode 100755
index 000000000000..b756d4e76c83
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
@@ -0,0 +1,35 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup $DISK
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh
new file mode 100755
index 000000000000..4dbf579b8ac7
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh
@@ -0,0 +1,81 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Create a large number of files in a directory. Then remove all files and
+# check that the directory zap was shrunk. Use zdb to check that the zap object
+# contains only one leaf block using zdb.
+#
+
+verify_runnable "global"
+
+DIR=largedir
+
+NR_FILES=100000
+BATCH=1000
+CWD=$PWD
+
+log_assert "Create a large number of files ($NR_FILES) in a directory. " \
+ "Make sure that the directory ZAP object was shrunk."
+
+log_must mkdir $TESTDIR/$DIR
+
+cd $TESTDIR/$DIR
+# In order to prevent arguments overflowing, create NR_FILES in BATCH at once.
+for i in $(seq $(($NR_FILES/$BATCH))); do
+ touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)));
+done
+cd $CWD
+
+log_must test $NR_FILES -eq $(ls -U $TESTDIR/$DIR | wc -l)
+
+# remove all files in $DIR directory
+cd $TESTDIR/$DIR
+for i in $(seq $(($NR_FILES/$BATCH))); do
+ rm $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)))
+done
+cd $CWD
+sync_pool $TESTPOOL
+
+log_must test 0 -eq $(ls -U $TESTDIR/$DIR | wc -l)
+
+# check whether zap_shrink works
+zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
+nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
+log_must test 1 -eq $nleafs
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+# check whether zap_shrink works
+zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
+nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
+log_must test 1 -eq $nleafs
+
+log_pass
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index 6e1f5de7cf9e..4d6786e92d22 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -418,6 +418,9 @@
/* Define if the GNU gettext() function is already present or preinstalled. */
/* #undef HAVE_GETTEXT */
+/* Define to 1 if you have the 'gettid' function. */
+/* #undef HAVE_GETTID */
+
/* iops->get_acl() exists */
/* #undef HAVE_GET_ACL */
@@ -655,6 +658,12 @@
/* Define if you have [udev] */
/* #undef HAVE_LIBUDEV */
+/* Define if you have [unwind] */
+/* #undef HAVE_LIBUNWIND */
+
+/* libunwind has unw_get_elf_filename */
+/* #undef HAVE_LIBUNWIND_ELF */
+
/* Define if you have [uuid] */
/* #undef HAVE_LIBUUID */
@@ -1179,7 +1188,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.99-456-FreeBSD_g1f940de07"
+#define ZFS_META_ALIAS "zfs-2.2.99-474-FreeBSD_g8f1b7a6fa"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@@ -1209,7 +1218,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
-#define ZFS_META_RELEASE "456-FreeBSD_g1f940de07"
+#define ZFS_META_RELEASE "474-FreeBSD_g8f1b7a6fa"
/* Define the project version. */
#define ZFS_META_VERSION "2.2.99"
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index e0d3575a6230..7bdd4299c8fa 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define ZFS_META_GITREV "zfs-2.2.99-456-g1f940de07"
+#define ZFS_META_GITREV "zfs-2.2.99-474-g8f1b7a6fa"