aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/acpica/acpi_wakeup.c4
-rw-r--r--sys/amd64/include/param.h11
-rw-r--r--sys/arm64/include/armreg.h5
-rw-r--r--sys/arm64/include/hypervisor.h96
-rw-r--r--sys/arm64/vmm/arm64.h7
-rw-r--r--sys/arm64/vmm/vmm.c34
-rw-r--r--sys/arm64/vmm/vmm_hyp.c19
-rw-r--r--sys/arm64/vmm/vmm_reset.c7
-rw-r--r--sys/cam/ata/ata_da.c9
-rw-r--r--sys/cam/nvme/nvme_da.c5
-rw-r--r--sys/compat/linuxkpi/common/src/linux_acpi.c33
-rw-r--r--sys/conf/kern.post.mk15
-rw-r--r--sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md2
-rw-r--r--sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md5
-rwxr-xr-xsys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh2
-rwxr-xr-xsys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh11
-rwxr-xr-xsys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh25
-rwxr-xr-xsys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh2
-rw-r--r--sys/contrib/openzfs/META2
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb.c20
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_main.c38
-rw-r--r--sys/contrib/openzfs/cmd/zinject/zinject.c81
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_iter.c118
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_main.c55
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_util.h3
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_vdev.c26
-rw-r--r--sys/contrib/openzfs/contrib/intel_qat/readme.md2
-rw-r--r--sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py2
-rw-r--r--sys/contrib/openzfs/etc/init.d/README.md2
-rw-r--r--sys/contrib/openzfs/include/libzfs.h2
-rw-r--r--sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h18
-rw-r--r--sys/contrib/openzfs/include/sys/fs/zfs.h2
-rw-r--r--sys/contrib/openzfs/include/sys/range_tree.h5
-rw-r--r--sys/contrib/openzfs/include/sys/spa.h3
-rw-r--r--sys/contrib/openzfs/include/sys/spa_impl.h1
-rw-r--r--sys/contrib/openzfs/include/sys/zfs_ioctl.h1
-rw-r--r--sys/contrib/openzfs/include/sys/zio.h1
-rw-r--r--sys/contrib/openzfs/lib/libuutil/libuutil.abi105
-rw-r--r--sys/contrib/openzfs/lib/libzfs/libzfs.abi172
-rw-r--r--sys/contrib/openzfs/lib/libzfs/libzfs_config.c17
-rw-r--r--sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi105
-rw-r--r--sys/contrib/openzfs/man/man8/zinject.814
-rw-r--r--sys/contrib/openzfs/man/man8/zpool-upgrade.84
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c23
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_prop.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c1
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c65
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/range_tree.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_config.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c23
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c15
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c35
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c38
-rw-r--r--sys/contrib/openzfs/module/zstd/zfs_zstd.c58
-rwxr-xr-xsys/contrib/openzfs/scripts/zfs-tests.sh9
-rw-r--r--sys/contrib/openzfs/tests/runfiles/common.run20
-rw-r--r--sys/contrib/openzfs/tests/runfiles/sanity.run2
-rwxr-xr-xsys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in61
-rw-r--r--sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am11
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh75
-rw-r--r--sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib42
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh (renamed from sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh)101
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh204
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh126
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh30
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh32
-rw-r--r--sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib235
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh90
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh80
-rw-r--r--sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib6
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh2
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh14
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh53
-rwxr-xr-xsys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh40
-rw-r--r--sys/dev/acpica/acpi.c6
-rw-r--r--sys/dev/acpica/acpi_apei.c2
-rw-r--r--sys/dev/acpica/acpi_powerres.c110
-rw-r--r--sys/dev/acpica/acpi_timer.c11
-rw-r--r--sys/dev/ahci/ahci_pci.c46
-rw-r--r--sys/dev/ath/if_ath_tx.c14
-rw-r--r--sys/dev/gpio/gpioc.c31
-rw-r--r--sys/dev/iwx/if_iwx.c187
-rw-r--r--sys/dev/iwx/if_iwxreg.h4
-rw-r--r--sys/dev/nvme/nvme_private.h6
-rw-r--r--sys/dev/pci/pci.c10
-rw-r--r--sys/dev/sound/pci/hda/hdaa.c53
-rw-r--r--sys/dev/virtio/network/if_vtnet.c2
-rw-r--r--sys/dev/vmware/vmxnet3/if_vmx.c7
-rw-r--r--sys/dev/vt/vt_core.c8
-rw-r--r--sys/dev/watchdog/watchdog.c2
-rw-r--r--sys/dev/xen/control/control.c7
-rw-r--r--sys/fs/nullfs/null.h10
-rw-r--r--sys/fs/nullfs/null_subr.c94
-rw-r--r--sys/fs/nullfs/null_vnops.c173
-rw-r--r--sys/i386/acpica/acpi_wakeup.c4
-rw-r--r--sys/isa/isa_common.c2
-rw-r--r--sys/kern/kern_exit.c46
-rw-r--r--sys/kern/kern_lock.c6
-rw-r--r--sys/kern/kern_mutex.c29
-rw-r--r--sys/kern/kern_sx.c11
-rw-r--r--sys/kern/link_elf.c6
-rw-r--r--sys/kern/link_elf_obj.c8
-rw-r--r--sys/kern/sys_generic.c36
-rw-r--r--sys/kern/sys_pipe.c5
-rw-r--r--sys/kern/vfs_vnops.c22
-rw-r--r--sys/modules/zfs/zfs_config.h6
-rw-r--r--sys/modules/zfs/zfs_gitrev.h2
-rw-r--r--sys/net/if.c6
-rw-r--r--sys/net/if_var.h1
-rw-r--r--sys/net/iflib.c112
-rw-r--r--sys/net/iflib.h2
-rw-r--r--sys/net80211/ieee80211.c28
-rw-r--r--sys/net80211/ieee80211_crypto.c87
-rw-r--r--sys/netgraph/netflow/netflow.c6
-rw-r--r--sys/netinet/ip_carp.c27
-rw-r--r--sys/netinet/sctp_lock_bsd.h6
-rw-r--r--sys/netinet/tcp_syncache.c82
-rw-r--r--sys/netinet/tcp_syncache.h2
-rw-r--r--sys/netinet6/in6.c17
-rw-r--r--sys/netinet6/in6_ifattach.c6
-rw-r--r--sys/netinet6/in6_proto.c4
-rw-r--r--sys/netinet6/in6_src.c54
-rw-r--r--sys/netinet6/in6_var.h2
-rw-r--r--sys/netinet6/ip6_var.h4
-rw-r--r--sys/netinet6/nd6.h4
-rw-r--r--sys/netinet6/nd6_nbr.c129
-rw-r--r--sys/netinet6/nd6_rtr.c9
-rw-r--r--sys/netlink/netlink_snl.h6
-rw-r--r--sys/netpfil/pf/pf.c120
-rw-r--r--sys/netpfil/pf/pf_lb.c4
-rw-r--r--sys/sys/eventhandler.h3
-rw-r--r--sys/sys/mutex.h28
-rw-r--r--sys/tools/gdb/README.txt21
-rw-r--r--sys/tools/gdb/acttrace.py48
-rw-r--r--sys/tools/gdb/freebsd.py75
-rw-r--r--sys/tools/gdb/pcpu.py77
-rw-r--r--sys/tools/gdb/selftest.py31
-rw-r--r--sys/tools/gdb/selftest.sh23
-rw-r--r--sys/tools/gdb/vnet.py100
-rw-r--r--sys/tools/kernel-gdb.py15
-rw-r--r--sys/ufs/ufs/ufs_vnops.c8
-rw-r--r--sys/vm/uma_core.c22
-rw-r--r--sys/x86/include/mca.h25
-rw-r--r--sys/x86/x86/mca.c355
146 files changed, 3554 insertions, 1553 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 99565fbb69ca..8cada2f4f911 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -74,7 +74,7 @@ extern int acpi_susp_bounce;
extern struct susppcb **susppcbs;
static cpuset_t suspcpus;
-static void acpi_stop_beep(void *);
+static void acpi_stop_beep(void *, enum power_stype);
static int acpi_wakeup_ap(struct acpi_softc *, int);
static void acpi_wakeup_cpus(struct acpi_softc *);
@@ -88,7 +88,7 @@ static void acpi_wakeup_cpus(struct acpi_softc *);
} while (0)
static void
-acpi_stop_beep(void *arg)
+acpi_stop_beep(void *arg, enum power_stype stype)
{
if (acpi_resume_beep != 0)
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 5a9c3162e14c..0654bb9de790 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -150,6 +150,15 @@
(((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
-#define SC_TABLESIZE 1024 /* Must be power of 2. */
+/*
+ * Must be power of 2.
+ *
+ * Perhaps should be autosized on boot based on found ncpus.
+ */
+#if MAXCPU > 256
+#define SC_TABLESIZE 2048
+#else
+#define SC_TABLESIZE 1024
+#endif
#endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h
index da051e8f7c8a..393d6d89da0c 100644
--- a/sys/arm64/include/armreg.h
+++ b/sys/arm64/include/armreg.h
@@ -2180,6 +2180,7 @@
#define OSLAR_EL1_CRn 1
#define OSLAR_EL1_CRm 0
#define OSLAR_EL1_op2 4
+#define OSLAR_OSLK (0x1ul << 0)
/* OSLSR_EL1 */
#define OSLSR_EL1_op0 2
@@ -2187,6 +2188,10 @@
#define OSLSR_EL1_CRn 1
#define OSLSR_EL1_CRm 1
#define OSLSR_EL1_op2 4
+#define OSLSR_OSLM_1 (0x1ul << 3)
+#define OSLSR_nTT (0x1ul << 2)
+#define OSLSR_OSLK (0x1ul << 1)
+#define OSLSR_OSLM_0 (0x1ul << 0)
/* PAR_EL1 - Physical Address Register */
#define PAR_F_SHIFT 0
diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h
index 04e15b55b218..8feabd2b981b 100644
--- a/sys/arm64/include/hypervisor.h
+++ b/sys/arm64/include/hypervisor.h
@@ -247,6 +247,54 @@
#define ICC_SRE_EL2_SRE (1UL << 0)
#define ICC_SRE_EL2_EN (1UL << 3)
+/* MDCR_EL2 - Hyp Debug Control Register */
+#define MDCR_EL2_HPMN_MASK 0x1f
+#define MDCR_EL2_HPMN_SHIFT 0
+#define MDCR_EL2_TPMCR_SHIFT 5
+#define MDCR_EL2_TPMCR (0x1UL << MDCR_EL2_TPMCR_SHIFT)
+#define MDCR_EL2_TPM_SHIFT 6
+#define MDCR_EL2_TPM (0x1UL << MDCR_EL2_TPM_SHIFT)
+#define MDCR_EL2_HPME_SHIFT 7
+#define MDCR_EL2_HPME (0x1UL << MDCR_EL2_HPME_SHIFT)
+#define MDCR_EL2_TDE_SHIFT 8
+#define MDCR_EL2_TDE (0x1UL << MDCR_EL2_TDE_SHIFT)
+#define MDCR_EL2_TDA_SHIFT 9
+#define MDCR_EL2_TDA (0x1UL << MDCR_EL2_TDA_SHIFT)
+#define MDCR_EL2_TDOSA_SHIFT 10
+#define MDCR_EL2_TDOSA (0x1UL << MDCR_EL2_TDOSA_SHIFT)
+#define MDCR_EL2_TDRA_SHIFT 11
+#define MDCR_EL2_TDRA (0x1UL << MDCR_EL2_TDRA_SHIFT)
+#define MDCR_EL2_E2PB_SHIFT 12
+#define MDCR_EL2_E2PB_MASK (0x3UL << MDCR_EL2_E2PB_SHIFT)
+#define MDCR_EL2_TPMS_SHIFT 14
+#define MDCR_EL2_TPMS (0x1UL << MDCR_EL2_TPMS_SHIFT)
+#define MDCR_EL2_EnSPM_SHIFT 15
+#define MDCR_EL2_EnSPM (0x1UL << MDCR_EL2_EnSPM_SHIFT)
+#define MDCR_EL2_HPMD_SHIFT 17
+#define MDCR_EL2_HPMD (0x1UL << MDCR_EL2_HPMD_SHIFT)
+#define MDCR_EL2_TTRF_SHIFT 19
+#define MDCR_EL2_TTRF (0x1UL << MDCR_EL2_TTRF_SHIFT)
+#define MDCR_EL2_HCCD_SHIFT 23
+#define MDCR_EL2_HCCD (0x1UL << MDCR_EL2_HCCD_SHIFT)
+#define MDCR_EL2_E2TB_SHIFT 24
+#define MDCR_EL2_E2TB_MASK (0x3UL << MDCR_EL2_E2TB_SHIFT)
+#define MDCR_EL2_HLP_SHIFT 26
+#define MDCR_EL2_HLP (0x1UL << MDCR_EL2_HLP_SHIFT)
+#define MDCR_EL2_TDCC_SHIFT 27
+#define MDCR_EL2_TDCC (0x1UL << MDCR_EL2_TDCC_SHIFT)
+#define MDCR_EL2_MTPME_SHIFT 28
+#define MDCR_EL2_MTPME (0x1UL << MDCR_EL2_MTPME_SHIFT)
+#define MDCR_EL2_HPMFZO_SHIFT 29
+#define MDCR_EL2_HPMFZO (0x1UL << MDCR_EL2_HPMFZO_SHIFT)
+#define MDCR_EL2_PMSSE_SHIFT 30
+#define MDCR_EL2_PMSSE_MASK (0x3UL << MDCR_EL2_PMSSE_SHIFT)
+#define MDCR_EL2_HPMFZS_SHIFT 36
+#define MDCR_EL2_HPMFZS (0x1UL << MDCR_EL2_HPMFZS_SHIFT)
+#define MDCR_EL2_PMEE_SHIFT 40
+#define MDCR_EL2_PMEE_MASK (0x3UL << MDCR_EL2_PMEE_SHIFT)
+#define MDCR_EL2_EBWE_SHIFT 43
+#define MDCR_EL2_EBWE (0x1UL << MDCR_EL2_EBWE_SHIFT)
+
/* SCTLR_EL2 - System Control Register */
#define SCTLR_EL2_RES1 0x30c50830
#define SCTLR_EL2_M_SHIFT 0
@@ -356,52 +404,4 @@
/* Assumed to be 0 by locore.S */
#define VTTBR_HOST 0x0000000000000000
-/* MDCR_EL2 - Hyp Debug Control Register */
-#define MDCR_EL2_HPMN_MASK 0x1f
-#define MDCR_EL2_HPMN_SHIFT 0
-#define MDCR_EL2_TPMCR_SHIFT 5
-#define MDCR_EL2_TPMCR (0x1UL << MDCR_EL2_TPMCR_SHIFT)
-#define MDCR_EL2_TPM_SHIFT 6
-#define MDCR_EL2_TPM (0x1UL << MDCR_EL2_TPM_SHIFT)
-#define MDCR_EL2_HPME_SHIFT 7
-#define MDCR_EL2_HPME (0x1UL << MDCR_EL2_HPME_SHIFT)
-#define MDCR_EL2_TDE_SHIFT 8
-#define MDCR_EL2_TDE (0x1UL << MDCR_EL2_TDE_SHIFT)
-#define MDCR_EL2_TDA_SHIFT 9
-#define MDCR_EL2_TDA (0x1UL << MDCR_EL2_TDA_SHIFT)
-#define MDCR_EL2_TDOSA_SHIFT 10
-#define MDCR_EL2_TDOSA (0x1UL << MDCR_EL2_TDOSA_SHIFT)
-#define MDCR_EL2_TDRA_SHIFT 11
-#define MDCR_EL2_TDRA (0x1UL << MDCR_EL2_TDRA_SHIFT)
-#define MDCR_E2PB_SHIFT 12
-#define MDCR_E2PB_MASK (0x3UL << MDCR_E2PB_SHIFT)
-#define MDCR_TPMS_SHIFT 14
-#define MDCR_TPMS (0x1UL << MDCR_TPMS_SHIFT)
-#define MDCR_EnSPM_SHIFT 15
-#define MDCR_EnSPM (0x1UL << MDCR_EnSPM_SHIFT)
-#define MDCR_HPMD_SHIFT 17
-#define MDCR_HPMD (0x1UL << MDCR_HPMD_SHIFT)
-#define MDCR_TTRF_SHIFT 19
-#define MDCR_TTRF (0x1UL << MDCR_TTRF_SHIFT)
-#define MDCR_HCCD_SHIFT 23
-#define MDCR_HCCD (0x1UL << MDCR_HCCD_SHIFT)
-#define MDCR_E2TB_SHIFT 24
-#define MDCR_E2TB_MASK (0x3UL << MDCR_E2TB_SHIFT)
-#define MDCR_HLP_SHIFT 26
-#define MDCR_HLP (0x1UL << MDCR_HLP_SHIFT)
-#define MDCR_TDCC_SHIFT 27
-#define MDCR_TDCC (0x1UL << MDCR_TDCC_SHIFT)
-#define MDCR_MTPME_SHIFT 28
-#define MDCR_MTPME (0x1UL << MDCR_MTPME_SHIFT)
-#define MDCR_HPMFZO_SHIFT 29
-#define MDCR_HPMFZO (0x1UL << MDCR_HPMFZO_SHIFT)
-#define MDCR_PMSSE_SHIFT 30
-#define MDCR_PMSSE_MASK (0x3UL << MDCR_PMSSE_SHIFT)
-#define MDCR_HPMFZS_SHIFT 36
-#define MDCR_HPMFZS (0x1UL << MDCR_HPMFZS_SHIFT)
-#define MDCR_PMEE_SHIFT 40
-#define MDCR_PMEE_MASK (0x3UL << MDCR_PMEE_SHIFT)
-#define MDCR_EBWE_SHIFT 43
-#define MDCR_EBWE (0x1UL << MDCR_EBWE_SHIFT)
-
#endif /* !_MACHINE_HYPERVISOR_H_ */
diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h
index 82c4481b8692..f9b74aef7188 100644
--- a/sys/arm64/vmm/arm64.h
+++ b/sys/arm64/vmm/arm64.h
@@ -78,14 +78,16 @@ struct hypctx {
uint64_t pmcr_el0; /* Performance Monitors Control Register */
uint64_t pmccntr_el0;
uint64_t pmccfiltr_el0;
+ uint64_t pmuserenr_el0;
+ uint64_t pmselr_el0;
+ uint64_t pmxevcntr_el0;
uint64_t pmcntenset_el0;
uint64_t pmintenset_el1;
uint64_t pmovsset_el0;
- uint64_t pmselr_el0;
- uint64_t pmuserenr_el0;
uint64_t pmevcntr_el0[31];
uint64_t pmevtyper_el0[31];
+ uint64_t dbgclaimset_el1;
uint64_t dbgbcr_el1[16]; /* Debug Breakpoint Control Registers */
uint64_t dbgbvr_el1[16]; /* Debug Breakpoint Value Registers */
uint64_t dbgwcr_el1[16]; /* Debug Watchpoint Control Registers */
@@ -117,6 +119,7 @@ struct hypctx {
struct vgic_v3_regs vgic_v3_regs;
struct vgic_v3_cpu *vgic_cpu;
bool has_exception;
+ bool dbg_oslock;
};
struct hyp {
diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c
index 1dcefa1489e9..a551a2807183 100644
--- a/sys/arm64/vmm/vmm.c
+++ b/sys/arm64/vmm/vmm.c
@@ -651,6 +651,33 @@ vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
return (0);
}
+static int
+vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg)
+{
+ struct hypctx *hypctx;
+
+ hypctx = vcpu_get_cookie(vcpu);
+ /* All other fields are RES0 & we don't do anything with this */
+ /* TODO: Disable access to other debug state when locked */
+ hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK;
+ return (0);
+}
+
+static int
+vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg)
+{
+ struct hypctx *hypctx;
+ uint64_t val;
+
+ hypctx = vcpu_get_cookie(vcpu);
+ val = OSLSR_OSLM_1;
+ if (hypctx->dbg_oslock)
+ val |= OSLSR_OSLK;
+ *rval = val;
+
+ return (0);
+}
+
static const struct vmm_special_reg vmm_special_regs[] = {
#define SPECIAL_REG(_reg, _read, _write) \
{ \
@@ -707,6 +734,13 @@ static const struct vmm_special_reg vmm_special_regs[] = {
SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
vtimer_phys_tval_write),
SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
+
+ /* Debug registers */
+ SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi),
+ SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi),
+ /* TODO: Exceptions on invalid access */
+ SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1),
+ SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi),
#undef SPECIAL_REG
};
diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c
index 345535318f6e..b8c6d2ab7a9a 100644
--- a/sys/arm64/vmm/vmm_hyp.c
+++ b/sys/arm64/vmm/vmm_hyp.c
@@ -121,6 +121,8 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest,
}
}
+ hypctx->dbgclaimset_el1 = READ_SPECIALREG(dbgclaimset_el1);
+
dfr0 = READ_SPECIALREG(id_aa64dfr0_el1);
switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) {
#define STORE_DBG_BRP(x) \
@@ -180,10 +182,13 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest,
hypctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0);
hypctx->pmccntr_el0 = READ_SPECIALREG(pmccntr_el0);
hypctx->pmccfiltr_el0 = READ_SPECIALREG(pmccfiltr_el0);
+ hypctx->pmuserenr_el0 = READ_SPECIALREG(pmuserenr_el0);
+ hypctx->pmselr_el0 = READ_SPECIALREG(pmselr_el0);
+ hypctx->pmxevcntr_el0 = READ_SPECIALREG(pmxevcntr_el0);
hypctx->pmcntenset_el0 = READ_SPECIALREG(pmcntenset_el0);
hypctx->pmintenset_el1 = READ_SPECIALREG(pmintenset_el1);
hypctx->pmovsset_el0 = READ_SPECIALREG(pmovsset_el0);
- hypctx->pmuserenr_el0 = READ_SPECIALREG(pmuserenr_el0);
+
switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) {
#define STORE_PMU(x) \
case (x + 1): \
@@ -337,12 +342,15 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest,
WRITE_SPECIALREG(pmcr_el0, hypctx->pmcr_el0);
WRITE_SPECIALREG(pmccntr_el0, hypctx->pmccntr_el0);
WRITE_SPECIALREG(pmccfiltr_el0, hypctx->pmccfiltr_el0);
+ WRITE_SPECIALREG(pmuserenr_el0, hypctx->pmuserenr_el0);
+ WRITE_SPECIALREG(pmselr_el0, hypctx->pmselr_el0);
+ WRITE_SPECIALREG(pmxevcntr_el0, hypctx->pmxevcntr_el0);
/* Clear all events/interrupts then enable them */
- WRITE_SPECIALREG(pmcntenclr_el0, 0xfffffffful);
+ WRITE_SPECIALREG(pmcntenclr_el0, ~0ul);
WRITE_SPECIALREG(pmcntenset_el0, hypctx->pmcntenset_el0);
- WRITE_SPECIALREG(pmintenclr_el1, 0xfffffffful);
+ WRITE_SPECIALREG(pmintenclr_el1, ~0ul);
WRITE_SPECIALREG(pmintenset_el1, hypctx->pmintenset_el1);
- WRITE_SPECIALREG(pmovsclr_el0, 0xfffffffful);
+ WRITE_SPECIALREG(pmovsclr_el0, ~0ul);
WRITE_SPECIALREG(pmovsset_el0, hypctx->pmovsset_el0);
switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) {
@@ -388,6 +396,9 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest,
#undef LOAD_PMU
}
+ WRITE_SPECIALREG(dbgclaimclr_el1, ~0ul);
+ WRITE_SPECIALREG(dbgclaimclr_el1, hypctx->dbgclaimset_el1);
+
dfr0 = READ_SPECIALREG(id_aa64dfr0_el1);
switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) {
#define LOAD_DBG_BRP(x) \
diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c
index 79d022cf33e8..1240c3ed16ec 100644
--- a/sys/arm64/vmm/vmm_reset.c
+++ b/sys/arm64/vmm/vmm_reset.c
@@ -100,10 +100,12 @@ reset_vm_el01_regs(void *vcpu)
el2ctx->pmcr_el0 |= PMCR_LC;
set_arch_unknown(el2ctx->pmccntr_el0);
set_arch_unknown(el2ctx->pmccfiltr_el0);
+ set_arch_unknown(el2ctx->pmuserenr_el0);
+ set_arch_unknown(el2ctx->pmselr_el0);
+ set_arch_unknown(el2ctx->pmxevcntr_el0);
set_arch_unknown(el2ctx->pmcntenset_el0);
set_arch_unknown(el2ctx->pmintenset_el1);
set_arch_unknown(el2ctx->pmovsset_el0);
- set_arch_unknown(el2ctx->pmuserenr_el0);
memset(el2ctx->pmevcntr_el0, 0, sizeof(el2ctx->pmevcntr_el0));
memset(el2ctx->pmevtyper_el0, 0, sizeof(el2ctx->pmevtyper_el0));
}
@@ -143,7 +145,8 @@ reset_vm_el2_regs(void *vcpu)
/* Set the Extended Hypervisor Configuration Register */
el2ctx->hcrx_el2 = 0;
/* TODO: Trap all extensions we don't support */
- el2ctx->mdcr_el2 = 0;
+ el2ctx->mdcr_el2 = MDCR_EL2_TDOSA | MDCR_EL2_TDRA | MDCR_EL2_TPMS |
+ MDCR_EL2_TTRF;
/* PMCR_EL0.N is read from MDCR_EL2.HPMN */
el2ctx->mdcr_el2 |= (el2ctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT;
diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c
index 1facab47473c..0d844a6fbf9e 100644
--- a/sys/cam/ata/ata_da.c
+++ b/sys/cam/ata/ata_da.c
@@ -44,6 +44,7 @@
#include <sys/malloc.h>
#include <sys/endian.h>
#include <sys/cons.h>
+#include <sys/power.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/sbuf.h>
@@ -878,8 +879,8 @@ static int adaerror(union ccb *ccb, uint32_t cam_flags,
uint32_t sense_flags);
static callout_func_t adasendorderedtag;
static void adashutdown(void *arg, int howto);
-static void adasuspend(void *arg);
-static void adaresume(void *arg);
+static void adasuspend(void *arg, enum power_stype stype);
+static void adaresume(void *arg, enum power_stype stype);
#ifndef ADA_DEFAULT_TIMEOUT
#define ADA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */
@@ -3747,7 +3748,7 @@ adashutdown(void *arg, int howto)
}
static void
-adasuspend(void *arg)
+adasuspend(void *arg, enum power_stype stype)
{
adaflush();
@@ -3760,7 +3761,7 @@ adasuspend(void *arg)
}
static void
-adaresume(void *arg)
+adaresume(void *arg, enum power_stype stype)
{
struct cam_periph *periph;
struct ada_softc *softc;
diff --git a/sys/cam/nvme/nvme_da.c b/sys/cam/nvme/nvme_da.c
index 1c0d5e8381d8..9c4707da482c 100644
--- a/sys/cam/nvme/nvme_da.c
+++ b/sys/cam/nvme/nvme_da.c
@@ -43,6 +43,7 @@
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/cons.h>
+#include <sys/power.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/sbuf.h>
@@ -159,7 +160,7 @@ static void ndadone(struct cam_periph *periph,
static int ndaerror(union ccb *ccb, uint32_t cam_flags,
uint32_t sense_flags);
static void ndashutdown(void *arg, int howto);
-static void ndasuspend(void *arg);
+static void ndasuspend(void *arg, enum power_stype stype);
#ifndef NDA_DEFAULT_SEND_ORDERED
#define NDA_DEFAULT_SEND_ORDERED 1
@@ -1365,7 +1366,7 @@ ndashutdown(void *arg, int howto)
}
static void
-ndasuspend(void *arg)
+ndasuspend(void *arg, enum power_stype stype)
{
ndaflush();
diff --git a/sys/compat/linuxkpi/common/src/linux_acpi.c b/sys/compat/linuxkpi/common/src/linux_acpi.c
index 43783bb8727b..c7d62c745c7e 100644
--- a/sys/compat/linuxkpi/common/src/linux_acpi.c
+++ b/sys/compat/linuxkpi/common/src/linux_acpi.c
@@ -33,6 +33,7 @@
#include <sys/bus.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
+#include <sys/power.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <dev/acpica/acpivar.h>
@@ -118,20 +119,32 @@ acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid,
}
static void
-linux_handle_power_suspend_event(void *arg __unused)
+linux_handle_power_suspend_event(void *arg __unused, enum power_stype stype)
{
- /*
- * Only support S3 for now.
- * acpi_sleep_event isn't always called so we use power_suspend_early
- * instead which means we don't know what state we're switching to.
- * TODO: Make acpi_sleep_event consistent
- */
- linux_acpi_target_sleep_state = ACPI_STATE_S3;
- pm_suspend_target_state = PM_SUSPEND_MEM;
+ switch (stype) {
+ case POWER_STYPE_SUSPEND_TO_IDLE:
+ /*
+ * XXX: obiwac Not 100% sure this is correct, but
+ * acpi_target_sleep_state does seem to be set to
+ * ACPI_STATE_S3 during s2idle on Linux.
+ */
+ linux_acpi_target_sleep_state = ACPI_STATE_S3;
+ pm_suspend_target_state = PM_SUSPEND_TO_IDLE;
+ break;
+ case POWER_STYPE_SUSPEND_TO_MEM:
+ linux_acpi_target_sleep_state = ACPI_STATE_S3;
+ pm_suspend_target_state = PM_SUSPEND_MEM;
+ break;
+ default:
+ printf("%s: sleep type %d not yet supported\n",
+ __func__, stype);
+ break;
+ }
}
static void
-linux_handle_power_resume_event(void *arg __unused)
+linux_handle_power_resume_event(void *arg __unused,
+ enum power_stype stype __unused)
{
linux_acpi_target_sleep_state = ACPI_STATE_S0;
pm_suspend_target_state = PM_SUSPEND_ON;
diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk
index bb3c7af82a4d..7cdfd17778db 100644
--- a/sys/conf/kern.post.mk
+++ b/sys/conf/kern.post.mk
@@ -398,6 +398,14 @@ CFLAGS+= -fdebug-prefix-map=./${_link}=${PREFIX_SYSDIR}/${_link}/include
.endif
.endfor
+# Install GDB plugins that are useful for kernel debugging. See the
+# README in sys/tools/gdb for more information.
+GDB_FILES= acttrace.py \
+ freebsd.py \
+ pcpu.py \
+ selftest.py \
+ vnet.py
+
${_ILINKS}:
@case ${.TARGET} in \
machine) \
@@ -447,6 +455,13 @@ kernel-install: .PHONY
.if defined(DEBUG) && !defined(INSTALL_NODEBUG) && ${MK_KERNEL_SYMBOLS} != "no"
mkdir -p ${DESTDIR}${KERN_DEBUGDIR}${KODIR}
${INSTALL} -p -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} ${KERNEL_KO}.debug ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/
+ ${INSTALL} -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} \
+ $S/tools/kernel-gdb.py ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/${KERNEL_KO}-gdb.py
+ mkdir -p ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/gdb
+.for file in ${GDB_FILES}
+ ${INSTALL} -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} \
+ $S/tools/gdb/${file} ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/gdb/${file}
+.endfor
.endif
.if defined(KERNEL_EXTRA_INSTALL)
${INSTALL} -p -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} ${KERNEL_EXTRA_INSTALL} ${DESTDIR}${KODIR}/
diff --git a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md
index 9b50a4a3d96e..f3d4316f6f67 100644
--- a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md
@@ -14,7 +14,7 @@ Please check our issue tracker before opening a new feature request.
Filling out the following template will help other contributors better understand your proposed feature.
-->
-### Describe the feature would like to see added to OpenZFS
+### Describe the feature you would like to see added to OpenZFS
<!--
Provide a clear and concise description of the feature.
diff --git a/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md b/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md
index 79809179cf13..47edc8174603 100644
--- a/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md
+++ b/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,11 +2,6 @@
<!--- Provide a general summary of your changes in the Title above -->
-<!---
-Documentation on ZFS Buildbot options can be found at
-https://openzfs.github.io/openzfs-docs/Developer%20Resources/Buildbot%20Options.html
--->
-
### Motivation and Context
<!--- Why is this change required? What problem does it solve? -->
<!--- If it fixes an open issue, please link to the issue here. -->
diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh
index 8439942c5a41..1c608348ffcd 100755
--- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh
+++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh
@@ -121,7 +121,7 @@ case "$OS" in
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
;;
freebsd15-0c)
- FreeBSD="15.0-ALPHA2"
+ FreeBSD="15.0-ALPHA3"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd14.0"
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh
index ee058b488088..f67bb2f68e94 100755
--- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh
+++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh
@@ -20,7 +20,7 @@ function archlinux() {
sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \
fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \
parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \
- samba sysstat rng-tools rsync wget xxhash
+ samba strace sysstat rng-tools rsync wget xxhash
echo "##[endgroup]"
}
@@ -43,7 +43,8 @@ function debian() {
lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \
python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \
python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \
- rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev
+ rsync samba strace sysstat uuid-dev watchdog wget xfslibs-dev xxhash \
+ zlib1g-dev
echo "##[endgroup]"
}
@@ -87,8 +88,8 @@ function rhel() {
libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \
parted perf python3 python3-cffi python3-devel python3-packaging \
kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \
- rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \
- zlib-devel
+ rpm-build rsync samba strace sysstat systemd watchdog wget xfsprogs-devel \
+ xxhash zlib-devel
echo "##[endgroup]"
}
@@ -104,7 +105,7 @@ function install_fedora_experimental_kernel {
our_version="$1"
sudo dnf -y copr enable @kernel-vanilla/stable
sudo dnf -y copr enable @kernel-vanilla/mainline
- all="$(sudo dnf list --showduplicates kernel-*)"
+ all="$(sudo dnf list --showduplicates kernel-* python3-perf* perf* bpftool*)"
echo "Available versions:"
echo "$all"
diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh
index 0adcad2a99bc..4869c1003e48 100755
--- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh
+++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh
@@ -108,19 +108,30 @@ echo '*/5 * * * * /root/cronjob.sh' > crontab.txt
sudo crontab crontab.txt
rm crontab.txt
-# check if the machines are okay
-echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)"
-for ((i=1; i<=VMs; i++)); do
- .github/workflows/scripts/qemu-wait-for-vm.sh vm$i
-done
-echo "All $VMs VMs are up now."
-
# Save the VM's serial output (ttyS0) to /var/tmp/console.txt
# - ttyS0 on the VM corresponds to a local /dev/pty/N entry
# - use 'virsh ttyconsole' to lookup the /dev/pty/N entry
for ((i=1; i<=VMs; i++)); do
mkdir -p $RESPATH/vm$i
read "pty" <<< $(sudo virsh ttyconsole vm$i)
+
+ # Create the file so we can tail it, even if there's no output.
+ touch $RESPATH/vm$i/console.txt
+
sudo nohup bash -c "cat $pty > $RESPATH/vm$i/console.txt" &
+
+ # Write all VM boot lines to the console to aid in debugging failed boots.
+ # The boot lines from all the VMs will be munged together, so prepend each
+ # line with the vm hostname (like 'vm1:').
+ (while IFS=$'\n' read -r line; do echo "vm$i: $line" ; done < <(sudo tail -f $RESPATH/vm$i/console.txt)) &
+
done
echo "Console logging for ${VMs}x $OS started."
+
+
+# check if the machines are okay
+echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)"
+for ((i=1; i<=VMs; i++)); do
+ .github/workflows/scripts/qemu-wait-for-vm.sh vm$i
+done
+echo "All $VMs VMs are up now."
diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh
index 5ab822f4f076..ca6ac77f146d 100755
--- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh
+++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh
@@ -111,7 +111,7 @@ fi
sudo dmesg -c > dmesg-prerun.txt
mount > mount.txt
df -h > df-prerun.txt
-$TDIR/zfs-tests.sh -vK -s 3GB -T $TAGS
+$TDIR/zfs-tests.sh -vKO -s 3GB -T $TAGS
RV=$?
df -h > df-postrun.txt
echo $RV > tests-exitcode.txt
diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META
index 5704b5c6de8a..bdb7aee48041 100644
--- a/sys/contrib/openzfs/META
+++ b/sys/contrib/openzfs/META
@@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
-Linux-Maximum: 6.16
+Linux-Maximum: 6.17
Linux-Minimum: 4.18
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
index d655fa715e15..70a4ed46f263 100644
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -3301,6 +3301,7 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
uint64_t keyformat, salt, iters;
int i;
unsigned char c;
+ FILE *f;
VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
@@ -3333,6 +3334,25 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
break;
+ case ZFS_KEYFORMAT_RAW:
+ if ((f = fopen(key_material, "r")) == NULL)
+ return (B_FALSE);
+
+ if (fread(key_out, 1, WRAPPING_KEY_LEN, f) !=
+ WRAPPING_KEY_LEN) {
+ (void) fclose(f);
+ return (B_FALSE);
+ }
+
+ /* Check the key length */
+ if (fgetc(f) != EOF) {
+ (void) fclose(f);
+ return (B_FALSE);
+ }
+
+ (void) fclose(f);
+ break;
+
default:
fatal("no support for key format %u\n",
(unsigned int) keyformat);
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
index 484986bde719..ccdd5ffef8e6 100644
--- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
@@ -914,7 +914,11 @@ zfs_do_clone(int argc, char **argv)
log_history = B_FALSE;
}
- ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
+ /*
+ * Dataset cloned successfully, mount/share failures are
+ * non-fatal.
+ */
+ (void) zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
}
zfs_close(zhp);
@@ -930,19 +934,15 @@ usage:
}
/*
- * Return a default volblocksize for the pool which always uses more than
- * half of the data sectors. This primarily applies to dRAID which always
- * writes full stripe widths.
+ * Calculate the minimum allocation size based on the top-level vdevs.
*/
static uint64_t
-default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+calculate_volblocksize(nvlist_t *config)
{
- uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+ uint64_t asize = SPA_MINBLOCKSIZE;
nvlist_t *tree, **vdevs;
uint_t nvdevs;
- nvlist_t *config = zpool_get_config(zhp, NULL);
-
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
&vdevs, &nvdevs) != 0) {
@@ -973,6 +973,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
}
}
+ return (asize);
+}
+
+/*
+ * Return a default volblocksize for the pool which always uses more than
+ * half of the data sectors. This primarily applies to dRAID which always
+ * writes full stripe widths.
+ */
+static uint64_t
+default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+{
+ uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0)
+ asize = calculate_volblocksize(config);
+
/*
* Calculate the target volblocksize such that more than half
* of the asize is used. The following table is for 4k sectors.
@@ -1319,7 +1337,9 @@ zfs_do_create(int argc, char **argv)
goto error;
}
- ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
+ /* Dataset created successfully, mount/share failures are non-fatal */
+ ret = 0;
+ (void) zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
error:
nvlist_free(props);
return (ret);
diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c
index 113797c878b9..c2f646f2567d 100644
--- a/sys/contrib/openzfs/cmd/zinject/zinject.c
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.c
@@ -107,6 +107,8 @@
* zinject
* zinject <-a | -u pool>
* zinject -c <id|all>
+ * zinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]
+ * [-T iotype] [-t type object | -b bookmark pool]
* zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
* [-r range] <object>
* zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
@@ -132,14 +134,18 @@
* The '-f' flag controls the frequency of errors injected, expressed as a
* real number percentage between 0.0001 and 100. The default is 100.
*
- * The this form is responsible for actually injecting the handler into the
+ * The <object> form is responsible for actually injecting the handler into the
* framework. It takes the arguments described above, translates them to the
* internal tuple using libzpool, and then issues an ioctl() to register the
* handler.
*
- * The final form can target a specific bookmark, regardless of whether a
+ * The '-b' option can target a specific bookmark, regardless of whether a
* human-readable interface has been designed. It allows developers to specify
* a particular block by number.
+ *
+ * The '-E' option injects pipeline ready stage delays for the given object or
+ * bookmark. The delay is specified in milliseconds, and it supports I/O type
+ * and range filters.
*/
#include <errno.h>
@@ -346,6 +352,13 @@ usage(void)
"\t\tsuch that the operation takes a minimum of supplied seconds\n"
"\t\tto complete.\n"
"\n"
+ "\tzinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]\n"
+ "\t\t[-T iotype] [-t type object | -b bookmark pool]\n"
+ "\n"
+ "\t\tInject pipeline ready stage delays for the given object path\n"
+ "\t\t(data or dnode) or raw bookmark. The delay is specified in\n"
+ "\t\tmilliseconds.\n"
+ "\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n"
"\t\treport errors for a duration. Simulates buggy hardware\n"
@@ -724,12 +737,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
if (quiet) {
(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
} else {
+ boolean_t show_object = B_FALSE;
+ boolean_t show_iotype = B_FALSE;
(void) printf("Added handler %llu with the following "
"properties:\n", (u_longlong_t)zc.zc_guid);
(void) printf(" pool: %s\n", pool);
if (record->zi_guid) {
(void) printf(" vdev: %llx\n",
(u_longlong_t)record->zi_guid);
+ show_iotype = B_TRUE;
} else if (record->zi_func[0] != '\0') {
(void) printf(" panic function: %s\n",
record->zi_func);
@@ -742,7 +758,18 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
} else if (record->zi_timer > 0) {
(void) printf(" timer: %lld ms\n",
(u_longlong_t)NSEC2MSEC(record->zi_timer));
+ if (record->zi_cmd == ZINJECT_DELAY_READY) {
+ show_object = B_TRUE;
+ show_iotype = B_TRUE;
+ }
} else {
+ show_object = B_TRUE;
+ }
+ if (show_iotype) {
+ (void) printf("iotype: %s\n",
+ iotype_to_str(record->zi_iotype));
+ }
+ if (show_object) {
(void) printf("objset: %llu\n",
(u_longlong_t)record->zi_objset);
(void) printf("object: %llu\n",
@@ -910,6 +937,7 @@ main(int argc, char **argv)
int ret;
int flags = 0;
uint32_t dvas = 0;
+ hrtime_t ready_delay = -1;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
@@ -940,7 +968,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
- ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
+ ":aA:b:C:d:D:E:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@@ -1113,6 +1141,18 @@ main(int argc, char **argv)
case 'u':
flags |= ZINJECT_UNLOAD_SPA;
break;
+ case 'E':
+ ready_delay = MSEC2NSEC(strtol(optarg, &end, 10));
+ if (ready_delay <= 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid delay '%s': "
+ "must be a positive duration\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ record.zi_cmd = ZINJECT_DELAY_READY;
+ record.zi_timer = ready_delay;
+ break;
case 'L':
if ((label = name_to_type(optarg)) == TYPE_INVAL &&
!LABEL_TYPE(type)) {
@@ -1150,7 +1190,7 @@ main(int argc, char **argv)
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
- record.zi_freq > 0 || dvas != 0) {
+ record.zi_freq > 0 || dvas != 0 || ready_delay >= 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -1186,7 +1226,7 @@ main(int argc, char **argv)
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
- dvas != 0) {
+ dvas != 0 || ready_delay >= 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -1276,13 +1316,23 @@ main(int argc, char **argv)
return (1);
}
- record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (record.zi_cmd == ZINJECT_UNINITIALIZED) {
+ record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (!error)
+ error = EIO;
+ } else if (error != 0) {
+ (void) fprintf(stderr, "error type -e incompatible "
+ "with delay injection\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ } else {
+ record.zi_iotype = io_type;
+ }
+
if (translate_raw(raw, &record) != 0) {
libzfs_fini(g_zfs);
return (1);
}
- if (!error)
- error = EIO;
} else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0 ||
@@ -1410,6 +1460,13 @@ main(int argc, char **argv)
record.zi_dvas = dvas;
}
+ if (record.zi_cmd != ZINJECT_UNINITIALIZED && error != 0) {
+ (void) fprintf(stderr, "error type -e incompatible "
+ "with delay injection\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
if (error == EACCES) {
if (type != TYPE_DATA) {
(void) fprintf(stderr, "decryption errors "
@@ -1425,8 +1482,12 @@ main(int argc, char **argv)
* not found.
*/
error = ECKSUM;
- } else {
+ } else if (record.zi_cmd == ZINJECT_UNINITIALIZED) {
record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (!error)
+ error = EIO;
+ } else {
+ record.zi_iotype = io_type;
}
if (translate_record(type, argv[0], range, level, &record, pool,
@@ -1434,8 +1495,6 @@ main(int argc, char **argv)
libzfs_fini(g_zfs);
return (1);
}
- if (!error)
- error = EIO;
}
/*
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
index 2eec9a95e24c..fef602736705 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
@@ -26,6 +26,7 @@
/*
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright (c) 2025, Klara, Inc.
*/
#include <libintl.h>
@@ -52,7 +53,7 @@
typedef struct zpool_node {
zpool_handle_t *zn_handle;
uu_avl_node_t zn_avlnode;
- int zn_mark;
+ hrtime_t zn_last_refresh;
} zpool_node_t;
struct zpool_list {
@@ -62,6 +63,7 @@ struct zpool_list {
uu_avl_pool_t *zl_pool;
zprop_list_t **zl_proplist;
zfs_type_t zl_type;
+ hrtime_t zl_last_refresh;
};
static int
@@ -81,26 +83,30 @@ zpool_compare(const void *larg, const void *rarg, void *unused)
* of known pools.
*/
static int
-add_pool(zpool_handle_t *zhp, void *data)
+add_pool(zpool_handle_t *zhp, zpool_list_t *zlp)
{
- zpool_list_t *zlp = data;
- zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
+ zpool_node_t *node, *new = safe_malloc(sizeof (zpool_node_t));
uu_avl_index_t idx;
- node->zn_handle = zhp;
- uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
- if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
+ new->zn_handle = zhp;
+ uu_avl_node_init(new, &new->zn_avlnode, zlp->zl_pool);
+
+ node = uu_avl_find(zlp->zl_avl, new, NULL, &idx);
+ if (node == NULL) {
if (zlp->zl_proplist &&
zpool_expand_proplist(zhp, zlp->zl_proplist,
zlp->zl_type, zlp->zl_literal) != 0) {
zpool_close(zhp);
- free(node);
+ free(new);
return (-1);
}
- uu_avl_insert(zlp->zl_avl, node, idx);
+ new->zn_last_refresh = zlp->zl_last_refresh;
+ uu_avl_insert(zlp->zl_avl, new, idx);
} else {
+ zpool_refresh_stats_from_handle(node->zn_handle, zhp);
+ node->zn_last_refresh = zlp->zl_last_refresh;
zpool_close(zhp);
- free(node);
+ free(new);
return (-1);
}
@@ -108,6 +114,18 @@ add_pool(zpool_handle_t *zhp, void *data)
}
/*
+ * add_pool(), but always returns 0. This allows zpool_iter() to continue
+ * even if a pool exists in the tree, or we fail to get the properties for
+ * a new one.
+ */
+static int
+add_pool_cb(zpool_handle_t *zhp, void *data)
+{
+ (void) add_pool(zhp, data);
+ return (0);
+}
+
+/*
* Create a list of pools based on the given arguments. If we're given no
* arguments, then iterate over all pools in the system and add them to the AVL
* tree. Otherwise, add only those pool explicitly specified on the command
@@ -135,9 +153,10 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,
zlp->zl_type = type;
zlp->zl_literal = literal;
+ zlp->zl_last_refresh = gethrtime();
if (argc == 0) {
- (void) zpool_iter(g_zfs, add_pool, zlp);
+ (void) zpool_iter(g_zfs, add_pool_cb, zlp);
zlp->zl_findall = B_TRUE;
} else {
int i;
@@ -159,15 +178,61 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,
}
/*
- * Search for any new pools, adding them to the list. We only add pools when no
- * options were given on the command line. Otherwise, we keep the list fixed as
- * those that were explicitly specified.
+ * Refresh the state of all pools on the list. Additionally, if no options were
+ * given on the command line, add any new pools and remove any that are no
+ * longer available.
*/
-void
-pool_list_update(zpool_list_t *zlp)
+int
+pool_list_refresh(zpool_list_t *zlp)
{
- if (zlp->zl_findall)
- (void) zpool_iter(g_zfs, add_pool, zlp);
+ zlp->zl_last_refresh = gethrtime();
+
+ if (!zlp->zl_findall) {
+ /*
+ * This list is a fixed list of pools, so we must not add
+ * or remove any. Just walk over them and refresh their
+ * state.
+ */
+ int navail = 0;
+ for (zpool_node_t *node = uu_avl_first(zlp->zl_avl);
+ node != NULL; node = uu_avl_next(zlp->zl_avl, node)) {
+ boolean_t missing;
+ zpool_refresh_stats(node->zn_handle, &missing);
+ navail += !missing;
+ node->zn_last_refresh = zlp->zl_last_refresh;
+ }
+ return (navail);
+ }
+
+ /* Search for any new pools and add them to the list. */
+ (void) zpool_iter(g_zfs, add_pool_cb, zlp);
+
+ /* Walk the list of existing pools, and update or remove them. */
+ zpool_node_t *node, *next;
+ for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next) {
+ next = uu_avl_next(zlp->zl_avl, node);
+
+ /*
+ * Skip any that were refreshed and are online; they were added
+ * by zpool_iter() and are already up to date.
+ */
+ if (node->zn_last_refresh == zlp->zl_last_refresh &&
+ zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL)
+ continue;
+
+ /* Refresh and remove if necessary. */
+ boolean_t missing;
+ zpool_refresh_stats(node->zn_handle, &missing);
+ if (missing) {
+ uu_avl_remove(zlp->zl_avl, node);
+ zpool_close(node->zn_handle);
+ free(node);
+ } else {
+ node->zn_last_refresh = zlp->zl_last_refresh;
+ }
+ }
+
+ return (uu_avl_numnodes(zlp->zl_avl));
}
/*
@@ -191,23 +256,6 @@ pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
}
/*
- * Remove the given pool from the list. When running iostat, we want to remove
- * those pools that no longer exist.
- */
-void
-pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp)
-{
- zpool_node_t search, *node;
-
- search.zn_handle = zhp;
- if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
- uu_avl_remove(zlp->zl_avl, node);
- zpool_close(node->zn_handle);
- free(node);
- }
-}
-
-/*
* Free all the handles associated with this list.
*/
void
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
index 2c46ad0df895..1feec55c0e8b 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -33,7 +33,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
- * Copyright (c) 2021, 2023, Klara Inc.
+ * Copyright (c) 2021, 2023, 2025, Klara, Inc.
* Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP.
*/
@@ -5761,24 +5761,6 @@ children:
return (ret);
}
-static int
-refresh_iostat(zpool_handle_t *zhp, void *data)
-{
- iostat_cbdata_t *cb = data;
- boolean_t missing;
-
- /*
- * If the pool has disappeared, remove it from the list and continue.
- */
- if (zpool_refresh_stats(zhp, &missing) != 0)
- return (-1);
-
- if (missing)
- pool_list_remove(cb->cb_list, zhp);
-
- return (0);
-}
-
/*
* Callback to print out the iostats for the given pool.
*/
@@ -6359,15 +6341,14 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data)
* This command can be tricky because we want to be able to deal with pool
* creation/destruction as well as vdev configuration changes. The bulk of this
* processing is handled by the pool_list_* routines in zpool_iter.c. We rely
- * on pool_list_update() to detect the addition of new pools. Configuration
- * changes are all handled within libzfs.
+ * on pool_list_refresh() to detect the addition and removal of pools.
+ * Configuration changes are all handled within libzfs.
*/
int
zpool_do_iostat(int argc, char **argv)
{
int c;
int ret;
- int npools;
float interval = 0;
unsigned long count = 0;
zpool_list_t *list;
@@ -6618,10 +6599,24 @@ zpool_do_iostat(int argc, char **argv)
return (1);
}
+ int last_npools = 0;
for (;;) {
- if ((npools = pool_list_count(list)) == 0)
+ /*
+ * Refresh all pools in list, adding or removing pools as
+ * necessary.
+ */
+ int npools = pool_list_refresh(list);
+ if (npools == 0) {
(void) fprintf(stderr, gettext("no pools available\n"));
- else {
+ } else {
+ /*
+ * If the list of pools has changed since last time
+ * around, reset the iteration count to force the
+ * header to be redisplayed.
+ */
+ if (last_npools != npools)
+ cb.cb_iteration = 0;
+
/*
* If this is the first iteration and -y was supplied
* we skip any printing.
@@ -6630,15 +6625,6 @@ zpool_do_iostat(int argc, char **argv)
cb.cb_iteration == 0);
/*
- * Refresh all statistics. This is done as an
- * explicit step before calculating the maximum name
- * width, so that any * configuration changes are
- * properly accounted for.
- */
- (void) pool_list_iter(list, B_FALSE, refresh_iostat,
- &cb);
-
- /*
* Iterate over all pools to determine the maximum width
* for the pool / device name column across all pools.
*/
@@ -6691,6 +6677,7 @@ zpool_do_iostat(int argc, char **argv)
if (skip) {
(void) fflush(stdout);
(void) fsleep(interval);
+ last_npools = npools;
continue;
}
@@ -6728,6 +6715,8 @@ zpool_do_iostat(int argc, char **argv)
(void) fflush(stdout);
(void) fsleep(interval);
+
+ last_npools = npools;
}
pool_list_free(list);
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
index 5ab7cb9750f1..3af23c52bd45 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
@@ -76,11 +76,10 @@ typedef struct zpool_list zpool_list_t;
zpool_list_t *pool_list_get(int, char **, zprop_list_t **, zfs_type_t,
boolean_t, int *);
-void pool_list_update(zpool_list_t *);
+int pool_list_refresh(zpool_list_t *);
int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
void pool_list_free(zpool_list_t *);
int pool_list_count(zpool_list_t *);
-void pool_list_remove(zpool_list_t *, zpool_handle_t *);
extern libzfs_handle_t *g_zfs;
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
index 684b46a2d673..088c0108e911 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
@@ -609,22 +609,28 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
ZPOOL_CONFIG_PATH, &path) == 0);
/*
+ * Skip active spares they should never cause
+ * the pool to be evaluated as inconsistent.
+ */
+ if (is_spare(NULL, path))
+ continue;
+
+ /*
* If we have a raidz/mirror that combines disks
- * with files, report it as an error.
+ * with files, only report it as an error when
+ * fatal is set to ensure all the replication
+ * checks aren't skipped in check_replication().
*/
- if (!dontreport && type != NULL &&
+ if (fatal && !dontreport && type != NULL &&
strcmp(type, childtype) != 0) {
if (ret != NULL)
free(ret);
ret = NULL;
- if (fatal)
- vdev_error(gettext(
- "mismatched replication "
- "level: %s contains both "
- "files and devices\n"),
- rep.zprl_type);
- else
- return (NULL);
+ vdev_error(gettext(
+ "mismatched replication "
+ "level: %s contains both "
+ "files and devices\n"),
+ rep.zprl_type);
dontreport = B_TRUE;
}
diff --git a/sys/contrib/openzfs/contrib/intel_qat/readme.md b/sys/contrib/openzfs/contrib/intel_qat/readme.md
index 7e45d395bb80..04c299b6404c 100644
--- a/sys/contrib/openzfs/contrib/intel_qat/readme.md
+++ b/sys/contrib/openzfs/contrib/intel_qat/readme.md
@@ -8,7 +8,7 @@ This contrib contains community compatibility patches to get Intel QAT working o
These patches are based on the following Intel QAT version:
[1.7.l.4.10.0-00014](https://01.org/sites/default/files/downloads/qat1.7.l.4.10.0-00014.tar.gz)
-When using QAT with above kernels versions, the following patches needs to be applied using:
+When using QAT with the above kernel versions, the following patches need to be applied using:
patch -p1 < _$PATCH_
_Where $PATCH refers to the path of the patch in question_
diff --git a/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py
index 971aa1d0d493..bad1af2d1671 100644
--- a/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py
+++ b/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py
@@ -4223,7 +4223,7 @@ class _TempPool(object):
self.getRoot().reset()
return
- # On the Buildbot builders this may fail with "pool is busy"
+ # On the CI builders this may fail with "pool is busy"
# Retry 5 times before raising an error
retry = 0
while True:
diff --git a/sys/contrib/openzfs/etc/init.d/README.md b/sys/contrib/openzfs/etc/init.d/README.md
index da780fdc1222..3852dd9a6b2e 100644
--- a/sys/contrib/openzfs/etc/init.d/README.md
+++ b/sys/contrib/openzfs/etc/init.d/README.md
@@ -1,5 +1,5 @@
DESCRIPTION
- These script were written with the primary intention of being portable and
+ These scripts were written with the primary intention of being portable and
usable on as many systems as possible.
This is, in practice, usually not possible. But the intention is there.
diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h
index 3fcdc176a621..14930fb90622 100644
--- a/sys/contrib/openzfs/include/libzfs.h
+++ b/sys/contrib/openzfs/include/libzfs.h
@@ -479,6 +479,8 @@ _LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, const char **,
_LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
_LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *);
_LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
+_LIBZFS_H void zpool_refresh_stats_from_handle(zpool_handle_t *,
+ zpool_handle_t *);
_LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
_LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *);
diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
index 076dab8ba6dc..214f3ea0e787 100644
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
@@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id)
}
#endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
-/*
- * All the io_*() helper functions below can operate on a bio, or a rq, but
- * not both. The older submit_bio() codepath will pass a bio, and the
- * newer blk-mq codepath will pass a rq.
- */
-static inline int
-io_data_dir(struct bio *bio, struct request *rq)
-{
- if (rq != NULL) {
- if (op_is_write(req_op(rq))) {
- return (WRITE);
- } else {
- return (READ);
- }
- }
- return (bio_data_dir(bio));
-}
-
static inline int
io_is_flush(struct bio *bio, struct request *rq)
{
diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h
index 49ab9d3db795..662fd81c5ee1 100644
--- a/sys/contrib/openzfs/include/sys/fs/zfs.h
+++ b/sys/contrib/openzfs/include/sys/fs/zfs.h
@@ -748,6 +748,8 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
#define ZPOOL_CONFIG_ASHIFT "ashift"
#define ZPOOL_CONFIG_ASIZE "asize"
+#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc"
+#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc"
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
diff --git a/sys/contrib/openzfs/include/sys/range_tree.h b/sys/contrib/openzfs/include/sys/range_tree.h
index 0f6884682459..0f6def36f9f6 100644
--- a/sys/contrib/openzfs/include/sys/range_tree.h
+++ b/sys/contrib/openzfs/include/sys/range_tree.h
@@ -238,8 +238,7 @@ zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end)
}
static inline void
-zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt,
- uint64_t fill)
+zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill)
{
ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
switch (rt->rt_type) {
@@ -277,7 +276,7 @@ static inline void
zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill)
{
ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift));
- zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift);
+ zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift);
}
typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size);
diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h
index 66db16b33c51..f172f2af6f07 100644
--- a/sys/contrib/openzfs/include/sys/spa.h
+++ b/sys/contrib/openzfs/include/sys/spa.h
@@ -1030,7 +1030,7 @@ extern void spa_import_progress_set_notes_nolog(spa_t *spa,
extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag,
krw_t rw);
extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw);
-extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag,
+extern void spa_config_enter_priority(spa_t *spa, int locks, const void *tag,
krw_t rw);
extern void spa_config_exit(spa_t *spa, int locks, const void *tag);
extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
@@ -1084,6 +1084,7 @@ extern pool_state_t spa_state(spa_t *spa);
extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
+extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max);
extern uint64_t spa_get_dspace(spa_t *spa);
extern uint64_t spa_get_checkpoint_space(spa_t *spa);
extern uint64_t spa_get_slop_space(spa_t *spa);
diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h
index 07a959db3447..62b062984d36 100644
--- a/sys/contrib/openzfs/include/sys/spa_impl.h
+++ b/sys/contrib/openzfs/include/sys/spa_impl.h
@@ -265,6 +265,7 @@ struct spa {
uint64_t spa_min_ashift; /* of vdevs in normal class */
uint64_t spa_max_ashift; /* of vdevs in normal class */
uint64_t spa_min_alloc; /* of vdevs in normal class */
+ uint64_t spa_max_alloc; /* of vdevs in normal class */
uint64_t spa_gcd_alloc; /* of vdevs in normal class */
uint64_t spa_config_guid; /* config pool guid */
uint64_t spa_load_guid; /* spa_load initialized guid */
diff --git a/sys/contrib/openzfs/include/sys/zfs_ioctl.h b/sys/contrib/openzfs/include/sys/zfs_ioctl.h
index 8174242abdac..cfe11f43bb8e 100644
--- a/sys/contrib/openzfs/include/sys/zfs_ioctl.h
+++ b/sys/contrib/openzfs/include/sys/zfs_ioctl.h
@@ -455,6 +455,7 @@ typedef enum zinject_type {
ZINJECT_DECRYPT_FAULT,
ZINJECT_DELAY_IMPORT,
ZINJECT_DELAY_EXPORT,
+ ZINJECT_DELAY_READY,
} zinject_type_t;
typedef enum zinject_iotype {
diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h
index a8acb83b4c2f..acb0a03a36b2 100644
--- a/sys/contrib/openzfs/include/sys/zio.h
+++ b/sys/contrib/openzfs/include/sys/zio.h
@@ -718,6 +718,7 @@ extern void zio_handle_ignored_writes(zio_t *zio);
extern hrtime_t zio_handle_io_delay(zio_t *zio);
extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed);
extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed);
+extern hrtime_t zio_handle_ready_delay(zio_t *zio);
/*
* Checksum ereport functions
diff --git a/sys/contrib/openzfs/lib/libuutil/libuutil.abi b/sys/contrib/openzfs/lib/libuutil/libuutil.abi
index 6c736c61e4a5..2a740afa07ca 100644
--- a/sys/contrib/openzfs/lib/libuutil/libuutil.abi
+++ b/sys/contrib/openzfs/lib/libuutil/libuutil.abi
@@ -616,6 +616,7 @@
<array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'>
<subrange length='23' type-id='7359adad' id='fdd0f594'/>
</array-type-def>
+ <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
<array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'>
<subrange length='4' type-id='7359adad' id='16fe7105'/>
</array-type-def>
@@ -1020,13 +1021,6 @@
<array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'>
<subrange length='3' type-id='7359adad' id='56f209d2'/>
</array-type-def>
- <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'>
- <subrange length='1' type-id='7359adad' id='52f813b4'/>
- </array-type-def>
- <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'>
- <subrange length='12' type-id='7359adad' id='84827bdc'/>
- </array-type-def>
- <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
<class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='mnt_special' type-id='26a90f95' visibility='default'/>
@@ -1061,93 +1055,6 @@
<var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/>
</data-member>
</class-decl>
- <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/>
- <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/>
- <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/>
- <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/>
- <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/>
- <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'>
- <data-member access='public' layout-offset-in-bits='0'>
- <var-decl name='tv_sec' type-id='49659421' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='64'>
- <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='96'>
- <var-decl name='__reserved' type-id='3158a266' visibility='default'/>
- </data-member>
- </class-decl>
- <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'>
- <data-member access='public' layout-offset-in-bits='0'>
- <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='32'>
- <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='64'>
- <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='128'>
- <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='160'>
- <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='192'>
- <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='224'>
- <var-decl name='stx_mode' type-id='d315442e' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='240'>
- <var-decl name='__spare0' type-id='811205dc' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='256'>
- <var-decl name='stx_ino' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='320'>
- <var-decl name='stx_size' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='384'>
- <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='448'>
- <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='512'>
- <var-decl name='stx_atime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='640'>
- <var-decl name='stx_btime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='768'>
- <var-decl name='stx_ctime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='896'>
- <var-decl name='stx_mtime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1024'>
- <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1056'>
- <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1088'>
- <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1120'>
- <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1152'>
- <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1216'>
- <var-decl name='__spare2' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1280'>
- <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/>
- </data-member>
- </class-decl>
<class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/>
@@ -1237,8 +1144,6 @@
<pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
<pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/>
<qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/>
- <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/>
- <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/>
<function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='e75a27e9'/>
<parameter type-id='3cad23cd'/>
@@ -1254,14 +1159,6 @@
<parameter type-id='95e97e5e'/>
<return type-id='26a90f95'/>
</function-decl>
- <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'>
- <parameter type-id='95e97e5e'/>
- <parameter type-id='9d26089a'/>
- <parameter type-id='95e97e5e'/>
- <parameter type-id='f0981eeb'/>
- <parameter type-id='31d265b7'/>
- <return type-id='95e97e5e'/>
- </function-decl>
<function-decl name='__fprintf_chk' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='e75a27e9'/>
<parameter type-id='95e97e5e'/>
diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi
index 184ea4a55b43..f988d27a286a 100644
--- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi
@@ -571,6 +571,7 @@
<elf-symbol name='zpool_props_refresh' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_read_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_refresh_stats' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='zpool_refresh_stats_from_handle' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_reguid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_reopen_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -641,7 +642,7 @@
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2632' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
- <elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='zfs_deleg_perm_tab' size='544' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_max_dataset_nesting' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_userquota_prop_prefixes' size='96' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -1458,103 +1459,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'>
- <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'>
- <subrange length='1' type-id='7359adad' id='52f813b4'/>
- </array-type-def>
- <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'>
- <subrange length='12' type-id='7359adad' id='84827bdc'/>
- </array-type-def>
- <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/>
- <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/>
- <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/>
- <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/>
- <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/>
- <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'>
- <data-member access='public' layout-offset-in-bits='0'>
- <var-decl name='tv_sec' type-id='49659421' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='64'>
- <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='96'>
- <var-decl name='__reserved' type-id='3158a266' visibility='default'/>
- </data-member>
- </class-decl>
- <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'>
- <data-member access='public' layout-offset-in-bits='0'>
- <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='32'>
- <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='64'>
- <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='128'>
- <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='160'>
- <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='192'>
- <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='224'>
- <var-decl name='stx_mode' type-id='d315442e' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='240'>
- <var-decl name='__spare0' type-id='811205dc' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='256'>
- <var-decl name='stx_ino' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='320'>
- <var-decl name='stx_size' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='384'>
- <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='448'>
- <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='512'>
- <var-decl name='stx_atime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='640'>
- <var-decl name='stx_btime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='768'>
- <var-decl name='stx_ctime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='896'>
- <var-decl name='stx_mtime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1024'>
- <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1056'>
- <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1088'>
- <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1120'>
- <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1152'>
- <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1216'>
- <var-decl name='__spare2' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1280'>
- <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/>
- </data-member>
- </class-decl>
<pointer-type-def type-id='56fe4a37' size-in-bits='64' id='b6b61d2f'/>
<qualified-type-def type-id='b6b61d2f' restrict='yes' id='3cad23cd'/>
- <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/>
- <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/>
<function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='e75a27e9'/>
<parameter type-id='3cad23cd'/>
@@ -1566,14 +1472,6 @@
<parameter type-id='822cd80b'/>
<return type-id='95e97e5e'/>
</function-decl>
- <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'>
- <parameter type-id='95e97e5e'/>
- <parameter type-id='9d26089a'/>
- <parameter type-id='95e97e5e'/>
- <parameter type-id='f0981eeb'/>
- <parameter type-id='31d265b7'/>
- <return type-id='95e97e5e'/>
- </function-decl>
</abi-instr>
<abi-instr address-size='64' path='lib/libspl/timestamp.c' language='LANG_C99'>
<typedef-decl name='nl_item' type-id='95e97e5e' id='03b79a94'/>
@@ -3194,6 +3092,10 @@
<parameter type-id='dace003f'/>
<return type-id='80f4b756'/>
</function-decl>
+ <function-decl name='fnvlist_dup' visibility='default' binding='global' size-in-bits='64'>
+ <parameter type-id='22cce67b'/>
+ <return type-id='5ce45b60'/>
+ </function-decl>
<function-decl name='fnvpair_value_nvlist' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='3fa542f0'/>
<return type-id='5ce45b60'/>
@@ -3238,6 +3140,11 @@
<parameter type-id='37e3bd22' name='missing'/>
<return type-id='95e97e5e'/>
</function-decl>
+ <function-decl name='zpool_refresh_stats_from_handle' mangled-name='zpool_refresh_stats_from_handle' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_refresh_stats_from_handle'>
+ <parameter type-id='4c81de99' name='dzhp'/>
+ <parameter type-id='4c81de99' name='szhp'/>
+ <return type-id='48b5725f'/>
+ </function-decl>
<function-decl name='zpool_skip_pool' mangled-name='zpool_skip_pool' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_skip_pool'>
<parameter type-id='80f4b756' name='poolname'/>
<return type-id='c19b74c3'/>
@@ -9398,10 +9305,6 @@
<parameter type-id='5ce45b60'/>
<return type-id='48b5725f'/>
</function-decl>
- <function-decl name='fnvlist_dup' visibility='default' binding='global' size-in-bits='64'>
- <parameter type-id='22cce67b'/>
- <return type-id='5ce45b60'/>
- </function-decl>
<function-decl name='spl_pagesize' mangled-name='spl_pagesize' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='spl_pagesize'>
<return type-id='b59d7dce'/>
</function-decl>
@@ -9774,8 +9677,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfs_deleg.c' language='LANG_C99'>
- <array-type-def dimensions='1' type-id='fa1870fd' size-in-bits='4224' id='55e705e7'>
- <subrange length='33' type-id='7359adad' id='6a5934df'/>
+ <array-type-def dimensions='1' type-id='fa1870fd' size-in-bits='4352' id='55f84f08'>
+ <subrange length='34' type-id='7359adad' id='6a6a7e00'/>
</array-type-def>
<array-type-def dimensions='1' type-id='fa1870fd' size-in-bits='infinite' id='7c00e69d'>
<subrange length='infinite' id='031f2035'/>
@@ -9805,30 +9708,31 @@
<enumerator name='ZFS_DELEG_NOTE_PROMOTE' value='5'/>
<enumerator name='ZFS_DELEG_NOTE_RENAME' value='6'/>
<enumerator name='ZFS_DELEG_NOTE_SEND' value='7'/>
- <enumerator name='ZFS_DELEG_NOTE_RECEIVE' value='8'/>
- <enumerator name='ZFS_DELEG_NOTE_ALLOW' value='9'/>
- <enumerator name='ZFS_DELEG_NOTE_USERPROP' value='10'/>
- <enumerator name='ZFS_DELEG_NOTE_MOUNT' value='11'/>
- <enumerator name='ZFS_DELEG_NOTE_SHARE' value='12'/>
- <enumerator name='ZFS_DELEG_NOTE_USERQUOTA' value='13'/>
- <enumerator name='ZFS_DELEG_NOTE_GROUPQUOTA' value='14'/>
- <enumerator name='ZFS_DELEG_NOTE_USERUSED' value='15'/>
- <enumerator name='ZFS_DELEG_NOTE_GROUPUSED' value='16'/>
- <enumerator name='ZFS_DELEG_NOTE_USEROBJQUOTA' value='17'/>
- <enumerator name='ZFS_DELEG_NOTE_GROUPOBJQUOTA' value='18'/>
- <enumerator name='ZFS_DELEG_NOTE_USEROBJUSED' value='19'/>
- <enumerator name='ZFS_DELEG_NOTE_GROUPOBJUSED' value='20'/>
- <enumerator name='ZFS_DELEG_NOTE_HOLD' value='21'/>
- <enumerator name='ZFS_DELEG_NOTE_RELEASE' value='22'/>
- <enumerator name='ZFS_DELEG_NOTE_DIFF' value='23'/>
- <enumerator name='ZFS_DELEG_NOTE_BOOKMARK' value='24'/>
- <enumerator name='ZFS_DELEG_NOTE_LOAD_KEY' value='25'/>
- <enumerator name='ZFS_DELEG_NOTE_CHANGE_KEY' value='26'/>
- <enumerator name='ZFS_DELEG_NOTE_PROJECTUSED' value='27'/>
- <enumerator name='ZFS_DELEG_NOTE_PROJECTQUOTA' value='28'/>
- <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJUSED' value='29'/>
- <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJQUOTA' value='30'/>
- <enumerator name='ZFS_DELEG_NOTE_NONE' value='31'/>
+ <enumerator name='ZFS_DELEG_NOTE_SEND_RAW' value='8'/>
+ <enumerator name='ZFS_DELEG_NOTE_RECEIVE' value='9'/>
+ <enumerator name='ZFS_DELEG_NOTE_ALLOW' value='10'/>
+ <enumerator name='ZFS_DELEG_NOTE_USERPROP' value='11'/>
+ <enumerator name='ZFS_DELEG_NOTE_MOUNT' value='12'/>
+ <enumerator name='ZFS_DELEG_NOTE_SHARE' value='13'/>
+ <enumerator name='ZFS_DELEG_NOTE_USERQUOTA' value='14'/>
+ <enumerator name='ZFS_DELEG_NOTE_GROUPQUOTA' value='15'/>
+ <enumerator name='ZFS_DELEG_NOTE_USERUSED' value='16'/>
+ <enumerator name='ZFS_DELEG_NOTE_GROUPUSED' value='17'/>
+ <enumerator name='ZFS_DELEG_NOTE_USEROBJQUOTA' value='18'/>
+ <enumerator name='ZFS_DELEG_NOTE_GROUPOBJQUOTA' value='19'/>
+ <enumerator name='ZFS_DELEG_NOTE_USEROBJUSED' value='20'/>
+ <enumerator name='ZFS_DELEG_NOTE_GROUPOBJUSED' value='21'/>
+ <enumerator name='ZFS_DELEG_NOTE_HOLD' value='22'/>
+ <enumerator name='ZFS_DELEG_NOTE_RELEASE' value='23'/>
+ <enumerator name='ZFS_DELEG_NOTE_DIFF' value='24'/>
+ <enumerator name='ZFS_DELEG_NOTE_BOOKMARK' value='25'/>
+ <enumerator name='ZFS_DELEG_NOTE_LOAD_KEY' value='26'/>
+ <enumerator name='ZFS_DELEG_NOTE_CHANGE_KEY' value='27'/>
+ <enumerator name='ZFS_DELEG_NOTE_PROJECTUSED' value='28'/>
+ <enumerator name='ZFS_DELEG_NOTE_PROJECTQUOTA' value='29'/>
+ <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJUSED' value='30'/>
+ <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJQUOTA' value='31'/>
+ <enumerator name='ZFS_DELEG_NOTE_NONE' value='32'/>
</enum-decl>
<typedef-decl name='zfs_deleg_note_t' type-id='729d4547' id='4613c173'/>
<class-decl name='zfs_deleg_perm_tab' size-in-bits='128' is-struct='yes' visibility='default' id='5aa05c1f'>
diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_config.c b/sys/contrib/openzfs/lib/libzfs/libzfs_config.c
index 0d2102191389..9d704e4303ff 100644
--- a/sys/contrib/openzfs/lib/libzfs/libzfs_config.c
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs_config.c
@@ -308,6 +308,23 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing)
}
/*
+ * Copies the pool config and state from szhp to dzhp. szhp and dzhp must
+ * represent the same pool. Used by pool_list_refresh() to avoid another
+ * round-trip into the kernel to get stats already collected earlier in the
+ * function.
+ */
+void
+zpool_refresh_stats_from_handle(zpool_handle_t *dzhp, zpool_handle_t *szhp)
+{
+ VERIFY0(strcmp(dzhp->zpool_name, szhp->zpool_name));
+ nvlist_free(dzhp->zpool_old_config);
+ dzhp->zpool_old_config = dzhp->zpool_config;
+ dzhp->zpool_config = fnvlist_dup(szhp->zpool_config);
+ dzhp->zpool_config_size = szhp->zpool_config_size;
+ dzhp->zpool_state = szhp->zpool_state;
+}
+
+/*
* The following environment variables are undocumented
* and should be used for testing purposes only:
*
diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi
index 7464b3adb254..263cad045f7a 100644
--- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi
+++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi
@@ -617,6 +617,7 @@
<array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'>
<subrange length='23' type-id='7359adad' id='fdd0f594'/>
</array-type-def>
+ <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
<array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'>
<subrange length='4' type-id='7359adad' id='16fe7105'/>
</array-type-def>
@@ -988,13 +989,6 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'>
- <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'>
- <subrange length='1' type-id='7359adad' id='52f813b4'/>
- </array-type-def>
- <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'>
- <subrange length='12' type-id='7359adad' id='84827bdc'/>
- </array-type-def>
- <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
<class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='mnt_special' type-id='26a90f95' visibility='default'/>
@@ -1029,93 +1023,6 @@
<var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/>
</data-member>
</class-decl>
- <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/>
- <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/>
- <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/>
- <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/>
- <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/>
- <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'>
- <data-member access='public' layout-offset-in-bits='0'>
- <var-decl name='tv_sec' type-id='49659421' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='64'>
- <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='96'>
- <var-decl name='__reserved' type-id='3158a266' visibility='default'/>
- </data-member>
- </class-decl>
- <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'>
- <data-member access='public' layout-offset-in-bits='0'>
- <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='32'>
- <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='64'>
- <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='128'>
- <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='160'>
- <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='192'>
- <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='224'>
- <var-decl name='stx_mode' type-id='d315442e' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='240'>
- <var-decl name='__spare0' type-id='811205dc' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='256'>
- <var-decl name='stx_ino' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='320'>
- <var-decl name='stx_size' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='384'>
- <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='448'>
- <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='512'>
- <var-decl name='stx_atime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='640'>
- <var-decl name='stx_btime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='768'>
- <var-decl name='stx_ctime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='896'>
- <var-decl name='stx_mtime' type-id='94101016' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1024'>
- <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1056'>
- <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1088'>
- <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1120'>
- <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1152'>
- <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1216'>
- <var-decl name='__spare2' type-id='d3130597' visibility='default'/>
- </data-member>
- <data-member access='public' layout-offset-in-bits='1280'>
- <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/>
- </data-member>
- </class-decl>
<class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/>
@@ -1191,8 +1098,6 @@
<pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
<pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/>
<qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/>
- <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/>
- <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/>
<function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='e75a27e9'/>
<parameter type-id='3cad23cd'/>
@@ -1208,14 +1113,6 @@
<parameter type-id='95e97e5e'/>
<return type-id='26a90f95'/>
</function-decl>
- <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'>
- <parameter type-id='95e97e5e'/>
- <parameter type-id='9d26089a'/>
- <parameter type-id='95e97e5e'/>
- <parameter type-id='f0981eeb'/>
- <parameter type-id='31d265b7'/>
- <return type-id='95e97e5e'/>
- </function-decl>
<function-decl name='stat64' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='9d26089a'/>
<parameter type-id='f1cadedf'/>
diff --git a/sys/contrib/openzfs/man/man8/zinject.8 b/sys/contrib/openzfs/man/man8/zinject.8
index 1d9e43aed5ec..704f6a7accd8 100644
--- a/sys/contrib/openzfs/man/man8/zinject.8
+++ b/sys/contrib/openzfs/man/man8/zinject.8
@@ -138,6 +138,20 @@ This injector is automatically cleared after the import is finished.
.
.It Xo
.Nm zinject
+.Fl E Ar delay
+.Op Fl a
+.Op Fl m
+.Op Fl f Ar freq
+.Op Fl l Ar level
+.Op Fl r Ar range
+.Op Fl T Ar iotype
+.Op Fl t Ar type Ns | Ns Fl b Ar bookmark
+.Xc
+Inject pipeline ready stage delays for the given object or bookmark.
+The delay is specified in milliseconds.
+.
+.It Xo
+.Nm zinject
.Fl I
.Op Fl s Ar seconds Ns | Ns Fl g Ar txgs
.Ar pool
diff --git a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 b/sys/contrib/openzfs/man/man8/zpool-upgrade.8
index cf69060da5ce..adae47f82eb1 100644
--- a/sys/contrib/openzfs/man/man8/zpool-upgrade.8
+++ b/sys/contrib/openzfs/man/man8/zpool-upgrade.8
@@ -65,10 +65,10 @@ property).
.Cm upgrade
.Fl v
.Xc
-Displays legacy ZFS versions supported by the this version of ZFS.
+Displays legacy ZFS versions supported by this version of ZFS.
See
.Xr zpool-features 7
-for a description of feature flags features supported by this version of ZFS.
+for a description of features supported by this version of ZFS.
.It Xo
.Nm zpool
.Cm upgrade
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index bac166fcd89e..967a018640e1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -484,7 +484,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = io_offset(bio, rq);
uint64_t size = io_size(bio, rq);
- int rw = io_data_dir(bio, rq);
+ int rw;
+
+ if (rq != NULL) {
+ /*
+ * Flush & trim requests go down the zvol_write codepath. Or
+ * more specifically:
+ *
+ * If request is a write, or if it's op_is_sync() and not a
+ * read, or if it's a flush, or if it's a discard, then send the
+ * request down the write path.
+ */
+ if (op_is_write(rq->cmd_flags) ||
+ (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) ||
+ req_op(rq) == REQ_OP_FLUSH ||
+ op_is_discard(rq->cmd_flags)) {
+ rw = WRITE;
+ } else {
+ rw = READ;
+ }
+ } else {
+ rw = bio_data_dir(bio);
+ }
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
zvol_end_io(bio, rq, SET_ERROR(ENXIO));
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 864e3898b365..9190ae0362ea 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -364,8 +364,8 @@ zfs_prop_init(void)
static const zprop_index_t xattr_table[] = {
{ "off", ZFS_XATTR_OFF },
- { "on", ZFS_XATTR_SA },
{ "sa", ZFS_XATTR_SA },
+ { "on", ZFS_XATTR_SA },
{ "dir", ZFS_XATTR_DIR },
{ NULL }
};
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index bd6dc8edd8ca..591e2dade59e 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -1392,6 +1392,7 @@ arc_get_complevel(arc_buf_t *buf)
return (buf->b_hdr->b_complevel);
}
+__maybe_unused
static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf)
{
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 6c150d31c669..e88d394b5229 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -2656,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
}
/*
+ * Adjust *offset to the next (or previous) block byte offset at lvl.
+ * Returns FALSE if *offset would overflow or underflow.
+ */
+static boolean_t
+dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
+{
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int span = lvl * epbs + dn->dn_datablkshift;
+ uint64_t blkid, maxblkid;
+
+ if (span >= 8 * sizeof (uint64_t))
+ return (B_FALSE);
+
+ blkid = *offset >> span;
+ maxblkid = 1ULL << (8 * sizeof (*offset) - span);
+ if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
+ *offset = (blkid + 1) << span;
+ else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
+ *offset = (blkid << span) - 1;
+ else
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
* Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find
* in an L0 data block; this value is 1 for normal objects,
@@ -2682,7 +2708,7 @@ int
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
- uint64_t initial_offset = *offset;
+ uint64_t matched = *offset;
int lvl, maxlvl;
int error = 0;
@@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
maxlvl = dn->dn_phys->dn_nlevels;
- for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ for (lvl = minlvl; lvl <= maxlvl; ) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
- if (error != ESRCH)
+ if (error == 0 && lvl > minlvl) {
+ --lvl;
+ matched = *offset;
+ } else if (error == ESRCH && lvl < maxlvl &&
+ dnode_next_block(dn, flags, &matched, lvl)) {
+ /*
+ * Continue search at next/prev offset in lvl+1 block.
+ *
+ * Usually we only search upwards at the start of the
+ * search as higher level blocks point at a matching
+ * minlvl block in most cases, but we backtrack if not.
+ *
+ * This can happen for txg > 0 searches if the block
+ * contains only BPs/dnodes freed at that txg. It also
+ * happens if we are still syncing out the tree, and
+ * some BP's at higher levels are not updated yet.
+ *
+ * We must adjust offset to avoid coming back to the
+ * same offset and getting stuck looping forever. This
+ * also deals with the case where offset is already at
+ * the beginning or end of the object.
+ */
+ ++lvl;
+ *offset = matched;
+ } else {
break;
- }
-
- while (error == 0 && --lvl >= minlvl) {
- error = dnode_next_offset_level(dn,
- flags, offset, lvl, blkfill, txg);
+ }
}
/*
@@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
error = 0;
}
- if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
- initial_offset < *offset : initial_offset > *offset))
- error = SET_ERROR(ESRCH);
out:
if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock);
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 7db72b9b04b0..fd46127b6068 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa)
uint64_t offset;
hrtime_t lock_acquire_time = gethrtime();
- spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
+ spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index ea2d2c7227c8..d73195f1a21f 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
* the size, since we do not support removing partial segments
* of range trees with gaps.
*/
- zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
+ zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
zfs_rs_get_start_raw(rs, rt));
zfs_range_tree_stat_incr(rt, &rs_tmp);
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index cf28955b0c50..f615591e826b 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc);
if (spa->spa_comment != NULL)
fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
spa->spa_comment);
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 6f7c060f97f8..0bead6d49666 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
static void
spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
- int mmp_flag)
+ int priority_flag)
{
(void) tag;
int wlocks_held = 0;
@@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
mutex_enter(&scl->scl_lock);
if (rw == RW_READER) {
while (scl->scl_writer ||
- (!mmp_flag && scl->scl_write_wanted)) {
+ (!priority_flag && scl->scl_write_wanted)) {
cv_wait(&scl->scl_cv, &scl->scl_lock);
}
} else {
@@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
}
/*
- * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * The spa_config_enter_priority() allows the mmp thread to cut in front of
* outstanding write lock requests. This is needed since the mmp updates are
* time sensitive and failure to service them promptly will result in a
* suspended pool. This pool suspension has been seen in practice when there is
@@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
*/
void
-spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw)
{
spa_config_enter_impl(spa, locks, tag, rw, 1);
}
@@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
spa->spa_min_alloc = INT_MAX;
+ spa->spa_max_alloc = 0;
spa->spa_gcd_alloc = INT_MAX;
/* Reset cached value */
@@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
}
/*
+ * Return the range of minimum allocation sizes for the normal allocation
+ * class. This can be used by external consumers of the DMU to estimate
+ * potential wasted capacity when setting the recordsize for an object.
+ * This is mainly for dRAID pools which always pad to a full stripe width.
+ */
+void
+spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc)
+{
+ *min_alloc = spa->spa_min_alloc;
+ *max_alloc = spa->spa_max_alloc;
+}
+
+/*
* Return the amount of slop space in bytes. It is typically 1/32 of the pool
* (3.2%), minus the embedded log space. On very small pools, it may be
* slightly larger than this. On very large pools, it will be capped to
@@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version);
EXPORT_SYMBOL(spa_state);
EXPORT_SYMBOL(spa_load_state);
EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */
EXPORT_SYMBOL(spa_get_dspace);
EXPORT_SYMBOL(spa_update_dspace);
EXPORT_SYMBOL(spa_deflate);
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index fc6d445f9785..654e034de9e1 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
{
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
- if (spa->spa_gcd_alloc == INT_MAX) {
+
+ if (min_alloc > spa->spa_max_alloc)
+ spa->spa_max_alloc = min_alloc;
+
+ if (spa->spa_gcd_alloc == INT_MAX)
spa->spa_gcd_alloc = min_alloc;
- } else {
- spa->spa_gcd_alloc = vdev_gcd(min_alloc,
- spa->spa_gcd_alloc);
- }
+ else
+ spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc);
}
void
@@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd)
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
- uint64_t min_alloc = vdev_get_min_alloc(vd);
- vdev_spa_set_alloc(spa, min_alloc);
+ vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd));
}
}
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index c44f654b0261..0d4fdaa77ba0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC,
+ vdev_get_min_alloc(vd));
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
if (vd->vdev_noalloc) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 4cf8912d4269..aeea58bedfe4 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio)
ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]);
if (vd == NULL) {
- if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
- spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+ if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) {
+ /*
+ * A deadlock workaround. The ddt_prune_unique_entries()
+ * -> prune_candidates_sync() code path takes the
+ * SCL_ZIO reader lock and may request it again here.
+ * If there is another thread who wants the SCL_ZIO
+ * writer lock, then scl_write_wanted will be set.
+ * Thus, the spa_config_enter_priority() is used to
+ * ignore pending writer requests.
+ *
+ * The locking should be revised to remove the need
+ * for this workaround. If that's not workable then
+ * it should only be applied to the zios involved in
+ * the pruning process. This impacts the read/write
+ * I/O balance while pruning.
+ */
+ if (spa->spa_active_ddt_prune)
+ spa_config_enter_priority(spa, SCL_ZIO, zio,
+ RW_READER);
+ else
+ spa_config_enter(spa, SCL_ZIO, zio,
+ RW_READER);
+ }
/*
* The mirror_ops handle multiple DVAs in a single BP.
@@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio)
return (NULL);
}
+ if (zio_injection_enabled) {
+ hrtime_t target = zio_handle_ready_delay(zio);
+ if (target != 0 && zio->io_target_timestamp == 0) {
+ zio->io_stage >>= 1;
+ zio->io_target_timestamp = target;
+ zio_delay_interrupt(zio);
+ return (NULL);
+ }
+ }
+
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index 981a1be4847c..287577018ed1 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
}
+/*
+ * For testing, inject a delay before ready state.
+ */
+hrtime_t
+zio_handle_ready_delay(zio_t *zio)
+{
+ inject_handler_t *handler;
+ hrtime_t now = gethrtime();
+ hrtime_t target = 0;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_DELAY_READY)
+ continue;
+
+ /* If this handler matches, inject the delay */
+ if (zio_match_iotype(zio, handler->zi_record.zi_iotype) &&
+ zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ zio_match_dva(zio), &handler->zi_record, zio->io_error)) {
+ target = now + (hrtime_t)handler->zi_record.zi_timer;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+ return (target);
+}
+
static int
zio_calculate_range(const char *pool, zinject_record_t *record)
{
diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
index 3db196953f74..c403c001086a 100644
--- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c
+++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
@@ -441,64 +441,6 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
}
#ifndef IN_LIBSA
-static size_t
-zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
- int level)
-{
- int16_t zstd_level;
- if (zstd_enum_to_level(level, &zstd_level)) {
- ZSTDSTAT_BUMP(zstd_stat_com_inval);
- return (s_len);
- }
- /*
- * A zstd early abort heuristic.
- *
- * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
- * 128k), don't try any of this, just go.
- * (because experimentally that was a reasonable cutoff for a perf win
- * with tiny ratio change)
- * - First, we try LZ4 compression, and if it doesn't early abort, we
- * jump directly to whatever compression level we intended to try.
- * - Second, we try zstd-1 - if that errors out (usually, but not
- * exclusively, if it would overflow), we give up early.
- *
- * If it works, instead we go on and compress anyway.
- *
- * Why two passes? LZ4 alone gets you a lot of the way, but on highly
- * compressible data, it was losing up to 8.5% of the compressed
- * savings versus no early abort, and all the zstd-fast levels are
- * worse indications on their own than LZ4, and don't improve the LZ4
- * pass noticably if stacked like this.
- */
- size_t actual_abort_size = zstd_abort_size;
- if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
- s_len >= actual_abort_size) {
- int pass_len = 1;
- pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
- if (pass_len < d_len) {
- ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
- goto keep_trying;
- }
- ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
-
- pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
- ZIO_ZSTD_LEVEL_1);
- if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
- ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
- return (s_len);
- }
- ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
- } else {
- ZSTDSTAT_BUMP(zstd_stat_passignored);
- if (s_len < actual_abort_size) {
- ZSTDSTAT_BUMP(zstd_stat_passignored_size);
- }
- }
-keep_trying:
- return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
-
-}
-
/* Compress block using zstd */
static size_t
zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
diff --git a/sys/contrib/openzfs/scripts/zfs-tests.sh b/sys/contrib/openzfs/scripts/zfs-tests.sh
index 04f3b6f32cb8..5a0a1a609448 100755
--- a/sys/contrib/openzfs/scripts/zfs-tests.sh
+++ b/sys/contrib/openzfs/scripts/zfs-tests.sh
@@ -38,6 +38,7 @@ DEBUG=""
CLEANUP="yes"
CLEANUPALL="no"
KMSG=""
+TIMEOUT_DEBUG=""
LOOPBACK="yes"
STACK_TRACER="no"
FILESIZE="4G"
@@ -364,6 +365,7 @@ OPTIONS:
-k Disable cleanup after test failure
-K Log test names to /dev/kmsg
-f Use files only, disables block device tests
+ -O Dump debugging info to /dev/kmsg on test timeout
-S Enable stack tracer (negative performance impact)
-c Only create and populate constrained path
-R Automatically rerun failing tests
@@ -402,7 +404,7 @@ $0 -x
EOF
}
-while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
+while getopts 'hvqxkKfScRmOn:d:Ds:r:?t:T:u:I:' OPTION; do
case $OPTION in
h)
usage
@@ -445,6 +447,9 @@ while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
export NFS=1
. "$nfsfile"
;;
+ O)
+ TIMEOUT_DEBUG="yes"
+ ;;
d)
FILEDIR="$OPTARG"
;;
@@ -773,6 +778,7 @@ msg "${TEST_RUNNER}" \
"${DEBUG:+-D}" \
"${KMEMLEAK:+-m}" \
"${KMSG:+-K}" \
+ "${TIMEOUT_DEBUG:+-O}" \
"-c \"${RUNFILES}\"" \
"-T \"${TAGS}\"" \
"-i \"${STF_SUITE}\"" \
@@ -783,6 +789,7 @@ msg "${TEST_RUNNER}" \
${DEBUG:+-D} \
${KMEMLEAK:+-m} \
${KMSG:+-K} \
+ ${TIMEOUT_DEBUG:+-O} \
-c "${RUNFILES}" \
-T "${TAGS}" \
-i "${STF_SUITE}" \
diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run
index 2b002830c82f..9f531411fbe1 100644
--- a/sys/contrib/openzfs/tests/runfiles/common.run
+++ b/sys/contrib/openzfs/tests/runfiles/common.run
@@ -168,10 +168,10 @@ tags = ['functional', 'cli_root', 'zinject']
tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
- 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
- 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
- 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup',
- 'zdb_tunables']
+ 'zdb_display_block', 'zdb_encrypted', 'zdb_encrypted_raw',
+ 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos',
+ 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2',
+ 'zdb_backup', 'zdb_tunables']
pre =
post =
tags = ['functional', 'cli_root', 'zdb']
@@ -395,8 +395,9 @@ tags = ['functional', 'cli_root', 'zpool']
[tests/functional/cli_root/zpool_add]
tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
- 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
- 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
+ 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_warn_create',
+ 'zpool_add_warn_degraded', 'zpool_add_warn_removal', 'add-o_ashift',
+ 'add_prop_ashift', 'zpool_add_dryrun_output']
tags = ['functional', 'cli_root', 'zpool_add']
[tests/functional/cli_root/zpool_attach]
@@ -490,6 +491,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
tags = ['functional', 'cli_root', 'zpool_import']
timeout = 1200
+[tests/functional/cli_root/zpool_iostat]
+tests = ['zpool_iostat_interval_all', 'zpool_iostat_interval_some']
+tags = ['functional', 'cli_root', 'zpool_iostat']
+
[tests/functional/cli_root/zpool_labelclear]
tests = ['zpool_labelclear_active', 'zpool_labelclear_exported',
'zpool_labelclear_removed', 'zpool_labelclear_valid']
@@ -1085,7 +1090,8 @@ tags = ['functional', 'write_dirs']
[tests/functional/xattr]
tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
- 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat']
+ 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_014_pos',
+ 'xattr_compat']
tags = ['functional', 'xattr']
[tests/functional/zvol/zvol_ENOSPC]
diff --git a/sys/contrib/openzfs/tests/runfiles/sanity.run b/sys/contrib/openzfs/tests/runfiles/sanity.run
index b56ffc3a4a2d..249b415029c4 100644
--- a/sys/contrib/openzfs/tests/runfiles/sanity.run
+++ b/sys/contrib/openzfs/tests/runfiles/sanity.run
@@ -622,7 +622,7 @@ tags = ['functional', 'vdev_zaps']
[tests/functional/xattr]
tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
- 'xattr_011_pos', 'xattr_013_pos', 'xattr_compat']
+ 'xattr_011_pos', 'xattr_013_pos', 'xattr_014_pos', 'xattr_compat']
tags = ['functional', 'xattr']
[tests/functional/zvol/zvol_ENOSPC]
diff --git a/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in b/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in
index 2158208be6e5..d2c1185e4a94 100755
--- a/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in
+++ b/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in
@@ -34,6 +34,7 @@ from select import select
from subprocess import PIPE
from subprocess import Popen
from subprocess import check_output
+from subprocess import run
from threading import Timer
from time import time, CLOCK_MONOTONIC
from os.path import exists
@@ -187,6 +188,63 @@ User: %s
''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user)
def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False):
+
+ """
+ We're about to kill a command due to a timeout.
+ If we're running with the -O option, then dump debug info about the
+ process with the highest CPU usage to /dev/kmsg (Linux only). This can
+ help debug the timeout.
+
+ Debug info includes:
+ - 30 lines from 'top'
+ - /proc/<PID>/stack output of process with highest CPU usage
+ - Last lines strace-ing process with highest CPU usage
+ """
+ if exists("/dev/kmsg"):
+ c = """
+TOP_OUT="$(COLUMNS=160 top -b -n 1 | head -n 30)"
+read -r PID CMD <<< $(echo "$TOP_OUT" | /usr/bin/awk \
+"/COMMAND/{
+ print_next=1
+ next
+}
+{
+ if (print_next == 1) {
+ print \\$1\\" \\"\\$12
+ exit
+ }
+}")
+echo "##### ZTS timeout debug #####"
+echo "----- top -----"
+echo "$TOP_OUT"
+echo "----- /proc/$PID/stack ($CMD)) -----"
+cat /proc/$PID/stack
+echo "----- strace ($CMD) -----"
+TMPFILE="$(mktemp --suffix=ZTS)"
+/usr/bin/strace -k --stack-traces -p $PID &> "$TMPFILE" &
+sleep 0.1
+killall strace
+tail -n 30 $TMPFILE
+rm "$TMPFILE"
+echo "##### /proc/sysrq-trigger stack #####"
+"""
+ c = "sudo bash -c '" + c + "'"
+ data = run(c, capture_output=True, shell=True, text=True)
+ out = data.stdout
+ try:
+ kp = Popen([SUDO, "sh", "-c",
+ "echo '" + out + "' > /dev/kmsg"])
+ kp.wait()
+
+ """
+ Trigger kernel stack traces
+ """
+ kp = Popen([SUDO, "sh", "-c",
+ "echo l > /proc/sysrq-trigger"])
+ kp.wait()
+ except Exception:
+ pass
+
"""
Kill a running command due to timeout, or ^C from the keyboard. If
sudo is required, this user was verified previously.
@@ -1129,6 +1187,9 @@ def parse_args():
parser.add_option('-o', action='callback', callback=options_cb,
default=BASEDIR, dest='outputdir', type='string',
metavar='outputdir', help='Specify an output directory.')
+ parser.add_option('-O', action='store_true', default=False,
+ dest='timeout_debug',
+ help='Dump debugging info to /dev/kmsg on test timeout')
parser.add_option('-i', action='callback', callback=options_cb,
default=TESTDIR, dest='testdir', type='string',
metavar='testdir', help='Specify a test directory.')
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
index 1517f90e99a5..678c01b58f94 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@@ -197,6 +197,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \
functional/cli_root/zpool_import/zpool_import.cfg \
functional/cli_root/zpool_import/zpool_import.kshlib \
+ functional/cli_root/zpool_iostat/zpool_iostat.kshlib \
functional/cli_root/zpool_initialize/zpool_initialize.kshlib \
functional/cli_root/zpool_labelclear/labelclear.cfg \
functional/cli_root/zpool_remove/zpool_remove.cfg \
@@ -640,6 +641,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zdb/zdb_decompress_zstd.ksh \
functional/cli_root/zdb/zdb_display_block.ksh \
functional/cli_root/zdb/zdb_encrypted.ksh \
+ functional/cli_root/zdb/zdb_encrypted_raw.ksh \
functional/cli_root/zdb/zdb_label_checksum.ksh \
functional/cli_root/zdb/zdb_object_range_neg.ksh \
functional/cli_root/zdb/zdb_object_range_pos.ksh \
@@ -1027,7 +1029,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_add/zpool_add_007_neg.ksh \
functional/cli_root/zpool_add/zpool_add_008_neg.ksh \
functional/cli_root/zpool_add/zpool_add_009_neg.ksh \
- functional/cli_root/zpool_add/zpool_add_010_pos.ksh \
+ functional/cli_root/zpool_add/zpool_add_warn_create.ksh \
+ functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh \
+ functional/cli_root/zpool_add/zpool_add_warn_removal.ksh \
functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \
functional/cli_root/zpool_attach/attach-o_ashift.ksh \
functional/cli_root/zpool_attach/cleanup.ksh \
@@ -1178,6 +1182,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \
functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \
functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \
+ functional/cli_root/zpool_iostat/setup.ksh \
+ functional/cli_root/zpool_iostat/cleanup.ksh \
+ functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh \
+ functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh \
functional/cli_root/zpool_initialize/cleanup.ksh \
functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
@@ -2226,6 +2234,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/xattr/xattr_011_pos.ksh \
functional/xattr/xattr_012_pos.ksh \
functional/xattr/xattr_013_pos.ksh \
+ functional/xattr/xattr_014_pos.ksh \
functional/xattr/xattr_compat.ksh \
functional/zap_shrink/cleanup.ksh \
functional/zap_shrink/zap_shrink_001_pos.ksh \
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh
new file mode 100755
index 000000000000..85d267d5402f
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh
@@ -0,0 +1,75 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib
+
+#
+# DESCRIPTION:
+# 'zdb -K ...' should enable reading from a raw-encrypted dataset
+#
+# STRATEGY:
+# 1. Create an encrypted dataset
+# 2. Write some data to a file
+# 3. Run zdb -dddd on the file, confirm it can't be read
+# 4. Run zdb -K ... -ddddd on the file, confirm it can be read
+#
+
+verify_runnable "both"
+
+dataset="$TESTPOOL/$TESTFS2"
+file="$TESTDIR2/somefile"
+keyfile="$TEST_BASE_DIR/keyfile"
+
+function cleanup
+{
+ datasetexists "$dataset" && destroy_dataset "$dataset" -f
+ rm -f "$keyfile"
+ default_cleanup_noexit
+}
+
+log_onexit cleanup
+
+log_must default_setup_noexit $DISKS
+
+log_assert "'zdb -K' should enable reading from a raw-encrypted dataset"
+
+# The key must be 32 bytes long.
+echo -n "$RAWKEY" > "$keyfile"
+
+log_must zfs create -o mountpoint="$TESTDIR2" \
+ -o encryption=on -o keyformat=raw -o keylocation="file://$keyfile" \
+ "$dataset"
+
+echo 'my great encrypted text' > "$file"
+
+typeset -i obj=$(ls -i "$file" | cut -d' ' -f1)
+typeset -i size=$(wc -c < "$file")
+
+log_note "test file $file is objid $obj, size $size"
+
+sync_pool "$TESTPOOL" true
+
+log_must eval "zdb -dddd $dataset $obj | grep -q 'object encrypted'"
+
+log_must eval "zdb -K $keyfile -dddd $dataset $obj | grep -q 'size\s$size$'"
+
+log_pass "'zdb -K' enables reading from a raw-encrypted dataset"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib
index 091d65bb4f33..74780bb02141 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib
@@ -27,6 +27,7 @@
#
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright 2025 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
@@ -89,3 +90,44 @@ function save_dump_dev
fi
echo $dumpdev
}
+
+function zpool_create_add_setup
+{
+ typeset -i i=0
+
+ while ((i < 10)); do
+ log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i
+
+ eval vdev$i=$TEST_BASE_DIR/vdev$i
+ ((i += 1))
+ done
+
+ if is_linux; then
+ vdev_lo="$(losetup -f "$vdev4" --show)"
+ elif is_freebsd; then
+ vdev_lo=/dev/"$(mdconfig -a -t vnode -f "$vdev4")"
+ else
+ vdev_lo="$(lofiadm -a "$vdev4")"
+ fi
+}
+
+function zpool_create_add_cleanup
+{
+ datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1
+
+ if [[ -e $vdev_lo ]]; then
+ if is_linux; then
+ log_must losetup -d "$vdev_lo"
+ elif is_freebsd; then
+ log_must mdconfig -d -u "$vdev_lo"
+ else
+ log_must lofiadm -d "$vdev_lo"
+ fi
+ fi
+
+ typeset -i i=0
+ while ((i < 10)); do
+ rm -f $TEST_BASE_DIR/vdev$i
+ ((i += 1))
+ done
+}
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh
index df085a2ec746..661e55998d8d 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh
@@ -23,67 +23,51 @@
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright 2012, 2016 by Delphix. All rights reserved.
+# Copyright 2025 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib
#
# DESCRIPTION:
-# Verify zpool add succeed when adding vdevs with matching redundancy.
+# Verify zpool add succeeds when adding vdevs with matching redundancy
+# and warns with differing redundancy for a healthy pool.
#
# STRATEGY:
# 1. Create several files == $MINVDEVSIZE.
# 2. Verify 'zpool add' succeeds with matching redundancy.
# 3. Verify 'zpool add' warns with differing redundancy.
-# 4. Verify 'zpool add' warns with differing redundancy after removal.
#
verify_runnable "global"
-function cleanup
-{
- datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1
-
- typeset -i i=0
- while ((i < 10)); do
- rm -f $TEST_BASE_DIR/vdev$i
- ((i += 1))
- done
-}
-
+log_assert "Verify 'zpool add' warns for differing redundancy."
+log_onexit zpool_create_add_cleanup
-log_assert "Verify 'zpool add' succeed with keywords combination."
-log_onexit cleanup
+zpool_create_add_setup
-# 1. Create several files == $MINVDEVSIZE.
typeset -i i=0
-while ((i < 10)); do
- log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i
-
- eval vdev$i=$TEST_BASE_DIR/vdev$i
- ((i += 1))
-done
+typeset -i j=0
set -A redundancy0_create_args \
"$vdev0"
set -A redundancy1_create_args \
"mirror $vdev0 $vdev1" \
- "raidz1 $vdev0 $vdev1"
+ "raidz1 $vdev0 $vdev1" \
+ "draid1:1s $vdev0 $vdev1 $vdev9"
set -A redundancy2_create_args \
"mirror $vdev0 $vdev1 $vdev2" \
- "raidz2 $vdev0 $vdev1 $vdev2"
+ "raidz2 $vdev0 $vdev1 $vdev2" \
+ "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9"
set -A redundancy3_create_args \
"mirror $vdev0 $vdev1 $vdev2 $vdev3" \
- "raidz3 $vdev0 $vdev1 $vdev2 $vdev3"
+ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \
+ "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9"
set -A redundancy0_add_args \
"$vdev5" \
@@ -93,21 +77,19 @@ set -A redundancy1_add_args \
"mirror $vdev5 $vdev6" \
"raidz1 $vdev5 $vdev6" \
"raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \
- "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8"
+ "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \
+ "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \
+ "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8"
set -A redundancy2_add_args \
"mirror $vdev5 $vdev6 $vdev7" \
- "raidz2 $vdev5 $vdev6 $vdev7"
+ "raidz2 $vdev5 $vdev6 $vdev7" \
+ "draid2 $vdev5 $vdev6 $vdev7"
set -A redundancy3_add_args \
"mirror $vdev5 $vdev6 $vdev7 $vdev8" \
- "raidz3 $vdev5 $vdev6 $vdev7 $vdev8"
-
-set -A log_args "log" "$vdev4"
-set -A cache_args "cache" "$vdev4"
-set -A spare_args "spare" "$vdev4"
-
-typeset -i j=0
+ "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \
+ "draid3 $vdev5 $vdev6 $vdev7 $vdev8"
function zpool_create_add
{
@@ -148,30 +130,6 @@ function zpool_create_forced_add
done
}
-function zpool_create_rm_add
-{
- typeset -n create_args=$1
- typeset -n add_args=$2
- typeset -n rm_args=$3
-
- i=0
- while ((i < ${#create_args[@]})); do
- j=0
- while ((j < ${#add_args[@]})); do
- log_must zpool create $TESTPOOL1 ${create_args[$i]}
- log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]}
- log_must zpool add $TESTPOOL1 ${add_args[$j]}
- log_must zpool remove $TESTPOOL1 ${rm_args[1]}
- log_mustnot zpool add $TESTPOOL1 ${rm_args[1]}
- log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]}
- log_must zpool destroy -f $TESTPOOL1
-
- ((j += 1))
- done
- ((i += 1))
- done
-}
-
# 2. Verify 'zpool add' succeeds with matching redundancy.
zpool_create_add redundancy0_create_args redundancy0_add_args
zpool_create_add redundancy1_create_args redundancy1_add_args
@@ -195,17 +153,4 @@ zpool_create_forced_add redundancy3_create_args redundancy0_add_args
zpool_create_forced_add redundancy3_create_args redundancy1_add_args
zpool_create_forced_add redundancy3_create_args redundancy2_add_args
-# 4. Verify 'zpool add' warns with differing redundancy after removal.
-zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args
-zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args
-zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args
-
-zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args
-zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args
-zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args
-
-zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args
-zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args
-zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args
-
-log_pass "'zpool add' succeed with keywords combination."
+log_pass "Verify 'zpool add' warns for differing redundancy."
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh
new file mode 100755
index 000000000000..313eb3666f27
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh
@@ -0,0 +1,204 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2012, 2016 by Delphix. All rights reserved.
+# Copyright 2025 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib
+
+#
+# DESCRIPTION:
+# Verify zpool add succeeds when adding vdevs with matching redundancy
+# and warns with differing redundancy for a degraded pool.
+#
+# STRATEGY:
+# 1. Create several files == $MINVDEVSIZE.
+# 2. Verify 'zpool add' succeeds with matching redundancy
+# 3. Verify 'zpool add' warns with differing redundancy when
+# a. Degraded pool with replaced mismatch vdev (file vs disk)
+# b. Degraded pool dRAID distributed spare active
+# c. Degraded pool hot spare active
+#
+
+verify_runnable "global"
+
+log_assert "Verify 'zpool add' warns for differing redundancy."
+log_onexit zpool_create_add_cleanup
+
+zpool_create_add_setup
+
+set -A redundancy1_create_args \
+ "mirror $vdev0 $vdev1" \
+ "raidz1 $vdev0 $vdev1" \
+ "draid1:1s $vdev0 $vdev1 $vdev9"
+
+set -A redundancy2_create_args \
+ "mirror $vdev0 $vdev1 $vdev2" \
+ "raidz2 $vdev0 $vdev1 $vdev2" \
+ "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9"
+
+set -A redundancy3_create_args \
+ "mirror $vdev0 $vdev1 $vdev2 $vdev3" \
+ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \
+ "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9"
+
+set -A redundancy1_add_args \
+ "mirror $vdev5 $vdev6" \
+ "raidz1 $vdev5 $vdev6" \
+ "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \
+ "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \
+ "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \
+ "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8"
+
+set -A redundancy2_add_args \
+ "mirror $vdev5 $vdev6 $vdev7" \
+ "raidz2 $vdev5 $vdev6 $vdev7" \
+ "draid2 $vdev5 $vdev6 $vdev7"
+
+set -A redundancy3_add_args \
+ "mirror $vdev5 $vdev6 $vdev7 $vdev8" \
+ "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \
+ "draid3 $vdev5 $vdev6 $vdev7 $vdev8"
+
+set -A redundancy1_create_draid_args \
+ "draid1:1s $vdev0 $vdev1 $vdev2"
+
+set -A redundancy2_create_draid_args \
+ "draid2:1s $vdev0 $vdev1 $vdev2 $vdev3"
+
+set -A redundancy3_create_draid_args \
+ "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9"
+
+set -A redundancy1_create_spare_args \
+ "mirror $vdev0 $vdev1 spare $vdev_lo" \
+ "raidz1 $vdev0 $vdev1 spare $vdev_lo" \
+ "draid1 $vdev0 $vdev1 spare $vdev_lo"
+
+set -A redundancy2_create_spare_args \
+ "mirror $vdev0 $vdev1 $vdev2 spare $vdev_lo" \
+ "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" \
+ "draid2 $vdev0 $vdev1 $vdev2 spare $vdev_lo"
+
+set -A redundancy3_create_spare_args \
+ "mirror $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \
+ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \
+ "draid3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo"
+
+set -A replace_args "$vdev1" "$vdev_lo"
+set -A draid1_args "$vdev1" "draid1-0-0"
+set -A draid2_args "$vdev1" "draid2-0-0"
+set -A draid3_args "$vdev1" "draid3-0-0"
+
+typeset -i i=0
+typeset -i j=0
+
+function zpool_create_degraded_add
+{
+ typeset -n create_args=$1
+ typeset -n add_args=$2
+ typeset -n rm_args=$3
+
+ i=0
+ while ((i < ${#create_args[@]})); do
+ j=0
+ while ((j < ${#add_args[@]})); do
+ log_must zpool create $TESTPOOL1 ${create_args[$i]}
+ log_must zpool offline -f $TESTPOOL1 ${rm_args[0]}
+ log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]}
+ log_must zpool add $TESTPOOL1 ${add_args[$j]}
+ log_must zpool destroy -f $TESTPOOL1
+ log_must zpool labelclear -f ${rm_args[0]}
+
+ ((j += 1))
+ done
+ ((i += 1))
+ done
+}
+
+function zpool_create_forced_degraded_add
+{
+ typeset -n create_args=$1
+ typeset -n add_args=$2
+ typeset -n rm_args=$3
+
+ i=0
+ while ((i < ${#create_args[@]})); do
+ j=0
+ while ((j < ${#add_args[@]})); do
+ log_must zpool create $TESTPOOL1 ${create_args[$i]}
+ log_must zpool offline -f $TESTPOOL1 ${rm_args[0]}
+ log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]}
+ log_mustnot zpool add $TESTPOOL1 ${add_args[$j]}
+ log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]}
+ log_must zpool destroy -f $TESTPOOL1
+ log_must zpool labelclear -f ${rm_args[0]}
+
+ ((j += 1))
+ done
+ ((i += 1))
+ done
+}
+
+# 2. Verify 'zpool add' succeeds with matching redundancy and a degraded pool.
+zpool_create_degraded_add redundancy1_create_args redundancy1_add_args replace_args
+zpool_create_degraded_add redundancy2_create_args redundancy2_add_args replace_args
+zpool_create_degraded_add redundancy3_create_args redundancy3_add_args replace_args
+
+# 3. Verify 'zpool add' warns with differing redundancy and a degraded pool.
+#
+# a. Degraded pool with replaced mismatch vdev (file vs disk)
+zpool_create_forced_degraded_add redundancy1_create_args redundancy2_add_args replace_args
+zpool_create_forced_degraded_add redundancy1_create_args redundancy3_add_args replace_args
+
+zpool_create_forced_degraded_add redundancy2_create_args redundancy1_add_args replace_args
+zpool_create_forced_degraded_add redundancy2_create_args redundancy3_add_args replace_args
+
+zpool_create_forced_degraded_add redundancy3_create_args redundancy1_add_args replace_args
+zpool_create_forced_degraded_add redundancy3_create_args redundancy2_add_args replace_args
+
+# b. Degraded pool dRAID distributed spare active
+
+zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy2_add_args draid1_args
+zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy3_add_args draid1_args
+
+zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy1_add_args draid2_args
+zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy3_add_args draid2_args
+
+zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy1_add_args draid3_args
+zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy2_add_args draid3_args
+
+# c. Degraded pool hot spare active
+zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy2_add_args replace_args
+zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy3_add_args replace_args
+
+zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy1_add_args replace_args
+zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy3_add_args replace_args
+
+zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy1_add_args replace_args
+zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy2_add_args replace_args
+
+log_pass "Verify 'zpool add' warns for differing redundancy."
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh
new file mode 100755
index 000000000000..782858e301ac
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh
@@ -0,0 +1,126 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2012, 2016 by Delphix. All rights reserved.
+# Copyright 2025 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib
+
+#
+# DESCRIPTION:
+# Verify zpool add succeeds when adding vdevs with matching redundancy
+# and warns with differing redundancy after removal.
+#
+# STRATEGY:
+# 1. Create several files == $MINVDEVSIZE.
+# 2. Verify 'zpool add' warns with differing redundancy after removal.
+#
+
+verify_runnable "global"
+
+log_assert "Verify 'zpool add' warns for differing redundancy."
+log_onexit zpool_create_add_cleanup
+
+zpool_create_add_setup
+
+typeset -i i=0
+typeset -i j=0
+
+set -A redundancy1_create_args \
+ "mirror $vdev0 $vdev1" \
+ "raidz1 $vdev0 $vdev1" \
+ "draid1:1s $vdev0 $vdev1 $vdev9"
+
+set -A redundancy2_create_args \
+ "mirror $vdev0 $vdev1 $vdev2" \
+ "raidz2 $vdev0 $vdev1 $vdev2" \
+ "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9"
+
+set -A redundancy3_create_args \
+ "mirror $vdev0 $vdev1 $vdev2 $vdev3" \
+ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \
+ "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9"
+
+set -A redundancy1_add_args \
+ "mirror $vdev5 $vdev6" \
+ "raidz1 $vdev5 $vdev6" \
+ "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \
+ "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \
+ "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \
+ "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8"
+
+set -A redundancy2_add_args \
+ "mirror $vdev5 $vdev6 $vdev7" \
+ "raidz2 $vdev5 $vdev6 $vdev7" \
+ "draid2 $vdev5 $vdev6 $vdev7"
+
+set -A redundancy3_add_args \
+ "mirror $vdev5 $vdev6 $vdev7 $vdev8" \
+ "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \
+ "draid3 $vdev5 $vdev6 $vdev7 $vdev8"
+
+set -A log_args "log" "$vdev_lo"
+set -A cache_args "cache" "$vdev_lo"
+set -A spare_args "spare" "$vdev_lo"
+
+
+function zpool_create_rm_add
+{
+ typeset -n create_args=$1
+ typeset -n add_args=$2
+ typeset -n rm_args=$3
+
+ i=0
+ while ((i < ${#create_args[@]})); do
+ j=0
+ while ((j < ${#add_args[@]})); do
+ log_must zpool create $TESTPOOL1 ${create_args[$i]}
+ log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]}
+ log_must zpool add $TESTPOOL1 ${add_args[$j]}
+ log_must zpool remove $TESTPOOL1 ${rm_args[1]}
+ log_mustnot zpool add $TESTPOOL1 ${rm_args[1]}
+ log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]}
+ log_must zpool destroy -f $TESTPOOL1
+
+ ((j += 1))
+ done
+ ((i += 1))
+ done
+}
+
+# 2. Verify 'zpool add' warns with differing redundancy after removal.
+zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args
+zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args
+zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args
+
+zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args
+zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args
+zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args
+
+zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args
+zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args
+zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh
new file mode 100755
index 000000000000..099b5426031d
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Klara, Inc.
+#
+#
+. $STF_SUITE/include/libtest.shlib
+
+log_pass
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh
new file mode 100755
index 000000000000..3529a0ccc015
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh
@@ -0,0 +1,32 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Klara, Inc.
+#
+#
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_pass
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib
new file mode 100644
index 000000000000..ea4b0bd2756d
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib
@@ -0,0 +1,235 @@
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Klara, Inc.
+#
+
+# Since we want to make sure that iostat responds correctly as pools appear and
+# disappear, we run it in the background and capture its output to a file.
+# Once we're done, we parse the output and ensure it matches what we'd expect
+# from the operations we performed.
+#
+# Because iostat is producing output every interval, it may produce the "same"
+# output for each step of the change; in fact, we want that to make sure we
+# don't miss anything. So, we describe what we expect as a series of "chunks".
+# Each chunk is a particular kind of output, which may repeat. Current known
+# chunk types are:
+#
+# NOPOOL: the text "no pools available"
+# HEADER: three lines, starting with "capacity", "pool" and "----" respectively.
+# (the rough shape of the normal iostat header).
+# POOL1: a line starting with "pool1" (stats line for a pool of that name)
+# POOL2: a line starting with "pool2"
+# POOLBOTH: three lines, starting with "pool1", "pool2" (either order) and
+# "-----" respectively. (the pool stat output for multiple pools)
+#
+# (the parser may produce other chunks in a failed parse to assist with
+# debugging, but they should never be part of the "wanted" output See the
+# parser commentary below).
+#
+# To help recognise the start of a new interval output, we run iostat with the
+# -T u option, which will output a numeric timestamp before each header or
+# second-or-later pool stat after the header.
+#
+# To keep the test run shorter, we use a subsecond interval, but to make sure
+# nothing is missed, we sleep for three intervals after each change.
+
+typeset _iostat_out=$(mktemp)
+typeset _iostat_pid=""
+
+function cleanup_iostat {
+ if [[ -n $_iostat_pid ]] ; then
+ kill -KILL $_iostat_pid || true
+ fi
+ rm -f $_iostat_out
+}
+
+function start_iostat {
+ zpool iostat -T u $@ 0.1 > $_iostat_out 2>&1 &
+ _iostat_pid=$!
+}
+
+function stop_iostat {
+ kill -TERM $_iostat_pid
+ wait $_iostat_pid
+ _iostat_pid=""
+}
+
+function delay_iostat {
+ sleep 0.3
+}
+
+typeset -a _iostat_expect
+function expect_iostat {
+ typeset chunk=$1
+ _iostat_expect+=($chunk)
+}
+
+# Parse the output The `state` var is used to track state across
+# multiple lines. The `last` var and the `_got_iostat` function are used
+# to record the completed chunks, and to collapse repetitions.
+typeset -a _iostat_got
+typeset _iostat_last=""
+typeset _iostat_state=""
+
+function _got_iostat {
+ typeset chunk=$1
+ if [[ -n $chunk && $_iostat_last != $chunk ]] ; then
+ _iostat_last=$chunk
+ _iostat_got+=($chunk)
+ fi
+ _iostat_state=""
+}
+
+function verify_iostat {
+
+ cat $_iostat_out | while read line ; do
+
+ # The "no pools available" text has no timestamp or other
+ # header, and should never appear in the middle of multiline
+ # chunk, so we can close any in-flight state.
+ if [[ $line = "no pools available" ]] ; then
+ _got_iostat $_iostat_state
+ _got_iostat "NOPOOL"
+ continue
+ fi
+
+ # A run of digits alone on the line is a timestamp (the `-T u`
+ # switch to `iostat`). It closes any in-flight state as a
+ # complete chunk, and indicates the start of a new chunk.
+ if [[ -z ${line/#+([0-9])/} ]] ; then
+ _got_iostat $_iostat_state
+ _iostat_state="TIMESTAMP"
+ continue
+ fi
+
+ # For this test, the first word of each line should be unique,
+ # so we extract it and use it for simplicity.
+ typeset first=${line%% *}
+
+ # Header is emitted whenever the pool list changes. It has
+ # three lines:
+ #
+ # capacity operations bandwidth
+ # pool alloc free read write read write
+ # ---------- ----- ----- ----- ----- ----- -----
+ #
+ # Each line moves the state; when we get to a run of dashes, we
+ # commit. Note that we check for one-or-more dashes, because
+ # the width can vary depending on the length of pool name.
+ #
+ if [[ $_iostat_state = "TIMESTAMP" &&
+ $first = "capacity" ]] ; then
+ _iostat_state="INHEADER1"
+ continue
+ fi
+ if [[ $_iostat_state = "INHEADER1" &&
+ $first = "pool" ]] ; then
+ _iostat_state="INHEADER2"
+ continue
+ fi
+ if [[ $_iostat_state = "INHEADER2" &&
+ -z ${first/#+(-)/} ]] ; then
+ # Headers never repeat, so if the last committed chunk
+ # was a header, we commit this one as EXTRAHEADER so we
+ # can see it in the error output.
+ if [[ $_iostat_last = "HEADER" ]] ; then
+ _got_iostat "EXTRAHEADER"
+ elif [[ $_iostat_last != "EXTRAHEADER" ]] ; then
+ _got_iostat "HEADER"
+ fi
+ _iostat_state="HEADER"
+ continue
+ fi
+
+ # A pool stat line looks like:
+ #
+ # pool1 147K 240M 0 0 0 0
+ #
+ # If there are multiple pools, iostat follows them with a
+ # separator of dashed lines:
+ #
+ # pool1 147K 240M 0 0 0 0
+ # pool2 147K 240M 0 0 0 0
+ # ---------- ----- ----- ----- ----- ----- -----
+ #
+ # Stats rows always start after a timestamp or a header. If the
+ # header was emitted, we won't see a timestamp here (it goes
+ # before the header).
+ #
+ # Because our test exercises both pools on their own and
+ # together, we allow pools in either order. In practice they
+ # are sorted, but that's a side-effect of the implementation
+ # (see zpool_compare()), so we're not going to rely on it here.
+ if [[ $first = "pool1" ]] || [[ $first = "pool2" ]] ; then
+
+ # First line, track which one we saw. If it's a
+ # standalone line, it will be committed by the next
+ # NOPOOL or TIMESTAMP above (or the `_got_iostat` after
+ # the loop if this is the last line).
+ if [[ $_iostat_state == "TIMESTAMP" ||
+ $_iostat_state == "HEADER" ]] ; then
+ if [[ $first = "pool1" ]] ; then
+ _iostat_state="POOL1"
+ elif [[ $first = "pool2" ]] ; then
+ _iostat_state="POOL2"
+ fi
+ continue
+ fi
+
+ # If this is the second pool, we're in a multi-pool
+ # block, and need to look for the separator to close it
+ # out.
+ if [[ $_iostat_state = "POOL1" && $first = "pool2" ]] ||
+ [[ $_iostat_state = "POOL2" && $first = "pool1" ]] ;
+ then
+ _iostat_state="INPOOLBOTH"
+ continue
+ fi
+ fi
+
+ # Separator after the stats block.
+ if [[ $_iostat_state = "INPOOLBOTH" &&
+ -z ${first/#+(-)/} ]] ; then
+ _got_iostat "POOLBOTH"
+ continue
+ fi
+
+ # Anything else will fall through to here. We commit any
+ # in-flight state, then "UNKNOWN", all to help with debugging..
+ if [[ $_iostat_state != "UNKNOWN" ]] ; then
+ _got_iostat $_iostat_state
+ _got_iostat "UNKNOWN"
+ fi
+ done
+
+ # Close out any remaining state.
+ _got_iostat $_iostat_state
+
+ # Compare what we wanted with what we got, and pass/fail the test!
+ if [[ "${_iostat_expect[*]}" != "${_iostat_got[*]}" ]] ; then
+ log_note "expected: ${_iostat_expect[*]}"
+ log_note " got: ${_iostat_got[*]}"
+ log_fail "zpool iostat did not produce expected output"
+ fi
+}
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh
new file mode 100755
index 000000000000..8e040058ec3e
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh
@@ -0,0 +1,90 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Klara, Inc.
+#
+
+# `zpool iostat <N>` should keep running and update the pools it displays as
+# pools are created/destroyed/imported/export.
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib
+
+typeset vdev1=$(mktemp)
+typeset vdev2=$(mktemp)
+
+function cleanup {
+ cleanup_iostat
+
+ poolexists pool1 && destroy_pool pool1
+ poolexists pool2 && destroy_pool pool2
+ rm -f $vdev1 $vdev2
+}
+
+log_must mkfile $MINVDEVSIZE $vdev1 $vdev2
+
+expect_iostat "NOPOOL"
+
+start_iostat
+
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOL1"
+log_must zpool create pool1 $vdev1
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOLBOTH"
+log_must zpool create pool2 $vdev2
+delay_iostat
+
+expect_iostat "NOPOOL"
+log_must zpool export -a
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOL2"
+log_must zpool import -d $vdev2 pool2
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOLBOTH"
+log_must zpool import -d $vdev1 pool1
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOL2"
+log_must zpool destroy pool1
+delay_iostat
+
+expect_iostat "NOPOOL"
+log_must zpool destroy pool2
+delay_iostat
+
+stop_iostat
+
+verify_iostat
+
+log_pass "zpool iostat in interval mode follows pool updates"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh
new file mode 100755
index 000000000000..ab1f258aa1cd
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh
@@ -0,0 +1,80 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Klara, Inc.
+#
+
+# `zpool iostat <pools> <N>` should keep running and only show the listed pools.
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib
+
+typeset vdev1=$(mktemp)
+typeset vdev2=$(mktemp)
+
+function cleanup {
+ cleanup_iostat
+
+ poolexists pool1 && destroy_pool pool1
+ poolexists pool2 && destroy_pool pool2
+ rm -f $vdev1 $vdev2
+}
+
+log_must mkfile $MINVDEVSIZE $vdev1 $vdev2
+
+log_must zpool create pool1 $vdev1
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOL1"
+start_iostat pool1
+delay_iostat
+
+log_must zpool create pool2 $vdev2
+delay_iostat
+
+expect_iostat "NOPOOL"
+log_must zpool export -a
+delay_iostat
+
+log_must zpool import -d $vdev2 pool2
+delay_iostat
+
+expect_iostat "HEADER"
+expect_iostat "POOL1"
+log_must zpool import -d $vdev1 pool1
+delay_iostat
+
+expect_iostat "NOPOOL"
+log_must zpool destroy pool1
+delay_iostat
+
+log_must zpool destroy pool2
+delay_iostat
+
+stop_iostat
+
+verify_iostat
+
+log_pass "zpool iostat in interval mode with pools follows listed pool updates"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib
index 0a402e71ee68..345239b88680 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib
@@ -1234,10 +1234,10 @@ function verify_fs_aedsx
typeset oldval
set -A modes "on" "off"
oldval=$(get_prop $perm $fs)
- if [[ $oldval == "on" ]]; then
- n=1
- elif [[ $oldval == "off" ]]; then
+ if [[ $oldval == "off" ]]; then
n=0
+ else
+ n=1
fi
log_note "$user zfs set $perm=${modes[$n]} $fs"
user_run $user zfs set $perm=${modes[$n]} $fs
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh
index 26153aafbc02..0e79e9b8b70c 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh
@@ -39,6 +39,6 @@
verify_runnable "global"
# create a pool without any features
-log_must mkfile 128m $TMPDEV
+log_must truncate -s $MINVDEVSIZE $TMPDEV
log_pass
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh
index d6bd69b7e134..e81d07794689 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh
@@ -35,17 +35,19 @@
verify_runnable "global"
-TESTFILE="$TESTDIR/file.bin"
-
log_assert "User accounting upgrade should not be executed on readonly pool"
log_onexit cleanup_upgrade
# 1. Create a pool with the feature@userobj_accounting disabled to simulate
# a legacy pool from a previous ZFS version.
-log_must zpool create -d -m $TESTDIR $TESTPOOL $TMPDEV
+log_must zpool create -d $TESTPOOL $TMPDEV
+log_must zfs create $TESTPOOL/$TESTFS
+
+MNTPNT=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+TESTFILE="$MNTPNT/file.bin"
# 2. Create a file on the "legecy" dataset
-log_must touch $TESTDIR/file.bin
+log_must touch $TESTFILE
# 3. Enable feature@userobj_accounting on the pool and verify it is only
# "enabled" and not "active": upgrading starts when the filesystem is mounted
@@ -54,12 +56,12 @@ log_must test "enabled" == "$(get_pool_prop 'feature@userobj_accounting' $TESTPO
# 4. Export the pool and re-import is readonly, without mounting any filesystem
log_must zpool export $TESTPOOL
-log_must zpool import -o readonly=on -N -d "$(dirname $TMPDEV)" $TESTPOOL
+log_must zpool import -o readonly=on -N -d $TEST_BASE_DIR $TESTPOOL
# 5. Try to mount the root dataset manually without the "ro" option, then verify
# filesystem status and the pool feature status (not "active") to ensure the
# pool "readonly" status is enforced.
-log_must mount -t zfs -o zfsutil $TESTPOOL $TESTDIR
+log_must zfs mount -R $TESTPOOL
log_must stat "$TESTFILE"
log_mustnot touch "$TESTFILE"
log_must test "enabled" == "$(get_pool_prop 'feature@userobj_accounting' $TESTPOOL)"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh
new file mode 100755
index 000000000000..d4c9a0a41816
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh
@@ -0,0 +1,53 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 by Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/xattr/xattr_common.kshlib
+
+#
+# DESCRIPTION:
+# The default xattr should be shown as 'sa', not 'on', for clarity.
+#
+# STRATEGY:
+# 1. Create a filesystem.
+# 2. Verify that the xattra is shown as 'sa'.
+# 3. Manually set the value to 'dir', 'sa', 'on', and 'off'.
+# 4. Verify that it is shown as 'dir', 'sa', 'sa', and 'off.
+#
+
+log_assert "The default and specific xattr values are displayed correctly."
+
+set -A args "dir" "sa" "on" "off"
+set -A display "dir" "sa" "sa" "off"
+
+log_must eval "[[ 'sa' == '$(zfs get -Hpo value xattr $TESTPOOL)' ]]"
+
+for i in `seq 0 3`; do
+ log_must zfs set xattr="${args[$i]}" $TESTPOOL
+ log_must eval "[[ '${display[$i]}' == '$(zfs get -Hpo value xattr $TESTPOOL)' ]]"
+done
+log_pass "The default and specific xattr values are displayed correctly."
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
index 571a698eb63a..502ebada22dc 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
@@ -50,17 +50,53 @@ fi
typeset datafile1="$(mktemp -t zvol_misc_fua1.XXXXXX)"
typeset datafile2="$(mktemp -t zvol_misc_fua2.XXXXXX)"
+typeset datafile3="$(mktemp -t zvol_misc_fua3_log.XXXXXX)"
typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL
+typeset DISK1=${DISKS%% *}
function cleanup
{
- rm "$datafile1" "$datafile2"
+ log_must zpool remove $TESTPOOL $datafile3
+ rm "$datafile1" "$datafile2" "$datafile2"
+}
+
+# Prints the total number of sync writes for a vdev
+# $1: vdev
+function get_sync
+{
+ zpool iostat -p -H -v -r $TESTPOOL $1 | \
+ awk '/[0-9]+$/{s+=$4+$5} END{print s}'
}
function do_test {
# Wait for udev to create symlinks to our zvol
block_device_wait $zvolpath
+ # Write using sync (creates FLUSH calls after writes, but not FUA)
+ old_vdev_writes=$(get_sync $DISK1)
+ old_log_writes=$(get_sync $datafile3)
+
+ log_must fio --name=write_iops --size=5M \
+ --ioengine=libaio --verify=0 --bs=4K \
+ --iodepth=1 --rw=randwrite --group_reporting=1 \
+ --filename=$zvolpath --sync=1
+
+ vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes))
+ log_writes=$(( $(get_sync $datafile3) - $old_log_writes))
+
+ # When we're doing sync writes, we should see many more writes go to
+ # the log vs the first vdev. Experiments show anywhere from a 160-320x
+ # ratio of writes to the log vs the first vdev (due to some straggler
+ # writes to the first vdev).
+ #
+ # Check that we have a large ratio (100x) of sync writes going to the
+ # log device
+ ratio=$(($log_writes / $vdev_writes))
+ log_note "Got $log_writes log writes, $vdev_writes vdev writes."
+ if [ $ratio -lt 100 ] ; then
+ log_fail "Expected > 100x more log writes than vdev writes. "
+ fi
+
# Create a data file
log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5
@@ -81,6 +117,8 @@ log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)"
log_onexit cleanup
log_must zfs set compression=off $TESTPOOL/$TESTVOL
+log_must truncate -s 100M $datafile3
+log_must zpool add $TESTPOOL log $datafile3
log_note "Testing without blk-mq"
diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c
index 7f9ca6e39df8..3f0a7b40245d 100644
--- a/sys/dev/acpica/acpi.c
+++ b/sys/dev/acpica/acpi.c
@@ -3468,10 +3468,10 @@ acpi_EnterSleepState(struct acpi_softc *sc, enum power_stype stype)
return_ACPI_STATUS (AE_OK);
}
- EVENTHANDLER_INVOKE(power_suspend_early);
+ EVENTHANDLER_INVOKE(power_suspend_early, stype);
stop_all_proc();
suspend_all_fs();
- EVENTHANDLER_INVOKE(power_suspend);
+ EVENTHANDLER_INVOKE(power_suspend, stype);
#ifdef EARLY_AP_STARTUP
MPASS(mp_ncpus == 1 || smp_started);
@@ -3632,7 +3632,7 @@ backout:
resume_all_fs();
resume_all_proc();
- EVENTHANDLER_INVOKE(power_resume);
+ EVENTHANDLER_INVOKE(power_resume, stype);
/* Allow another sleep request after a while. */
callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME);
diff --git a/sys/dev/acpica/acpi_apei.c b/sys/dev/acpica/acpi_apei.c
index 9cfd46c97430..624c81ad1b4f 100644
--- a/sys/dev/acpica/acpi_apei.c
+++ b/sys/dev/acpica/acpi_apei.c
@@ -754,7 +754,7 @@ apei_detach(device_t dev)
apei_nmi = NULL;
apei_nmi_nges = NULL;
if (sc->nges.swi_ih != NULL) {
- swi_remove(&sc->nges.swi_ih);
+ swi_remove(sc->nges.swi_ih);
sc->nges.swi_ih = NULL;
}
if (acpi_get_handle(dev) != NULL) {
diff --git a/sys/dev/acpica/acpi_powerres.c b/sys/dev/acpica/acpi_powerres.c
index 0baa5c595470..29d1690f1bdd 100644
--- a/sys/dev/acpica/acpi_powerres.c
+++ b/sys/dev/acpica/acpi_powerres.c
@@ -76,13 +76,6 @@ struct acpi_powerconsumer {
/* Device which is powered */
ACPI_HANDLE ac_consumer;
int ac_state;
-
- struct {
- bool prx_has;
- size_t prx_count;
- ACPI_HANDLE *prx_deps;
- } ac_prx[ACPI_D_STATE_COUNT];
-
TAILQ_ENTRY(acpi_powerconsumer) ac_link;
TAILQ_HEAD(,acpi_powerreference) ac_references;
};
@@ -103,7 +96,9 @@ static TAILQ_HEAD(acpi_powerconsumer_list, acpi_powerconsumer)
ACPI_SERIAL_DECL(powerres, "ACPI power resources");
static ACPI_STATUS acpi_pwr_register_consumer(ACPI_HANDLE consumer);
+#ifdef notyet
static ACPI_STATUS acpi_pwr_deregister_consumer(ACPI_HANDLE consumer);
+#endif /* notyet */
static ACPI_STATUS acpi_pwr_register_resource(ACPI_HANDLE res);
#ifdef notyet
static ACPI_STATUS acpi_pwr_deregister_resource(ACPI_HANDLE res);
@@ -227,84 +222,6 @@ acpi_pwr_deregister_resource(ACPI_HANDLE res)
#endif /* notyet */
/*
- * Evaluate the _PRx (power resources each D-state depends on). This also
- * populates the acpi_powerresources queue with the power resources discovered
- * during this step.
- *
- * ACPI 7.3.8 - 7.3.11 guarantee that _PRx will return the same data each
- * time they are evaluated.
- *
- * If this function fails, acpi_pwr_deregister_consumer() must be called on the
- * power consumer to free already allocated memory.
- */
-static ACPI_STATUS
-acpi_pwr_get_power_resources(ACPI_HANDLE consumer, struct acpi_powerconsumer *pc)
-{
- ACPI_INTEGER status;
- ACPI_STRING reslist_name;
- ACPI_HANDLE reslist_handle;
- ACPI_STRING reslist_names[] = {"_PR0", "_PR1", "_PR2", "_PR3"};
- ACPI_BUFFER reslist;
- ACPI_OBJECT *reslist_object;
- ACPI_OBJECT *dep;
- ACPI_HANDLE *res;
-
- ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
- ACPI_SERIAL_ASSERT(powerres);
-
- MPASS(consumer != NULL);
-
- for (int state = ACPI_STATE_D0; state <= ACPI_STATE_D3_HOT; state++) {
- pc->ac_prx[state].prx_has = false;
- pc->ac_prx[state].prx_count = 0;
- pc->ac_prx[state].prx_deps = NULL;
-
- reslist_name = reslist_names[state - ACPI_STATE_D0];
- if (ACPI_FAILURE(AcpiGetHandle(consumer, reslist_name, &reslist_handle)))
- continue;
-
- reslist.Pointer = NULL;
- reslist.Length = ACPI_ALLOCATE_BUFFER;
- status = AcpiEvaluateObjectTyped(reslist_handle, NULL, NULL, &reslist,
- ACPI_TYPE_PACKAGE);
- if (ACPI_FAILURE(status) || reslist.Pointer == NULL)
- /*
- * ACPI_ALLOCATE_BUFFER entails everything will be freed on error
- * by AcpiEvaluateObjectTyped.
- */
- continue;
-
- reslist_object = (ACPI_OBJECT *)reslist.Pointer;
- pc->ac_prx[state].prx_has = true;
- pc->ac_prx[state].prx_count = reslist_object->Package.Count;
-
- if (reslist_object->Package.Count == 0) {
- AcpiOsFree(reslist_object);
- continue;
- }
-
- pc->ac_prx[state].prx_deps = mallocarray(pc->ac_prx[state].prx_count,
- sizeof(*pc->ac_prx[state].prx_deps), M_ACPIPWR, M_NOWAIT);
- if (pc->ac_prx[state].prx_deps == NULL) {
- AcpiOsFree(reslist_object);
- return_ACPI_STATUS (AE_NO_MEMORY);
- }
-
- for (size_t i = 0; i < reslist_object->Package.Count; i++) {
- dep = &reslist_object->Package.Elements[i];
- res = dep->Reference.Handle;
- pc->ac_prx[state].prx_deps[i] = res;
-
- /* It's fine to attempt to register the same resource twice. */
- acpi_pwr_register_resource(res);
- }
- AcpiOsFree(reslist_object);
- }
-
- return_ACPI_STATUS (AE_OK);
-}
-
-/*
* Register a power consumer.
*
* It's OK to call this if we already know about the consumer.
@@ -312,7 +229,6 @@ acpi_pwr_get_power_resources(ACPI_HANDLE consumer, struct acpi_powerconsumer *pc
static ACPI_STATUS
acpi_pwr_register_consumer(ACPI_HANDLE consumer)
{
- ACPI_INTEGER status;
struct acpi_powerconsumer *pc;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
@@ -323,27 +239,12 @@ acpi_pwr_register_consumer(ACPI_HANDLE consumer)
return_ACPI_STATUS (AE_OK);
/* Allocate a new power consumer */
- if ((pc = malloc(sizeof(*pc), M_ACPIPWR, M_NOWAIT | M_ZERO)) == NULL)
+ if ((pc = malloc(sizeof(*pc), M_ACPIPWR, M_NOWAIT)) == NULL)
return_ACPI_STATUS (AE_NO_MEMORY);
TAILQ_INSERT_HEAD(&acpi_powerconsumers, pc, ac_link);
TAILQ_INIT(&pc->ac_references);
pc->ac_consumer = consumer;
- /*
- * Get all its power resource dependencies, if it has _PRx. We do this now
- * as an opportunity to populate the acpi_powerresources queue.
- *
- * If this fails, immediately deregister it.
- */
- status = acpi_pwr_get_power_resources(consumer, pc);
- if (ACPI_FAILURE(status)) {
- ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS,
- "failed to get power resources for %s\n",
- acpi_name(consumer)));
- acpi_pwr_deregister_consumer(consumer);
- return_ACPI_STATUS (status);
- }
-
/* XXX we should try to find its current state */
pc->ac_state = ACPI_STATE_UNKNOWN;
@@ -353,6 +254,7 @@ acpi_pwr_register_consumer(ACPI_HANDLE consumer)
return_ACPI_STATUS (AE_OK);
}
+#ifdef notyet
/*
* Deregister a power consumer.
*
@@ -377,9 +279,6 @@ acpi_pwr_deregister_consumer(ACPI_HANDLE consumer)
/* Pull the consumer off the list and free it */
TAILQ_REMOVE(&acpi_powerconsumers, pc, ac_link);
- for (size_t i = 0; i < sizeof(pc->ac_prx) / sizeof(*pc->ac_prx); i++)
- if (pc->ac_prx[i].prx_deps != NULL)
- free(pc->ac_prx[i].prx_deps, M_ACPIPWR);
free(pc, M_ACPIPWR);
ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "deregistered power consumer %s\n",
@@ -387,6 +286,7 @@ acpi_pwr_deregister_consumer(ACPI_HANDLE consumer)
return_ACPI_STATUS (AE_OK);
}
+#endif /* notyet */
/*
* Set a power consumer to a particular power state.
diff --git a/sys/dev/acpica/acpi_timer.c b/sys/dev/acpica/acpi_timer.c
index 3d51a4211b80..b20912e2f5fb 100644
--- a/sys/dev/acpica/acpi_timer.c
+++ b/sys/dev/acpica/acpi_timer.c
@@ -34,6 +34,7 @@
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/timetc.h>
+#include <sys/power.h>
#include <machine/bus.h>
#include <machine/resource.h>
@@ -69,8 +70,10 @@ bool acpi_timer_disabled = false;
static void acpi_timer_identify(driver_t *driver, device_t parent);
static int acpi_timer_probe(device_t dev);
static int acpi_timer_attach(device_t dev);
-static void acpi_timer_resume_handler(struct timecounter *);
-static void acpi_timer_suspend_handler(struct timecounter *);
+static void acpi_timer_resume_handler(struct timecounter *,
+ enum power_stype);
+static void acpi_timer_suspend_handler(struct timecounter *,
+ enum power_stype);
static u_int acpi_timer_get_timecount(struct timecounter *tc);
static u_int acpi_timer_get_timecount_safe(struct timecounter *tc);
static int acpi_timer_sysctl_freq(SYSCTL_HANDLER_ARGS);
@@ -235,7 +238,7 @@ acpi_timer_attach(device_t dev)
}
static void
-acpi_timer_resume_handler(struct timecounter *newtc)
+acpi_timer_resume_handler(struct timecounter *newtc, enum power_stype stype)
{
struct timecounter *tc;
@@ -251,7 +254,7 @@ acpi_timer_resume_handler(struct timecounter *newtc)
}
static void
-acpi_timer_suspend_handler(struct timecounter *newtc)
+acpi_timer_suspend_handler(struct timecounter *newtc, enum power_stype stype)
{
struct timecounter *tc;
diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c
index 82f56fc0d19e..2b4cb37275a6 100644
--- a/sys/dev/ahci/ahci_pci.c
+++ b/sys/dev/ahci/ahci_pci.c
@@ -467,28 +467,6 @@ ahci_ata_probe(device_t dev)
}
static int
-ahci_pci_read_msix_bars(device_t dev, uint8_t *table_bar, uint8_t *pba_bar)
-{
- int cap_offset = 0, ret;
- uint32_t val;
-
- if ((table_bar == NULL) || (pba_bar == NULL))
- return (EINVAL);
-
- ret = pci_find_cap(dev, PCIY_MSIX, &cap_offset);
- if (ret != 0)
- return (EINVAL);
-
- val = pci_read_config(dev, cap_offset + PCIR_MSIX_TABLE, 4);
- *table_bar = PCIR_BAR(val & PCIM_MSIX_BIR_MASK);
-
- val = pci_read_config(dev, cap_offset + PCIR_MSIX_PBA, 4);
- *pba_bar = PCIR_BAR(val & PCIM_MSIX_BIR_MASK);
-
- return (0);
-}
-
-static int
ahci_pci_attach(device_t dev)
{
struct ahci_controller *ctlr = device_get_softc(dev);
@@ -496,7 +474,6 @@ ahci_pci_attach(device_t dev)
uint32_t devid = pci_get_devid(dev);
uint8_t revid = pci_get_revid(dev);
int msi_count, msix_count;
- uint8_t table_bar = 0, pba_bar = 0;
uint32_t caps, pi;
msi_count = pci_msi_count(dev);
@@ -584,20 +561,11 @@ ahci_pci_attach(device_t dev)
if (ctlr->quirks & AHCI_Q_NOMSIX)
msix_count = 0;
- /* Read MSI-x BAR IDs if supported */
- if (msix_count > 0) {
- error = ahci_pci_read_msix_bars(dev, &table_bar, &pba_bar);
- if (error == 0) {
- ctlr->r_msix_tab_rid = table_bar;
- ctlr->r_msix_pba_rid = pba_bar;
- } else {
- /* Failed to read BARs, disable MSI-x */
- msix_count = 0;
- }
- }
-
/* Allocate resources for MSI-x table and PBA */
if (msix_count > 0) {
+ ctlr->r_msix_tab_rid = pci_msix_table_bar(dev);
+ ctlr->r_msix_pba_rid = pci_msix_pba_bar(dev);
+
/*
* Allocate new MSI-x table only if not
* allocated before.
@@ -608,8 +576,8 @@ ahci_pci_attach(device_t dev)
ctlr->r_msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
&ctlr->r_msix_tab_rid, RF_ACTIVE);
if (ctlr->r_msix_table == NULL) {
- ahci_free_mem(dev);
- return (ENXIO);
+ msix_count = 0;
+ goto no_msix;
}
}
@@ -624,12 +592,12 @@ ahci_pci_attach(device_t dev)
ctlr->r_msix_pba = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
&ctlr->r_msix_pba_rid, RF_ACTIVE);
if (ctlr->r_msix_pba == NULL) {
- ahci_free_mem(dev);
- return (ENXIO);
+ msix_count = 0;
}
}
}
+no_msix:
pci_enable_busmaster(dev);
/* Reset controller */
if ((error = ahci_pci_ctlr_reset(dev)) != 0) {
diff --git a/sys/dev/ath/if_ath_tx.c b/sys/dev/ath/if_ath_tx.c
index deadd63c3d18..9ac591c14943 100644
--- a/sys/dev/ath/if_ath_tx.c
+++ b/sys/dev/ath/if_ath_tx.c
@@ -971,6 +971,12 @@ ath_legacy_xmit_handoff(struct ath_softc *sc, struct ath_txq *txq,
ath_tx_handoff_hw(sc, txq, bf);
}
+/*
+ * Setup a frame for encryption.
+ *
+ * If this fails, then an non-zero error is returned. The mbuf
+ * must be freed by the caller.
+ */
static int
ath_tx_tag_crypto(struct ath_softc *sc, struct ieee80211_node *ni,
struct mbuf *m0, int iswep, int isfrag, int *hdrlen, int *pktlen,
@@ -1547,6 +1553,10 @@ ath_tx_xmit_normal(struct ath_softc *sc, struct ath_txq *txq,
*
* Note that this may cause the mbuf to be reallocated, so
* m0 may not be valid.
+ *
+ * If there's a problem then the mbuf is freed and an error
+ * is returned. The ath_buf then needs to be freed by the
+ * caller.
*/
static int
ath_tx_normal_setup(struct ath_softc *sc, struct ieee80211_node *ni,
@@ -2073,9 +2083,8 @@ ath_tx_start(struct ath_softc *sc, struct ieee80211_node *ni,
/* This also sets up the DMA map; crypto; frame parameters, etc */
r = ath_tx_normal_setup(sc, ni, bf, m0, txq);
-
if (r != 0)
- goto done;
+ return (r);
/* At this point m0 could have changed! */
m0 = bf->bf_m;
@@ -2132,7 +2141,6 @@ ath_tx_start(struct ath_softc *sc, struct ieee80211_node *ni,
ath_tx_leak_count_update(sc, tid, bf);
ath_tx_xmit_normal(sc, txq, bf);
#endif
-done:
return 0;
}
diff --git a/sys/dev/gpio/gpioc.c b/sys/dev/gpio/gpioc.c
index 5a60f939dc78..6c6f79227166 100644
--- a/sys/dev/gpio/gpioc.c
+++ b/sys/dev/gpio/gpioc.c
@@ -704,7 +704,7 @@ gpioc_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
* npins isn't a horrible fifo size for that either.
*/
priv->numevents = priv->sc->sc_npins * 2;
- priv->events = malloc(priv->numevents * sizeof(struct gpio_event_detail),
+ priv->events = malloc(priv->numevents * sizeof(struct gpioc_pin_event),
M_GPIOC, M_WAITOK | M_ZERO);
priv->evidx_head = priv->evidx_tail = 0;
@@ -793,6 +793,7 @@ gpioc_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag,
struct gpio_access_32 *a32;
struct gpio_config_32 *c32;
struct gpio_event_config *evcfg;
+ struct gpioc_pin_event *tmp;
uint32_t caps, intrflags;
switch (cmd) {
@@ -908,27 +909,35 @@ gpioc_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag,
res = devfs_get_cdevpriv((void **)&priv);
if (res != 0)
break;
- /* If any pins have been configured, changes aren't allowed. */
- if (!SLIST_EMPTY(&priv->pins)) {
- res = EINVAL;
- break;
- }
if (evcfg->gp_report_type != GPIO_EVENT_REPORT_DETAIL &&
evcfg->gp_report_type != GPIO_EVENT_REPORT_SUMMARY) {
res = EINVAL;
break;
}
- priv->report_option = evcfg->gp_report_type;
/* Reallocate the events buffer if the user wants it bigger. */
- if (priv->report_option == GPIO_EVENT_REPORT_DETAIL &&
+ tmp = NULL;
+ if (evcfg->gp_report_type == GPIO_EVENT_REPORT_DETAIL &&
priv->numevents < evcfg->gp_fifo_size) {
+ tmp = malloc(evcfg->gp_fifo_size *
+ sizeof(struct gpioc_pin_event), M_GPIOC,
+ M_WAITOK | M_ZERO);
+ }
+ mtx_lock(&priv->mtx);
+ /* If any pins have been configured, changes aren't allowed. */
+ if (!SLIST_EMPTY(&priv->pins)) {
+ mtx_unlock(&priv->mtx);
+ free(tmp, M_GPIOC);
+ res = EINVAL;
+ break;
+ }
+ if (tmp != NULL) {
free(priv->events, M_GPIOC);
+ priv->events = tmp;
priv->numevents = evcfg->gp_fifo_size;
- priv->events = malloc(priv->numevents *
- sizeof(struct gpio_event_detail), M_GPIOC,
- M_WAITOK | M_ZERO);
priv->evidx_head = priv->evidx_tail = 0;
}
+ priv->report_option = evcfg->gp_report_type;
+ mtx_unlock(&priv->mtx);
break;
case FIONBIO:
/*
diff --git a/sys/dev/iwx/if_iwx.c b/sys/dev/iwx/if_iwx.c
index 8422fcb787c3..04ed09f04604 100644
--- a/sys/dev/iwx/if_iwx.c
+++ b/sys/dev/iwx/if_iwx.c
@@ -4805,6 +4805,8 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_packet *pkt,
static void
iwx_clear_oactive(struct iwx_softc *sc, struct iwx_tx_ring *ring)
{
+ IWX_ASSERT_LOCKED(sc);
+
if (ring->queued < iwx_lomark) {
sc->qfullmsk &= ~(1 << ring->qid);
if (sc->qfullmsk == 0 /* && ifq_is_oactive(&ifp->if_snd) */) {
@@ -4890,11 +4892,19 @@ iwx_rx_bmiss(struct iwx_softc *sc, struct iwx_rx_packet *pkt,
bus_dmamap_sync(sc->rxq.data_dmat, data->map,
BUS_DMASYNC_POSTREAD);
+ IWX_DPRINTF(sc, IWX_DEBUG_BEACON,
+ "%s: mac_id=%u, cmslrx=%u, cmb=%u, neb=%d, nrb=%u\n",
+ __func__,
+ le32toh(mbn->mac_id),
+ le32toh(mbn->consec_missed_beacons_since_last_rx),
+ le32toh(mbn->consec_missed_beacons),
+ le32toh(mbn->num_expected_beacons),
+ le32toh(mbn->num_recvd_beacons));
+
missed = le32toh(mbn->consec_missed_beacons_since_last_rx);
if (missed > vap->iv_bmissthreshold) {
ieee80211_beacon_miss(ic);
}
-
}
static int
@@ -5491,6 +5501,9 @@ iwx_tx_fill_cmd(struct iwx_softc *sc, struct iwx_node *in,
/* for non-data, use the lowest supported rate */
ridx = min_ridx;
*flags |= IWX_TX_FLAGS_CMD_RATE;
+ } else if (ni->ni_flags & IEEE80211_NODE_VHT) {
+ /* TODO: VHT - the ridx / rate array doesn't have VHT rates yet */
+ ridx = iwx_min_basic_rate(ic);
} else if (ni->ni_flags & IEEE80211_NODE_HT) {
ridx = iwx_mcs2ridx[ieee80211_node_get_txrate_dot11rate(ni)
& ~IEEE80211_RATE_MCS];
@@ -5622,6 +5635,8 @@ iwx_tx(struct iwx_softc *sc, struct mbuf *m, struct ieee80211_node *ni)
struct mbuf *m1;
size_t txcmd_size;
+ IWX_ASSERT_LOCKED(sc);
+
wh = mtod(m, struct ieee80211_frame *);
type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
@@ -7308,97 +7323,107 @@ iwx_rs_init(struct iwx_softc *sc, struct iwx_node *in)
return iwx_rs_init_v3(sc, in);
}
-static void
-iwx_rs_update(struct iwx_softc *sc, struct iwx_tlc_update_notif *notif)
+
+/**
+ * @brief Turn the given TX rate control notification into an ieee80211_node_txrate
+ *
+ * This populates the given txrate node with the TX rate control notification.
+ *
+ * @param sc driver softc
+ * @param notif firmware notification
+ * @param ni ieee80211_node update
+ * @returns true if updated, false if not
+ */
+static bool
+iwx_rs_update_node_txrate(struct iwx_softc *sc,
+ const struct iwx_tlc_update_notif *notif, struct ieee80211_node *ni)
{
struct ieee80211com *ic = &sc->sc_ic;
- struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps);
- struct ieee80211_node *ni = (void *)vap->iv_bss;
+ /* XXX TODO: create an inline function in if_iwxreg.h? */
+ static int cck_idx_to_rate[] = { 2, 4, 11, 22, 2, 2, 2, 2 };
+ static int ofdm_idx_to_rate[] = { 12, 18, 24, 36, 48, 72, 96, 108 };
- struct ieee80211_rateset *rs = &ni->ni_rates;
uint32_t rate_n_flags;
- uint8_t plcp, rval;
- int i, cmd_ver, rate_n_flags_ver2 = 0;
-
- if (notif->sta_id != IWX_STATION_ID ||
- (le32toh(notif->flags) & IWX_TLC_NOTIF_FLAG_RATE) == 0)
- return;
+ uint32_t type;
+ /* Extract the rate and command version */
rate_n_flags = le32toh(notif->rate);
+ if (sc->sc_rate_n_flags_version != 2) {
+ net80211_ic_printf(ic,
+ "%s: unsupported rate_n_flags version (%d)\n",
+ __func__,
+ sc->sc_rate_n_flags_version);
+ return (false);
+ }
+
if (sc->sc_debug & IWX_DEBUG_TXRATE)
print_ratenflags(__func__, __LINE__,
rate_n_flags, sc->sc_rate_n_flags_version);
- cmd_ver = iwx_lookup_notif_ver(sc, IWX_DATA_PATH_GROUP,
- IWX_TLC_MNG_UPDATE_NOTIF);
- if (cmd_ver != IWX_FW_CMD_VER_UNKNOWN && cmd_ver >= 3)
- rate_n_flags_ver2 = 1;
-
- if (rate_n_flags_ver2) {
- uint32_t mod_type = (rate_n_flags & IWX_RATE_MCS_MOD_TYPE_MSK);
- if (mod_type == IWX_RATE_MCS_HT_MSK) {
-
- ieee80211_node_set_txrate_dot11rate(ni,
- IWX_RATE_HT_MCS_INDEX(rate_n_flags) |
- IEEE80211_RATE_MCS);
- IWX_DPRINTF(sc, IWX_DEBUG_TXRATE,
- "%s:%d new MCS: %d rate_n_flags: %x\n",
- __func__, __LINE__,
- ieee80211_node_get_txrate_dot11rate(ni) & ~IEEE80211_RATE_MCS,
- rate_n_flags);
- return;
- }
- } else {
- if (rate_n_flags & IWX_RATE_MCS_HT_MSK_V1) {
- ieee80211_node_set_txrate_dot11rate(ni,
- rate_n_flags & (IWX_RATE_HT_MCS_RATE_CODE_MSK_V1 |
- IWX_RATE_HT_MCS_NSS_MSK_V1));
-
- IWX_DPRINTF(sc, IWX_DEBUG_TXRATE,
- "%s:%d new MCS idx: %d rate_n_flags: %x\n",
- __func__, __LINE__,
- ieee80211_node_get_txrate_dot11rate(ni), rate_n_flags);
- return;
- }
+ type = (rate_n_flags & IWX_RATE_MCS_MOD_TYPE_MSK);
+ switch (type) {
+ case IWX_RATE_MCS_CCK_MSK:
+ ieee80211_node_set_txrate_dot11rate(ni,
+ cck_idx_to_rate[rate_n_flags & IWX_RATE_LEGACY_RATE_MSK]);
+ return (true);
+ case IWX_RATE_MCS_LEGACY_OFDM_MSK:
+ ieee80211_node_set_txrate_dot11rate(ni,
+ ofdm_idx_to_rate[rate_n_flags & IWX_RATE_LEGACY_RATE_MSK]);
+ return (true);
+ case IWX_RATE_MCS_HT_MSK:
+ /*
+ * TODO: the current API doesn't include channel width
+ * and other flags, so we can't accurately store them yet!
+ *
+ * channel width: (flags & IWX_RATE_MCS_CHAN_WIDTH_MSK)
+ * >> IWX_RATE_MCS_CHAN_WIDTH_POS)
+ * LDPC: (flags & (1 << 16))
+ */
+ ieee80211_node_set_txrate_ht_mcsrate(ni,
+ IWX_RATE_HT_MCS_INDEX(rate_n_flags));
+ return (true);
+ case IWX_RATE_MCS_VHT_MSK:
+ /* TODO: same comment on channel width, etc above */
+ ieee80211_node_set_txrate_vht_rate(ni,
+ IWX_RATE_VHT_MCS_CODE(rate_n_flags),
+ IWX_RATE_VHT_MCS_NSS(rate_n_flags));
+ return (true);
+ default:
+ net80211_ic_printf(ic,
+ "%s: unsupported chosen rate type in "
+ "IWX_RATE_MCS_MOD_TYPE (%d)\n", __func__,
+ type >> IWX_RATE_MCS_MOD_TYPE_POS);
+ return (false);
}
- if (rate_n_flags_ver2) {
- const struct ieee80211_rateset *rs;
- uint32_t ridx = (rate_n_flags & IWX_RATE_LEGACY_RATE_MSK);
- if (rate_n_flags & IWX_RATE_MCS_LEGACY_OFDM_MSK)
- rs = &ieee80211_std_rateset_11a;
- else
- rs = &ieee80211_std_rateset_11b;
- if (ridx < rs->rs_nrates)
- rval = (rs->rs_rates[ridx] & IEEE80211_RATE_VAL);
- else
- rval = 0;
- } else {
- plcp = (rate_n_flags & IWX_RATE_LEGACY_RATE_MSK_V1);
+ /* Default: if we get here, we didn't successfully update anything */
+ return (false);
+}
- rval = 0;
- for (i = IWX_RATE_1M_INDEX; i < nitems(iwx_rates); i++) {
- if (iwx_rates[i].plcp == plcp) {
- rval = iwx_rates[i].rate;
- break;
- }
- }
- }
+/**
+ * @brief Process a firmware rate control update and update net80211.
+ *
+ * Since firmware is doing rate control, this just needs to update
+ * the txrate in the ieee80211_node entry.
+ */
+static void
+iwx_rs_update(struct iwx_softc *sc, struct iwx_tlc_update_notif *notif)
+{
+ struct ieee80211com *ic = &sc->sc_ic;
+ struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps);
+ /* XXX TODO: get a node ref! */
+ struct ieee80211_node *ni = (void *)vap->iv_bss;
- if (rval) {
- uint8_t rv;
- for (i = 0; i < rs->rs_nrates; i++) {
- rv = rs->rs_rates[i] & IEEE80211_RATE_VAL;
- if (rv == rval) {
- ieee80211_node_set_txrate_dot11rate(ni, i);
- break;
- }
- }
- IWX_DPRINTF(sc, IWX_DEBUG_TXRATE,
- "%s:%d new rate %d\n", __func__, __LINE__,
- ieee80211_node_get_txrate_dot11rate(ni));
- }
+ /*
+ * For now the iwx driver only supports a single vdev with a single
+ * node; it doesn't yet support ibss/hostap/multiple vdevs.
+ */
+ if (notif->sta_id != IWX_STATION_ID ||
+ (le32toh(notif->flags) & IWX_TLC_NOTIF_FLAG_RATE) == 0)
+ return;
+
+ iwx_rs_update_node_txrate(sc, notif, ni);
}
static int
@@ -8526,6 +8551,8 @@ iwx_start(struct iwx_softc *sc)
struct ieee80211_node *ni;
struct mbuf *m;
+ IWX_ASSERT_LOCKED(sc);
+
while (sc->qfullmsk == 0 && (m = mbufq_dequeue(&sc->sc_snd)) != NULL) {
ni = (struct ieee80211_node *)m->m_pkthdr.rcvif;
if (iwx_tx(sc, m, ni) != 0) {
@@ -8985,10 +9012,10 @@ iwx_rx_pkt(struct iwx_softc *sc, struct iwx_rx_data *data, struct mbuf *ml)
break;
case IWX_MISSED_BEACONS_NOTIFICATION:
+ IWX_DPRINTF(sc, IWX_DEBUG_BEACON,
+ "%s: IWX_MISSED_BEACONS_NOTIFICATION\n",
+ __func__);
iwx_rx_bmiss(sc, pkt, data);
- DPRINTF(("%s: IWX_MISSED_BEACONS_NOTIFICATION\n",
- __func__));
- ieee80211_beacon_miss(ic);
break;
case IWX_MFUART_LOAD_NOTIFICATION:
diff --git a/sys/dev/iwx/if_iwxreg.h b/sys/dev/iwx/if_iwxreg.h
index 6755b93fa0ba..f3d1f078b48e 100644
--- a/sys/dev/iwx/if_iwxreg.h
+++ b/sys/dev/iwx/if_iwxreg.h
@@ -5176,6 +5176,10 @@ enum {
#define IWX_RATE_HT_MCS_INDEX(r) ((((r) & IWX_RATE_MCS_NSS_MSK) >> 1) | \
((r) & IWX_RATE_HT_MCS_CODE_MSK))
+#define IWX_RATE_VHT_MCS_CODE(r) ((r) & IWX_RATE_HT_MCS_CODE_MSK)
+#define IWX_RATE_VHT_MCS_NSS(r) \
+ ((((r) & IWX_RATE_MCS_NSS_MSK) == 0) >> IWX_RATE_MCS_NSS_POS)
+
/* Bits 7-5: reserved */
/*
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 52f9e12f8f9a..52e9fcbbebcd 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -463,13 +463,13 @@ static __inline void
nvme_completion_poll(struct nvme_completion_poll_status *status)
{
int timeout = ticks + 10 * hz;
- sbintime_t delta_t = SBT_1US;
+ sbintime_t delta = SBT_1US;
while (!atomic_load_acq_int(&status->done)) {
if (timeout - ticks < 0)
panic("NVME polled command failed to complete within 10s.");
- pause_sbt("nvme", delta_t, 0, C_PREL(1));
- delta_t = min(SBT_1MS, delta_t * 3 / 2);
+ pause_sbt("nvme", delta, 0, C_PREL(1));
+ delta = min(SBT_1MS, delta + delta / 2);
}
}
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 9e43a4c1909f..cde98cb62cef 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -240,6 +240,7 @@ struct pci_quirk {
#define PCI_QUIRK_DISABLE_MSIX 5 /* MSI-X doesn't work */
#define PCI_QUIRK_MSI_INTX_BUG 6 /* PCIM_CMD_INTxDIS disables MSI */
#define PCI_QUIRK_REALLOC_BAR 7 /* Can't allocate memory at the default address */
+#define PCI_QUIRK_DISABLE_FLR 8 /* Function-Level Reset (FLR) not working. */
int arg1;
int arg2;
};
@@ -319,6 +320,13 @@ static const struct pci_quirk pci_quirks[] = {
* expected place.
*/
{ 0x98741002, PCI_QUIRK_REALLOC_BAR, 0, 0 },
+
+ /*
+ * With some MediaTek mt76 WiFi FLR does not work despite advertised.
+ */
+ { 0x061614c3, PCI_QUIRK_DISABLE_FLR, 0, 0 }, /* mt76 7922 */
+
+ /* end of table */
{ 0 }
};
@@ -6740,6 +6748,8 @@ pcie_flr(device_t dev, u_int max_delay, bool force)
if (!(pci_read_config(dev, cap + PCIER_DEVICE_CAP, 4) & PCIEM_CAP_FLR))
return (false);
+ if (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_FLR))
+ return (false);
/*
* Disable busmastering to prevent generation of new
diff --git a/sys/dev/sound/pci/hda/hdaa.c b/sys/dev/sound/pci/hda/hdaa.c
index 1e486b01b168..5dbb5c4f4453 100644
--- a/sys/dev/sound/pci/hda/hdaa.c
+++ b/sys/dev/sound/pci/hda/hdaa.c
@@ -532,9 +532,11 @@ static void
hdaa_presence_handler(struct hdaa_widget *w)
{
struct hdaa_devinfo *devinfo = w->devinfo;
- struct hdaa_audio_as *as;
+ struct hdaa_audio_as *as, *asp;
+ char buf[32];
uint32_t res;
- int connected, old;
+ int connected, old, i;
+ bool active;
if (w->enable == 0 || w->type !=
HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX)
@@ -552,13 +554,6 @@ hdaa_presence_handler(struct hdaa_widget *w)
if (connected == old)
return;
w->wclass.pin.connected = connected;
- HDA_BOOTVERBOSE(
- if (connected || old != 2) {
- device_printf(devinfo->dev,
- "Pin sense: nid=%d sense=0x%08x (%sconnected)\n",
- w->nid, res, !connected ? "dis" : "");
- }
- );
as = &devinfo->as[w->bindas];
if (as->hpredir >= 0 && as->pins[15] == w->nid)
@@ -567,6 +562,38 @@ hdaa_presence_handler(struct hdaa_widget *w)
hdaa_autorecsrc_handler(as, w);
if (old != 2)
hdaa_channels_handler(as);
+
+ if (connected || old != 2) {
+ HDA_BOOTVERBOSE(
+ device_printf(devinfo->dev,
+ "Pin sense: nid=%d sense=0x%08x (%sconnected)\n",
+ w->nid, res, !connected ? "dis" : "");
+ );
+ if (as->hpredir >= 0)
+ return;
+ for (i = 0, active = false; i < devinfo->num_devs; i++) {
+ if (device_get_unit(devinfo->devs[i].dev) == snd_unit) {
+ active = true;
+ break;
+ }
+ }
+ /* Proceed only if we are currently using this codec. */
+ if (!active)
+ return;
+ for (i = 0; i < devinfo->ascnt; i++) {
+ asp = &devinfo->as[i];
+ if (!asp->enable)
+ continue;
+ if ((connected && asp->index == as->index) ||
+ (!connected && asp->dir == as->dir)) {
+ snprintf(buf, sizeof(buf), "cdev=dsp%d",
+ device_get_unit(asp->pdevinfo->dev));
+ devctl_notify("SND", "CONN",
+ asp->dir == HDAA_CTL_IN ? "IN" : "OUT", buf);
+ break;
+ }
+ }
+ }
}
/*
@@ -6194,15 +6221,15 @@ hdaa_configure(device_t dev)
);
hdaa_patch_direct(devinfo);
HDA_BOOTHVERBOSE(
- device_printf(dev, "Pin sense init...\n");
- );
- hdaa_sense_init(devinfo);
- HDA_BOOTHVERBOSE(
device_printf(dev, "Creating PCM devices...\n");
);
hdaa_unlock(devinfo);
hdaa_create_pcms(devinfo);
hdaa_lock(devinfo);
+ HDA_BOOTHVERBOSE(
+ device_printf(dev, "Pin sense init...\n");
+ );
+ hdaa_sense_init(devinfo);
HDA_BOOTVERBOSE(
if (devinfo->quirks != 0) {
diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c
index 634ba0de2d55..471c6b3714b2 100644
--- a/sys/dev/virtio/network/if_vtnet.c
+++ b/sys/dev/virtio/network/if_vtnet.c
@@ -281,7 +281,7 @@ static int vtnet_tso_disable = 0;
SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_disable, CTLFLAG_RDTUN,
&vtnet_tso_disable, 0, "Disables TSO");
-static int vtnet_lro_disable = 0;
+static int vtnet_lro_disable = 1;
SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_disable, CTLFLAG_RDTUN,
&vtnet_lro_disable, 0, "Disables hardware LRO");
diff --git a/sys/dev/vmware/vmxnet3/if_vmx.c b/sys/dev/vmware/vmxnet3/if_vmx.c
index 62b5f313a137..1a314ca6660e 100644
--- a/sys/dev/vmware/vmxnet3/if_vmx.c
+++ b/sys/dev/vmware/vmxnet3/if_vmx.c
@@ -2056,7 +2056,12 @@ vmxnet3_update_admin_status(if_ctx_t ctx)
struct vmxnet3_softc *sc;
sc = iflib_get_softc(ctx);
- if (sc->vmx_ds->event != 0)
+ /*
+ * iflib may invoke this routine before vmxnet3_attach_post() has
+ * run, which is before the top level shared data area is
+ * initialized and the device made aware of it.
+ */
+ if (sc->vmx_ds != NULL && sc->vmx_ds->event != 0)
vmxnet3_evintr(sc);
vmxnet3_refresh_host_stats(sc);
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index b51ef6766de4..bcf67ddc9689 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -195,8 +195,8 @@ static void vt_update_static(void *);
#ifndef SC_NO_CUTPASTE
static void vt_mouse_paste(void);
#endif
-static void vt_suspend_handler(void *priv);
-static void vt_resume_handler(void *priv);
+static void vt_suspend_handler(void *priv, enum power_stype stype);
+static void vt_resume_handler(void *priv, enum power_stype stype);
SET_DECLARE(vt_drv_set, struct vt_driver);
@@ -3330,7 +3330,7 @@ vt_replace_backend(const struct vt_driver *drv, void *softc)
}
static void
-vt_suspend_handler(void *priv)
+vt_suspend_handler(void *priv, enum power_stype stype)
{
struct vt_device *vd;
@@ -3341,7 +3341,7 @@ vt_suspend_handler(void *priv)
}
static void
-vt_resume_handler(void *priv)
+vt_resume_handler(void *priv, enum power_stype stype)
{
struct vt_device *vd;
diff --git a/sys/dev/watchdog/watchdog.c b/sys/dev/watchdog/watchdog.c
index e1b2e08c3f10..c599db56bf95 100644
--- a/sys/dev/watchdog/watchdog.c
+++ b/sys/dev/watchdog/watchdog.c
@@ -204,6 +204,7 @@ wd_valid_act(int act)
return true;
}
+#ifdef COMPAT_FREEBSD14
static int
wd_ioctl_patpat(caddr_t data)
{
@@ -223,6 +224,7 @@ wd_ioctl_patpat(caddr_t data)
return (wdog_kern_pat(u));
}
+#endif
static int
wd_get_time_left(struct thread *td, time_t *remainp)
diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c
index 123df4992894..2c61b48c0451 100644
--- a/sys/dev/xen/control/control.c
+++ b/sys/dev/xen/control/control.c
@@ -91,6 +91,7 @@
#include <sys/smp.h>
#include <sys/eventhandler.h>
#include <sys/timetc.h>
+#include <sys/power.h>
#include <geom/geom.h>
@@ -175,12 +176,12 @@ xctrl_suspend(void)
cpuset_t cpu_suspend_map;
#endif
- EVENTHANDLER_INVOKE(power_suspend_early);
+ EVENTHANDLER_INVOKE(power_suspend_early, POWER_STYPE_SUSPEND_TO_MEM);
xs_lock();
stop_all_proc();
xs_unlock();
suspend_all_fs();
- EVENTHANDLER_INVOKE(power_suspend);
+ EVENTHANDLER_INVOKE(power_suspend, POWER_STYPE_SUSPEND_TO_MEM);
#ifdef EARLY_AP_STARTUP
MPASS(mp_ncpus == 1 || smp_started);
@@ -297,7 +298,7 @@ xctrl_suspend(void)
resume_all_fs();
resume_all_proc();
- EVENTHANDLER_INVOKE(power_resume);
+ EVENTHANDLER_INVOKE(power_resume, POWER_STYPE_SUSPEND_TO_MEM);
if (bootverbose)
printf("System resumed after suspension\n");
diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
index 0a93878c859f..ad3f7779e108 100644
--- a/sys/fs/nullfs/null.h
+++ b/sys/fs/nullfs/null.h
@@ -37,6 +37,9 @@
#define NULLM_CACHE 0x0001
+#include <sys/ck.h>
+#include <vm/uma.h>
+
struct null_mount {
struct mount *nullm_vfs;
struct vnode *nullm_lowerrootvp; /* Ref to lower root vnode */
@@ -50,7 +53,7 @@ struct null_mount {
* A cache of vnode references
*/
struct null_node {
- LIST_ENTRY(null_node) null_hash; /* Hash list */
+ CK_SLIST_ENTRY(null_node) null_hash; /* Hash list */
struct vnode *null_lowervp; /* VREFed once */
struct vnode *null_vnode; /* Back pointer */
u_int null_flags;
@@ -61,6 +64,7 @@ struct null_node {
#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
#define VTONULL(vp) ((struct null_node *)(vp)->v_data)
+#define VTONULL_SMR(vp) ((struct null_node *)vn_load_v_data_smr(vp))
#define NULLTOV(xp) ((xp)->null_vnode)
int nullfs_init(struct vfsconf *vfsp);
@@ -79,9 +83,7 @@ struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno);
extern struct vop_vector null_vnodeops;
-#ifdef MALLOC_DECLARE
-MALLOC_DECLARE(M_NULLFSNODE);
-#endif
+extern uma_zone_t null_node_zone;
#ifdef NULLFS_DEBUG
#define NULLFSDEBUG(format, args...) printf(format ,## args)
diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
index 053614b6910d..d7f847d449d0 100644
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -36,14 +36,19 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/rwlock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/smr.h>
#include <sys/vnode.h>
#include <fs/nullfs/null.h>
+#include <vm/uma.h>
+
+VFS_SMR_DECLARE;
+
/*
* Null layer cache:
* Each cache entry holds a reference to the lower vnode
@@ -54,12 +59,12 @@
#define NULL_NHASH(vp) (&null_node_hashtbl[vfs_hash_index(vp) & null_hash_mask])
-static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl;
+static CK_SLIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl;
static struct rwlock null_hash_lock;
static u_long null_hash_mask;
static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table");
-MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part");
+uma_zone_t __read_mostly null_node_zone;
static void null_hashins(struct mount *, struct null_node *);
@@ -73,6 +78,10 @@ nullfs_init(struct vfsconf *vfsp)
null_node_hashtbl = hashinit(desiredvnodes, M_NULLFSHASH,
&null_hash_mask);
rw_init(&null_hash_lock, "nullhs");
+ null_node_zone = uma_zcreate("nullfs node", sizeof(struct null_node),
+ NULL, NULL, NULL, NULL, 0, UMA_ZONE_ZINIT);
+ VFS_SMR_ZONE_SET(null_node_zone);
+
return (0);
}
@@ -80,6 +89,7 @@ int
nullfs_uninit(struct vfsconf *vfsp)
{
+ uma_zdestroy(null_node_zone);
rw_destroy(&null_hash_lock);
hashdestroy(null_node_hashtbl, M_NULLFSHASH, null_hash_mask);
return (0);
@@ -96,7 +106,7 @@ null_hashget_locked(struct mount *mp, struct vnode *lowervp)
struct null_node *a;
struct vnode *vp;
- ASSERT_VOP_LOCKED(lowervp, "null_hashget");
+ ASSERT_VOP_LOCKED(lowervp, __func__);
rw_assert(&null_hash_lock, RA_LOCKED);
/*
@@ -106,18 +116,21 @@ null_hashget_locked(struct mount *mp, struct vnode *lowervp)
* reference count (but NOT the lower vnode's VREF counter).
*/
hd = NULL_NHASH(lowervp);
- LIST_FOREACH(a, hd, null_hash) {
- if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) {
- /*
- * Since we have the lower node locked the nullfs
- * node can not be in the process of recycling. If
- * it had been recycled before we grabed the lower
- * lock it would not have been found on the hash.
- */
- vp = NULLTOV(a);
- vref(vp);
- return (vp);
- }
+ CK_SLIST_FOREACH(a, hd, null_hash) {
+ if (a->null_lowervp != lowervp)
+ continue;
+ /*
+ * Since we have the lower node locked the nullfs
+ * node can not be in the process of recycling. If
+ * it had been recycled before we grabed the lower
+ * lock it would not have been found on the hash.
+ */
+ vp = NULLTOV(a);
+ VNPASS(!VN_IS_DOOMED(vp), vp);
+ if (vp->v_mount != mp)
+ continue;
+ vref(vp);
+ return (vp);
}
return (NULL);
}
@@ -126,17 +139,34 @@ struct vnode *
null_hashget(struct mount *mp, struct vnode *lowervp)
{
struct null_node_hashhead *hd;
+ struct null_node *a;
struct vnode *vp;
+ enum vgetstate vs;
- hd = NULL_NHASH(lowervp);
- if (LIST_EMPTY(hd))
- return (NULL);
-
- rw_rlock(&null_hash_lock);
- vp = null_hashget_locked(mp, lowervp);
- rw_runlock(&null_hash_lock);
+ ASSERT_VOP_LOCKED(lowervp, __func__);
+ rw_assert(&null_hash_lock, RA_UNLOCKED);
- return (vp);
+ vfs_smr_enter();
+ hd = NULL_NHASH(lowervp);
+ CK_SLIST_FOREACH(a, hd, null_hash) {
+ if (a->null_lowervp != lowervp)
+ continue;
+ /*
+ * See null_hashget_locked as to why the nullfs vnode can't be
+ * doomed here.
+ */
+ vp = NULLTOV(a);
+ VNPASS(!VN_IS_DOOMED(vp), vp);
+ if (vp->v_mount != mp)
+ continue;
+ vs = vget_prep_smr(vp);
+ vfs_smr_exit();
+ VNPASS(vs != VGET_NONE, vp);
+ vget_finish_ref(vp, vs);
+ return (vp);
+ }
+ vfs_smr_exit();
+ return (NULL);
}
static void
@@ -151,7 +181,7 @@ null_hashins(struct mount *mp, struct null_node *xp)
hd = NULL_NHASH(xp->null_lowervp);
#ifdef INVARIANTS
- LIST_FOREACH(oxp, hd, null_hash) {
+ CK_SLIST_FOREACH(oxp, hd, null_hash) {
if (oxp->null_lowervp == xp->null_lowervp &&
NULLTOV(oxp)->v_mount == mp) {
VNASSERT(0, NULLTOV(oxp),
@@ -159,7 +189,7 @@ null_hashins(struct mount *mp, struct null_node *xp)
}
}
#endif
- LIST_INSERT_HEAD(hd, xp, null_hash);
+ CK_SLIST_INSERT_HEAD(hd, xp, null_hash);
}
static void
@@ -174,7 +204,7 @@ null_destroy_proto(struct vnode *vp, void *xp)
VI_UNLOCK(vp);
vgone(vp);
vput(vp);
- free(xp, M_NULLFSNODE);
+ uma_zfree_smr(null_node_zone, xp);
}
/*
@@ -208,12 +238,12 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
* Note that duplicate can only appear in hash if the lowervp is
* locked LK_SHARED.
*/
- xp = malloc(sizeof(struct null_node), M_NULLFSNODE, M_WAITOK);
+ xp = uma_zalloc_smr(null_node_zone, M_WAITOK);
error = getnewvnode("nullfs", mp, &null_vnodeops, &vp);
if (error) {
vput(lowervp);
- free(xp, M_NULLFSNODE);
+ uma_zfree_smr(null_node_zone, xp);
return (error);
}
@@ -261,8 +291,8 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
return (error);
}
- null_hashins(mp, xp);
vn_set_state(vp, VSTATE_CONSTRUCTED);
+ null_hashins(mp, xp);
rw_wunlock(&null_hash_lock);
*vpp = vp;
@@ -275,9 +305,11 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
void
null_hashrem(struct null_node *xp)
{
+ struct null_node_hashhead *hd;
+ hd = NULL_NHASH(xp->null_lowervp);
rw_wlock(&null_hash_lock);
- LIST_REMOVE(xp, null_hash);
+ CK_SLIST_REMOVE(hd, xp, null_node, null_hash);
rw_wunlock(&null_hash_lock);
}
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index e9d598014a2f..ec8a6b10b13f 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -174,6 +174,8 @@
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/smr.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/stat.h>
@@ -185,6 +187,8 @@
#include <vm/vm_object.h>
#include <vm/vnode_pager.h>
+VFS_SMR_DECLARE;
+
static int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */
SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW,
&null_bug_bypass, 0, "");
@@ -768,83 +772,111 @@ null_rmdir(struct vop_rmdir_args *ap)
}
/*
- * We need to process our own vnode lock and then clear the
- * interlock flag as it applies only to our vnode, not the
- * vnodes below us on the stack.
+ * We need to process our own vnode lock and then clear the interlock flag as
+ * it applies only to our vnode, not the vnodes below us on the stack.
+ *
+ * We have to hold the vnode here to solve a potential reclaim race. If we're
+ * forcibly vgone'd while we still have refs, a thread could be sleeping inside
+ * the lowervp's vop_lock routine. When we vgone we will drop our last ref to
+ * the lowervp, which would allow it to be reclaimed. The lowervp could then
+ * be recycled, in which case it is not legal to be sleeping in its VOP. We
+ * prevent it from being recycled by holding the vnode here.
*/
+static struct vnode *
+null_lock_prep_with_smr(struct vop_lock1_args *ap)
+{
+ struct null_node *nn;
+ struct vnode *lvp;
+
+ lvp = NULL;
+
+ vfs_smr_enter();
+
+ nn = VTONULL_SMR(ap->a_vp);
+ if (__predict_true(nn != NULL)) {
+ lvp = nn->null_lowervp;
+ if (lvp != NULL && !vhold_smr(lvp))
+ lvp = NULL;
+ }
+
+ vfs_smr_exit();
+ return (lvp);
+}
+
+static struct vnode *
+null_lock_prep_with_interlock(struct vop_lock1_args *ap)
+{
+ struct null_node *nn;
+ struct vnode *lvp;
+
+ ASSERT_VI_LOCKED(ap->a_vp, __func__);
+
+ ap->a_flags &= ~LK_INTERLOCK;
+
+ lvp = NULL;
+
+ nn = VTONULL(ap->a_vp);
+ if (__predict_true(nn != NULL)) {
+ lvp = nn->null_lowervp;
+ if (lvp != NULL)
+ vholdnz(lvp);
+ }
+ VI_UNLOCK(ap->a_vp);
+ return (lvp);
+}
+
static int
null_lock(struct vop_lock1_args *ap)
{
- struct vnode *vp = ap->a_vp;
- int flags;
- struct null_node *nn;
struct vnode *lvp;
- int error;
+ int error, flags;
- if ((ap->a_flags & LK_INTERLOCK) == 0)
- VI_LOCK(vp);
- else
- ap->a_flags &= ~LK_INTERLOCK;
- flags = ap->a_flags;
- nn = VTONULL(vp);
+ if (__predict_true((ap->a_flags & LK_INTERLOCK) == 0)) {
+ lvp = null_lock_prep_with_smr(ap);
+ if (__predict_false(lvp == NULL)) {
+ VI_LOCK(ap->a_vp);
+ lvp = null_lock_prep_with_interlock(ap);
+ }
+ } else {
+ lvp = null_lock_prep_with_interlock(ap);
+ }
+
+ ASSERT_VI_UNLOCKED(ap->a_vp, __func__);
+
+ if (__predict_false(lvp == NULL))
+ return (vop_stdlock(ap));
+
+ VNPASS(lvp->v_holdcnt > 0, lvp);
+ error = VOP_LOCK(lvp, ap->a_flags);
/*
- * If we're still active we must ask the lower layer to
- * lock as ffs has special lock considerations in its
- * vop lock.
+ * We might have slept to get the lock and someone might have
+ * clean our vnode already, switching vnode lock from one in
+ * lowervp to v_lock in our own vnode structure. Handle this
+ * case by reacquiring correct lock in requested mode.
*/
- if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) {
- /*
- * We have to hold the vnode here to solve a potential
- * reclaim race. If we're forcibly vgone'd while we
- * still have refs, a thread could be sleeping inside
- * the lowervp's vop_lock routine. When we vgone we will
- * drop our last ref to the lowervp, which would allow it
- * to be reclaimed. The lowervp could then be recycled,
- * in which case it is not legal to be sleeping in its VOP.
- * We prevent it from being recycled by holding the vnode
- * here.
- */
- vholdnz(lvp);
- VI_UNLOCK(vp);
- error = VOP_LOCK(lvp, flags);
-
- /*
- * We might have slept to get the lock and someone might have
- * clean our vnode already, switching vnode lock from one in
- * lowervp to v_lock in our own vnode structure. Handle this
- * case by reacquiring correct lock in requested mode.
- */
- if (VTONULL(vp) == NULL && error == 0) {
- ap->a_flags &= ~LK_TYPE_MASK;
- switch (flags & LK_TYPE_MASK) {
- case LK_SHARED:
- ap->a_flags |= LK_SHARED;
- break;
- case LK_UPGRADE:
- case LK_EXCLUSIVE:
- ap->a_flags |= LK_EXCLUSIVE;
- break;
- default:
- panic("Unsupported lock request %d\n",
- ap->a_flags);
- }
- VOP_UNLOCK(lvp);
- error = vop_stdlock(ap);
+ if (VTONULL(ap->a_vp) == NULL && error == 0) {
+ VOP_UNLOCK(lvp);
+
+ flags = ap->a_flags;
+ ap->a_flags &= ~LK_TYPE_MASK;
+ switch (flags & LK_TYPE_MASK) {
+ case LK_SHARED:
+ ap->a_flags |= LK_SHARED;
+ break;
+ case LK_UPGRADE:
+ case LK_EXCLUSIVE:
+ ap->a_flags |= LK_EXCLUSIVE;
+ break;
+ default:
+ panic("Unsupported lock request %d\n",
+ flags);
}
- vdrop(lvp);
- } else {
- VI_UNLOCK(vp);
error = vop_stdlock(ap);
}
-
+ vdrop(lvp);
return (error);
}
-/*
- * We need to process our own vnode unlock and then clear the
- * interlock flag as it applies only to our vnode, not the
- * vnodes below us on the stack.
- */
static int
null_unlock(struct vop_unlock_args *ap)
{
@@ -853,11 +885,20 @@ null_unlock(struct vop_unlock_args *ap)
struct vnode *lvp;
int error;
+ /*
+ * Contrary to null_lock, we don't need to hold the vnode around
+ * unlock.
+ *
+ * We hold the lock, which means we can't be racing against vgone.
+ *
+ * At the same time VOP_UNLOCK promises to not touch anything after
+ * it finishes unlock, just like we don't.
+ *
+ * vop_stdunlock for a doomed vnode matches doomed locking in null_lock.
+ */
nn = VTONULL(vp);
if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) {
- vholdnz(lvp);
error = VOP_UNLOCK(lvp);
- vdrop(lvp);
} else {
error = vop_stdunlock(ap);
}
@@ -961,7 +1002,7 @@ null_reclaim(struct vop_reclaim_args *ap)
vunref(lowervp);
else
vput(lowervp);
- free(xp, M_NULLFSNODE);
+ uma_zfree_smr(null_node_zone, xp);
return (0);
}
diff --git a/sys/i386/acpica/acpi_wakeup.c b/sys/i386/acpica/acpi_wakeup.c
index 2d60d5e037a0..96be64de017b 100644
--- a/sys/i386/acpica/acpi_wakeup.c
+++ b/sys/i386/acpica/acpi_wakeup.c
@@ -84,7 +84,7 @@ static cpuset_t suspcpus;
static struct susppcb **susppcbs;
#endif
-static void acpi_stop_beep(void *);
+static void acpi_stop_beep(void *, enum power_stype);
#ifdef SMP
static int acpi_wakeup_ap(struct acpi_softc *, int);
@@ -100,7 +100,7 @@ static void acpi_wakeup_cpus(struct acpi_softc *);
} while (0)
static void
-acpi_stop_beep(void *arg)
+acpi_stop_beep(void *arg, enum power_stype stype)
{
if (acpi_resume_beep != 0)
diff --git a/sys/isa/isa_common.c b/sys/isa/isa_common.c
index 8e4064af1455..1a6df7bf6046 100644
--- a/sys/isa/isa_common.c
+++ b/sys/isa/isa_common.c
@@ -1114,7 +1114,7 @@ isab_attach(device_t dev)
{
device_t child;
- child = device_add_child(dev, "isa", 0);
+ child = device_add_child(dev, "isa", DEVICE_UNIT_ANY);
if (child == NULL)
return (ENXIO);
bus_attach_children(dev);
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index a32b5a1b3354..ab8ed32ad189 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -127,6 +127,27 @@ proc_realparent(struct proc *child)
return (parent);
}
+static void
+reaper_clear(struct proc *p, struct proc *rp)
+{
+ struct proc *p1;
+ bool clear;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ LIST_REMOVE(p, p_reapsibling);
+ if (p->p_reapsubtree == 1)
+ return;
+ clear = true;
+ LIST_FOREACH(p1, &rp->p_reaplist, p_reapsibling) {
+ if (p1->p_reapsubtree == p->p_reapsubtree) {
+ clear = false;
+ break;
+ }
+ }
+ if (clear)
+ proc_id_clear(PROC_ID_REAP, p->p_reapsubtree);
+}
+
void
reaper_abandon_children(struct proc *p, bool exiting)
{
@@ -138,7 +159,7 @@ reaper_abandon_children(struct proc *p, bool exiting)
return;
p1 = p->p_reaper;
LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
- LIST_REMOVE(p2, p_reapsibling);
+ reaper_clear(p2, p);
p2->p_reaper = p1;
p2->p_reapsubtree = p->p_reapsubtree;
LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
@@ -152,27 +173,6 @@ reaper_abandon_children(struct proc *p, bool exiting)
p->p_treeflag &= ~P_TREE_REAPER;
}
-static void
-reaper_clear(struct proc *p)
-{
- struct proc *p1;
- bool clear;
-
- sx_assert(&proctree_lock, SX_LOCKED);
- LIST_REMOVE(p, p_reapsibling);
- if (p->p_reapsubtree == 1)
- return;
- clear = true;
- LIST_FOREACH(p1, &p->p_reaper->p_reaplist, p_reapsibling) {
- if (p1->p_reapsubtree == p->p_reapsubtree) {
- clear = false;
- break;
- }
- }
- if (clear)
- proc_id_clear(PROC_ID_REAP, p->p_reapsubtree);
-}
-
void
proc_clear_orphan(struct proc *p)
{
@@ -972,7 +972,7 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options)
sx_xunlock(PIDHASHLOCK(p->p_pid));
LIST_REMOVE(p, p_sibling);
reaper_abandon_children(p, true);
- reaper_clear(p);
+ reaper_clear(p, p->p_reaper);
PROC_LOCK(p);
proc_clear_orphan(p);
PROC_UNLOCK(p);
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 31bff6d2c1aa..76f68677e292 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -1780,9 +1780,11 @@ lockmgr_chain(struct thread *td, struct thread **ownerp)
lk = td->td_wchan;
- if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
+ if (!TD_ON_SLEEPQ(td) || sleepq_type(td->td_wchan) != SLEEPQ_LK ||
+ LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
return (0);
- db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
+ db_printf("blocked on lock %p (%s) \"%s\" ", &lk->lock_object,
+ lock_class_lockmgr.lc_name, lk->lock_object.lo_name);
if (lk->lk_lock & LK_SHARE)
db_printf("SHARED (count %ju)\n",
(uintmax_t)LK_SHARERS(lk->lk_lock));
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 8b5908f5219a..d67c70984528 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -503,8 +503,8 @@ _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
/*
* __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
*
- * We call this if the lock is either contested (i.e. we need to go to
- * sleep waiting for it), or if we need to recurse on it.
+ * We get here if lock profiling is enabled, the lock is already held by
+ * someone else or we are recursing on it.
*/
#if LOCK_DEBUG > 0
void
@@ -660,13 +660,8 @@ retry_turnstile:
}
#endif
- /*
- * If the mutex isn't already contested and a failure occurs
- * setting the contested bit, the mutex was either released
- * or the state of the MTX_RECURSED bit changed.
- */
- if ((v & MTX_CONTESTED) == 0 &&
- !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_CONTESTED)) {
+ if ((v & MTX_WAITERS) == 0 &&
+ !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_WAITERS)) {
goto retry_turnstile;
}
@@ -869,7 +864,7 @@ _thread_lock(struct thread *td)
WITNESS_LOCK(&m->lock_object, LOP_EXCLUSIVE, file, line);
return;
}
- _mtx_release_lock_quick(m);
+ atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED);
slowpath_unlocked:
spinlock_exit();
slowpath_noirq:
@@ -959,7 +954,7 @@ retry:
}
if (m == td->td_lock)
break;
- _mtx_release_lock_quick(m);
+ atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED);
}
LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
line);
@@ -1029,8 +1024,8 @@ thread_lock_set(struct thread *td, struct mtx *new)
/*
* __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
*
- * We are only called here if the lock is recursed, contested (i.e. we
- * need to wake up a blocked thread) or lockstat probe is active.
+ * We get here if lock profiling is enabled, the lock is already held by
+ * someone else or we are recursing on it.
*/
#if LOCK_DEBUG > 0
void
@@ -1071,7 +1066,7 @@ __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v)
* can be removed from the hash list if it is empty.
*/
turnstile_chain_lock(&m->lock_object);
- _mtx_release_lock_quick(m);
+ atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED);
ts = turnstile_lookup(&m->lock_object);
MPASS(ts != NULL);
if (LOCK_LOG_TEST(&m->lock_object, opts))
@@ -1207,7 +1202,7 @@ _mtx_destroy(volatile uintptr_t *c)
if (!mtx_owned(m))
MPASS(mtx_unowned(m));
else {
- MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+ MPASS((m->mtx_lock & (MTX_RECURSED|MTX_WAITERS)) == 0);
/* Perform the non-mtx related part of mtx_unlock_spin(). */
if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) {
@@ -1359,8 +1354,8 @@ db_show_mtx(const struct lock_object *lock)
db_printf("DESTROYED");
else {
db_printf("OWNED");
- if (m->mtx_lock & MTX_CONTESTED)
- db_printf(", CONTESTED");
+ if (m->mtx_lock & MTX_WAITERS)
+ db_printf(", WAITERS");
if (m->mtx_lock & MTX_RECURSED)
db_printf(", RECURSED");
}
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
index c005e112d3b9..249faf5b1ec4 100644
--- a/sys/kern/kern_sx.c
+++ b/sys/kern/kern_sx.c
@@ -1539,16 +1539,19 @@ sx_chain(struct thread *td, struct thread **ownerp)
/*
* Check to see if this thread is blocked on an sx lock.
- * First, we check the lock class. If that is ok, then we
- * compare the lock name against the wait message.
+ * The thread should be on a sleep queue with type SLEEPQ_SX, the
+ * purported lock should have the lock class index of sx, and the lock
+ * name should match the wait message.
*/
sx = td->td_wchan;
- if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
+ if (!TD_ON_SLEEPQ(td) || sleepq_type(td->td_wchan) != SLEEPQ_SX ||
+ LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
sx->lock_object.lo_name != td->td_wmesg)
return (0);
/* We think we have an sx lock, so output some details. */
- db_printf("blocked on sx \"%s\" ", td->td_wmesg);
+ db_printf("blocked on lock %p (%s) \"%s\" ", &sx->lock_object,
+ lock_class_sx.lc_name, td->td_wmesg);
*ownerp = sx_xholder(sx);
if (sx->sx_lock & SX_LOCK_SHARED)
db_printf("SLOCK (count %ju)\n",
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
index bbebadc4c395..ebd203858b66 100644
--- a/sys/kern/link_elf.c
+++ b/sys/kern/link_elf.c
@@ -518,9 +518,15 @@ link_elf_init(void* arg)
(void)link_elf_link_common_finish(linker_kernel_file);
linker_kernel_file->flags |= LINKER_FILE_LINKED;
TAILQ_INIT(&set_pcpu_list);
+ ef->pcpu_start = DPCPU_START;
+ ef->pcpu_stop = DPCPU_STOP;
+ ef->pcpu_base = DPCPU_START;
#ifdef VIMAGE
TAILQ_INIT(&set_vnet_list);
vnet_save_init((void *)VNET_START, VNET_STOP - VNET_START);
+ ef->vnet_start = VNET_START;
+ ef->vnet_stop = VNET_STOP;
+ ef->vnet_base = VNET_START;
#endif
}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
index 151aab96f9be..a3a53a39bfd6 100644
--- a/sys/kern/link_elf_obj.c
+++ b/sys/kern/link_elf_obj.c
@@ -70,6 +70,7 @@
typedef struct {
void *addr;
+ void *origaddr; /* Used by debuggers. */
Elf_Off size;
int flags; /* Section flags. */
int sec; /* Original section number. */
@@ -492,7 +493,8 @@ link_elf_link_preload(linker_class_t cls, const char *filename,
case SHT_FINI_ARRAY:
if (shdr[i].sh_addr == 0)
break;
- ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
+ ef->progtab[pb].addr = ef->progtab[pb].origaddr =
+ (void *)shdr[i].sh_addr;
if (shdr[i].sh_type == SHT_PROGBITS)
ef->progtab[pb].name = "<<PROGBITS>>";
#ifdef __amd64__
@@ -1088,6 +1090,8 @@ link_elf_load_file(linker_class_t cls, const char *filename,
ef->progtab[pb].name = "<<NOBITS>>";
if (ef->progtab[pb].name != NULL &&
!strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) {
+ ef->progtab[pb].origaddr =
+ (void *)(uintptr_t)mapbase;
ef->progtab[pb].addr =
dpcpu_alloc(shdr[i].sh_size);
if (ef->progtab[pb].addr == NULL) {
@@ -1101,6 +1105,8 @@ link_elf_load_file(linker_class_t cls, const char *filename,
#ifdef VIMAGE
else if (ef->progtab[pb].name != NULL &&
!strcmp(ef->progtab[pb].name, VNET_SETNAME)) {
+ ef->progtab[pb].origaddr =
+ (void *)(uintptr_t)mapbase;
ef->progtab[pb].addr =
vnet_data_alloc(shdr[i].sh_size);
if (ef->progtab[pb].addr == NULL) {
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 5606b36f772f..7d666da9f88b 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -729,7 +729,7 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
{
struct file *fp;
struct filedesc *fdp;
- int error, tmp, locked;
+ int error, f_flag, tmp, locked;
AUDIT_ARG_FD(fd);
AUDIT_ARG_CMD(com);
@@ -782,30 +782,36 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
goto out;
}
+ f_flag = 0;
switch (com) {
case FIONCLEX:
fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
- goto out;
+ break;
case FIOCLEX:
fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
- goto out;
- case FIONBIO:
- if ((tmp = *(int *)data))
- atomic_set_int(&fp->f_flag, FNONBLOCK);
- else
- atomic_clear_int(&fp->f_flag, FNONBLOCK);
- data = (void *)&tmp;
break;
+ case FIONBIO:
case FIOASYNC:
- if ((tmp = *(int *)data))
- atomic_set_int(&fp->f_flag, FASYNC);
- else
- atomic_clear_int(&fp->f_flag, FASYNC);
- data = (void *)&tmp;
+ f_flag = com == FIONBIO ? FNONBLOCK : FASYNC;
+ tmp = *(int *)data;
+ fsetfl_lock(fp);
+ if (((fp->f_flag & f_flag) != 0) != (tmp != 0)) {
+ error = fo_ioctl(fp, com, (void *)&tmp, td->td_ucred,
+ td);
+ if (error == 0) {
+ if (tmp != 0)
+ atomic_set_int(&fp->f_flag, f_flag);
+ else
+ atomic_clear_int(&fp->f_flag, f_flag);
+ }
+ }
+ fsetfl_unlock(fp);
+ break;
+ default:
+ error = fo_ioctl(fp, com, data, td->td_ucred, td);
break;
}
- error = fo_ioctl(fp, com, data, td->td_ucred, td);
out:
switch (locked) {
case LA_XLOCKED:
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 30527fdd4fd0..57ebe8dc85f0 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -567,7 +567,7 @@ pipespace_new(struct pipe *cpipe, int size)
static int curfail = 0;
static struct timeval lastfail;
- KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
+ PIPE_LOCK_ASSERT(cpipe, MA_NOTOWNED);
KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
("pipespace: resize of direct writes not allowed"));
retry:
@@ -1679,8 +1679,7 @@ static void
pipe_free_kmem(struct pipe *cpipe)
{
- KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
- ("pipe_free_kmem: pipe mutex locked"));
+ PIPE_LOCK_ASSERT(cpipe, MA_NOTOWNED);
if (cpipe->pipe_buffer.buffer != NULL) {
atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 3d4567b6ab1e..a53df50c06bd 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -806,9 +806,12 @@ file_v_lock(struct file *fp, short lock_bit, short lock_wait_bit)
flagsp = &fp->f_vflags;
state = atomic_load_16(flagsp);
- if ((state & lock_bit) == 0 &&
- atomic_cmpset_acq_16(flagsp, state, state | lock_bit))
- return;
+ for (;;) {
+ if ((state & lock_bit) != 0)
+ break;
+ if (atomic_fcmpset_acq_16(flagsp, &state, state | lock_bit))
+ return;
+ }
sleepq_lock(flagsp);
state = atomic_load_16(flagsp);
@@ -842,9 +845,12 @@ file_v_unlock(struct file *fp, short lock_bit, short lock_wait_bit)
flagsp = &fp->f_vflags;
state = atomic_load_16(flagsp);
- if ((state & lock_wait_bit) == 0 &&
- atomic_cmpset_rel_16(flagsp, state, state & ~lock_bit))
- return;
+ for (;;) {
+ if ((state & lock_wait_bit) != 0)
+ break;
+ if (atomic_fcmpset_rel_16(flagsp, &state, state & ~lock_bit))
+ return;
+ }
sleepq_lock(flagsp);
MPASS((*flagsp & lock_bit) != 0);
@@ -864,10 +870,6 @@ foffset_lock(struct file *fp, int flags)
FILE_V_FOFFSET_LOCK_WAITING);
}
- /*
- * According to McKusick the vn lock was protecting f_offset here.
- * It is now protected by the FOFFSET_LOCKED flag.
- */
return (atomic_load_long(&fp->f_offset));
}
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index c595030ed4a0..db1b6f33a8ef 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -843,7 +843,7 @@
/* #undef ZFS_DEVICE_MINOR */
/* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.4.99-72-FreeBSD_gb2196fbed"
+#define ZFS_META_ALIAS "zfs-2.4.99-95-FreeBSD_g5605a6d79"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@@ -852,7 +852,7 @@
/* #undef ZFS_META_DATA */
/* Define the maximum compatible kernel version. */
-#define ZFS_META_KVER_MAX "6.16"
+#define ZFS_META_KVER_MAX "6.17"
/* Define the minimum compatible kernel version. */
#define ZFS_META_KVER_MIN "4.18"
@@ -873,7 +873,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
-#define ZFS_META_RELEASE "72-FreeBSD_gb2196fbed"
+#define ZFS_META_RELEASE "95-FreeBSD_g5605a6d79"
/* Define the project version. */
#define ZFS_META_VERSION "2.4.99"
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index 9eae1e8573c0..8a1802f5480b 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define ZFS_META_GITREV "zfs-2.4.99-72-gb2196fbed"
+#define ZFS_META_GITREV "zfs-2.4.99-95-g5605a6d79"
diff --git a/sys/net/if.c b/sys/net/if.c
index 6a68d627c07f..b6a798aa0fab 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -5116,12 +5116,6 @@ if_getvnet(if_t ifp)
return (ifp->if_vnet);
}
-struct vnet *
-if_gethomevnet(if_t ifp)
-{
- return (ifp->if_home_vnet);
-}
-
void *
if_getafdata(if_t ifp, int af)
{
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index e71fe798fdec..f2df612b19c1 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -648,7 +648,6 @@ u_int16_t if_getvtag(struct mbuf *m);
int if_vlantrunkinuse(if_t ifp);
char *if_getlladdr(const if_t ifp);
struct vnet *if_getvnet(const if_t ifp);
-struct vnet *if_gethomevnet(const if_t ifp);
void *if_gethandle(u_char);
void if_vlancap(if_t ifp);
int if_transmit(if_t ifp, struct mbuf *m);
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index e2005aa28c5c..d2625da19cd2 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -202,6 +202,8 @@ struct iflib_ctx {
uint16_t ifc_sysctl_extra_msix_vectors;
bool ifc_cpus_are_physical_cores;
bool ifc_sysctl_simple_tx;
+ uint16_t ifc_sysctl_tx_reclaim_thresh;
+ uint16_t ifc_sysctl_tx_reclaim_ticks;
qidx_t ifc_sysctl_ntxds[8];
qidx_t ifc_sysctl_nrxds[8];
@@ -345,7 +347,9 @@ struct iflib_txq {
uint16_t ift_npending;
uint16_t ift_db_pending;
uint16_t ift_rs_pending;
- /* implicit pad */
+ uint32_t ift_last_reclaim;
+ uint16_t ift_reclaim_thresh;
+ uint16_t ift_reclaim_ticks;
uint8_t ift_txd_size[8];
uint64_t ift_processed;
uint64_t ift_cleaned;
@@ -729,7 +733,7 @@ static void iflib_free_intr_mem(if_ctx_t ctx);
#ifndef __NO_STRICT_ALIGNMENT
static struct mbuf *iflib_fixup_rx(struct mbuf *m);
#endif
-static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh);
+static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq);
static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
SLIST_HEAD_INITIALIZER(cpu_offsets);
@@ -3084,8 +3088,6 @@ txq_max_rs_deferred(iflib_txq_t txq)
#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
-/* XXX we should be setting this to something other than zero */
-#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
#define MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
(ctx)->ifc_softc_ctx.isc_tx_nsegments)
@@ -3642,7 +3644,7 @@ defrag:
* cxgb
*/
if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
- (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+ (void)iflib_completed_tx_reclaim(txq);
if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
txq->ift_no_desc_avail++;
bus_dmamap_unload(buf_tag, map);
@@ -3785,14 +3787,21 @@ iflib_tx_desc_free(iflib_txq_t txq, int n)
}
static __inline int
-iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
+iflib_completed_tx_reclaim(iflib_txq_t txq)
{
- int reclaim;
+ int reclaim, thresh;
+ uint32_t now;
if_ctx_t ctx = txq->ift_ctx;
+ thresh = txq->ift_reclaim_thresh;
KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
+ now = ticks;
+ if (now <= (txq->ift_last_reclaim + txq->ift_reclaim_ticks) &&
+ txq->ift_in_use < thresh)
+ return (0);
+ txq->ift_last_reclaim = now;
/*
* Need a rate-limiting check so that this isn't called every time
*/
@@ -3873,7 +3882,7 @@ iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
DBG_COUNTER_INC(txq_drain_notready);
return (0);
}
- reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+ reclaimed = iflib_completed_tx_reclaim(txq);
rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
avail = IDXDIFF(pidx, cidx, r->size);
@@ -3952,7 +3961,7 @@ iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
}
/* deliberate use of bitwise or to avoid gratuitous short-circuit */
- ring = rang ? false : (iflib_min_tx_latency | err);
+ ring = rang ? false : (iflib_min_tx_latency | err | (!!txq->ift_reclaim_thresh));
iflib_txd_db_check(txq, ring);
if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
@@ -4032,7 +4041,7 @@ _task_fn_tx(void *context)
#endif
if (ctx->ifc_sysctl_simple_tx) {
mtx_lock(&txq->ift_mtx);
- (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+ (void)iflib_completed_tx_reclaim(txq);
mtx_unlock(&txq->ift_mtx);
goto skip_ifmp;
}
@@ -5883,6 +5892,7 @@ iflib_queues_alloc(if_ctx_t ctx)
device_printf(dev, "Unable to allocate buf_ring\n");
goto err_tx_desc;
}
+ txq->ift_reclaim_thresh = ctx->ifc_sysctl_tx_reclaim_thresh;
}
for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
@@ -6774,6 +6784,74 @@ mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
return (rc);
}
+static int
+iflib_handle_tx_reclaim_thresh(SYSCTL_HANDLER_ARGS)
+{
+ if_ctx_t ctx = (void *)arg1;
+ iflib_txq_t txq;
+ int i, err;
+ int thresh;
+
+ thresh = ctx->ifc_sysctl_tx_reclaim_thresh;
+ err = sysctl_handle_int(oidp, &thresh, arg2, req);
+ if (err != 0) {
+ return err;
+ }
+
+ if (thresh == ctx->ifc_sysctl_tx_reclaim_thresh)
+ return 0;
+
+ if (thresh > ctx->ifc_softc_ctx.isc_ntxd[0] / 2) {
+ device_printf(ctx->ifc_dev, "TX Reclaim thresh must be <= %d\n",
+ ctx->ifc_softc_ctx.isc_ntxd[0] / 2);
+ return (EINVAL);
+ }
+
+ ctx->ifc_sysctl_tx_reclaim_thresh = thresh;
+ if (ctx->ifc_txqs == NULL)
+ return (err);
+
+ txq = &ctx->ifc_txqs[0];
+ for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
+ txq->ift_reclaim_thresh = thresh;
+ }
+ return (err);
+}
+
+static int
+iflib_handle_tx_reclaim_ticks(SYSCTL_HANDLER_ARGS)
+{
+ if_ctx_t ctx = (void *)arg1;
+ iflib_txq_t txq;
+ int i, err;
+ int ticks;
+
+ ticks = ctx->ifc_sysctl_tx_reclaim_ticks;
+ err = sysctl_handle_int(oidp, &ticks, arg2, req);
+ if (err != 0) {
+ return err;
+ }
+
+ if (ticks == ctx->ifc_sysctl_tx_reclaim_ticks)
+ return 0;
+
+ if (ticks > hz) {
+ device_printf(ctx->ifc_dev,
+ "TX Reclaim ticks must be <= hz (%d)\n", hz);
+ return (EINVAL);
+ }
+
+ ctx->ifc_sysctl_tx_reclaim_ticks = ticks;
+ if (ctx->ifc_txqs == NULL)
+ return (err);
+
+ txq = &ctx->ifc_txqs[0];
+ for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
+ txq->ift_reclaim_ticks = ticks;
+ }
+ return (err);
+}
+
#define NAME_BUFLEN 32
static void
iflib_add_device_sysctl_pre(if_ctx_t ctx)
@@ -6862,6 +6940,16 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
node = ctx->ifc_sysctl_node;
child = SYSCTL_CHILDREN(node);
+ SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "tx_reclaim_thresh",
+ CTLTYPE_INT | CTLFLAG_RWTUN, ctx,
+ 0, iflib_handle_tx_reclaim_thresh, "I",
+ "Number of TX descs outstanding before reclaim is called");
+
+ SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "tx_reclaim_ticks",
+ CTLTYPE_INT | CTLFLAG_RWTUN, ctx,
+ 0, iflib_handle_tx_reclaim_ticks, "I",
+ "Number of ticks before a TX reclaim is forced");
+
if (scctx->isc_ntxqsets > 100)
qfmt = "txq%03d";
else if (scctx->isc_ntxqsets > 10)
@@ -7109,7 +7197,7 @@ iflib_debugnet_poll(if_t ifp, int count)
return (EBUSY);
txq = &ctx->ifc_txqs[0];
- (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+ (void)iflib_completed_tx_reclaim(txq);
NET_EPOCH_ENTER(et);
for (i = 0; i < scctx->isc_nrxqsets; i++)
@@ -7159,7 +7247,7 @@ iflib_simple_transmit(if_t ifp, struct mbuf *m)
else
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
}
- (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+ (void)iflib_completed_tx_reclaim(txq);
mtx_unlock(&txq->ift_mtx);
if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
diff --git a/sys/net/iflib.h b/sys/net/iflib.h
index 3817445228d0..e65c936fc4b4 100644
--- a/sys/net/iflib.h
+++ b/sys/net/iflib.h
@@ -272,7 +272,7 @@ struct if_shared_ctx {
int isc_ntxqs; /* # of tx queues per tx qset - usually 1 */
int isc_nrxqs; /* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
int __spare0__;
- int isc_tx_reclaim_thresh;
+ int __spare1__;
int isc_flags;
};
diff --git a/sys/net80211/ieee80211.c b/sys/net80211/ieee80211.c
index 2b7cf635b9f5..1299f86ebdc7 100644
--- a/sys/net80211/ieee80211.c
+++ b/sys/net80211/ieee80211.c
@@ -2689,13 +2689,18 @@ ieee80211_channel_type_char(const struct ieee80211_channel *c)
return 'f';
}
-/*
- * Determine whether the given key in the given VAP is a global key.
+/**
+ * @brief Determine whether the given key in the given VAP is a global key.
+ *
* (key index 0..3, shared between all stations on a VAP.)
*
* This is either a WEP key or a GROUP key.
*
* Note this will NOT return true if it is a IGTK key.
+ *
+ * @param vap the current VAP
+ * @param key ieee80211_key to use/check
+ * @returns true if it's a global/WEP key, false otherwise
*/
bool
ieee80211_is_key_global(const struct ieee80211vap *vap,
@@ -2705,8 +2710,23 @@ ieee80211_is_key_global(const struct ieee80211vap *vap,
key < &vap->iv_nw_keys[IEEE80211_WEP_NKID]);
}
-/*
- * Determine whether the given key in the given VAP is a unicast key.
+/**
+ * @brief Determine whether the given key in the given VAP is a unicast key.
+ *
+ * This only returns true if it's a unicast key.
+ *
+ * Note: For now net80211 only supports a single unicast key, stored in
+ * an ieee80211_node entry.
+ *
+ * Code should use this to know if it's a unicast key and then call
+ * ieee80211_crypto_get_keyid() to get the 802.11 key ID (0..3 for
+ * unicast/global keys, 4..5 for IGTK keys.) Since the unicast
+ * and global key indexes "overlap", callers will need to check
+ * both the type and id.
+ *
+ * @param vap the current VAP
+ * @param key ieee80211_key to use/check
+ * @returns true if the key is a unicast key, false if it is not
*/
bool
ieee80211_is_key_unicast(const struct ieee80211vap *vap,
diff --git a/sys/net80211/ieee80211_crypto.c b/sys/net80211/ieee80211_crypto.c
index 1e63ca46f28f..566f0b2e0c23 100644
--- a/sys/net80211/ieee80211_crypto.c
+++ b/sys/net80211/ieee80211_crypto.c
@@ -611,11 +611,15 @@ ieee80211_crypto_setkey(struct ieee80211vap *vap, struct ieee80211_key *key)
return dev_key_set(vap, key);
}
-/*
- * Return index if the key is a WEP key (0..3); -1 otherwise.
+/**
+ * @brief Return index if the key is a WEP key (0..3); -1 otherwise.
*
* This is different to "get_keyid" which defaults to returning
* 0 for unicast keys; it assumes that it won't be used for WEP.
+ *
+ * @param vap the current VAP
+ * @param k ieee80211_key to check
+ * @returns 0..3 if it's a global/WEP key, -1 otherwise.
*/
int
ieee80211_crypto_get_key_wepidx(const struct ieee80211vap *vap,
@@ -628,8 +632,18 @@ ieee80211_crypto_get_key_wepidx(const struct ieee80211vap *vap,
return (-1);
}
-/*
- * Note: only supports a single unicast key (0).
+/**
+ * @brief Return the index of a unicast, global or IGTK key.
+ *
+ * Return the index of a key. For unicast keys the index is 0..1.
+ * For global/WEP keys it's 0..3. For IGTK keys its 4..5.
+ *
+ * TODO: support >1 unicast key
+ * TODO: support IGTK keys
+ *
+ * @param vap the current VAP
+ * @param k ieee80211_key to check
+ * @returns 0..3 for a WEP/global key, 0..1 for unicast key, 4..5 for IGTK key
*/
uint8_t
ieee80211_crypto_get_keyid(struct ieee80211vap *vap, struct ieee80211_key *k)
@@ -641,6 +655,19 @@ ieee80211_crypto_get_keyid(struct ieee80211vap *vap, struct ieee80211_key *k)
return (0);
}
+/**
+ * @param Return the key to use for encrypting an mbuf frame to a node
+ *
+ * This routine chooses a suitable key used to encrypt the given frame with.
+ * It doesn't do the encryption; it only chooses the key. If a key is not
+ * available then the routine will return NULL.
+ *
+ * It's up to the caller to enforce whether a key is absolutely required or not.
+ *
+ * @param ni The ieee80211_node to send the frame to
+ * @param m the mbuf to encrypt
+ * @returns the ieee80211_key to encrypt with, or NULL if there's no suitable key
+ */
struct ieee80211_key *
ieee80211_crypto_get_txkey(struct ieee80211_node *ni, struct mbuf *m)
{
@@ -676,8 +703,28 @@ ieee80211_crypto_get_txkey(struct ieee80211_node *ni, struct mbuf *m)
return &ni->ni_ucastkey;
}
-/*
- * Add privacy headers appropriate for the specified key.
+/**
+ * @brief Privacy encapsulate and encrypt the given mbuf.
+ *
+ * This routine handles the mechanics of encryption - expanding the
+ * mbuf to add privacy headers, IV, ICV, MIC, MMIC, and then encrypts
+ * the given mbuf if required.
+ *
+ * This should be called by the driver in its TX path as part of
+ * encapsulation before passing frames to the hardware/firmware
+ * queues.
+ *
+ * Drivers/hardware which does its own entirely offload path
+ * should still call this for completeness - it indicates to the
+ * driver that the frame itself should be encrypted.
+ *
+ * The driver should have set capability bits in the attach /
+ * key allocation path to disable various encapsulation/encryption
+ * features.
+ *
+ * @param ni ieee80211_node for this frame
+ * @param mbuf mbuf to modify
+ * @returns the key used if the frame is to be encrypted, NULL otherwise
*/
struct ieee80211_key *
ieee80211_crypto_encap(struct ieee80211_node *ni, struct mbuf *m)
@@ -693,9 +740,31 @@ ieee80211_crypto_encap(struct ieee80211_node *ni, struct mbuf *m)
return NULL;
}
-/*
- * Validate and strip privacy headers (and trailer) for a
- * received frame that has the WEP/Privacy bit set.
+/**
+ * @brief Decapsulate and validate an encrypted frame.
+ *
+ * This handles an encrypted frame (one with the privacy bit set.)
+ * It also obeys the key / config / receive packet flags for how
+ * the driver says its already been processed.
+ *
+ * Unlike ieee80211_crypto_encap(), this isn't called in the driver.
+ * Instead, drivers passed the potentially decrypted frame - fully,
+ * partial, or not at all - and net80211 will call this as appropriate.
+ *
+ * This handles NICs (like ath(4)) which have a variable size between
+ * the 802.11 header and 802.11 payload due to DMA alignment / encryption
+ * engine concerns.
+ *
+ * If the frame was decrypted and validated successfully then 1 is returned
+ * and the mbuf can be treated as an 802.11 frame. If it is not decrypted
+ * successfully or it was decrypted but failed validation/checks, then
+ * 0 is returned.
+ *
+ * @param ni ieee80211_node for received frame
+ * @param m mbuf frame to receive
+ * @param hdrlen length of the 802.11 header, including trailing null bytes
+ * @param key pointer to ieee80211_key that will be set if appropriate
+ * @returns 0 if the frame wasn't decrypted/validated, 1 if decrypted/validated.
*/
int
ieee80211_crypto_decap(struct ieee80211_node *ni, struct mbuf *m, int hdrlen,
diff --git a/sys/netgraph/netflow/netflow.c b/sys/netgraph/netflow/netflow.c
index 978d6fd0b54d..05c6062463be 100644
--- a/sys/netgraph/netflow/netflow.c
+++ b/sys/netgraph/netflow/netflow.c
@@ -960,7 +960,7 @@ struct ngnf_show_header *resp)
list_id = 0;
TAILQ_FOREACH(fle, &hsh->head, fle_hash) {
- if (hsh->mtx.mtx_lock & MTX_CONTESTED) {
+ if (hsh->mtx.mtx_lock & MTX_WAITERS) {
resp->hash_id = i;
resp->list_id = list_id;
mtx_unlock(&hsh->mtx);
@@ -1111,7 +1111,7 @@ ng_netflow_expire(void *arg)
* Interrupt thread wants this entry!
* Quick! Quick! Bail out!
*/
- if (hsh->mtx.mtx_lock & MTX_CONTESTED)
+ if (hsh->mtx.mtx_lock & MTX_WAITERS)
break;
/*
@@ -1150,7 +1150,7 @@ ng_netflow_expire(void *arg)
* Interrupt thread wants this entry!
* Quick! Quick! Bail out!
*/
- if (hsh->mtx.mtx_lock & MTX_CONTESTED)
+ if (hsh->mtx.mtx_lock & MTX_WAITERS)
break;
/*
diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c
index d3d7957cf087..4f553b9aac5e 100644
--- a/sys/netinet/ip_carp.c
+++ b/sys/netinet/ip_carp.c
@@ -1640,18 +1640,31 @@ carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
static void
carp_send_na(struct carp_softc *sc)
{
- static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
struct ifaddr *ifa;
- struct in6_addr *in6;
+ int flags;
+ /*
+ * Sending Unsolicited Neighbor Advertisements
+ *
+ * If the node is a router, we MUST set the Router flag to one.
+ * We set Override flag to one and send link-layer address option,
+ * thus neighboring nodes will install the new link-layer address.
+ */
+ flags = ND_NA_FLAG_OVERRIDE;
+ if (V_ip6_forwarding)
+ flags |= ND_NA_FLAG_ROUTER;
CARP_FOREACH_IFA(sc, ifa) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
-
- in6 = IFA_IN6(ifa);
- nd6_na_output(sc->sc_carpdev, &mcast, in6,
- ND_NA_FLAG_OVERRIDE, 1, NULL);
- DELAY(1000); /* XXX */
+ /*
+ * We use unspecified address as destination here to avoid
+ * scope initialization for each call.
+ * nd6_na_output() will use all nodes multicast address if
+ * destinaion address is unspecified.
+ */
+ nd6_na_output(sc->sc_carpdev, &in6addr_any, IFA_IN6(ifa),
+ flags, ND6_NA_OPT_LLA | ND6_NA_CARP_MASTER, NULL);
+ DELAY(1000); /* RetransTimer */
}
}
diff --git a/sys/netinet/sctp_lock_bsd.h b/sys/netinet/sctp_lock_bsd.h
index ec66be0cf371..a60983cb30e3 100644
--- a/sys/netinet/sctp_lock_bsd.h
+++ b/sys/netinet/sctp_lock_bsd.h
@@ -263,10 +263,10 @@
} while (0)
#define SCTP_INP_LOCK_CONTENDED(_inp) \
- ((_inp)->inp_mtx.mtx_lock & MTX_CONTESTED)
+ ((_inp)->inp_mtx.mtx_lock & MTX_WAITERS)
#define SCTP_INP_READ_CONTENDED(_inp) \
- ((_inp)->inp_rdata_mtx.mtx_lock & MTX_CONTESTED)
+ ((_inp)->inp_rdata_mtx.mtx_lock & MTX_WAITERS)
#ifdef SCTP_LOCK_LOGGING
#define SCTP_INP_RLOCK(_inp) do { \
@@ -337,7 +337,7 @@
} while (0)
#define SCTP_ASOC_CREATE_LOCK_CONTENDED(_inp) \
- ((_inp)->inp_create_mtx.mtx_lock & MTX_CONTESTED)
+ ((_inp)->inp_create_mtx.mtx_lock & MTX_WAITERS)
/*
* For the majority of things (once we have found the association) we will
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 1ee6c6e31f33..f842a5678fa1 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -122,6 +122,7 @@ static void syncache_drop(struct syncache *, struct syncache_head *);
static void syncache_free(struct syncache *);
static void syncache_insert(struct syncache *, struct syncache_head *);
static int syncache_respond(struct syncache *, const struct mbuf *, int);
+static void syncache_send_challenge_ack(struct syncache *, struct mbuf *);
static struct socket *syncache_socket(struct syncache *, struct socket *,
struct mbuf *m);
static void syncache_timeout(struct syncache *sc, struct syncache_head *sch,
@@ -694,13 +695,7 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m,
"sending challenge ACK\n",
s, __func__,
th->th_seq, sc->sc_irs + 1, sc->sc_wnd);
- if (syncache_respond(sc, m, TH_ACK) == 0) {
- TCPSTAT_INC(tcps_sndacks);
- TCPSTAT_INC(tcps_sndtotal);
- } else {
- syncache_drop(sc, sch);
- TCPSTAT_INC(tcps_sc_dropped);
- }
+ syncache_send_challenge_ack(sc, m);
}
} else {
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
@@ -963,6 +958,10 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
if (sc->sc_rxmits > 1)
tp->snd_cwnd = 1;
+ /* Copy over the challenge ACK state. */
+ tp->t_challenge_ack_end = sc->sc_challenge_ack_end;
+ tp->t_challenge_ack_cnt = sc->sc_challenge_ack_cnt;
+
#ifdef TCP_OFFLOAD
/*
* Allow a TOE driver to install its hooks. Note that we hold the
@@ -1202,7 +1201,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
*/
if (sc->sc_flags & SCF_TIMESTAMP && to->to_flags & TOF_TS &&
TSTMP_LT(to->to_tsval, sc->sc_tsreflect)) {
- SCH_UNLOCK(sch);
if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
log(LOG_DEBUG,
"%s; %s: SEG.TSval %u < TS.Recent %u, "
@@ -1210,6 +1208,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
to->to_tsval, sc->sc_tsreflect);
free(s, M_TCPLOG);
}
+ SCH_UNLOCK(sch);
return (-1); /* Do not send RST */
}
@@ -1258,6 +1257,38 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
return (-1); /* Do not send RST */
}
}
+
+ /*
+ * SEG.SEQ validation:
+ * The SEG.SEQ must be in the window starting at our
+ * initial receive sequence number + 1.
+ */
+ if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+ SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, "
+ "sending challenge ACK\n",
+ s, __func__, th->th_seq, sc->sc_irs + 1);
+ syncache_send_challenge_ack(sc, m);
+ SCH_UNLOCK(sch);
+ free(s, M_TCPLOG);
+ return (-1); /* Do not send RST */
+ }
+
+ /*
+ * SEG.ACK validation:
+ * SEG.ACK must match our initial send sequence number + 1.
+ */
+ if (th->th_ack != sc->sc_iss + 1) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
+ log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, "
+ "segment rejected\n",
+ s, __func__, th->th_ack, sc->sc_iss + 1);
+ SCH_UNLOCK(sch);
+ free(s, M_TCPLOG);
+ return (0); /* Do send RST, do not free sc. */
+ }
+
TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
sch->sch_length--;
#ifdef TCP_OFFLOAD
@@ -1270,29 +1301,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
SCH_UNLOCK(sch);
}
- /*
- * Segment validation:
- * ACK must match our initial sequence number + 1 (the SYN|ACK).
- */
- if (th->th_ack != sc->sc_iss + 1) {
- if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
- log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
- "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
- goto failed;
- }
-
- /*
- * The SEQ must fall in the window starting at the received
- * initial receive sequence number + 1 (the SYN).
- */
- if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
- SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
- if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
- log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
- "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
- goto failed;
- }
-
*lsop = syncache_socket(sc, *lsop, m);
if (__predict_false(*lsop == NULL)) {
@@ -2053,6 +2061,18 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags)
return (error);
}
+static void
+syncache_send_challenge_ack(struct syncache *sc, struct mbuf *m)
+{
+ if (tcp_challenge_ack_check(&sc->sc_challenge_ack_end,
+ &sc->sc_challenge_ack_cnt)) {
+ if (syncache_respond(sc, m, TH_ACK) == 0) {
+ TCPSTAT_INC(tcps_sndacks);
+ TCPSTAT_INC(tcps_sndtotal);
+ }
+ }
+}
+
/*
* The purpose of syncookies is to handle spoofed SYN flooding DoS attacks
* that exceed the capacity of the syncache by avoiding the storage of any
diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h
index a336571f12c9..37f6ff3d6ca9 100644
--- a/sys/netinet/tcp_syncache.h
+++ b/sys/netinet/tcp_syncache.h
@@ -67,6 +67,8 @@ struct syncache {
u_int8_t sc_requested_s_scale:4,
sc_requested_r_scale:4;
u_int16_t sc_flags;
+ u_int32_t sc_challenge_ack_cnt; /* chall. ACKs sent in epoch */
+ sbintime_t sc_challenge_ack_end; /* End of chall. ack epoch */
#if defined(TCP_OFFLOAD)
struct toedev *sc_tod; /* entry added by this TOE */
void *sc_todctx; /* TOE driver context */
diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c
index 8ef755e2dc0a..b98703bdfbfe 100644
--- a/sys/netinet6/in6.c
+++ b/sys/netinet6/in6.c
@@ -1295,8 +1295,8 @@ in6_addifaddr(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *i
*/
bzero(&pr0, sizeof(pr0));
pr0.ndpr_ifp = ifp;
- pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
- NULL);
+ pr0.ndpr_plen = ia->ia_plen =
+ in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL);
if (pr0.ndpr_plen == 128) {
/* we don't need to install a host route. */
goto aifaddr_out;
@@ -1490,16 +1490,16 @@ in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
* positive reference.
*/
remove_lle = 0;
- if (ia->ia6_ndpr == NULL) {
- nd6log((LOG_NOTICE,
- "in6_unlink_ifa: autoconf'ed address "
- "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia))));
- } else {
+ if (ia->ia6_ndpr != NULL) {
ia->ia6_ndpr->ndpr_addrcnt--;
/* Do not delete lles within prefix if refcont != 0 */
if (ia->ia6_ndpr->ndpr_addrcnt == 0)
remove_lle = 1;
ia->ia6_ndpr = NULL;
+ } else if (ia->ia_plen < 128) {
+ nd6log((LOG_NOTICE,
+ "in6_unlink_ifa: autoconf'ed address "
+ "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia))));
}
nd6_rem_ifa_lle(ia, remove_lle);
@@ -2604,8 +2604,6 @@ in6_domifattach(struct ifnet *ifp)
COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat,
sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK);
- ext->dad_failures = counter_u64_alloc(M_WAITOK);
-
ext->nd_ifinfo = nd6_ifattach(ifp);
ext->scope6_id = scope6_ifattach(ifp);
ext->lltable = in6_lltattach(ifp);
@@ -2641,7 +2639,6 @@ in6_domifdetach(struct ifnet *ifp, void *aux)
COUNTER_ARRAY_FREE(ext->icmp6_ifstat,
sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
free(ext->icmp6_ifstat, M_IFADDR);
- counter_u64_free(ext->dad_failures);
free(ext, M_IFADDR);
}
diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c
index 4fde346fb691..090ba610460b 100644
--- a/sys/netinet6/in6_ifattach.c
+++ b/sys/netinet6/in6_ifattach.c
@@ -44,7 +44,6 @@
#include <sys/rmlock.h>
#include <sys/syslog.h>
#include <sys/md5.h>
-#include <crypto/sha2/sha256.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -72,6 +71,9 @@
#include <netinet6/mld6_var.h>
#include <netinet6/scope6_var.h>
+#include <crypto/sha2/sha256.h>
+#include <machine/atomic.h>
+
#ifdef IP6_AUTO_LINKLOCAL
VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL;
#else
@@ -377,7 +379,7 @@ in6_get_stableifid(struct ifnet *ifp, struct in6_addr *in6, int prefixlen)
}
hostuuid_len = strlen(hostuuid);
- dad_failures = counter_u64_fetch(DAD_FAILURES(ifp));
+ dad_failures = atomic_load_int(&DAD_FAILURES(ifp));
/*
* RFC 7217 section 7
diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c
index 6669a2ba56ce..f567b42b42ca 100644
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@@ -167,7 +167,7 @@ VNET_DEFINE(int, ip6_rr_prune) = 5; /* router renumbering prefix
* walk list every 5 sec. */
VNET_DEFINE(int, ip6_mcast_pmtu) = 0; /* enable pMTU discovery for multicast? */
VNET_DEFINE(int, ip6_v6only) = 1;
-VNET_DEFINE(int, ip6_stableaddr_maxretries) = IP6_IDGEN_RETRIES;
+VNET_DEFINE(u_int, ip6_stableaddr_maxretries) = IP6_IDGEN_RETRIES;
#ifdef IPSTEALTH
VNET_DEFINE(int, ip6stealth) = 0;
@@ -317,7 +317,7 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr,
SYSCTL_BOOL(_net_inet6_ip6, IPV6CTL_USESTABLEADDR, use_stableaddr,
CTLFLAG_VNET | CTLFLAG_RWTUN, &VNET_NAME(ip6_use_stableaddr), 0,
"Create RFC7217 semantically opaque address for autoconfigured addresses (default for new interfaces)");
-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STABLEADDR_MAXRETRIES, stableaddr_maxretries,
+SYSCTL_UINT(_net_inet6_ip6, IPV6CTL_STABLEADDR_MAXRETRIES, stableaddr_maxretries,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_stableaddr_maxretries), IP6_IDGEN_RETRIES,
"RFC7217 semantically opaque address DAD max retries");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STABLEADDR_NETIFSRC, stableaddr_netifsource,
diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c
index dd6864482b3c..3e55c6e5fc05 100644
--- a/sys/netinet6/in6_src.c
+++ b/sys/netinet6/in6_src.c
@@ -132,8 +132,8 @@ static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct ifnet **,
struct ifnet *, u_int);
static int in6_selectsrc(uint32_t, struct sockaddr_in6 *,
- struct ip6_pktopts *, struct inpcb *, struct ucred *,
- struct ifnet **, struct in6_addr *);
+ struct ip6_pktopts *, struct ip6_moptions *, struct inpcb *,
+ struct ucred *, struct ifnet **, struct in6_addr *);
static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
@@ -173,8 +173,8 @@ static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
static int
in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
- struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred,
- struct ifnet **ifpp, struct in6_addr *srcp)
+ struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct inpcb *inp,
+ struct ucred *cred, struct ifnet **ifpp, struct in6_addr *srcp)
{
struct rm_priotracker in6_ifa_tracker;
struct in6_addr dst, tmp;
@@ -186,7 +186,6 @@ in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
u_int32_t odstzone;
int prefer_tempaddr;
int error;
- struct ip6_moptions *mopts;
NET_EPOCH_ASSERT();
KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__));
@@ -205,13 +204,6 @@ in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
*ifpp = NULL;
}
- if (inp != NULL) {
- INP_LOCK_ASSERT(inp);
- mopts = inp->in6p_moptions;
- } else {
- mopts = NULL;
- }
-
/*
* If the source address is explicitly specified by the caller,
* check if the requested source address is indeed a unicast address
@@ -552,10 +544,13 @@ in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
uint32_t fibnum;
int error;
+ INP_LOCK_ASSERT(inp);
+
fibnum = inp->inp_inc.inc_fibnum;
retifp = NULL;
- error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp);
+ error = in6_selectsrc(fibnum, dstsock, opts, inp->in6p_moptions,
+ inp, cred, &retifp, srcp);
if (error != 0)
return (error);
@@ -583,7 +578,7 @@ in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
* Stores selected address to @srcp.
* Returns 0 on success.
*
- * Used by non-socket based consumers (ND code mostly)
+ * Used by non-socket based consumers
*/
int
in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst,
@@ -602,13 +597,42 @@ in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst,
dst_sa.sin6_scope_id = scopeid;
sa6_embedscope(&dst_sa, 0);
- error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp);
+ error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL,
+ NULL, NULL, &retifp, srcp);
if (hlim != NULL)
*hlim = in6_selecthlim(NULL, retifp);
return (error);
}
+/*
+ * Select source address based on @fibnum, @dst and @mopts.
+ * Stores selected address to @srcp.
+ * Returns 0 on success.
+ *
+ * Used by non-socket based consumers (ND code mostly)
+ */
+int
+in6_selectsrc_nbr(uint32_t fibnum, const struct in6_addr *dst,
+ struct ip6_moptions *mopts, struct ifnet *ifp, struct in6_addr *srcp)
+{
+ struct sockaddr_in6 dst_sa;
+ struct ifnet *retifp;
+ int error;
+
+ retifp = ifp;
+ bzero(&dst_sa, sizeof(dst_sa));
+ dst_sa.sin6_family = AF_INET6;
+ dst_sa.sin6_len = sizeof(dst_sa);
+ dst_sa.sin6_addr = *dst;
+ dst_sa.sin6_scope_id = ntohs(in6_getscope(dst));
+ sa6_embedscope(&dst_sa, 0);
+
+ error = in6_selectsrc(fibnum, &dst_sa, NULL, mopts,
+ NULL, NULL, &retifp, srcp);
+ return (error);
+}
+
static struct nhop_object *
cache_route(uint32_t fibnum, const struct sockaddr_in6 *dst, struct route_in6 *ro,
uint32_t flowid)
diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h
index e511ead24f08..1414cc71388d 100644
--- a/sys/netinet6/in6_var.h
+++ b/sys/netinet6/in6_var.h
@@ -106,7 +106,7 @@ struct in6_ifextra {
struct scope6_id *scope6_id;
struct lltable *lltable;
struct mld_ifsoftc *mld_ifinfo;
- counter_u64_t dad_failures; /* DAD failures when using RFC 7217 stable addresses */
+ u_int dad_failures; /* DAD failures when using RFC 7217 stable addresses */
};
#define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable)
diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h
index e1a4e8678ebb..db1631736c4a 100644
--- a/sys/netinet6/ip6_var.h
+++ b/sys/netinet6/ip6_var.h
@@ -344,7 +344,7 @@ VNET_DECLARE(bool, ip6_use_stableaddr); /* Whether to use stable address generat
#define V_ip6_use_stableaddr VNET(ip6_use_stableaddr)
#define IP6_IDGEN_RETRIES 3 /* RFC 7217 section 7 default max retries */
-VNET_DECLARE(int, ip6_stableaddr_maxretries);
+VNET_DECLARE(u_int, ip6_stableaddr_maxretries);
#define V_ip6_stableaddr_maxretries VNET(ip6_stableaddr_maxretries)
#define IP6_STABLEADDR_NETIFSRC_NAME 0
@@ -440,6 +440,8 @@ int in6_selectsrc_socket(struct sockaddr_in6 *, struct ip6_pktopts *,
struct inpcb *, struct ucred *, int, struct in6_addr *, int *);
int in6_selectsrc_addr(uint32_t, const struct in6_addr *,
uint32_t, struct ifnet *, struct in6_addr *, int *);
+int in6_selectsrc_nbr(uint32_t, const struct in6_addr *,
+ struct ip6_moptions *, struct ifnet *, struct in6_addr *);
int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route_in6 *, struct ifnet **,
struct nhop_object **, u_int, uint32_t);
diff --git a/sys/netinet6/nd6.h b/sys/netinet6/nd6.h
index 5fe027ac5e7c..e484c709e29a 100644
--- a/sys/netinet6/nd6.h
+++ b/sys/netinet6/nd6.h
@@ -171,6 +171,10 @@ struct in6_ndifreq {
#define NDPRF_ONLINK 0x1
#define NDPRF_DETACHED 0x2
+/* ND6 NA output flags */
+#define ND6_NA_OPT_LLA 0x01
+#define ND6_NA_CARP_MASTER 0x02
+
/* protocol constants */
#define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */
#define RTR_SOLICITATION_INTERVAL 4 /* 4sec */
diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c
index cc17b4e1a402..29151b29a071 100644
--- a/sys/netinet6/nd6_nbr.c
+++ b/sys/netinet6/nd6_nbr.c
@@ -77,6 +77,8 @@
#include <netinet/ip_carp.h>
#include <netinet6/send.h>
+#include <machine/atomic.h>
+
#define SDL(s) ((struct sockaddr_dl *)s)
struct dadq;
@@ -245,10 +247,9 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len)
* In implementation, we add target link-layer address by default.
* We do not add one in MUST NOT cases.
*/
- if (!IN6_IS_ADDR_MULTICAST(&daddr6))
- tlladdr = 0;
- else
- tlladdr = 1;
+ tlladdr = 0;
+ if (IN6_IS_ADDR_MULTICAST(&daddr6))
+ tlladdr |= ND6_NA_OPT_LLA;
/*
* Target address (taddr6) must be either:
@@ -257,9 +258,11 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len)
* (3) "tentative" address on which DAD is being performed.
*/
/* (1) and (3) check. */
- if (ifp->if_carp)
+ if (ifp->if_carp) {
ifa = (*carp_iamatch6_p)(ifp, &taddr6);
- else
+ if (ifa != NULL)
+ tlladdr |= ND6_NA_CARP_MASTER;
+ } else
ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
/* (2) check. */
@@ -323,32 +326,28 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len)
}
/*
+ * If the Target Address is either an anycast address or a unicast
+ * address for which the node is providing proxy service, or the Target
+ * Link-Layer Address option is not included, the Override flag SHOULD
+ * be set to zero. Otherwise, the Override flag SHOULD be set to one.
+ */
+ if (anycast == 0 && proxy == 0 && (tlladdr & ND6_NA_OPT_LLA) != 0)
+ rflag |= ND_NA_FLAG_OVERRIDE;
+ /*
* If the source address is unspecified address, entries must not
* be created or updated.
- * It looks that sender is performing DAD. Output NA toward
- * all-node multicast address, to tell the sender that I'm using
- * the address.
+ * It looks that sender is performing DAD. nd6_na_output() will
+ * send NA toward all-node multicast address, to tell the sender
+ * that I'm using the address.
* S bit ("solicited") must be zero.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
- struct in6_addr in6_all;
-
- in6_all = in6addr_linklocal_allnodes;
- if (in6_setscope(&in6_all, ifp, NULL) != 0)
- goto bad;
- nd6_na_output_fib(ifp, &in6_all, &taddr6,
- ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
- rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL,
- M_GETFIB(m));
- goto freeit;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
+ nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
+ ND_NEIGHBOR_SOLICIT, 0);
+ rflag |= ND_NA_FLAG_SOLICITED;
}
- nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
- ND_NEIGHBOR_SOLICIT, 0);
-
- nd6_na_output_fib(ifp, &saddr6, &taddr6,
- ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
- rflag | ND_NA_FLAG_SOLICITED, tlladdr,
+ nd6_na_output_fib(ifp, &saddr6, &taddr6, rflag, tlladdr,
proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m));
freeit:
if (ifa != NULL)
@@ -440,13 +439,6 @@ nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6,
return;
M_SETFIB(m, fibnum);
- if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) {
- m->m_flags |= M_MCAST;
- im6o.im6o_multicast_ifp = ifp;
- im6o.im6o_multicast_hlim = 255;
- im6o.im6o_multicast_loop = 0;
- }
-
icmp6len = sizeof(*nd_ns);
m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */
@@ -471,6 +463,12 @@ nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6,
if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
goto bad;
}
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
+ m->m_flags |= M_MCAST;
+ im6o.im6o_multicast_ifp = ifp;
+ im6o.im6o_multicast_hlim = 255;
+ im6o.im6o_multicast_loop = 0;
+ }
if (nonce == NULL) {
char ip6buf[INET6_ADDRSTRLEN];
struct ifaddr *ifa = NULL;
@@ -492,20 +490,16 @@ nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6,
ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, saddr6);
if (ifa == NULL) {
int error;
- struct in6_addr dst6, src6;
- uint32_t scopeid;
- in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid);
- error = in6_selectsrc_addr(fibnum, &dst6,
- scopeid, ifp, &src6, NULL);
+ error = in6_selectsrc_nbr(fibnum, &ip6->ip6_dst, &im6o,
+ ifp, &ip6->ip6_src);
if (error) {
nd6log((LOG_DEBUG, "%s: source can't be "
"determined: dst=%s, error=%d\n", __func__,
- ip6_sprintf(ip6buf, &dst6),
+ ip6_sprintf(ip6buf, &ip6->ip6_dst),
error));
goto bad;
}
- ip6->ip6_src = src6;
} else
ip6->ip6_src = *saddr6;
@@ -968,7 +962,9 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len)
* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
*
- * tlladdr - 1 if include target link-layer address
+ * tlladdr:
+ * - 0x01 if include target link-layer address
+ * - 0x02 if target address is CARP MASTER
* sdl0 - sockaddr_dl (= proxy NA) or NULL
*/
static void
@@ -981,8 +977,7 @@ nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
struct ip6_hdr *ip6;
struct nd_neighbor_advert *nd_na;
struct ip6_moptions im6o;
- struct in6_addr daddr6, dst6, src6;
- uint32_t scopeid;
+ struct in6_addr daddr6;
NET_EPOCH_ASSERT();
@@ -1006,13 +1001,6 @@ nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
return;
M_SETFIB(m, fibnum);
- if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
- m->m_flags |= M_MCAST;
- im6o.im6o_multicast_ifp = ifp;
- im6o.im6o_multicast_hlim = 255;
- im6o.im6o_multicast_loop = 0;
- }
-
icmp6len = sizeof(*nd_na);
m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */
@@ -1024,26 +1012,24 @@ nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_hlim = 255;
+
if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
/* reply to DAD */
- daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
- daddr6.s6_addr16[1] = 0;
- daddr6.s6_addr32[1] = 0;
- daddr6.s6_addr32[2] = 0;
- daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
+ daddr6 = in6addr_linklocal_allnodes;
if (in6_setscope(&daddr6, ifp, NULL))
goto bad;
flags &= ~ND_NA_FLAG_SOLICITED;
}
- ip6->ip6_dst = daddr6;
+ if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
+ m->m_flags |= M_MCAST;
+ im6o.im6o_multicast_ifp = ifp;
+ im6o.im6o_multicast_hlim = 255;
+ im6o.im6o_multicast_loop = 0;
+ }
- /*
- * Select a source whose scope is the same as that of the dest.
- */
- in6_splitscope(&daddr6, &dst6, &scopeid);
- error = in6_selectsrc_addr(fibnum, &dst6,
- scopeid, ifp, &src6, NULL);
+ ip6->ip6_dst = daddr6;
+ error = in6_selectsrc_nbr(fibnum, &daddr6, &im6o, ifp, &ip6->ip6_src);
if (error) {
char ip6buf[INET6_ADDRSTRLEN];
nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
@@ -1051,7 +1037,6 @@ nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
ip6_sprintf(ip6buf, &daddr6), error));
goto bad;
}
- ip6->ip6_src = src6;
nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
nd_na->nd_na_code = 0;
@@ -1059,20 +1044,24 @@ nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
in6_clearscope(&nd_na->nd_na_target); /* XXX */
/*
+ * If we respond from CARP address, we need to prepare mac address
+ * for carp_output().
+ */
+ if (ifp->if_carp && (tlladdr & ND6_NA_CARP_MASTER))
+ mac = (*carp_macmatch6_p)(ifp, m, taddr6);
+ /*
* "tlladdr" indicates NS's condition for adding tlladdr or not.
* see nd6_ns_input() for details.
* Basically, if NS packet is sent to unicast/anycast addr,
* target lladdr option SHOULD NOT be included.
*/
- if (tlladdr) {
+ if (tlladdr & ND6_NA_OPT_LLA) {
/*
* sdl0 != NULL indicates proxy NA. If we do proxy, use
* lladdr in sdl0. If we are not proxying (sending NA for
* my address) use lladdr configured for the interface.
*/
if (sdl0 == NULL) {
- if (ifp->if_carp)
- mac = (*carp_macmatch6_p)(ifp, m, taddr6);
if (mac == NULL)
mac = nd6_ifptomac(ifp);
} else if (sdl0->sa_family == AF_LINK) {
@@ -1082,7 +1071,7 @@ nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
mac = LLADDR(sdl);
}
}
- if (tlladdr && mac) {
+ if ((tlladdr & ND6_NA_OPT_LLA) && mac != NULL) {
int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1);
@@ -1473,7 +1462,7 @@ nd6_dad_timer(void *arg)
if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0) {
ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
if ((ND_IFINFO(ifp)->flags & ND6_IFF_STABLEADDR) && !(ia->ia6_flags & IN6_IFF_TEMPORARY))
- counter_u64_zero(DAD_FAILURES(ifp));
+ atomic_store_int(&DAD_FAILURES(ifp), 0);
}
nd6log((LOG_DEBUG,
@@ -1522,10 +1511,10 @@ nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp)
* More addresses will be generated as long as retries are not exhausted.
*/
if ((ND_IFINFO(ifp)->flags & ND6_IFF_STABLEADDR) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
- uint64_t dad_failures = counter_u64_fetch(DAD_FAILURES(ifp));
+ u_int dad_failures = atomic_load_int(&DAD_FAILURES(ifp));
if (dad_failures <= V_ip6_stableaddr_maxretries) {
- counter_u64_add(DAD_FAILURES(ifp), 1);
+ atomic_add_int(&DAD_FAILURES(ifp), 1);
/* if retries exhausted, output an informative error message */
if (dad_failures == V_ip6_stableaddr_maxretries)
log(LOG_ERR, "%s: manual intervention required, consider disabling \"stableaddr\" on the interface"
diff --git a/sys/netinet6/nd6_rtr.c b/sys/netinet6/nd6_rtr.c
index 78dc55dd292f..10f0342f2bc4 100644
--- a/sys/netinet6/nd6_rtr.c
+++ b/sys/netinet6/nd6_rtr.c
@@ -74,6 +74,8 @@
#include <netinet/icmp6.h>
#include <netinet6/scope6_var.h>
+#include <machine/atomic.h>
+
static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *);
static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *,
struct mbuf *, int);
@@ -1243,9 +1245,8 @@ in6_ifadd(struct nd_prefixctl *pr, int mcast)
/* No suitable LL address, get the ifid directly */
if (ifid_addr == NULL) {
- struct in6_addr taddr;
- ifa = ifa_alloc(sizeof(taddr), M_WAITOK);
- if (ifa) {
+ ifa = ifa_alloc(sizeof(struct in6_ifaddr), M_NOWAIT);
+ if (ifa != NULL) {
ib = (struct in6_ifaddr *)ifa;
ifid_addr = &ib->ia_addr.sin6_addr;
if(in6_get_ifid(ifp, NULL, ifid_addr) != 0) {
@@ -1757,7 +1758,7 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr,
* to fail and no further retries should happen.
*/
if (ND_IFINFO(ifp)->flags & ND6_IFF_STABLEADDR &&
- counter_u64_fetch(DAD_FAILURES(ifp)) <= V_ip6_stableaddr_maxretries &&
+ atomic_load_int(&DAD_FAILURES(ifp)) <= V_ip6_stableaddr_maxretries &&
ifa6->ia6_flags & (IN6_IFF_DUPLICATED | IN6_IFF_TEMPORARY))
continue;
diff --git a/sys/netlink/netlink_snl.h b/sys/netlink/netlink_snl.h
index 6dd8a9cbdb35..57f7e1e29d08 100644
--- a/sys/netlink/netlink_snl.h
+++ b/sys/netlink/netlink_snl.h
@@ -1068,14 +1068,14 @@ snl_init_writer(struct snl_state *ss, struct snl_writer *nw)
{
nw->size = SNL_WRITER_BUFFER_SIZE;
nw->base = (char *)snl_allocz(ss, nw->size);
- if (nw->base == NULL) {
+ if (__predict_false(nw->base == NULL)) {
nw->error = true;
nw->size = 0;
- }
+ } else
+ nw->error = false;
nw->offset = 0;
nw->hdr = NULL;
- nw->error = false;
nw->ss = ss;
}
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index 2c6d62078e6a..d6fc24a23fe9 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -3364,7 +3364,7 @@ pf_change_ap(struct pf_pdesc *pd, struct pf_addr *a, u_int16_t *p,
u_int16_t po;
uint8_t u = pd->virtual_proto == IPPROTO_UDP;
- MPASS(pd->pcksum);
+ MPASS(pd->pcksum != NULL);
if (pd->af == AF_INET) {
MPASS(pd->ip_sum);
}
@@ -5965,37 +5965,42 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
ctx.nat_pool = &(ctx.nr->rdr);
}
- ruleset = &pf_main_ruleset;
- rv = pf_match_rule(&ctx, ruleset, match_rules);
- if (rv == PF_TEST_FAIL) {
- /*
- * Reason has been set in pf_match_rule() already.
- */
- goto cleanup;
- }
-
- r = *ctx.rm; /* matching rule */
- ctx.a = *ctx.am; /* rule that defines an anchor containing 'r' */
- ruleset = *ctx.rsm; /* ruleset of the anchor defined by the rule 'a' */
- ctx.aruleset = ctx.arsm; /* ruleset of the 'a' rule itself */
+ if (ctx.nr && ctx.nr->natpass) {
+ r = ctx.nr;
+ ruleset = *ctx.rsm;
+ } else {
+ ruleset = &pf_main_ruleset;
+ rv = pf_match_rule(&ctx, ruleset, match_rules);
+ if (rv == PF_TEST_FAIL) {
+ /*
+ * Reason has been set in pf_match_rule() already.
+ */
+ goto cleanup;
+ }
- REASON_SET(&ctx.reason, PFRES_MATCH);
+ r = *ctx.rm; /* matching rule */
+ ctx.a = *ctx.am; /* rule that defines an anchor containing 'r' */
+ ruleset = *ctx.rsm; /* ruleset of the anchor defined by the rule 'a' */
+ ctx.aruleset = ctx.arsm; /* ruleset of the 'a' rule itself */
- /* apply actions for last matching pass/block rule */
- pf_rule_to_actions(r, &pd->act);
- transerror = pf_rule_apply_nat(&ctx, r);
- switch (transerror) {
- case PFRES_MATCH:
- /* Translation action found in rule and applied successfully */
- case PFRES_MAX:
- /* No translation action found in rule */
- break;
- default:
- /* Translation action found in rule but failed to apply */
- REASON_SET(&ctx.reason, transerror);
- goto cleanup;
+ /* apply actions for last matching pass/block rule */
+ pf_rule_to_actions(r, &pd->act);
+ transerror = pf_rule_apply_nat(&ctx, r);
+ switch (transerror) {
+ case PFRES_MATCH:
+ /* Translation action found in rule and applied successfully */
+ case PFRES_MAX:
+ /* No translation action found in rule */
+ break;
+ default:
+ /* Translation action found in rule but failed to apply */
+ REASON_SET(&ctx.reason, transerror);
+ goto cleanup;
+ }
}
+ REASON_SET(&ctx.reason, PFRES_MATCH);
+
if (r->log) {
if (ctx.rewrite)
m_copyback(pd->m, pd->off, pd->hdrlen, pd->hdr.any);
@@ -7634,6 +7639,7 @@ again:
nj->pd.m = j->pd.m;
nj->op = j->op;
+ MPASS(nj->pd.pcksum);
TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, nj, next);
}
PF_SCTP_ENDPOINTS_UNLOCK();
@@ -7753,6 +7759,7 @@ pf_multihome_scan(int start, int len, struct pf_pdesc *pd, int op)
job->pd.m = pd->m;
job->op = op;
+ MPASS(job->pd.pcksum);
TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, job, next);
break;
}
@@ -7786,6 +7793,7 @@ pf_multihome_scan(int start, int len, struct pf_pdesc *pd, int op)
job->pd.m = pd->m;
job->op = op;
+ MPASS(job->pd.pcksum);
TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, job, next);
break;
}
@@ -10443,28 +10451,28 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
__func__);
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
h = mtod(pd->m, struct ip *);
if (pd->m->m_pkthdr.len < ntohs(h->ip_len)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
if (pf_normalize_ip(reason, pd) != PF_PASS) {
/* We do IP header normalization and packet reassembly here */
*m0 = pd->m;
*action = PF_DROP;
- return (-1);
+ return (PF_DROP);
}
*m0 = pd->m;
h = mtod(pd->m, struct ip *);
if (pf_walk_header(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
- return (-1);
+ return (PF_DROP);
}
pd->src = (struct pf_addr *)&h->ip_src;
@@ -10494,7 +10502,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
", pullup failed", __func__);
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
h = mtod(pd->m, struct ip6_hdr *);
@@ -10502,7 +10510,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
/*
@@ -10511,12 +10519,12 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
*/
if (htons(h->ip6_plen) == 0) {
*action = PF_DROP;
- return (-1);
+ return (PF_DROP);
}
if (pf_walk_header6(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
- return (-1);
+ return (PF_DROP);
}
h = mtod(pd->m, struct ip6_hdr *);
@@ -10538,13 +10546,13 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
PF_PASS) {
*m0 = pd->m;
*action = PF_DROP;
- return (-1);
+ return (PF_DROP);
}
*m0 = pd->m;
if (pd->m == NULL) {
/* packet sits in reassembly queue, no error */
*action = PF_PASS;
- return (-1);
+ return (PF_DROP);
}
/* Update pointers into the packet. */
@@ -10556,7 +10564,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
if (pf_walk_header6(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
- return (-1);
+ return (PF_DROP);
}
if (m_tag_find(pd->m, PACKET_TAG_PF_REASSEMBLED, NULL) != NULL) {
@@ -10586,7 +10594,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
reason, af)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
pd->hdrlen = sizeof(*th);
pd->p_len = pd->tot_len - pd->off - (th->th_off << 2);
@@ -10602,7 +10610,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
reason, af)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
pd->hdrlen = sizeof(*uh);
if (uh->uh_dport == 0 ||
@@ -10610,7 +10618,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
ntohs(uh->uh_ulen) < sizeof(struct udphdr)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
pd->sport = &uh->uh_sport;
pd->dport = &uh->uh_dport;
@@ -10622,7 +10630,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
reason, af)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
pd->hdrlen = sizeof(pd->hdr.sctp);
pd->p_len = pd->tot_len - pd->off;
@@ -10632,19 +10640,23 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
if (pd->hdr.sctp.src_port == 0 || pd->hdr.sctp.dest_port == 0) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
- }
- if (pf_scan_sctp(pd) != PF_PASS) {
- *action = PF_DROP;
- REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
+
/*
* Placeholder. The SCTP checksum is 32-bits, but
* pf_test_state() expects to update a 16-bit checksum.
* Provide a dummy value which we'll subsequently ignore.
+ * Do this before pf_scan_sctp() so any jobs we enqueue
+ * have a pcksum set.
*/
pd->pcksum = &pd->sctp_dummy_sum;
+
+ if (pf_scan_sctp(pd) != PF_PASS) {
+ *action = PF_DROP;
+ REASON_SET(reason, PFRES_SHORT);
+ return (PF_DROP);
+ }
break;
}
case IPPROTO_ICMP: {
@@ -10652,7 +10664,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
reason, af)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
pd->pcksum = &pd->hdr.icmp.icmp_cksum;
pd->hdrlen = ICMP_MINLEN;
@@ -10666,7 +10678,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
reason, af)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
/* ICMP headers we look further into to match state */
switch (pd->hdr.icmp6.icmp6_type) {
@@ -10692,7 +10704,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
reason, af)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
- return (-1);
+ return (PF_DROP);
}
pd->hdrlen = icmp_hlen;
pd->pcksum = &pd->hdr.icmp6.icmp6_cksum;
@@ -10715,7 +10727,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
MPASS(pd->pcksum != NULL);
- return (0);
+ return (PF_PASS);
}
static __inline void
@@ -10977,7 +10989,7 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
PF_RULES_RLOCK();
if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason,
- kif, default_actions) == -1) {
+ kif, default_actions) != PF_PASS) {
if (action != PF_PASS)
pd.act.log |= PF_LOG_FORCE;
goto done;
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index b8b5157c9b15..fb1b121d0bc0 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -73,7 +73,7 @@ VNET_DEFINE_STATIC(int, pf_rdr_srcport_rewrite_tries) = 16;
static uint64_t pf_hash(struct pf_addr *, struct pf_addr *,
struct pf_poolhashkey *, sa_family_t);
-struct pf_krule *pf_match_translation(int, struct pf_test_ctx *);
+static struct pf_krule *pf_match_translation(int, struct pf_test_ctx *);
static enum pf_test_status pf_step_into_translation_anchor(int, struct pf_test_ctx *,
struct pf_krule *);
static int pf_get_sport(struct pf_pdesc *, struct pf_krule *,
@@ -273,7 +273,7 @@ pf_step_into_translation_anchor(int rs_num, struct pf_test_ctx *ctx, struct pf_k
return (rv);
}
-struct pf_krule *
+static struct pf_krule *
pf_match_translation(int rs_num, struct pf_test_ctx *ctx)
{
enum pf_test_status rv;
diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h
index c0d9811dd1b9..29a16b393b52 100644
--- a/sys/sys/eventhandler.h
+++ b/sys/sys/eventhandler.h
@@ -33,6 +33,7 @@
#include <sys/lock.h>
#include <sys/ktr.h>
#include <sys/mutex.h>
+#include <sys/power.h>
#include <sys/queue.h>
#ifdef VIMAGE
@@ -201,7 +202,7 @@ EVENTHANDLER_DECLARE(shutdown_post_sync, shutdown_fn); /* after fs sync */
EVENTHANDLER_DECLARE(shutdown_final, shutdown_fn);
/* Power state change events */
-typedef void (*power_change_fn)(void *);
+typedef void (*power_change_fn)(void *, enum power_stype stype);
EVENTHANDLER_DECLARE(power_resume, power_change_fn);
EVENTHANDLER_DECLARE(power_suspend, power_change_fn);
EVENTHANDLER_DECLARE(power_suspend_early, power_change_fn);
diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h
index 08d4e2d28b33..4f6b45d78a88 100644
--- a/sys/sys/mutex.h
+++ b/sys/sys/mutex.h
@@ -68,9 +68,9 @@
*/
#define MTX_UNOWNED 0x00000000 /* Cookie for free mutex */
#define MTX_RECURSED 0x00000001 /* lock recursed (for MTX_DEF only) */
-#define MTX_CONTESTED 0x00000002 /* lock contested (for MTX_DEF only) */
+#define MTX_WAITERS 0x00000002 /* lock has waiters (for MTX_DEF only) */
#define MTX_DESTROYED 0x00000004 /* lock destroyed */
-#define MTX_FLAGMASK (MTX_RECURSED | MTX_CONTESTED | MTX_DESTROYED)
+#define MTX_FLAGMASK (MTX_RECURSED | MTX_WAITERS | MTX_DESTROYED)
/*
* Prototypes
@@ -217,14 +217,10 @@ void _thread_lock(struct thread *);
#define _mtx_obtain_lock_fetch(mp, vp, tid) \
atomic_fcmpset_acq_ptr(&(mp)->mtx_lock, vp, (tid))
-/* Try to release mtx_lock if it is unrecursed and uncontested. */
+/* Try to release mtx_lock if it is unrecursed and without waiters. */
#define _mtx_release_lock(mp, tid) \
atomic_cmpset_rel_ptr(&(mp)->mtx_lock, (tid), MTX_UNOWNED)
-/* Release mtx_lock quickly, assuming we own it. */
-#define _mtx_release_lock_quick(mp) \
- atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED)
-
#define _mtx_release_lock_fetch(mp, vp) \
atomic_fcmpset_rel_ptr(&(mp)->mtx_lock, (vp), MTX_UNOWNED)
@@ -246,10 +242,10 @@ void _thread_lock(struct thread *);
})
/*
- * Lock a spin mutex. For spinlocks, we handle recursion inline (it
- * turns out that function calls can be significantly expensive on
- * some architectures). Since spin locks are not _too_ common,
- * inlining this code is not too big a deal.
+ * Lock a spin mutex.
+ *
+ * FIXME: spinlock_enter is a function call, defeating the point of inlining in
+ * this.
*/
#ifdef SMP
#define __mtx_lock_spin(mp, tid, opts, file, line) __extension__ ({ \
@@ -317,10 +313,10 @@ void _thread_lock(struct thread *);
})
/*
- * Unlock a spin mutex. For spinlocks, we can handle everything
- * inline, as it's pretty simple and a function call would be too
- * expensive (at least on some architectures). Since spin locks are
- * not _too_ common, inlining this code is not too big a deal.
+ * Unlock a spin mutex.
+ *
+ * FIXME: spinlock_exit is a function call, defeating the point of inlining in
+ * this.
*
* Since we always perform a spinlock_enter() when attempting to acquire a
* spin lock, we need to always perform a matching spinlock_exit() when
@@ -332,7 +328,7 @@ void _thread_lock(struct thread *);
(mp)->mtx_recurse--; \
else { \
LOCKSTAT_PROFILE_RELEASE_SPIN_LOCK(spin__release, mp); \
- _mtx_release_lock_quick((mp)); \
+ atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED); \
} \
spinlock_exit(); \
})
diff --git a/sys/tools/gdb/README.txt b/sys/tools/gdb/README.txt
new file mode 100644
index 000000000000..8c31565ddc42
--- /dev/null
+++ b/sys/tools/gdb/README.txt
@@ -0,0 +1,21 @@
+This directory contains Python scripts that can be loaded by GDB to help debug
+FreeBSD kernel crashes.
+
+Add new commands and functions in their own files. Functions with general
+utility should be added to freebsd.py. sys/tools/kernel-gdb.py is installed
+into the kernel debug directory (typically /usr/lib/debug/boot/kernel). It will
+be automatically loaded by kgdb when opening a vmcore, so if you add new GDB
+commands or functions, that script should be updated to import them, and you
+should document them here.
+
+To provide some rudimentary testing, selftest.py tries to exercise all of the
+commands and functions defined here. To use it, run selftest.sh to panic the
+system. Then, create a kernel dump or attach to the panicked kernel, and invoke
+the script with "python import selftest" in (k)gdb.
+
+Commands:
+acttrace Display a backtrace for all on-CPU threads
+
+Functions:
+$PCPU(<field>[, <cpuid>]) Display the value of a PCPU/DPCPU field
+$V(<variable>[, <vnet>]) Display the value of a VNET variable
diff --git a/sys/tools/gdb/acttrace.py b/sys/tools/gdb/acttrace.py
new file mode 100644
index 000000000000..147effbbddf1
--- /dev/null
+++ b/sys/tools/gdb/acttrace.py
@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2022 The FreeBSD Foundation
+#
+# This software was developed by Mark Johnston under sponsorship from the
+# FreeBSD Foundation.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+import gdb
+from freebsd import *
+from pcpu import *
+
+class acttrace(gdb.Command):
+ """
+ Register an acttrace command with gdb.
+
+ When run, acttrace prints the stack trace of all threads that were on-CPU
+ at the time of the panic.
+ """
+ def __init__(self):
+ super(acttrace, self).__init__("acttrace", gdb.COMMAND_USER)
+
+ def invoke(self, arg, from_tty):
+ # Save the current thread so that we can switch back after.
+ curthread = gdb.selected_thread()
+
+ for pcpu in pcpu_foreach():
+ td = pcpu['pc_curthread']
+ tid = td['td_tid']
+
+ gdb_thread = tid_to_gdb_thread(tid)
+ if gdb_thread is None:
+ raise gdb.error(f"failed to find GDB thread with TID {tid}")
+ else:
+ gdb_thread.switch()
+
+ p = td['td_proc']
+ print("Tracing command {} pid {} tid {} (CPU {})".format(
+ p['p_comm'], p['p_pid'], td['td_tid'], pcpu['pc_cpuid']))
+ gdb.execute("bt")
+ print()
+
+ curthread.switch()
+
+
+# Registers the command with gdb, doesn't do anything.
+acttrace()
diff --git a/sys/tools/gdb/freebsd.py b/sys/tools/gdb/freebsd.py
new file mode 100644
index 000000000000..81ea60373348
--- /dev/null
+++ b/sys/tools/gdb/freebsd.py
@@ -0,0 +1,75 @@
+#
+# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org>
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+import gdb
+
+def symval(name):
+ sym = gdb.lookup_global_symbol(name)
+ if sym is None:
+ sym = gdb.lookup_static_symbol(name)
+ if sym is None:
+ raise gdb.GdbError(f"Symbol '{name}' not found")
+ return sym.value()
+
+
+def _queue_foreach(head, field, headf, nextf):
+ elm = head[headf]
+ while elm != 0:
+ yield elm
+ elm = elm[field][nextf]
+
+
+def list_foreach(head, field):
+ """sys/queue.h-style iterator."""
+ return _queue_foreach(head, field, "lh_first", "le_next")
+
+
+def tailq_foreach(head, field):
+ """sys/queue.h-style iterator."""
+ return _queue_foreach(head, field, "tqh_first", "tqe_next")
+
+
+def linker_file_foreach():
+ """Iterate over loaded linker files."""
+ return tailq_foreach(symval("linker_files"), "link")
+
+
+def pcpu_foreach():
+ mp_maxid = symval("mp_maxid")
+ cpuid_to_pcpu = symval("cpuid_to_pcpu")
+
+ cpu = 0
+ while cpu <= mp_maxid:
+ pcpu = cpuid_to_pcpu[cpu]
+ if pcpu:
+ yield pcpu
+ cpu = cpu + 1
+
+
+def tid_to_gdb_thread(tid):
+ """Convert a FreeBSD kernel thread ID to a gdb inferior thread."""
+ for thread in gdb.inferiors()[0].threads():
+ if thread.ptid[2] == tid:
+ return thread
+ else:
+ return None
+
+
+def tdfind(tid, pid=-1):
+ """Convert a FreeBSD kernel thread ID to a struct thread pointer."""
+ td = tdfind.cached_threads.get(int(tid))
+ if td:
+ return td
+
+ for p in list_foreach(symval("allproc"), "p_list"):
+ if pid != -1 and pid != p['p_pid']:
+ continue
+ for td in tailq_foreach(p['p_threads'], "td_plist"):
+ ntid = td['td_tid']
+ tdfind.cached_threads[int(ntid)] = td
+ if ntid == tid:
+ return td
+tdfind.cached_threads = dict()
diff --git a/sys/tools/gdb/pcpu.py b/sys/tools/gdb/pcpu.py
new file mode 100644
index 000000000000..aadc4b2d42df
--- /dev/null
+++ b/sys/tools/gdb/pcpu.py
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org>
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+import gdb
+from freebsd import *
+
+class pcpu(gdb.Function):
+ """
+ Register a function to lookup PCPU and DPCPU variables by name.
+
+ To look up the value of the PCPU field foo on CPU n, use
+ $PCPU("foo", n). This works for DPCPU fields too. If the CPU ID is
+ omitted, and the currently selected thread is on-CPU, that CPU is
+ used, otherwise an error is raised.
+ """
+ def __init__(self):
+ super(pcpu, self).__init__("PCPU")
+
+ def invoke(self, field, cpuid=-1):
+ if cpuid == -1:
+ cpuid = tdfind(gdb.selected_thread().ptid[2])['td_oncpu']
+ if cpuid == -1:
+ raise gdb.error("Currently selected thread is off-CPU")
+ if cpuid < 0 or cpuid > symval("mp_maxid"):
+ raise gdb.error(f"Currently selected on invalid CPU {cpuid}")
+ pcpu = symval("cpuid_to_pcpu")[cpuid]
+
+ # Are we dealing with a PCPU or DPCPU field?
+ field = field.string()
+ for f in gdb.lookup_type("struct pcpu").fields():
+ if f.name == "pc_" + field:
+ return pcpu["pc_" + field]
+
+ def uintptr_t(val):
+ return val.cast(gdb.lookup_type("uintptr_t"))
+
+ # We're dealing with a DPCPU field. This is handled similarly
+ # to VNET symbols, see vnet.py for comments.
+ pcpu_base = pcpu['pc_dynamic']
+ pcpu_entry = symval("pcpu_entry_" + field)
+ pcpu_entry_addr = uintptr_t(pcpu_entry.address)
+
+ for lf in linker_file_foreach():
+ block = gdb.block_for_pc(lf['ops']['cls']['methods'][0]['func'])
+ elf_file_t = gdb.lookup_type("elf_file_t", block).target()
+ ef = lf.cast(elf_file_t)
+
+ file_type = lf['ops']['cls']['name'].string()
+ if file_type == "elf64":
+ start = uintptr_t(ef['pcpu_start'])
+ if start == 0:
+ continue
+ end = uintptr_t(ef['pcpu_stop'])
+ base = uintptr_t(ef['pcpu_base'])
+ elif file_type == "elf64_obj":
+ for i in range(ef['nprogtab']):
+ pe = ef['progtab'][i]
+ if pe['name'].string() == "set_pcpu":
+ start = uintptr_t(pe['origaddr'])
+ end = start + uintptr_t(pe['size'])
+ base = uintptr_t(pe['addr'])
+ break
+ else:
+ continue
+ else:
+ path = lf['pathname'].string()
+ raise gdb.error(f"{path} has unexpected linker file type {file_type}")
+
+ if pcpu_entry_addr >= start and pcpu_entry_addr < end:
+ obj = gdb.Value(pcpu_base + pcpu_entry_addr - start + base)
+ return obj.cast(pcpu_entry.type.pointer()).dereference()
+
+# Register with gdb.
+pcpu()
diff --git a/sys/tools/gdb/selftest.py b/sys/tools/gdb/selftest.py
new file mode 100644
index 000000000000..41e9211c4bb3
--- /dev/null
+++ b/sys/tools/gdb/selftest.py
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org>
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+import gdb
+
+cmds = ["acttrace",
+ "p $V(\"tcbinfo\")",
+ "p $V(\"tcbinfo\", vnet0)",
+ "p $V(\"pf_status\")",
+ "p $V(\"pf_status\", \"gdbselftest\")",
+ "p $PCPU(\"curthread\")",
+ "p $PCPU(\"curthread\", 0)",
+ "p/x $PCPU(\"hardclocktime\", 1)",
+ "p $PCPU(\"pqbatch\")[0][0]",
+ "p $PCPU(\"ss\", 1)",
+ ]
+
+for cmd in cmds:
+ try:
+ print(f"Running command: '{cmd}'")
+ gdb.execute(cmd)
+ except gdb.error as e:
+ print(f"Command '{cmd}' failed: {e}")
+ break
+
+# We didn't hit any unexpected errors. This isn't as good as actually
+# verifying the output, but it's better than nothing.
+print("Everything seems OK")
diff --git a/sys/tools/gdb/selftest.sh b/sys/tools/gdb/selftest.sh
new file mode 100644
index 000000000000..252fae14af17
--- /dev/null
+++ b/sys/tools/gdb/selftest.sh
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org>
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+set -e
+
+n=$(sysctl -n hw.ncpu)
+if [ $n -lt 2 ]; then
+ echo "This test requires at least 2 CPUs"
+ exit 1
+fi
+
+# Set up some things expected by selftest.py.
+kldload -n pf siftr
+pfctl -e || true
+jail -c name=gdbselftest vnet persist
+
+echo "I'm about to panic your system, ctrl-C now if that's not what you want."
+sleep 10
+sysctl debug.debugger_on_panic=0
+sysctl debug.kdb.panic=1
diff --git a/sys/tools/gdb/vnet.py b/sys/tools/gdb/vnet.py
new file mode 100644
index 000000000000..36b4d512a3eb
--- /dev/null
+++ b/sys/tools/gdb/vnet.py
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org>
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+import gdb
+import traceback
+from freebsd import *
+
+class vnet(gdb.Function):
+ """
+ Register a function to look up VNET variables by name.
+
+ To look at the value of a VNET variable V_foo, print $V("foo"). The
+ currently selected thread's VNET is used by default, but can be optionally
+ specified as a second parameter, e.g., $V("foo", <vnet>), where <vnet> is a
+ pointer to a struct vnet (e.g., vnet0 or allprison.tqh_first->pr_vnet) or a
+ string naming a jail.
+ """
+ def __init__(self):
+ super(vnet, self).__init__("V")
+
+ def invoke(self, sym, vnet=None):
+ sym = sym.string()
+ if sym.startswith("V_"):
+ sym = sym[len("V_"):]
+ if gdb.lookup_symbol("sysctl___kern_features_vimage")[0] is None:
+ return symval(sym)
+
+ # Look up the VNET's base address.
+ if vnet is None:
+ vnet = tdfind(gdb.selected_thread().ptid[2])['td_vnet']
+ if not vnet:
+ # If curthread->td_vnet == NULL, vnet0 is the current vnet.
+ vnet = symval("vnet0")
+ elif vnet.type.is_string_like:
+ vnet = vnet.string()
+ for prison in tailq_foreach(symval("allprison"), "pr_list"):
+ if prison['pr_name'].string() == vnet:
+ vnet = prison['pr_vnet']
+ break
+ else:
+ raise gdb.error(f"No prison named {vnet}")
+
+ def uintptr_t(val):
+ return val.cast(gdb.lookup_type("uintptr_t"))
+
+ # Now the tricky part: compute the address of the symbol relative
+ # to the selected VNET. In the compiled kernel this is done at
+ # load time by applying a magic transformation to relocations
+ # against symbols in the vnet linker set. Here we have to apply
+ # the transformation manually.
+ vnet_data_base = vnet['vnet_data_base']
+ vnet_entry = symval("vnet_entry_" + sym)
+ vnet_entry_addr = uintptr_t(vnet_entry.address)
+
+ # First, which kernel module does the symbol belong to?
+ for lf in linker_file_foreach():
+ # Find the bounds of this linker file's VNET linker set. The
+ # struct containing the bounds depends on the type of the linker
+ # file, and unfortunately both are called elf_file_t. So we use a
+ # PC value from the compilation unit (either link_elf.c or
+ # link_elf_obj.c) to disambiguate.
+ block = gdb.block_for_pc(lf['ops']['cls']['methods'][0]['func'])
+ elf_file_t = gdb.lookup_type("elf_file_t", block).target()
+ ef = lf.cast(elf_file_t)
+
+ file_type = lf['ops']['cls']['name'].string()
+ if file_type == "elf64":
+ start = uintptr_t(ef['vnet_start'])
+ if start == 0:
+ # This linker file doesn't have a VNET linker set.
+ continue
+ end = uintptr_t(ef['vnet_stop'])
+ base = uintptr_t(ef['vnet_base'])
+ elif file_type == "elf64_obj":
+ for i in range(ef['nprogtab']):
+ pe = ef['progtab'][i]
+ if pe['name'].string() == "set_vnet":
+ start = uintptr_t(pe['origaddr'])
+ end = start + uintptr_t(pe['size'])
+ base = uintptr_t(pe['addr'])
+ break
+ else:
+ # This linker file doesn't have a VNET linker set.
+ continue
+ else:
+ path = lf['pathname'].string()
+ raise gdb.error(f"{path} has unexpected linker file type {file_type}")
+
+ if vnet_entry_addr >= start and vnet_entry_addr < end:
+ # The symbol belongs to this linker file, so compute the final
+ # address.
+ obj = gdb.Value(vnet_data_base + vnet_entry_addr - start + base)
+ return obj.cast(vnet_entry.type.pointer()).dereference()
+
+
+# Register with gdb.
+vnet()
diff --git a/sys/tools/kernel-gdb.py b/sys/tools/kernel-gdb.py
new file mode 100644
index 000000000000..8a41ef6efab1
--- /dev/null
+++ b/sys/tools/kernel-gdb.py
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org>
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+
+import os
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "gdb"))
+
+# Import FreeBSD kernel debugging commands and modules below.
+import acttrace
+import pcpu
+import vnet
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 0921eee92b9d..736c5a66267e 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -2592,8 +2592,12 @@ ufs_print(
printf("\tnlink=%d, effnlink=%d, size=%jd", ip->i_nlink,
ip->i_effnlink, (intmax_t)ip->i_size);
- if (I_IS_UFS2(ip))
- printf(", extsize %d", ip->i_din2->di_extsize);
+ if (I_IS_UFS2(ip)) {
+ if (ip->i_din2 == NULL)
+ printf(", dinode=NULL (fields omitted)");
+ else
+ printf(", extsize=%d", ip->i_din2->di_extsize);
+ }
printf("\n\tgeneration=%jx, uid=%d, gid=%d, flags=0x%b\n",
(uintmax_t)ip->i_gen, ip->i_uid, ip->i_gid,
(uint32_t)ip->i_flags, PRINT_INODE_FLAGS);
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 679b2e20e88b..b80b5cc781f7 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -4009,21 +4009,15 @@ restart:
/*
* Use the keg's policy if upper layers haven't already specified a
* domain (as happens with first-touch zones).
- *
- * To avoid races we run the iterator with the keg lock held, but that
- * means that we cannot allow the vm_domainset layer to sleep. Thus,
- * clear M_WAITOK and handle low memory conditions locally.
*/
rr = rdomain == UMA_ANYDOMAIN;
+ aflags = flags;
if (rr) {
- aflags = (flags & ~M_WAITOK) | M_NOWAIT;
if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
&aflags) != 0)
return (NULL);
- } else {
- aflags = flags;
+ } else
domain = rdomain;
- }
for (;;) {
slab = keg_fetch_free_slab(keg, domain, rr, flags);
@@ -4053,13 +4047,8 @@ restart:
if ((flags & M_WAITOK) == 0)
break;
vm_wait_domain(domain);
- } else if (vm_domainset_iter_policy(&di, &domain) != 0) {
- if ((flags & M_WAITOK) != 0) {
- vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
- goto restart;
- }
+ } else if (vm_domainset_iter_policy(&di, &domain) != 0)
break;
- }
}
/*
@@ -5245,7 +5234,7 @@ uma_prealloc(uma_zone_t zone, int items)
KEG_GET(zone, keg);
slabs = howmany(items, keg->uk_ipers);
while (slabs-- > 0) {
- aflags = M_NOWAIT;
+ aflags = M_WAITOK;
if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
&aflags) != 0)
panic("%s: Domainset is empty", __func__);
@@ -5266,7 +5255,8 @@ uma_prealloc(uma_zone_t zone, int items)
break;
}
if (vm_domainset_iter_policy(&di, &domain) != 0)
- vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
+ panic("%s: Cannot allocate from any domain",
+ __func__);
}
}
}
diff --git a/sys/x86/include/mca.h b/sys/x86/include/mca.h
index 183480625f6d..553b5d765f17 100644
--- a/sys/x86/include/mca.h
+++ b/sys/x86/include/mca.h
@@ -44,6 +44,31 @@ struct mca_record {
int mr_cpu;
};
+enum mca_stat_types {
+ MCA_T_NONE = 0,
+ MCA_T_UNCLASSIFIED,
+ MCA_T_UCODE_ROM_PARITY,
+ MCA_T_EXTERNAL,
+ MCA_T_FRC,
+ MCA_T_INTERNAL_PARITY,
+ MCA_T_SMM_HANDLER,
+ MCA_T_INTERNAL_TIMER,
+ MCA_T_GENERIC_IO,
+ MCA_T_INTERNAL,
+ MCA_T_MEMORY,
+ MCA_T_TLB,
+ MCA_T_MEMCONTROLLER_GEN,
+ MCA_T_MEMCONTROLLER_RD,
+ MCA_T_MEMCONTROLLER_WR,
+ MCA_T_MEMCONTROLLER_AC,
+ MCA_T_MEMCONTROLLER_MS,
+ MCA_T_MEMCONTROLLER_OTHER,
+ MCA_T_CACHE,
+ MCA_T_BUS,
+ MCA_T_UNKNOWN,
+ MCA_T_COUNT /* Must stay last */
+};
+
#ifdef _KERNEL
void cmc_intr(void);
diff --git a/sys/x86/x86/mca.c b/sys/x86/x86/mca.c
index 4ba49469d3a2..735efe307215 100644
--- a/sys/x86/x86/mca.c
+++ b/sys/x86/x86/mca.c
@@ -46,9 +46,11 @@
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
+#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <machine/intr_machdep.h>
@@ -124,6 +126,22 @@ SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
&workaround_erratum383, 0,
"Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
+#ifdef DIAGNOSTIC
+static uint64_t fake_status;
+SYSCTL_U64(_hw_mca, OID_AUTO, fake_status, CTLFLAG_RW,
+ &fake_status, 0,
+ "Insert artificial MCA with given status (testing purpose only)");
+static int fake_bank;
+SYSCTL_INT(_hw_mca, OID_AUTO, fake_bank, CTLFLAG_RW,
+ &fake_bank, 0,
+ "Bank to use for artificial MCAs (testing purpose only)");
+#endif
+
+static bool mca_uselog = false;
+SYSCTL_BOOL(_hw_mca, OID_AUTO, uselog, CTLFLAG_RWTUN, &mca_uselog, 0,
+ "Should the system send non-fatal machine check errors to the log "
+ "(instead of the console)?");
+
static STAILQ_HEAD(, mca_internal) mca_freelist;
static int mca_freecount;
static STAILQ_HEAD(, mca_internal) mca_records;
@@ -131,8 +149,44 @@ static STAILQ_HEAD(, mca_internal) mca_pending;
static int mca_ticks = 300;
static struct taskqueue *mca_tq;
static struct task mca_resize_task;
+static struct task mca_postscan_task;
static struct timeout_task mca_scan_task;
static struct mtx mca_lock;
+static bool mca_startup_done = false;
+
+/* Static buffer to compose messages while in an interrupt context. */
+static char mca_msg_buf[1024];
+static struct mtx mca_msg_buf_lock;
+
+/* Statistics on number of MCA events by type, updated with the mca_lock. */
+static uint64_t mca_stats[MCA_T_COUNT];
+SYSCTL_OPAQUE(_hw_mca, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_SKIP,
+ mca_stats, MCA_T_COUNT * sizeof(mca_stats[0]),
+ "S", "Array of MCA events by type");
+
+/* Variables to track and control message rate limiting. */
+static struct timeval mca_last_log_time;
+static struct timeval mca_log_interval;
+static int mca_log_skipped;
+
+static int
+sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ u_int val;
+
+ val = mca_log_interval.tv_sec;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ mca_log_interval.tv_sec = val;
+ return (0);
+}
+SYSCTL_PROC(_hw_mca, OID_AUTO, log_interval,
+ CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_log_interval, 0,
+ sysctl_mca_log_interval, "IU",
+ "Minimum number of seconds between logging correctable MCAs"
+ " (0 = no limit)");
static unsigned int
mca_ia32_ctl_reg(int bank)
@@ -356,21 +410,27 @@ mca_error_request(uint16_t mca_error)
}
static const char *
-mca_error_mmtype(uint16_t mca_error)
+mca_error_mmtype(uint16_t mca_error, enum mca_stat_types *event_type)
{
switch ((mca_error & 0x70) >> 4) {
case 0x0:
+ *event_type = MCA_T_MEMCONTROLLER_GEN;
return ("GEN");
case 0x1:
+ *event_type = MCA_T_MEMCONTROLLER_RD;
return ("RD");
case 0x2:
+ *event_type = MCA_T_MEMCONTROLLER_WR;
return ("WR");
case 0x3:
+ *event_type = MCA_T_MEMCONTROLLER_AC;
return ("AC");
case 0x4:
+ *event_type = MCA_T_MEMCONTROLLER_MS;
return ("MS");
}
+ *event_type = MCA_T_MEMCONTROLLER_OTHER;
return ("???");
}
@@ -423,87 +483,111 @@ mca_mute(const struct mca_record *rec)
/* Dump details about a single machine check. */
static void
-mca_log(const struct mca_record *rec)
+mca_log(enum scan_mode mode, const struct mca_record *rec, bool fatal)
{
+ int error, numskipped;
uint16_t mca_error;
+ enum mca_stat_types event_type;
+ struct sbuf sb;
+ bool uncor, using_shared_buf;
if (mca_mute(rec))
return;
- if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 &&
- (!tes_supported(rec->mr_mcg_cap) ||
+ uncor = (rec->mr_status & MC_STATUS_UC) != 0;
+
+ if (!log_corrected && !uncor && (!tes_supported(rec->mr_mcg_cap) ||
((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2))
return;
- printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
+ /* Try to use an allocated buffer when not in an interrupt context. */
+ if (mode == POLLED && sbuf_new(&sb, NULL, 512, SBUF_AUTOEXTEND) != NULL)
+ using_shared_buf = false;
+ else {
+ using_shared_buf = true;
+ mtx_lock_spin(&mca_msg_buf_lock);
+ sbuf_new(&sb, mca_msg_buf, sizeof(mca_msg_buf), SBUF_FIXEDLEN);
+ }
+
+ sbuf_printf(&sb, "MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
(long long)rec->mr_status);
- printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
+ sbuf_printf(&sb, "MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
(long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
- printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
- rec->mr_cpu_id, rec->mr_apic_id);
- printf("MCA: CPU %d ", rec->mr_cpu);
+ sbuf_printf(&sb, "MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n",
+ cpu_vendor, rec->mr_cpu_id, rec->mr_apic_id);
+ sbuf_printf(&sb, "MCA: CPU %d ", rec->mr_cpu);
if (rec->mr_status & MC_STATUS_UC)
- printf("UNCOR ");
+ sbuf_printf(&sb, "UNCOR ");
else {
- printf("COR ");
+ sbuf_printf(&sb, "COR ");
if (cmci_supported(rec->mr_mcg_cap))
- printf("(%lld) ", ((long long)rec->mr_status &
+ sbuf_printf(&sb, "(%lld) ", ((long long)rec->mr_status &
MC_STATUS_COR_COUNT) >> 38);
if (tes_supported(rec->mr_mcg_cap)) {
switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) {
case 0x1:
- printf("(Green) ");
+ sbuf_printf(&sb, "(Green) ");
break;
case 0x2:
- printf("(Yellow) ");
+ sbuf_printf(&sb, "(Yellow) ");
break;
}
}
}
if (rec->mr_status & MC_STATUS_EN)
- printf("EN ");
+ sbuf_printf(&sb, "EN ");
if (rec->mr_status & MC_STATUS_PCC)
- printf("PCC ");
+ sbuf_printf(&sb, "PCC ");
if (ser_supported(rec->mr_mcg_cap)) {
if (rec->mr_status & MC_STATUS_S)
- printf("S ");
+ sbuf_printf(&sb, "S ");
if (rec->mr_status & MC_STATUS_AR)
- printf("AR ");
+ sbuf_printf(&sb, "AR ");
}
if (rec->mr_status & MC_STATUS_OVER)
- printf("OVER ");
+ sbuf_printf(&sb, "OVER ");
mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
+ event_type = MCA_T_COUNT;
switch (mca_error) {
/* Simple error codes. */
case 0x0000:
- printf("no error");
+ sbuf_printf(&sb, "no error");
+ event_type = MCA_T_NONE;
break;
case 0x0001:
- printf("unclassified error");
+ sbuf_printf(&sb, "unclassified error");
+ event_type = MCA_T_UNCLASSIFIED;
break;
case 0x0002:
- printf("ucode ROM parity error");
+ sbuf_printf(&sb, "ucode ROM parity error");
+ event_type = MCA_T_UCODE_ROM_PARITY;
break;
case 0x0003:
- printf("external error");
+ sbuf_printf(&sb, "external error");
+ event_type = MCA_T_EXTERNAL;
break;
case 0x0004:
- printf("FRC error");
+ sbuf_printf(&sb, "FRC error");
+ event_type = MCA_T_FRC;
break;
case 0x0005:
- printf("internal parity error");
+ sbuf_printf(&sb, "internal parity error");
+ event_type = MCA_T_INTERNAL_PARITY;
break;
case 0x0006:
- printf("SMM handler code access violation");
+ sbuf_printf(&sb, "SMM handler code access violation");
+ event_type = MCA_T_SMM_HANDLER;
break;
case 0x0400:
- printf("internal timer error");
+ sbuf_printf(&sb, "internal timer error");
+ event_type = MCA_T_INTERNAL_TIMER;
break;
case 0x0e0b:
- printf("generic I/O error");
+ sbuf_printf(&sb, "generic I/O error");
+ event_type = MCA_T_GENERIC_IO;
if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL &&
(rec->mr_status & MC_STATUS_MISCV)) {
- printf(" (pci%d:%d:%d:%d)",
+ sbuf_printf(&sb, " (pci%d:%d:%d:%d)",
(int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32),
(int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24),
(int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19),
@@ -512,7 +596,9 @@ mca_log(const struct mca_record *rec)
break;
default:
if ((mca_error & 0xfc00) == 0x0400) {
- printf("internal error %x", mca_error & 0x03ff);
+ sbuf_printf(&sb, "internal error %x",
+ mca_error & 0x03ff);
+ event_type = MCA_T_INTERNAL;
break;
}
@@ -520,101 +606,168 @@ mca_log(const struct mca_record *rec)
/* Memory hierarchy error. */
if ((mca_error & 0xeffc) == 0x000c) {
- printf("%s memory error", mca_error_level(mca_error));
+ sbuf_printf(&sb, "%s memory error",
+ mca_error_level(mca_error));
+ event_type = MCA_T_MEMORY;
break;
}
/* TLB error. */
if ((mca_error & 0xeff0) == 0x0010) {
- printf("%sTLB %s error", mca_error_ttype(mca_error),
+ sbuf_printf(&sb, "%sTLB %s error",
+ mca_error_ttype(mca_error),
mca_error_level(mca_error));
+ event_type = MCA_T_TLB;
break;
}
/* Memory controller error. */
if ((mca_error & 0xef80) == 0x0080) {
- printf("%s channel ", mca_error_mmtype(mca_error));
+ sbuf_printf(&sb, "%s channel ",
+ mca_error_mmtype(mca_error, &event_type));
if ((mca_error & 0x000f) != 0x000f)
- printf("%d", mca_error & 0x000f);
+ sbuf_printf(&sb, "%d", mca_error & 0x000f);
else
- printf("??");
- printf(" memory error");
+ sbuf_printf(&sb, "??");
+ sbuf_printf(&sb, " memory error");
break;
}
/* Cache error. */
if ((mca_error & 0xef00) == 0x0100) {
- printf("%sCACHE %s %s error",
+ sbuf_printf(&sb, "%sCACHE %s %s error",
mca_error_ttype(mca_error),
mca_error_level(mca_error),
mca_error_request(mca_error));
+ event_type = MCA_T_CACHE;
break;
}
/* Extended memory error. */
if ((mca_error & 0xef80) == 0x0280) {
- printf("%s channel ", mca_error_mmtype(mca_error));
+ sbuf_printf(&sb, "%s channel ",
+ mca_error_mmtype(mca_error, &event_type));
if ((mca_error & 0x000f) != 0x000f)
- printf("%d", mca_error & 0x000f);
+ sbuf_printf(&sb, "%d", mca_error & 0x000f);
else
- printf("??");
- printf(" extended memory error");
+ sbuf_printf(&sb, "??");
+ sbuf_printf(&sb, " extended memory error");
break;
}
/* Bus and/or Interconnect error. */
if ((mca_error & 0xe800) == 0x0800) {
- printf("BUS%s ", mca_error_level(mca_error));
+ sbuf_printf(&sb, "BUS%s ", mca_error_level(mca_error));
+ event_type = MCA_T_BUS;
switch ((mca_error & 0x0600) >> 9) {
case 0:
- printf("Source");
+ sbuf_printf(&sb, "Source");
break;
case 1:
- printf("Responder");
+ sbuf_printf(&sb, "Responder");
break;
case 2:
- printf("Observer");
+ sbuf_printf(&sb, "Observer");
break;
default:
- printf("???");
+ sbuf_printf(&sb, "???");
break;
}
- printf(" %s ", mca_error_request(mca_error));
+ sbuf_printf(&sb, " %s ", mca_error_request(mca_error));
switch ((mca_error & 0x000c) >> 2) {
case 0:
- printf("Memory");
+ sbuf_printf(&sb, "Memory");
break;
case 2:
- printf("I/O");
+ sbuf_printf(&sb, "I/O");
break;
case 3:
- printf("Other");
+ sbuf_printf(&sb, "Other");
break;
default:
- printf("???");
+ sbuf_printf(&sb, "???");
break;
}
if (mca_error & 0x0100)
- printf(" timed out");
+ sbuf_printf(&sb, " timed out");
break;
}
- printf("unknown error %x", mca_error);
+ sbuf_printf(&sb, "unknown error %x", mca_error);
+ event_type = MCA_T_UNKNOWN;
break;
}
- printf("\n");
+ sbuf_printf(&sb, "\n");
if (rec->mr_status & MC_STATUS_ADDRV) {
- printf("MCA: Address 0x%llx", (long long)rec->mr_addr);
+ sbuf_printf(&sb, "MCA: Address 0x%llx",
+ (long long)rec->mr_addr);
if (ser_supported(rec->mr_mcg_cap) &&
(rec->mr_status & MC_STATUS_MISCV)) {
- printf(" (Mode: %s, LSB: %d)",
+ sbuf_printf(&sb, " (Mode: %s, LSB: %d)",
mca_addres_mode(rec->mr_misc),
(int)(rec->mr_misc & MC_MISC_RA_LSB));
}
- printf("\n");
+ sbuf_printf(&sb, "\n");
}
if (rec->mr_status & MC_STATUS_MISCV)
- printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
+ sbuf_printf(&sb, "MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
+
+ if (event_type < 0 || event_type >= MCA_T_COUNT) {
+ KASSERT(0, ("%s: invalid event type (%d)", __func__,
+ event_type));
+ event_type = MCA_T_UNKNOWN;
+ }
+ numskipped = 0;
+ if (!fatal && !uncor) {
+ /*
+ * Update statistics and check the rate limit for
+ * correctable errors. The rate limit is only applied
+ * after the system records a reasonable number of errors
+ * of the same type. The goal is to reduce the impact of
+ * the system seeing and attempting to log a burst of
+ * similar errors, which (especially when printed to the
+ * console) can be expensive.
+ */
+ mtx_lock_spin(&mca_lock);
+ mca_stats[event_type]++;
+ if (mca_log_interval.tv_sec > 0 && mca_stats[event_type] > 50 &&
+ ratecheck(&mca_last_log_time, &mca_log_interval) == 0) {
+ mca_log_skipped++;
+ mtx_unlock_spin(&mca_lock);
+ goto done;
+ }
+ numskipped = mca_log_skipped;
+ mca_log_skipped = 0;
+ mtx_unlock_spin(&mca_lock);
+ }
+
+ error = sbuf_finish(&sb);
+ if (fatal || !mca_uselog) {
+ if (numskipped > 0)
+ printf("MCA: %d events skipped due to rate limit\n",
+ numskipped);
+ if (error)
+ printf("MCA: error logging message (sbuf error %d)\n",
+ error);
+ else
+ sbuf_putbuf(&sb);
+ } else {
+ if (numskipped > 0)
+ log(LOG_ERR,
+ "MCA: %d events skipped due to rate limit\n",
+ numskipped);
+ if (error)
+ log(LOG_ERR,
+ "MCA: error logging message (sbuf error %d)\n",
+ error);
+ else
+ log(uncor ? LOG_CRIT : LOG_ERR, "%s", sbuf_data(&sb));
+ }
+
+done:
+ sbuf_delete(&sb);
+ if (using_shared_buf)
+ mtx_unlock_spin(&mca_msg_buf_lock);
}
static bool
@@ -662,8 +815,24 @@ mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank,
bool mce, recover;
status = rdmsr(mca_msr_ops.status(bank));
- if (!(status & MC_STATUS_VAL))
+ if (!(status & MC_STATUS_VAL)) {
+#ifdef DIAGNOSTIC
+ /*
+ * Check if we have a pending artificial event to generate.
+ * Note that this is potentially racy with the sysctl. The
+ * tradeoff is deemed acceptable given the test nature
+ * of the code.
+ */
+ if (fake_status && bank == fake_bank) {
+ status = fake_status;
+ fake_status = 0;
+ }
+ if (!(status & MC_STATUS_VAL))
+ return (0);
+#else
return (0);
+#endif
+ }
recover = *recoverablep;
mce = mca_is_mce(mcg_cap, status, &recover);
@@ -757,9 +926,9 @@ mca_record_entry(enum scan_mode mode, const struct mca_record *record)
mtx_lock_spin(&mca_lock);
rec = STAILQ_FIRST(&mca_freelist);
if (rec == NULL) {
- printf("MCA: Unable to allocate space for an event.\n");
- mca_log(record);
mtx_unlock_spin(&mca_lock);
+ printf("MCA: Unable to allocate space for an event.\n");
+ mca_log(mode, record, false);
return;
}
STAILQ_REMOVE_HEAD(&mca_freelist, link);
@@ -916,7 +1085,7 @@ mca_scan(enum scan_mode mode, bool *recoverablep)
if (*recoverablep)
mca_record_entry(mode, &rec);
else
- mca_log(&rec);
+ mca_log(mode, &rec, true);
}
#ifdef DEV_APIC
@@ -978,18 +1147,49 @@ static void
mca_process_records(enum scan_mode mode)
{
struct mca_internal *mca;
+ STAILQ_HEAD(, mca_internal) tmplist;
+
+ /*
+ * If in an interrupt context, defer the post-scan activities to a
+ * task queue.
+ */
+ if (mode != POLLED) {
+ if (mca_startup_done)
+ taskqueue_enqueue(mca_tq, &mca_postscan_task);
+ return;
+ }
+
+ /*
+ * Copy the pending list to the stack so we can drop the spin lock
+ * while we are emitting logs.
+ */
+ STAILQ_INIT(&tmplist);
+ mtx_lock_spin(&mca_lock);
+ STAILQ_SWAP(&mca_pending, &tmplist, mca_internal);
+ mtx_unlock_spin(&mca_lock);
+
+ STAILQ_FOREACH(mca, &tmplist, link)
+ mca_log(mode, &mca->rec, false);
mtx_lock_spin(&mca_lock);
- while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) {
- STAILQ_REMOVE_HEAD(&mca_pending, link);
- mca_log(&mca->rec);
+ while ((mca = STAILQ_FIRST(&tmplist)) != NULL) {
+ STAILQ_REMOVE_HEAD(&tmplist, link);
mca_store_record(mca);
}
mtx_unlock_spin(&mca_lock);
- if (mode == POLLED)
- mca_resize_freelist();
- else if (!cold)
- taskqueue_enqueue(mca_tq, &mca_resize_task);
+ mca_resize_freelist();
+}
+
+/*
+ * Emit log entries and resize the free list. This is intended to be called
+ * from a task queue to handle work which does not need to be done (or cannot
+ * be done) in an interrupt context.
+ */
+static void
+mca_postscan(void *context __unused, int pending __unused)
+{
+
+ mca_process_records(POLLED);
}
/*
@@ -1060,7 +1260,7 @@ sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)
doresize = true;
}
mtx_unlock_spin(&mca_lock);
- if (doresize && !cold)
+ if (doresize && mca_startup_done)
taskqueue_enqueue(mca_tq, &mca_resize_task);
return (error);
}
@@ -1072,12 +1272,16 @@ mca_startup(void *dummy)
if (mca_banks <= 0)
return;
- /* CMCIs during boot may have claimed items from the freelist. */
- mca_resize_freelist();
-
taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
mca_ticks * SBT_1S, 0, C_PREL(1));
+ mca_startup_done = true;
+
+ /*
+ * CMCIs during boot may have recorded entries. Conduct the post-scan
+ * activities now.
+ */
+ mca_postscan(NULL, 0);
}
SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
@@ -1130,6 +1334,7 @@ mca_setup(uint64_t mcg_cap)
mca_banks = mcg_cap & MCG_CAP_COUNT;
mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
+ mtx_init(&mca_msg_buf_lock, "mca_msg_buf", NULL, MTX_SPIN);
STAILQ_INIT(&mca_records);
STAILQ_INIT(&mca_pending);
mca_tq = taskqueue_create_fast("mca", M_WAITOK,
@@ -1137,6 +1342,7 @@ mca_setup(uint64_t mcg_cap)
TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL);
STAILQ_INIT(&mca_freelist);
TASK_INIT(&mca_resize_task, 0, mca_resize, NULL);
+ TASK_INIT(&mca_postscan_task, 0, mca_postscan, NULL);
mca_resize_freelist();
SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
@@ -1540,6 +1746,9 @@ mca_intr(void)
panic("Unrecoverable machine check exception");
}
+ if (count)
+ mca_process_records(MCE);
+
/* Clear MCIP. */
wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
}