diff options
Diffstat (limited to 'sys')
381 files changed, 8543 insertions, 3059 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c index 99565fbb69ca..8cada2f4f911 100644 --- a/sys/amd64/acpica/acpi_wakeup.c +++ b/sys/amd64/acpica/acpi_wakeup.c @@ -74,7 +74,7 @@ extern int acpi_susp_bounce; extern struct susppcb **susppcbs; static cpuset_t suspcpus; -static void acpi_stop_beep(void *); +static void acpi_stop_beep(void *, enum power_stype); static int acpi_wakeup_ap(struct acpi_softc *, int); static void acpi_wakeup_cpus(struct acpi_softc *); @@ -88,7 +88,7 @@ static void acpi_wakeup_cpus(struct acpi_softc *); } while (0) static void -acpi_stop_beep(void *arg) +acpi_stop_beep(void *arg, enum power_stype stype) { if (acpi_resume_beep != 0) diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index 6cc2d58bbbcc..933f1ac0051f 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -179,7 +179,7 @@ freebsd_brand_info_la57_img_compat(const struct image_params *imgp, return (!prefer_uva_la48); } -static Elf64_Brandinfo freebsd_brand_info_la48 = { +static const Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -190,7 +190,7 @@ static Elf64_Brandinfo freebsd_brand_info_la48 = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, }; -static Elf64_Brandinfo freebsd_brand_info_la57 = { +static const Elf64_Brandinfo freebsd_brand_info_la57 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -216,7 +216,7 @@ sysinit_register_elf64_brand_entries(void *arg __unused) SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, sysinit_register_elf64_brand_entries, NULL); -static Elf64_Brandinfo freebsd_brand_oinfo = { +static const Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -226,11 +226,10 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; - -SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo); -static Elf64_Brandinfo kfreebsd_brand_info = { +static const Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -240,8 +239,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; - -SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info); void diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index e35119af8572..ad67510fecf3 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -170,55 +170,63 @@ struct vm_eventinfo { int *iptr; /* reqidle cookie */ }; -typedef int (*vmm_init_func_t)(int ipinum); -typedef int (*vmm_cleanup_func_t)(void); -typedef void (*vmm_suspend_func_t)(void); -typedef void (*vmm_resume_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); -typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, - struct pmap *pmap, struct vm_eventinfo *info); -typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, - int vcpu_id); -typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); -typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); -typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); -typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); -typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); -typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); -typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); -typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); -typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); -typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); -typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); -typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); -typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + typedef ret_type (*vmmops_##opname##_t) args; \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void, modresume, (void)); +DECLARE_VMMOPS_FUNC(void, modsuspend, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, + struct pmap *pmap, struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getdesc, (void *vcpui, int num, + struct seg_desc *desc)); +DECLARE_VMMOPS_FUNC(int, setdesc, (void *vcpui, int num, + struct seg_desc *desc)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, + (vm_offset_t min, vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); +DECLARE_VMMOPS_FUNC(struct vlapic *, vlapic_init, (void *vcpui)); +DECLARE_VMMOPS_FUNC(void, vlapic_cleanup, (struct vlapic *vlapic)); +DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); struct vmm_ops { - vmm_init_func_t modinit; /* module wide initialization */ - vmm_cleanup_func_t modcleanup; - vmm_resume_func_t modsuspend; - vmm_resume_func_t modresume; - - vmi_init_func_t init; /* vm-specific initialization */ - vmi_run_func_t run; - vmi_cleanup_func_t cleanup; - vmi_vcpu_init_func_t vcpu_init; - vmi_vcpu_cleanup_func_t vcpu_cleanup; - vmi_get_register_t getreg; - vmi_set_register_t setreg; - vmi_get_desc_t getdesc; - vmi_set_desc_t setdesc; - vmi_get_cap_t getcap; - vmi_set_cap_t setcap; - vmi_vmspace_alloc vmspace_alloc; - vmi_vmspace_free vmspace_free; - vmi_vlapic_init vlapic_init; - vmi_vlapic_cleanup vlapic_cleanup; + vmmops_modinit_t modinit; /* module wide initialization */ + vmmops_modcleanup_t modcleanup; + vmmops_modresume_t modsuspend; + vmmops_modresume_t modresume; + + vmmops_init_t init; /* vm-specific initialization */ + vmmops_run_t run; + vmmops_cleanup_t cleanup; + vmmops_vcpu_init_t vcpu_init; + vmmops_vcpu_cleanup_t vcpu_cleanup; + vmmops_getreg_t getreg; + vmmops_setreg_t setreg; + vmmops_getdesc_t getdesc; + vmmops_setdesc_t setdesc; + vmmops_getcap_t getcap; + vmmops_setcap_t setcap; + vmmops_vmspace_alloc_t vmspace_alloc; + vmmops_vmspace_free_t vmspace_free; + vmmops_vlapic_init_t vlapic_init; + vmmops_vlapic_cleanup_t vlapic_cleanup; /* checkpoint operations */ - vmi_snapshot_vcpu_t vcpu_snapshot; - vmi_restore_tsc_t restore_tsc; + vmmops_vcpu_snapshot_t vcpu_snapshot; + vmmops_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; @@ -229,7 +237,7 @@ extern u_int vm_maxcpu; /* maximum virtual cpus */ int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -354,6 +362,7 @@ enum vcpu_state { }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); +int vcpu_set_state_all(struct vm *vm, enum vcpu_state state); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline @@ -375,7 +384,6 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); -struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index c8579c5da4ad..890cf01c46a0 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -857,7 +857,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux64_brandnote = { +static const Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, @@ -866,7 +866,7 @@ static Elf_Brandnote linux64_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf64_Brandinfo linux_glibc2brand = { +static const Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -877,7 +877,7 @@ static Elf64_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf64_Brandinfo linux_glibc2brandshort = { +static const Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -888,7 +888,7 @@ static Elf64_Brandinfo linux_glibc2brandshort = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf64_Brandinfo linux_muslbrand = { +static const Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -900,7 +900,7 @@ static Elf64_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -static Elf64_Brandinfo *linux_brandlist[] = { +static const Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, @@ -910,7 +910,7 @@ static Elf64_Brandinfo *linux_brandlist[] = { static int linux64_elf_modevent(module_t mod, int type, void *data) { - Elf64_Brandinfo **brandinfo; + const Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 8fac626f9053..735ebb151017 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -954,7 +954,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux32_brandnote = { +static const Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, @@ -963,7 +963,7 @@ static Elf_Brandnote linux32_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf32_Brandinfo linux_brand = { +static const Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -974,7 +974,7 @@ static Elf32_Brandinfo linux_brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_glibc2brand = { +static const Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -985,7 +985,7 @@ static Elf32_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_muslbrand = { +static const Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -997,7 +997,7 @@ static Elf32_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -static Elf32_Brandinfo *linux_brandlist[] = { +static const Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, @@ -1007,7 +1007,7 @@ static Elf32_Brandinfo *linux_brandlist[] = { static int linux_elf_modevent(module_t mod, int type, void *data) { - Elf32_Brandinfo **brandinfo; + const Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c index c7b75767680a..6b2296de049c 100644 --- a/sys/amd64/pt/pt.c +++ b/sys/amd64/pt/pt.c @@ -42,15 +42,15 @@ */ #include <sys/systm.h> +#include <sys/bus.h> #include <sys/hwt.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mutex.h> -#include <sys/sdt.h> #include <sys/smp.h> -#include <sys/taskqueue.h> #include <vm/vm.h> #include <vm/vm_page.h> @@ -94,12 +94,7 @@ MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); -SDT_PROVIDER_DEFINE(pt); -SDT_PROBE_DEFINE(pt, , , topa__intr); - -TASKQUEUE_FAST_DEFINE_THREAD(pt); - -static void pt_send_buffer_record(void *arg, int pending __unused); +static void pt_send_buffer_record(void *arg); static int pt_topa_intr(struct trapframe *tf); /* @@ -122,29 +117,24 @@ struct pt_buffer { size_t size; struct mtx lock; /* Lock for fields below. */ vm_offset_t offset; - uint64_t wrap_count; - int curpage; }; struct pt_ctx { int id; struct pt_buffer buf; /* ToPA buffer metadata */ - struct task task; /* ToPA buffer notification task */ struct hwt_context *hwt_ctx; uint8_t *save_area; /* PT XSAVE area */ }; /* PT tracing contexts used for CPU mode. */ static struct pt_ctx *pt_pcpu_ctx; -enum pt_cpu_state { - PT_DISABLED = 0, - PT_STOPPED, - PT_ACTIVE -}; +enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE }; static struct pt_cpu { struct pt_ctx *ctx; /* active PT tracing context */ enum pt_cpu_state state; /* used as part of trace stop protocol */ + void *swi_cookie; /* Software interrupt handler context */ + int in_pcint_handler; } *pt_pcpu; /* @@ -199,31 +189,28 @@ static __inline void pt_update_buffer(struct pt_buffer *buf) { uint64_t reg; - int curpage; + uint64_t offset; /* Update buffer offset. */ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); - curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; - mtx_lock_spin(&buf->lock); - /* Check if the output wrapped. */ - if (buf->curpage > curpage) - buf->wrap_count++; - buf->curpage = curpage; - buf->offset = reg >> 32; - mtx_unlock_spin(&buf->lock); - - dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, - buf->wrap_count, buf->curpage, buf->offset); + offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE; + offset += (reg >> 32); + + atomic_store_rel_64(&buf->offset, offset); } static __inline void pt_fill_buffer_record(int id, struct pt_buffer *buf, struct hwt_record_entry *rec) { + vm_offset_t offset; + + offset = atomic_load_acq_64(&buf->offset); + rec->record_type = HWT_RECORD_BUFFER; rec->buf_id = id; - rec->curpage = buf->curpage; - rec->offset = buf->offset + (buf->wrap_count * buf->size); + rec->curpage = offset / PAGE_SIZE; + rec->offset = offset & PAGE_MASK; } /* @@ -273,9 +260,9 @@ pt_cpu_start(void *dummy) MPASS(cpu->ctx != NULL); dprintf("%s: curcpu %d\n", __func__, curcpu); + pt_cpu_set_state(curcpu, PT_ACTIVE); load_cr4(rcr4() | CR4_XSAVE); wrmsr(MSR_IA32_RTIT_STATUS, 0); - pt_cpu_set_state(curcpu, PT_ACTIVE); pt_cpu_toggle_local(cpu->ctx->save_area, true); } @@ -291,16 +278,16 @@ pt_cpu_stop(void *dummy) struct pt_cpu *cpu; struct pt_ctx *ctx; - /* Shutdown may occur before PT gets properly configured. */ - if (pt_cpu_get_state(curcpu) == PT_DISABLED) - return; - cpu = &pt_pcpu[curcpu]; ctx = cpu->ctx; - MPASS(ctx != NULL); - dprintf("%s: curcpu %d\n", __func__, curcpu); - pt_cpu_set_state(curcpu, PT_STOPPED); + dprintf("%s: curcpu %d\n", __func__, curcpu); + /* Shutdown may occur before PT gets properly configured. */ + if (ctx == NULL) { + dprintf("%s: missing context on cpu %d; bailing\n", __func__, + curcpu); + return; + } pt_cpu_toggle_local(cpu->ctx->save_area, false); pt_update_buffer(&ctx->buf); } @@ -406,13 +393,11 @@ pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) return (ENOMEM); dprintf("%s: preparing ToPA buffer\n", __func__); if (pt_topa_prepare(pt_ctx, vm) != 0) { - dprintf("%s: failed to prepare ToPA buffer\n", __func__); free(pt_ctx->save_area, M_PT); return (ENOMEM); } pt_ctx->id = ctx_id; - TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); return (0); } @@ -426,7 +411,6 @@ pt_deinit_ctx(struct pt_ctx *pt_ctx) if (pt_ctx->save_area != NULL) free(pt_ctx->save_area, M_PT); memset(pt_ctx, 0, sizeof(*pt_ctx)); - pt_ctx->buf.topa_hw = NULL; } /* @@ -519,7 +503,6 @@ pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) XSTATE_XCOMP_BV_COMPACT; pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; pt_pcpu[cpu_id].ctx = pt_ctx; - pt_cpu_set_state(cpu_id, PT_STOPPED); return (0); } @@ -549,12 +532,19 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) if (ctx->mode == HWT_MODE_CPU) return; - KASSERT(curcpu == cpu_id, ("%s: attempting to disable PT on another cpu", __func__)); + + cpu = &pt_pcpu[cpu_id]; + + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__, + cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + pt_cpu_stop(NULL); CPU_CLR(cpu_id, &ctx->cpu_map); - cpu = &pt_pcpu[cpu_id]; cpu->ctx = NULL; } @@ -564,14 +554,14 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) static int pt_backend_enable_smp(struct hwt_context *ctx) { - dprintf("%s\n", __func__); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); if (ctx->mode == HWT_MODE_CPU && atomic_swap_32(&cpu_mode_ctr, 1) != 0) return (-1); - KASSERT(ctx->mode == HWT_MODE_CPU, - ("%s: should only be used for CPU mode", __func__)); smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); return (0); @@ -583,6 +573,7 @@ pt_backend_enable_smp(struct hwt_context *ctx) static int pt_backend_disable_smp(struct hwt_context *ctx) { + struct pt_cpu *cpu; dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU && @@ -593,6 +584,14 @@ pt_backend_disable_smp(struct hwt_context *ctx) dprintf("%s: empty cpu map\n", __func__); return (-1); } + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + cpu = &pt_pcpu[cpu_id]; + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", + __func__, cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + } smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); return (0); @@ -611,13 +610,13 @@ pt_backend_init(struct hwt_context *ctx) int error; dprintf("%s\n", __func__); - if (ctx->mode == HWT_MODE_CPU) { - TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { - error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], - hwt_cpu->vm, hwt_cpu->cpu_id); - if (error) - return (error); - } + if (ctx->mode != HWT_MODE_CPU) + return (0); + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm, + hwt_cpu->cpu_id); + if (error) + return (error); } return (0); @@ -647,20 +646,16 @@ pt_backend_deinit(struct hwt_context *ctx) pt_deinit_ctx(pt_ctx); } } else { - CPU_FOREACH(cpu_id) { - if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + if (pt_pcpu[cpu_id].ctx == NULL) continue; - if (pt_pcpu[cpu_id].ctx != NULL) { - KASSERT(pt_pcpu[cpu_id].ctx == - &pt_pcpu_ctx[cpu_id], - ("%s: CPU mode tracing with non-cpu mode PT" - "context active", - __func__)); - pt_pcpu[cpu_id].ctx = NULL; - } - pt_ctx = &pt_pcpu_ctx[cpu_id]; - pt_deinit_ctx(pt_ctx); - memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_deinit_ctx(pt_pcpu[cpu_id].ctx); + pt_pcpu[cpu_id].ctx = NULL; + atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0); } } @@ -675,15 +670,15 @@ pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, uint64_t *data) { struct pt_buffer *buf; + uint64_t offset; if (vm->ctx->mode == HWT_MODE_THREAD) buf = &((struct pt_ctx *)vm->thr->private)->buf; else buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; - mtx_lock_spin(&buf->lock); - *curpage = buf->curpage; - *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); - mtx_unlock_spin(&buf->lock); + offset = atomic_load_acq_64(&buf->offset); + *curpage = offset / PAGE_SIZE; + *curpage_offset = offset & PAGE_MASK; return (0); } @@ -762,15 +757,13 @@ static struct hwt_backend backend = { * Used as a taskqueue routine from the ToPA interrupt handler. */ static void -pt_send_buffer_record(void *arg, int pending __unused) +pt_send_buffer_record(void *arg) { + struct pt_cpu *cpu = (struct pt_cpu *)arg; struct hwt_record_entry record; - struct pt_ctx *ctx = (struct pt_ctx *)arg; - /* Prepare buffer record. */ - mtx_lock_spin(&ctx->buf.lock); + struct pt_ctx *ctx = cpu->ctx; pt_fill_buffer_record(ctx->id, &ctx->buf, &record); - mtx_unlock_spin(&ctx->buf.lock); hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); } static void @@ -795,36 +788,40 @@ static int pt_topa_intr(struct trapframe *tf) { struct pt_buffer *buf; + struct pt_cpu *cpu; struct pt_ctx *ctx; uint64_t reg; - SDT_PROBE0(pt, , , topa__intr); - - if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { - return (0); - } + cpu = &pt_pcpu[curcpu]; reg = rdmsr(MSR_IA_GLOBAL_STATUS); if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { - /* ACK spurious or leftover interrupt. */ pt_topa_status_clear(); + return (0); + } + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { return (1); } + atomic_set_int(&cpu->in_pcint_handler, 1); - ctx = pt_pcpu[curcpu].ctx; + ctx = cpu->ctx; + KASSERT(ctx != NULL, + ("%s: cpu %d: ToPA PMI interrupt without an active context", + __func__, curcpu)); buf = &ctx->buf; KASSERT(buf->topa_hw != NULL, - ("%s: ToPA PMI interrupt with invalid buffer", __func__)); - + ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__, + curcpu)); pt_cpu_toggle_local(ctx->save_area, false); pt_update_buffer(buf); pt_topa_status_clear(); - taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, - TASKQUEUE_FAIL_IF_PENDING); if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + swi_sched(cpu->swi_cookie, SWI_FROMNMI); pt_cpu_toggle_local(ctx->save_area, true); lapic_reenable_pcint(); } + atomic_set_int(&cpu->in_pcint_handler, 0); return (1); } @@ -839,7 +836,7 @@ static int pt_init(void) { u_int cp[4]; - int error; + int error, i; dprintf("pt: Enumerating part 1\n"); cpuid_count(CPUID_PT_LEAF, 0, cp); @@ -869,20 +866,38 @@ pt_init(void) pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, M_ZERO | M_WAITOK); + for (i = 0; i < mp_ncpus; i++) { + error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record, + &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE, + &pt_pcpu[i].swi_cookie); + if (error != 0) { + dprintf( + "%s: failed to add interrupt handler for cpu: %d\n", + __func__, error); + goto err; + } + } + nmi_register_handler(pt_topa_intr); - if (!lapic_enable_pcint()) { - nmi_remove_handler(pt_topa_intr); - hwt_backend_unregister(&backend); - free(pt_pcpu, M_PT); - free(pt_pcpu_ctx, M_PT); - pt_pcpu = NULL; - pt_pcpu_ctx = NULL; + if (lapic_enable_pcint()) { + initialized = true; + return (0); + } else printf("pt: failed to setup interrupt line\n"); - return (error); +err: + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + if (pt_pcpu[i].swi_cookie != 0) + swi_remove(pt_pcpu[i].swi_cookie); } - initialized = true; + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; - return (0); + return (error); } /* @@ -941,14 +956,24 @@ pt_supported(void) static void pt_deinit(void) { + int i; + struct pt_cpu *cpu; + if (!initialized) return; nmi_remove_handler(pt_topa_intr); lapic_disable_pcint(); hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + cpu = &pt_pcpu[i]; + swi_remove(cpu->swi_cookie); + } + free(pt_pcpu, M_PT); free(pt_pcpu_ctx, M_PT); pt_pcpu = NULL; + pt_pcpu_ctx = NULL; initialized = false; } diff --git a/sys/amd64/sgx/sgx_linux.c b/sys/amd64/sgx/sgx_linux.c index 6ecef9207a38..d389edc1b2b0 100644 --- a/sys/amd64/sgx/sgx_linux.c +++ b/sys/amd64/sgx/sgx_linux.c @@ -92,16 +92,7 @@ out: return (error); } -static struct linux_ioctl_handler sgx_linux_handler = { - sgx_linux_ioctl, - SGX_LINUX_IOCTL_MIN, - SGX_LINUX_IOCTL_MAX, -}; - -SYSINIT(sgx_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &sgx_linux_handler); -SYSUNINIT(sgx_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &sgx_linux_handler); +LINUX_IOCTL_SET(sgx, SGX_LINUX_IOCTL_MIN, SGX_LINUX_IOCTL_MAX); static int sgx_linux_modevent(module_t mod, int type, void *data) diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c42da02d0bf6..f7c59847140b 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -163,7 +163,6 @@ struct vm { void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ - struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) [m+v] guest memory */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (o) guest vcpus */ @@ -201,7 +200,7 @@ vmmops_panic(void) } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ + DEFINE_IFUNC(, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ @@ -499,7 +498,7 @@ MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { - vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); @@ -563,9 +562,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -584,7 +583,7 @@ int vm_create(const char *name, struct vm **retvm) { struct vm *vm; - struct vmspace *vmspace; + int error; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -597,14 +596,13 @@ vm_create(const char *name, struct vm **retvm) VM_MAX_NAMELEN + 1) return (EINVAL); - vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); - if (vmspace == NULL) - return (ENOMEM); - vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48); + if (error != 0) { + free(vm, M_VM); + return (error); + } strcpy(vm->name, name); - vm->vmspace = vmspace; - vm_mem_init(&vm->mem); mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | @@ -685,9 +683,6 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_mem_destroy(vm); - vmmops_vmspace_free(vm->vmspace); - vm->vmspace = NULL; - free(vm->vcpu, M_VM); sx_destroy(&vm->vcpus_init_lock); mtx_destroy(&vm->rendezvous_mtx); @@ -731,7 +726,7 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; - if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); @@ -741,19 +736,21 @@ int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { - vmm_mmio_free(vm->vmspace, gpa, len); + vmm_mmio_free(vm_vmspace(vm), gpa, len); return (0); } static int vm_iommu_map(struct vm *vm) { + pmap_t pmap; vm_paddr_t gpa, hpa; struct vm_mem_map *mm; int error, i; sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); + pmap = vmspace_pmap(vm_vmspace(vm)); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; @@ -767,7 +764,7 @@ vm_iommu_map(struct vm *vm) mm->flags |= VM_MEMMAP_F_IOMMU; for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { - hpa = pmap_extract(vmspace_pmap(vm->vmspace), gpa); + hpa = pmap_extract(pmap, gpa); /* * All mappings in the vmm vmspace must be @@ -816,7 +813,7 @@ vm_iommu_unmap(struct vm *vm) for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( - vmspace_pmap(vm->vmspace), gpa))), + vmspace_pmap(vm_vmspace(vm)), gpa))), ("vm_iommu_unmap: vm %p gpa %jx not wired", vm, (uintmax_t)gpa)); iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); @@ -993,6 +990,54 @@ save_guest_fpustate(struct vcpu *vcpu) static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); +/* + * Invoke the rendezvous function on the specified vcpu if applicable. Return + * true if the rendezvous is finished, false otherwise. + */ +static bool +vm_rendezvous(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + int vcpuid; + + mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED); + KASSERT(vcpu->vm->rendezvous_func != NULL, + ("vm_rendezvous: no rendezvous pending")); + + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, + &vm->active_cpus); + + vcpuid = vcpu->vcpuid; + if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && + !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { + VMM_CTR0(vcpu, "Calling rendezvous func"); + (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); + CPU_SET(vcpuid, &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VMM_CTR0(vcpu, "Rendezvous completed"); + CPU_ZERO(&vm->rendezvous_req_cpus); + vm->rendezvous_func = NULL; + wakeup(&vm->rendezvous_func); + return (true); + } + return (false); +} + +static void +vcpu_wait_idle(struct vcpu *vcpu) +{ + KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle")); + + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VMM_CTR1(vcpu, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +} + static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) @@ -1007,13 +1052,8 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, * ioctl() operating on a vcpu at any point. */ if (from_idle) { - while (vcpu->state != VCPU_IDLE) { - vcpu->reqidle = 1; - vcpu_notify_event_locked(vcpu, false); - VMM_CTR1(vcpu, "vcpu state change from %s to " - "idle requested", vcpu_state2str(vcpu->state)); - msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); - } + while (vcpu->state != VCPU_IDLE) + vcpu_wait_idle(vcpu); } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); @@ -1065,6 +1105,95 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, return (0); } +/* + * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks + * with vm_smp_rendezvous(). + * + * The complexity here suggests that the rendezvous mechanism needs a rethink. + */ +int +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) +{ + cpuset_t locked; + struct vcpu *vcpu; + int error, i; + uint16_t maxcpus; + + KASSERT(newstate != VCPU_IDLE, + ("vcpu_set_state_all: invalid target state %d", newstate)); + + error = 0; + CPU_ZERO(&locked); + maxcpus = vm->maxcpus; + + mtx_lock(&vm->rendezvous_mtx); +restart: + if (vm->rendezvous_func != NULL) { + /* + * If we have a pending rendezvous, then the initiator may be + * blocked waiting for other vCPUs to execute the callback. The + * current thread may be a vCPU thread so we must not block + * waiting for the initiator, otherwise we get a deadlock. + * Thus, execute the callback on behalf of any idle vCPUs. + */ + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + if (vcpu->state == VCPU_IDLE) { + (void)vcpu_set_state_locked(vcpu, VCPU_FROZEN, + true); + CPU_SET(i, &locked); + } + if (CPU_ISSET(i, &locked)) { + /* + * We can safely execute the callback on this + * vCPU's behalf. + */ + vcpu_unlock(vcpu); + (void)vm_rendezvous(vcpu); + vcpu_lock(vcpu); + } + vcpu_unlock(vcpu); + } + } + + /* + * Now wait for remaining vCPUs to become idle. This may include the + * initiator of a rendezvous that is currently blocked on the rendezvous + * mutex. + */ + CPU_FOREACH_ISCLR(i, &locked) { + if (i >= maxcpus) + break; + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + while (vcpu->state != VCPU_IDLE) { + mtx_unlock(&vm->rendezvous_mtx); + vcpu_wait_idle(vcpu); + vcpu_unlock(vcpu); + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) + goto restart; + vcpu_lock(vcpu); + } + error = vcpu_set_state_locked(vcpu, newstate, true); + vcpu_unlock(vcpu); + if (error != 0) { + /* Roll back state changes. */ + CPU_FOREACH_ISSET(i, &locked) + (void)vcpu_set_state(vcpu, VCPU_IDLE, false); + break; + } + CPU_SET(i, &locked); + } + mtx_unlock(&vm->rendezvous_mtx); + return (error); +} + static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { @@ -1086,36 +1215,23 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) static int vm_handle_rendezvous(struct vcpu *vcpu) { - struct vm *vm = vcpu->vm; + struct vm *vm; struct thread *td; - int error, vcpuid; - error = 0; - vcpuid = vcpu->vcpuid; td = curthread; + vm = vcpu->vm; + mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { - /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ - CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); - - if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && - !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { - VMM_CTR0(vcpu, "Calling rendezvous func"); - (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); - CPU_SET(vcpuid, &vm->rendezvous_done_cpus); - } - if (CPU_CMP(&vm->rendezvous_req_cpus, - &vm->rendezvous_done_cpus) == 0) { - VMM_CTR0(vcpu, "Rendezvous completed"); - CPU_ZERO(&vm->rendezvous_req_cpus); - vm->rendezvous_func = NULL; - wakeup(&vm->rendezvous_func); + if (vm_rendezvous(vcpu)) break; - } + VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { + int error; + mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) @@ -1249,7 +1365,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { - rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm_vmspace(vm)), vme->u.paging.gpa, ftype); if (rv == 0) { VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", @@ -1259,7 +1375,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) } } - map = &vm->vmspace->vm_map; + map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " @@ -1560,7 +1676,7 @@ vm_run(struct vcpu *vcpu) if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_req_cpus; evinfo.sptr = &vm->suspend; @@ -2302,12 +2418,6 @@ vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) vcpu_unlock(vcpu); } -struct vmspace * -vm_vmspace(struct vm *vm) -{ - return (vm->vmspace); -} - struct vm_mem * vm_mem(struct vm *vm) { @@ -2519,7 +2629,7 @@ vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * - vmspace_resident_count(vcpu->vm->vmspace)); + vmspace_resident_count(vm_vmspace(vcpu->vm))); } } @@ -2529,7 +2639,7 @@ vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * - pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); + pmap_wired_count(vmspace_pmap(vm_vmspace(vcpu->vm)))); } } diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c index d8d2b460404c..dfebc9dcadbf 100644 --- a/sys/amd64/vmm/vmm_dev_machdep.c +++ b/sys/amd64/vmm/vmm_dev_machdep.c @@ -48,6 +48,7 @@ #include <x86/apicreg.h> #include <dev/vmm/vmm_dev.h> +#include <dev/vmm/vmm_mem.h> #include <dev/vmm/vmm_stat.h> #include "vmm_lapic.h" diff --git a/sys/arm/allwinner/aw_sid.c b/sys/arm/allwinner/aw_sid.c index ba5faca33c5e..932c2f189e51 100644 --- a/sys/arm/allwinner/aw_sid.c +++ b/sys/arm/allwinner/aw_sid.c @@ -297,7 +297,7 @@ aw_sid_attach(device_t dev) /* Register ourself so device can resolve who we are */ OF_device_register_xref(OF_xref_from_node(node), dev); - for (i = 0; i < sc->sid_conf->nfuses ;i++) {\ + for (i = 0; i < sc->sid_conf->nfuses; i++) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, sc->sid_conf->efuses[i].name, diff --git a/sys/arm/arm/elf_machdep.c b/sys/arm/arm/elf_machdep.c index ea6437f320ce..881c4fcff475 100644 --- a/sys/arm/arm/elf_machdep.c +++ b/sys/arm/arm/elf_machdep.c @@ -106,7 +106,7 @@ struct sysentvec elf32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); -static Elf32_Brandinfo freebsd_brand_info = { +static const Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_ARM, .compat_3_brand = "FreeBSD", @@ -118,7 +118,7 @@ static Elf32_Brandinfo freebsd_brand_info = { .header_supported= elf32_arm_abi_supported, }; -SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 78883296c5b7..6a0ece1e4d98 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -1246,7 +1246,7 @@ pmap_bootstrap(vm_offset_t firstaddr) } static void -pmap_init_reserved_pages(void) +pmap_init_reserved_pages(void *dummy __unused) { struct pcpu *pc; vm_offset_t pages; diff --git a/sys/arm/arm/unwind.c b/sys/arm/arm/unwind.c index 7ad91a3e01a5..0d77074fae34 100644 --- a/sys/arm/arm/unwind.c +++ b/sys/arm/arm/unwind.c @@ -278,7 +278,7 @@ unwind_module_unloaded(struct linker_file *lf) * the unwind tables might be stripped, so instead we have to use the * _exidx_start/end symbols created by ldscript.arm. */ -static int +static void module_info_init(void *arg __unused) { struct linker_file thekernel; @@ -291,8 +291,6 @@ module_info_init(void *arg __unused) thekernel.exidx_addr = CADDR(&_exidx_start); thekernel.exidx_size = UADDR(&_exidx_end) - UADDR(&_exidx_start); populate_module_info(create_module_info(), &thekernel); - - return (0); } SYSINIT(unwind_init, SI_SUB_KMEM, SI_ORDER_ANY, module_info_init, NULL); diff --git a/sys/arm/ti/ti_pruss.c b/sys/arm/ti/ti_pruss.c index 4e9f2022240c..bae1de9f2ddf 100644 --- a/sys/arm/ti/ti_pruss.c +++ b/sys/arm/ti/ti_pruss.c @@ -793,6 +793,7 @@ static const struct filterops ti_pruss_kq_read = { .f_isfd = 1, .f_detach = ti_pruss_irq_kqread_detach, .f_event = ti_pruss_irq_kqevent, + .f_copy = knote_triv_copy, }; static void diff --git a/sys/arm64/arm64/cpu_errata.c b/sys/arm64/arm64/cpu_errata.c index 989924bc0567..b876703a2a15 100644 --- a/sys/arm64/arm64/cpu_errata.c +++ b/sys/arm64/arm64/cpu_errata.c @@ -52,56 +52,11 @@ struct cpu_quirks { u_int flags; }; -static enum { - SSBD_FORCE_ON, - SSBD_FORCE_OFF, - SSBD_KERNEL, -} ssbd_method = SSBD_KERNEL; - -static cpu_quirk_install install_psci_bp_hardening; -static cpu_quirk_install install_ssbd_workaround; static cpu_quirk_install install_thunderx_bcast_tlbi_workaround; static struct cpu_quirks cpu_quirks[] = { { .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A57,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A72,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A73,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A75,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = - CPU_ID_RAW(CPU_IMPL_CAVIUM, CPU_PART_THUNDERX2, 0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = 0, - .midr_value = 0, - .quirk_install = install_ssbd_workaround, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, .midr_value = CPU_ID_RAW(CPU_IMPL_CAVIUM, CPU_PART_THUNDERX, 0, 0), .quirk_install = install_thunderx_bcast_tlbi_workaround, @@ -114,57 +69,6 @@ static struct cpu_quirks cpu_quirks[] = { }, }; -static void -install_psci_bp_hardening(void) -{ - /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ - if (!psci_present) - return; - - if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_1) != SMCCC_RET_SUCCESS) - return; - - PCPU_SET(bp_harden, smccc_arch_workaround_1); -} - -static void -install_ssbd_workaround(void) -{ - char *env; - - if (PCPU_GET(cpuid) == 0) { - env = kern_getenv("kern.cfg.ssbd"); - if (env != NULL) { - if (strcmp(env, "force-on") == 0) { - ssbd_method = SSBD_FORCE_ON; - } else if (strcmp(env, "force-off") == 0) { - ssbd_method = SSBD_FORCE_OFF; - } - } - } - - /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ - if (!psci_present) - return; - - /* Enable the workaround on this CPU if it's enabled in the firmware */ - if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_2) != SMCCC_RET_SUCCESS) - return; - - switch(ssbd_method) { - case SSBD_FORCE_ON: - smccc_arch_workaround_2(1); - break; - case SSBD_FORCE_OFF: - smccc_arch_workaround_2(0); - break; - case SSBD_KERNEL: - default: - PCPU_SET(ssbd, smccc_arch_workaround_2); - break; - } -} - /* * Workaround Cavium erratum 27456. * diff --git a/sys/arm64/arm64/elf_machdep.c b/sys/arm64/arm64/elf_machdep.c index 13af5c5065d6..207b37180a26 100644 --- a/sys/arm64/arm64/elf_machdep.c +++ b/sys/arm64/arm64/elf_machdep.c @@ -121,7 +121,7 @@ static struct sysentvec elf64_freebsd_sysvec = { }; INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); -static Elf64_Brandinfo freebsd_brand_info = { +static const Elf64_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_AARCH64, .compat_3_brand = "FreeBSD", @@ -131,8 +131,7 @@ static Elf64_Brandinfo freebsd_brand_info = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; - -SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info); static bool @@ -336,7 +335,7 @@ elf_cpu_parse_dynamic(caddr_t loadbase __unused, Elf_Dyn *dynamic __unused) return (0); } -static Elf_Note gnu_property_note = { +static const Elf_Note gnu_property_note = { .n_namesz = sizeof(GNU_ABI_VENDOR), .n_descsz = 16, .n_type = NT_GNU_PROPERTY_TYPE_0, diff --git a/sys/arm64/arm64/spec_workaround.c b/sys/arm64/arm64/spec_workaround.c new file mode 100644 index 000000000000..7f4f86cdb48c --- /dev/null +++ b/sys/arm64/arm64/spec_workaround.c @@ -0,0 +1,166 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Arm Ltd + * Copyright (c) 2018 Andrew Turner + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237 + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/pcpu.h> +#include <sys/systm.h> + +#include <machine/cpu.h> +#include <machine/cpu_feat.h> + +#include <dev/psci/psci.h> +#include <dev/psci/smccc.h> + +static enum { + SSBD_FORCE_ON, + SSBD_FORCE_OFF, + SSBD_KERNEL, +} ssbd_method = SSBD_KERNEL; + +struct psci_bp_hardening_impl { + u_int midr_mask; + u_int midr_value; +}; + +static struct psci_bp_hardening_impl psci_bp_hardening_impl[] = { + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A57,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A72,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A73,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A75,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = + CPU_ID_RAW(CPU_IMPL_CAVIUM, CPU_PART_THUNDERX2, 0,0), + } +}; + +static cpu_feat_en +psci_bp_hardening_check(const struct cpu_feat *feat __unused, u_int midr) +{ + size_t i; + + for (i = 0; i < nitems(psci_bp_hardening_impl); i++) { + if ((midr & psci_bp_hardening_impl[i].midr_mask) == + psci_bp_hardening_impl[i].midr_value) { + /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ + if (!psci_present) + return (FEAT_ALWAYS_DISABLE); + + if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_1) != + SMCCC_RET_SUCCESS) + return (FEAT_ALWAYS_DISABLE); + + return (FEAT_DEFAULT_ENABLE); + } + } + + return (FEAT_ALWAYS_DISABLE); +} + +static bool +psci_bp_hardening_enable(const struct cpu_feat *feat __unused, + cpu_feat_errata errata_status __unused, u_int *errata_list __unused, + u_int errata_count __unused) +{ + PCPU_SET(bp_harden, smccc_arch_workaround_1); + + return (true); +} + +CPU_FEAT(feat_csv2_missing, "Branch Predictor Hardening", + psci_bp_hardening_check, NULL, psci_bp_hardening_enable, NULL, + CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); + +static cpu_feat_en +ssbd_workaround_check(const struct cpu_feat *feat __unused, u_int midr __unused) +{ + char *env; + + if (PCPU_GET(cpuid) == 0) { + env = kern_getenv("kern.cfg.ssbd"); + if (env != NULL) { + if (strcmp(env, "force-on") == 0) { + ssbd_method = SSBD_FORCE_ON; + } else if (strcmp(env, "force-off") == 0) { + ssbd_method = SSBD_FORCE_OFF; + } + } + } + + /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ + if (!psci_present) + return (FEAT_ALWAYS_DISABLE); + + /* Enable the workaround on this CPU if it's enabled in the firmware */ + if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_2) != SMCCC_RET_SUCCESS) + return (FEAT_ALWAYS_DISABLE); + + return (FEAT_DEFAULT_ENABLE); +} + +static bool +ssbd_workaround_enable(const struct cpu_feat *feat __unused, + cpu_feat_errata errata_status __unused, u_int *errata_list __unused, + u_int errata_count __unused) +{ + switch(ssbd_method) { + case SSBD_FORCE_ON: + smccc_arch_workaround_2(1); + break; + case SSBD_FORCE_OFF: + smccc_arch_workaround_2(0); + break; + case SSBD_KERNEL: + default: + PCPU_SET(ssbd, smccc_arch_workaround_2); + break; + } + + return (true); +} + +CPU_FEAT(feat_ssbs_missing, "Speculator Store Bypass Disable Workaround", + ssbd_workaround_check, NULL, ssbd_workaround_enable, NULL, + CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); diff --git a/sys/arm64/coresight/coresight.c b/sys/arm64/coresight/coresight.c index 5928c153f4ae..9b9d3c65ecc9 100644 --- a/sys/arm64/coresight/coresight.c +++ b/sys/arm64/coresight/coresight.c @@ -113,7 +113,7 @@ coresight_get_output_device(struct endpoint *endp, struct endpoint **out_endp) } static void -coresight_init(void) +coresight_init(void *dummy __unused) { mtx_init(&cs_mtx, "ARM Coresight", NULL, MTX_DEF); diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h index da051e8f7c8a..393d6d89da0c 100644 --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -2180,6 +2180,7 @@ #define OSLAR_EL1_CRn 1 #define OSLAR_EL1_CRm 0 #define OSLAR_EL1_op2 4 +#define OSLAR_OSLK (0x1ul << 0) /* OSLSR_EL1 */ #define OSLSR_EL1_op0 2 @@ -2187,6 +2188,10 @@ #define OSLSR_EL1_CRn 1 #define OSLSR_EL1_CRm 1 #define OSLSR_EL1_op2 4 +#define OSLSR_OSLM_1 (0x1ul << 3) +#define OSLSR_nTT (0x1ul << 2) +#define OSLSR_OSLK (0x1ul << 1) +#define OSLSR_OSLM_0 (0x1ul << 0) /* PAR_EL1 - Physical Address Register */ #define PAR_F_SHIFT 0 diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index e839b5dd92c9..696a69669a2a 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -143,10 +143,41 @@ struct vm_eventinfo { int *iptr; /* reqidle cookie */ }; +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); +#ifdef notyet +#ifdef BHYVE_SNAPSHOT +DECLARE_VMMOPS_FUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); +#endif +#endif + int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -232,7 +263,6 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu); -struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); enum vm_reg_name vm_segment_name(int seg_encoding); diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c index 084b7a11b01f..ac05820f89bc 100644 --- a/sys/arm64/linux/linux_sysvec.c +++ b/sys/arm64/linux/linux_sysvec.c @@ -584,7 +584,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux64_brandnote = { +static const Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, @@ -593,7 +593,7 @@ static Elf_Brandnote linux64_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf64_Brandinfo linux_glibc2brand = { +static const Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_AARCH64, .compat_3_brand = "Linux", @@ -604,7 +604,7 @@ static Elf64_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -Elf64_Brandinfo *linux_brandlist[] = { +const Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, NULL }; @@ -612,8 +612,8 @@ Elf64_Brandinfo *linux_brandlist[] = { static int linux64_elf_modevent(module_t mod, int type, void *data) { - Elf64_Brandinfo **brandinfo; - struct linux_ioctl_handler**lihp; + const Elf64_Brandinfo **brandinfo; + struct linux_ioctl_handler **lihp; int error; error = 0; diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h index 334b795832a3..f530dab05331 100644 --- a/sys/arm64/vmm/arm64.h +++ b/sys/arm64/vmm/arm64.h @@ -119,6 +119,7 @@ struct hypctx { struct vgic_v3_regs vgic_v3_regs; struct vgic_v3_cpu *vgic_cpu; bool has_exception; + bool dbg_oslock; }; struct hyp { @@ -135,37 +136,6 @@ struct hyp { struct hypctx *ctx[]; }; -#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - ret_type vmmops_##opname args; - -DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) -DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) -DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) -DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) -DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, - struct vm_eventinfo *info)) -DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) -DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, - int vcpu_id)) -DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) -DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)) -DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) -DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) -DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) -DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) -DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, - vm_offset_t max)) -DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) -#ifdef notyet -#ifdef BHYVE_SNAPSHOT -DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) -DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, - struct vm_snapshot_meta *meta)) -DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) -#endif -#endif - uint64_t vmm_call_hyp(uint64_t, ...); #if 0 diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index 1dcefa1489e9..bf52dc0fe916 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -88,7 +88,6 @@ struct vcpu { struct vfpstate *guestfpu; /* (a,i) guest fpu state */ }; -#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) @@ -126,7 +125,6 @@ struct vm { bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ - struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) guest memory */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ @@ -274,6 +272,7 @@ vcpu_cleanup(struct vcpu *vcpu, bool destroy) vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); + free(vcpu, M_VMM); } } @@ -407,7 +406,7 @@ vm_init(struct vm *vm, bool create) { int i; - vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); MPASS(vm->cookie != NULL); CPU_ZERO(&vm->active_cpus); @@ -470,9 +469,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -485,7 +484,7 @@ int vm_create(const char *name, struct vm **retvm) { struct vm *vm; - struct vmspace *vmspace; + int error; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -497,14 +496,13 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); - vmspace = vmmops_vmspace_alloc(0, 1ul << 39); - if (vmspace == NULL) - return (ENOMEM); - vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, 1ul << 39); + if (error != 0) { + free(vm, M_VMM); + return (error); + } strcpy(vm->name, name); - vm->vmspace = vmspace; - vm_mem_init(&vm->mem); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->sockets = 1; @@ -558,7 +556,7 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_xlock_memsegs(vm); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); sched_pin(); PCPU_SET(curvmpmap, NULL); sched_unpin(); @@ -582,11 +580,6 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_mem_destroy(vm); - vmmops_vmspace_free(vm->vmspace); - vm->vmspace = NULL; - - for (i = 0; i < vm->maxcpus; i++) - free(vm->vcpu[i], M_VMM); free(vm->vcpu, M_VMM); sx_destroy(&vm->vcpus_init_lock); } @@ -651,6 +644,33 @@ vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg) return (0); } +static int +vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hypctx *hypctx; + + hypctx = vcpu_get_cookie(vcpu); + /* All other fields are RES0 & we don't do anything with this */ + /* TODO: Disable access to other debug state when locked */ + hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK; + return (0); +} + +static int +vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hypctx *hypctx; + uint64_t val; + + hypctx = vcpu_get_cookie(vcpu); + val = OSLSR_OSLM_1; + if (hypctx->dbg_oslock) + val |= OSLSR_OSLK; + *rval = val; + + return (0); +} + static const struct vmm_special_reg vmm_special_regs[] = { #define SPECIAL_REG(_reg, _read, _write) \ { \ @@ -707,6 +727,13 @@ static const struct vmm_special_reg vmm_special_regs[] = { SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read, vtimer_phys_tval_write), SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write), + + /* Debug registers */ + SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi), + SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi), + /* TODO: Exceptions on invalid access */ + SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1), + SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi), #undef SPECIAL_REG }; @@ -1056,12 +1083,6 @@ vcpu_notify_event(struct vcpu *vcpu) vcpu_unlock(vcpu); } -struct vmspace * -vm_vmspace(struct vm *vm) -{ - return (vm->vmspace); -} - struct vm_mem * vm_mem(struct vm *vm) { @@ -1382,7 +1403,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) vme = &vcpu->exitinfo; - pmap = vmspace_pmap(vcpu->vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vcpu->vm)); addr = vme->u.paging.gpa; esr = vme->u.paging.esr; @@ -1399,7 +1420,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) panic("%s: Invalid exception (esr = %lx)", __func__, esr); } - map = &vm->vmspace->vm_map; + map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); if (rv != KERN_SUCCESS) return (EFAULT); @@ -1473,7 +1494,7 @@ vm_run(struct vcpu *vcpu) if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = NULL; evinfo.sptr = &vm->suspend; diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index 1facab47473c..0d844a6fbf9e 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -44,6 +44,7 @@ #include <sys/malloc.h> #include <sys/endian.h> #include <sys/cons.h> +#include <sys/power.h> #include <sys/proc.h> #include <sys/reboot.h> #include <sys/sbuf.h> @@ -878,8 +879,8 @@ static int adaerror(union ccb *ccb, uint32_t cam_flags, uint32_t sense_flags); static callout_func_t adasendorderedtag; static void adashutdown(void *arg, int howto); -static void adasuspend(void *arg); -static void adaresume(void *arg); +static void adasuspend(void *arg, enum power_stype stype); +static void adaresume(void *arg, enum power_stype stype); #ifndef ADA_DEFAULT_TIMEOUT #define ADA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */ @@ -3747,7 +3748,7 @@ adashutdown(void *arg, int howto) } static void -adasuspend(void *arg) +adasuspend(void *arg, enum power_stype stype) { adaflush(); @@ -3760,7 +3761,7 @@ adasuspend(void *arg) } static void -adaresume(void *arg) +adaresume(void *arg, enum power_stype stype) { struct cam_periph *periph; struct ada_softc *softc; diff --git a/sys/cam/nvme/nvme_da.c b/sys/cam/nvme/nvme_da.c index 1c0d5e8381d8..9c4707da482c 100644 --- a/sys/cam/nvme/nvme_da.c +++ b/sys/cam/nvme/nvme_da.c @@ -43,6 +43,7 @@ #include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/cons.h> +#include <sys/power.h> #include <sys/proc.h> #include <sys/reboot.h> #include <sys/sbuf.h> @@ -159,7 +160,7 @@ static void ndadone(struct cam_periph *periph, static int ndaerror(union ccb *ccb, uint32_t cam_flags, uint32_t sense_flags); static void ndashutdown(void *arg, int howto); -static void ndasuspend(void *arg); +static void ndasuspend(void *arg, enum power_stype stype); #ifndef NDA_DEFAULT_SEND_ORDERED #define NDA_DEFAULT_SEND_ORDERED 1 @@ -1365,7 +1366,7 @@ ndashutdown(void *arg, int howto) } static void -ndasuspend(void *arg) +ndasuspend(void *arg, enum power_stype stype) { ndaflush(); diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c index b518f84454ad..fd128e69f1f1 100644 --- a/sys/cam/scsi/scsi_all.c +++ b/sys/cam/scsi/scsi_all.c @@ -112,7 +112,7 @@ static void fetchtableentries(int sense_key, int asc, int ascq, const struct asc_table_entry **); #ifdef _KERNEL -static void init_scsi_delay(void); +static void init_scsi_delay(void *); static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS); static int set_scsi_delay(int delay); #endif @@ -686,7 +686,7 @@ scsi_op_desc(uint16_t opcode, struct scsi_inquiry_data *inq_data) opmask = 1 << pd_type; for (j = 0; j < num_tables; j++) { - for (i = 0;i < num_ops[j] && table[j][i].opcode <= opcode; i++){ + for (i = 0; i < num_ops[j] && table[j][i].opcode <= opcode; i++) { if ((table[j][i].opcode == opcode) && ((table[j][i].opmask & opmask) != 0)) return(table[j][i].desc); @@ -9379,7 +9379,7 @@ scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id) } static void -init_scsi_delay(void) +init_scsi_delay(void *dummy __unused) { int delay; diff --git a/sys/cam/scsi/scsi_enc_ses.c b/sys/cam/scsi/scsi_enc_ses.c index 435874a9874a..3a362eaf11a4 100644 --- a/sys/cam/scsi/scsi_enc_ses.c +++ b/sys/cam/scsi/scsi_enc_ses.c @@ -2302,7 +2302,7 @@ ses_print_addl_data_sas_type0(char *sesname, struct sbuf *sbp, sbuf_putc(sbp, '\n'); if (addl->proto_data.sasdev_phys == NULL) return; - for (i = 0;i < addl->proto_hdr.sas->base_hdr.num_phys;i++) { + for (i = 0; i < addl->proto_hdr.sas->base_hdr.num_phys; i++) { phy = &addl->proto_data.sasdev_phys[i]; sbuf_printf(sbp, "%s: phy %d:", sesname, i); if (ses_elm_sas_dev_phy_sata_dev(phy)) @@ -2349,7 +2349,7 @@ ses_print_addl_data_sas_type1(char *sesname, struct sbuf *sbp, sbuf_printf(sbp, "Expander: %d phys", num_phys); if (addl->proto_data.sasexp_phys == NULL) return; - for (i = 0;i < num_phys;i++) { + for (i = 0; i < num_phys; i++) { exp_phy = &addl->proto_data.sasexp_phys[i]; sbuf_printf(sbp, "%s: phy %d: connector %d other %d\n", sesname, i, exp_phy->connector_index, @@ -2360,7 +2360,7 @@ ses_print_addl_data_sas_type1(char *sesname, struct sbuf *sbp, sbuf_printf(sbp, "Port: %d phys", num_phys); if (addl->proto_data.sasport_phys == NULL) return; - for (i = 0;i < num_phys;i++) { + for (i = 0; i < num_phys; i++) { port_phy = &addl->proto_data.sasport_phys[i]; sbuf_printf(sbp, "%s: phy %d: id %d connector %d other %d\n", diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c index c3587421c176..b44ab866dfe7 100644 --- a/sys/cam/scsi/scsi_pass.c +++ b/sys/cam/scsi/scsi_pass.c @@ -206,7 +206,8 @@ static struct cdevsw pass_cdevsw = { static const struct filterops passread_filtops = { .f_isfd = 1, .f_detach = passreadfiltdetach, - .f_event = passreadfilt + .f_event = passreadfilt, + .f_copy = knote_triv_copy, }; static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers"); diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c index 21c78e35dadc..39ce2bcea8f4 100644 --- a/sys/cam/scsi/scsi_target.c +++ b/sys/cam/scsi/scsi_target.c @@ -108,6 +108,7 @@ static const struct filterops targread_filtops = { .f_isfd = 1, .f_detach = targreadfiltdetach, .f_event = targreadfilt, + .f_copy = knote_triv_copy, }; static struct cdevsw targ_cdevsw = { diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris.c b/sys/cddl/compat/opensolaris/kern/opensolaris.c index 10924977c20d..898b2ea49f96 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris.c @@ -67,7 +67,7 @@ opensolaris_load(void *dummy) SYSINIT(opensolaris_register, SI_SUB_OPENSOLARIS, SI_ORDER_FIRST, opensolaris_load, NULL); static void -opensolaris_unload(void) +opensolaris_unload(void *dummy __unused) { mutex_destroy(&cpu_lock); } diff --git a/sys/compat/ia32/ia32_sysvec.c b/sys/compat/ia32/ia32_sysvec.c index 0ea7d072e911..b9dada4eee7b 100644 --- a/sys/compat/ia32/ia32_sysvec.c +++ b/sys/compat/ia32/ia32_sysvec.c @@ -145,7 +145,7 @@ struct sysentvec ia32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf_ia32_sysvec, &ia32_freebsd_sysvec); -static Elf32_Brandinfo ia32_brand_info = { +static const Elf32_Brandinfo ia32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -155,12 +155,10 @@ static Elf32_Brandinfo ia32_brand_info = { .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; +C_SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE, + (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_info); -SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &ia32_brand_info); - -static Elf32_Brandinfo ia32_brand_oinfo = { +static const Elf32_Brandinfo ia32_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -170,12 +168,10 @@ static Elf32_Brandinfo ia32_brand_oinfo = { .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; +C_SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_oinfo); -SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &ia32_brand_oinfo); - -static Elf32_Brandinfo kia32_brand_info = { +static const Elf32_Brandinfo kia32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -184,10 +180,8 @@ static Elf32_Brandinfo kia32_brand_info = { .brand_note = &elf32_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; - -SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &kia32_brand_info); +C_SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t)elf32_insert_brand_entry, &kia32_brand_info); void elf32_dump_thread(struct thread *td, void *dst, size_t *off) diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c index 95b212be1306..7ac48786c77b 100644 --- a/sys/compat/linprocfs/linprocfs.c +++ b/sys/compat/linprocfs/linprocfs.c @@ -2216,6 +2216,67 @@ linprocfs_dosysvipc_shm(PFS_FILL_ARGS) return (0); } +static int +linprocfs_doinotify(const char *sysctl, PFS_FILL_ARGS) +{ + size_t size; + int error, val; + + if (uio->uio_rw == UIO_READ) { + size = sizeof(val); + error = kernel_sysctlbyname(curthread, + __DECONST(void *, sysctl), &val, &size, NULL, 0, 0, 0); + if (error == 0) + sbuf_printf(sb, "%d\n", val); + } else { + char *endp, *newval; + long vall; + + sbuf_trim(sb); + sbuf_finish(sb); + newval = sbuf_data(sb); + vall = strtol(newval, &endp, 10); + if (vall < 0 || vall > INT_MAX || endp == newval || + *endp != '\0') + return (EINVAL); + val = (int)vall; + error = kernel_sysctlbyname(curthread, + __DECONST(void *, sysctl), NULL, NULL, + &val, sizeof(val), 0, 0); + } + return (error); +} + +/* + * Filler function for proc/sys/fs/inotify/max_queued_events + */ +static int +linprocfs_doinotify_max_queued_events(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_queued_events", + PFS_FILL_ARGNAMES)); +} + +/* + * Filler function for proc/sys/fs/inotify/max_user_instances + */ +static int +linprocfs_doinotify_max_user_instances(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_user_instances", + PFS_FILL_ARGNAMES)); +} + +/* + * Filler function for proc/sys/fs/inotify/max_user_watches + */ +static int +linprocfs_doinotify_max_user_watches(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_user_watches", + PFS_FILL_ARGNAMES)); +} + /* * Filler function for proc/sys/fs/mqueue/msg_default */ @@ -2313,9 +2374,7 @@ linprocfs_domqueue_queues_max(PFS_FILL_ARGS) static int linprocfs_init(PFS_INIT_ARGS) { - struct pfs_node *root; - struct pfs_node *dir; - struct pfs_node *sys; + struct pfs_node *dir, *fs, *root, *sys; root = pi->pi_root; @@ -2466,10 +2525,18 @@ linprocfs_init(PFS_INIT_ARGS) NULL, PFS_RD); /* /proc/sys/fs/... */ - pfs_create_dir(sys, &dir, "fs", NULL, NULL, NULL, 0); + pfs_create_dir(sys, &fs, "fs", NULL, NULL, NULL, 0); + + pfs_create_dir(fs, &dir, "inotify", NULL, NULL, NULL, 0); + pfs_create_file(dir, NULL, "max_queued_events", + &linprocfs_doinotify_max_queued_events, NULL, NULL, NULL, PFS_RDWR); + pfs_create_file(dir, NULL, "max_user_instances", + &linprocfs_doinotify_max_user_instances, NULL, NULL, NULL, PFS_RDWR); + pfs_create_file(dir, NULL, "max_user_watches", + &linprocfs_doinotify_max_user_watches, NULL, NULL, NULL, PFS_RDWR); /* /proc/sys/fs/mqueue/... */ - pfs_create_dir(dir, &dir, "mqueue", NULL, NULL, NULL, 0); + pfs_create_dir(fs, &dir, "mqueue", NULL, NULL, NULL, 0); pfs_create_file(dir, NULL, "msg_default", &linprocfs_domqueue_msg_default, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, NULL, "msgsize_default", diff --git a/sys/compat/linux/linux.c b/sys/compat/linux/linux.c index 61b207070963..a40f110634f7 100644 --- a/sys/compat/linux/linux.c +++ b/sys/compat/linux/linux.c @@ -578,8 +578,13 @@ bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, return (0); } +/* + * If sap is NULL, then osa points at already copied in linux sockaddr that + * should be edited in place. Otherwise memory is allocated, sockaddr + * copied in and returned in *sap. + */ int -linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, +linux_to_bsd_sockaddr(struct l_sockaddr *osa, struct sockaddr **sap, socklen_t *len) { struct sockaddr *sa; @@ -609,10 +614,12 @@ linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, } #endif - kosa = malloc(salen, M_SONAME, M_WAITOK); - - if ((error = copyin(osa, kosa, *len))) - goto out; + if (sap != NULL) { + kosa = malloc(salen, M_SONAME, M_WAITOK); + if ((error = copyin(osa, kosa, *len))) + goto out; + } else + kosa = osa; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == AF_UNKNOWN) { @@ -686,12 +693,15 @@ linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, sa->sa_family = bdom; sa->sa_len = salen; - *sap = sa; - *len = salen; + if (sap != NULL) { + *sap = sa; + *len = salen; + } return (0); out: - free(kosa, M_SONAME); + if (sap != NULL) + free(kosa, M_SONAME); return (error); } diff --git a/sys/compat/linux/linux_common.h b/sys/compat/linux/linux_common.h index 97f5a259f300..814c183b338a 100644 --- a/sys/compat/linux/linux_common.h +++ b/sys/compat/linux/linux_common.h @@ -43,7 +43,7 @@ sa_family_t bsd_to_linux_domain(sa_family_t domain); #define AF_UNKNOWN UINT8_MAX int bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, socklen_t len); -int linux_to_bsd_sockaddr(const struct l_sockaddr *lsa, +int linux_to_bsd_sockaddr(struct l_sockaddr *lsa, struct sockaddr **sap, socklen_t *len); void linux_to_bsd_poll_events(struct thread *td, int fd, short lev, short *bev); diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c index e88791659f1f..fc3ef7c3e90a 100644 --- a/sys/compat/linux/linux_event.c +++ b/sys/compat/linux/linux_event.c @@ -104,7 +104,7 @@ static int epoll_create_common(struct thread *td, int flags) { - return (kern_kqueue(td, flags, NULL)); + return (kern_kqueue(td, flags, false, NULL)); } #ifdef LINUX_LEGACY_SYSCALLS diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c index 37d0142bae8b..0586eb55a8f3 100644 --- a/sys/compat/linux/linux_futex.c +++ b/sys/compat/linux/linux_futex.c @@ -251,7 +251,7 @@ linux_futex(struct thread *td, struct linux_futex_args *args) * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. */ p = td->td_proc; - Elf_Brandinfo *bi = p->p_elf_brandinfo; + const Elf_Brandinfo *bi = p->p_elf_brandinfo; if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) return (EINVAL); args->val3_compare = false; diff --git a/sys/compat/linux/linux_ioctl.h b/sys/compat/linux/linux_ioctl.h index ccc25bc919ab..8345b7e4b719 100644 --- a/sys/compat/linux/linux_ioctl.h +++ b/sys/compat/linux/linux_ioctl.h @@ -823,4 +823,16 @@ int linux32_ioctl_register_handler(struct linux_ioctl_handler *h); int linux32_ioctl_unregister_handler(struct linux_ioctl_handler *h); #endif +#define LINUX_IOCTL_SET(n, low, high) \ +static linux_ioctl_function_t n##_linux_ioctl; \ +static struct linux_ioctl_handler n##_linux_handler = { \ + n##_linux_ioctl, \ + low, \ + high \ +}; \ +SYSINIT(n##_ioctl_register, SI_SUB_KLD, SI_ORDER_MIDDLE, \ + linux_ioctl_register_handler, &n##_linux_handler); \ +SYSUNINIT(n##_ioctl_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, \ + linux_ioctl_unregister_handler, &n##_linux_handler) + #endif /* !_LINUX_IOCTL_H_ */ diff --git a/sys/compat/linux/linux_socket.c b/sys/compat/linux/linux_socket.c index 0e07b0a60ced..b1a483ce611c 100644 --- a/sys/compat/linux/linux_socket.c +++ b/sys/compat/linux/linux_socket.c @@ -2146,7 +2146,8 @@ linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) return (ENOPROTOOPT); } - if (name == IPV6_NEXTHOP) { + switch (name) { + case IPV6_NEXTHOP: { len = args->optlen; error = linux_to_bsd_sockaddr(PTRIN(args->optval), &sa, &len); if (error != 0) @@ -2155,7 +2156,34 @@ linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) error = kern_setsockopt(td, args->s, level, name, sa, UIO_SYSSPACE, len); free(sa, M_SONAME); - } else { + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: { + struct group_source_req req; + size_t size; + + size = (name == MCAST_JOIN_SOURCE_GROUP || + name == MCAST_LEAVE_SOURCE_GROUP) ? + sizeof(struct group_source_req) : sizeof(struct group_req); + + if ((error = copyin(PTRIN(args->optval), &req, size))) + return (error); + len = sizeof(struct sockaddr_storage); + if ((error = linux_to_bsd_sockaddr( + (struct l_sockaddr *)&req.gsr_group, NULL, &len))) + return (error); + if (size == sizeof(struct group_source_req) && + (error = linux_to_bsd_sockaddr( + (struct l_sockaddr *)&req.gsr_source, NULL, &len))) + return (error); + error = kern_setsockopt(td, args->s, level, name, &req, + UIO_SYSSPACE, size); + break; + } + default: error = kern_setsockopt(td, args->s, level, name, PTRIN(args->optval), UIO_USERSPACE, args->optlen); } diff --git a/sys/compat/linuxkpi/common/src/linux_acpi.c b/sys/compat/linuxkpi/common/src/linux_acpi.c index 43783bb8727b..c7d62c745c7e 100644 --- a/sys/compat/linuxkpi/common/src/linux_acpi.c +++ b/sys/compat/linuxkpi/common/src/linux_acpi.c @@ -33,6 +33,7 @@ #include <sys/bus.h> #include <sys/eventhandler.h> #include <sys/kernel.h> +#include <sys/power.h> #include <contrib/dev/acpica/include/acpi.h> #include <dev/acpica/acpivar.h> @@ -118,20 +119,32 @@ acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid, } static void -linux_handle_power_suspend_event(void *arg __unused) +linux_handle_power_suspend_event(void *arg __unused, enum power_stype stype) { - /* - * Only support S3 for now. - * acpi_sleep_event isn't always called so we use power_suspend_early - * instead which means we don't know what state we're switching to. - * TODO: Make acpi_sleep_event consistent - */ - linux_acpi_target_sleep_state = ACPI_STATE_S3; - pm_suspend_target_state = PM_SUSPEND_MEM; + switch (stype) { + case POWER_STYPE_SUSPEND_TO_IDLE: + /* + * XXX: obiwac Not 100% sure this is correct, but + * acpi_target_sleep_state does seem to be set to + * ACPI_STATE_S3 during s2idle on Linux. + */ + linux_acpi_target_sleep_state = ACPI_STATE_S3; + pm_suspend_target_state = PM_SUSPEND_TO_IDLE; + break; + case POWER_STYPE_SUSPEND_TO_MEM: + linux_acpi_target_sleep_state = ACPI_STATE_S3; + pm_suspend_target_state = PM_SUSPEND_MEM; + break; + default: + printf("%s: sleep type %d not yet supported\n", + __func__, stype); + break; + } } static void -linux_handle_power_resume_event(void *arg __unused) +linux_handle_power_resume_event(void *arg __unused, + enum power_stype stype __unused) { linux_acpi_target_sleep_state = ACPI_STATE_S0; pm_suspend_target_state = PM_SUSPEND_ON; diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index 458744a9fec6..ff0f477ea8cc 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -1171,12 +1171,14 @@ static const struct filterops linux_dev_kqfiltops_read = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops linux_dev_kqfiltops_write = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_write_event, + .f_copy = knote_triv_copy, }; static void diff --git a/sys/conf/NOTES b/sys/conf/NOTES index ea9b2667607e..9944375c3615 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -679,6 +679,7 @@ options TCP_OFFLOAD # TCP offload support. options TCP_RFC7413 # TCP Fast Open options TCPHPTS +#options TCP_HPTS_KTEST # Add KTEST support for HPTS # In order to enable IPSEC you MUST also add device crypto to # your kernel configuration @@ -2800,7 +2801,7 @@ options MAXFILES=999 # Random number generator # Alternative algorithm. -#options RANDOM_FENESTRASX +options RANDOM_FENESTRASX # Allow the CSPRNG algorithm to be loaded as a module. #options RANDOM_LOADABLE # Select this to allow high-rate but potentially expensive diff --git a/sys/conf/dtb.build.mk b/sys/conf/dtb.build.mk index 327d69106244..7eb0db5e8b80 100644 --- a/sys/conf/dtb.build.mk +++ b/sys/conf/dtb.build.mk @@ -1,7 +1,3 @@ - -.include <bsd.init.mk> -# Grab all the options for a kernel build. For backwards compat, we need to -# do this after bsd.own.mk. .include "kern.opts.mk" DTC?= dtc diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 856ea3af1372..2f412fa3cb1b 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -73,6 +73,7 @@ arm64/arm64/pmap.c standard arm64/arm64/ptrace_machdep.c standard arm64/arm64/sdt_machdep.c optional kdtrace_hooks arm64/arm64/sigtramp.S standard +arm64/arm64/spec_workaround.c standard arm64/arm64/stack_machdep.c optional ddb | stack arm64/arm64/strcmp.S standard arm64/arm64/strncmp.S standard diff --git a/sys/conf/kern.opts.mk b/sys/conf/kern.opts.mk index 045e55d1b19a..cef4dd11ba58 100644 --- a/sys/conf/kern.opts.mk +++ b/sys/conf/kern.opts.mk @@ -4,6 +4,7 @@ # parts to omit (eg CDDL or SOURCELESS_HOST). Some of these will cause # config.mk to define symbols in various opt_*.h files. + # # Define MK_* variables (which are either "yes" or "no") for users # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the @@ -13,17 +14,12 @@ # that haven't been converted over. # -# Note: bsd.own.mk must be included before the rest of kern.opts.mk to make -# building on 10.x and earlier work. This should be removed when that's no -# longer supported since it confounds the defaults (since it uses the host's -# notion of defaults rather than what's default in current when building -# within sys/modules). -.include <bsd.own.mk> - # These options are used by the kernel build process (kern.mk and kmod.mk) # They have to be listed here so we can build modules outside of the # src tree. +.include <bsd.init.mk> + KLDXREF_CMD?= kldxref __DEFAULT_YES_OPTIONS = \ diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk index bb3c7af82a4d..7cdfd17778db 100644 --- a/sys/conf/kern.post.mk +++ b/sys/conf/kern.post.mk @@ -398,6 +398,14 @@ CFLAGS+= -fdebug-prefix-map=./${_link}=${PREFIX_SYSDIR}/${_link}/include .endif .endfor +# Install GDB plugins that are useful for kernel debugging. See the +# README in sys/tools/gdb for more information. +GDB_FILES= acttrace.py \ + freebsd.py \ + pcpu.py \ + selftest.py \ + vnet.py + ${_ILINKS}: @case ${.TARGET} in \ machine) \ @@ -447,6 +455,13 @@ kernel-install: .PHONY .if defined(DEBUG) && !defined(INSTALL_NODEBUG) && ${MK_KERNEL_SYMBOLS} != "no" mkdir -p ${DESTDIR}${KERN_DEBUGDIR}${KODIR} ${INSTALL} -p -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} ${KERNEL_KO}.debug ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/ + ${INSTALL} -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} \ + $S/tools/kernel-gdb.py ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/${KERNEL_KO}-gdb.py + mkdir -p ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/gdb +.for file in ${GDB_FILES} + ${INSTALL} -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} \ + $S/tools/gdb/${file} ${DESTDIR}${KERN_DEBUGDIR}${KODIR}/gdb/${file} +.endfor .endif .if defined(KERNEL_EXTRA_INSTALL) ${INSTALL} -p -m ${KMODMODE} -o ${KMODOWN} -g ${KMODGRP} ${KERNEL_EXTRA_INSTALL} ${DESTDIR}${KODIR}/ diff --git a/sys/conf/options b/sys/conf/options index b48ad1cf42cf..0b795a8d28fb 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -231,6 +231,7 @@ SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS +TCP_HPTS_KTEST opt_inet.h TCP_REQUEST_TRK opt_global.h TCP_ACCOUNTING opt_global.h TCP_BBR opt_inet.h diff --git a/sys/conf/std.debug b/sys/conf/std.debug index f5ed5582c78d..0149779b3e5c 100644 --- a/sys/conf/std.debug +++ b/sys/conf/std.debug @@ -16,3 +16,4 @@ options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones options VERBOSE_SYSINIT=0 # Support debug.verbose_sysinit, off by default options ALT_BREAK_TO_DEBUGGER # Enter debugger on keyboard escape sequence options KDTRACE_MIB_SDT # Add SDT probes to network counters +options TCP_HPTS_KTEST # Add KTEST support for HPTS diff --git a/sys/conf/std.nodebug b/sys/conf/std.nodebug index 4035e28d2a62..79676a1d618f 100644 --- a/sys/conf/std.nodebug +++ b/sys/conf/std.nodebug @@ -16,6 +16,7 @@ nooptions KCOV nooptions MALLOC_DEBUG_MAXZONES nooptions QUEUE_MACRO_DEBUG_TRASH nooptions KDTRACE_MIB_SDT +nooptions TCP_HPTS_KTEST # Net80211 debugging nooptions IEEE80211_DEBUG diff --git a/sys/contrib/libnv/bsd_nvpair.c b/sys/contrib/libnv/bsd_nvpair.c index c73bc2189121..b884dd260b84 100644 --- a/sys/contrib/libnv/bsd_nvpair.c +++ b/sys/contrib/libnv/bsd_nvpair.c @@ -985,13 +985,13 @@ nvpair_unpack_string_array(bool isbe __unused, nvpair_t *nvp, size = nvp->nvp_datasize; tmp = (const char *)ptr; for (ii = 0; ii < nvp->nvp_nitems; ii++) { - len = strnlen(tmp, size - 1) + 1; - size -= len; - if (tmp[len - 1] != '\0') { + if (size <= 0) { ERRNO_SET(EINVAL); return (NULL); } - if (size < 0) { + len = strnlen(tmp, size - 1) + 1; + size -= len; + if (tmp[len - 1] != '\0') { ERRNO_SET(EINVAL); return (NULL); } diff --git a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md index 9b50a4a3d96e..f3d4316f6f67 100644 --- a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md +++ b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md @@ -14,7 +14,7 @@ Please check our issue tracker before opening a new feature request. Filling out the following template will help other contributors better understand your proposed feature. --> -### Describe the feature would like to see added to OpenZFS +### Describe the feature you would like to see added to OpenZFS <!-- Provide a clear and concise description of the feature. diff --git a/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md b/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md index 79809179cf13..47edc8174603 100644 --- a/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md +++ b/sys/contrib/openzfs/.github/PULL_REQUEST_TEMPLATE.md @@ -2,11 +2,6 @@ <!--- Provide a general summary of your changes in the Title above --> -<!--- -Documentation on ZFS Buildbot options can be found at -https://openzfs.github.io/openzfs-docs/Developer%20Resources/Buildbot%20Options.html ---> - ### Motivation and Context <!--- Why is this change required? What problem does it solve? --> <!--- If it fixes an open issue, please link to the issue here. --> diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh index 8439942c5a41..1c608348ffcd 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh @@ -121,7 +121,7 @@ case "$OS" in KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - FreeBSD="15.0-ALPHA2" + FreeBSD="15.0-ALPHA3" OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh index ee058b488088..f67bb2f68e94 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -20,7 +20,7 @@ function archlinux() { sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \ fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \ parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \ - samba sysstat rng-tools rsync wget xxhash + samba strace sysstat rng-tools rsync wget xxhash echo "##[endgroup]" } @@ -43,7 +43,8 @@ function debian() { lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \ python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \ python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \ - rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev + rsync samba strace sysstat uuid-dev watchdog wget xfslibs-dev xxhash \ + zlib1g-dev echo "##[endgroup]" } @@ -87,8 +88,8 @@ function rhel() { libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \ parted perf python3 python3-cffi python3-devel python3-packaging \ kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \ - rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \ - zlib-devel + rpm-build rsync samba strace sysstat systemd watchdog wget xfsprogs-devel \ + xxhash zlib-devel echo "##[endgroup]" } @@ -104,7 +105,7 @@ function install_fedora_experimental_kernel { our_version="$1" sudo dnf -y copr enable @kernel-vanilla/stable sudo dnf -y copr enable @kernel-vanilla/mainline - all="$(sudo dnf list --showduplicates kernel-*)" + all="$(sudo dnf list --showduplicates kernel-* python3-perf* perf* bpftool*)" echo "Available versions:" echo "$all" diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh index 0adcad2a99bc..4869c1003e48 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh @@ -108,19 +108,30 @@ echo '*/5 * * * * /root/cronjob.sh' > crontab.txt sudo crontab crontab.txt rm crontab.txt -# check if the machines are okay -echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)" -for ((i=1; i<=VMs; i++)); do - .github/workflows/scripts/qemu-wait-for-vm.sh vm$i -done -echo "All $VMs VMs are up now." - # Save the VM's serial output (ttyS0) to /var/tmp/console.txt # - ttyS0 on the VM corresponds to a local /dev/pty/N entry # - use 'virsh ttyconsole' to lookup the /dev/pty/N entry for ((i=1; i<=VMs; i++)); do mkdir -p $RESPATH/vm$i read "pty" <<< $(sudo virsh ttyconsole vm$i) + + # Create the file so we can tail it, even if there's no output. + touch $RESPATH/vm$i/console.txt + sudo nohup bash -c "cat $pty > $RESPATH/vm$i/console.txt" & + + # Write all VM boot lines to the console to aid in debugging failed boots. + # The boot lines from all the VMs will be munged together, so prepend each + # line with the vm hostname (like 'vm1:'). + (while IFS=$'\n' read -r line; do echo "vm$i: $line" ; done < <(sudo tail -f $RESPATH/vm$i/console.txt)) & + done echo "Console logging for ${VMs}x $OS started." + + +# check if the machines are okay +echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)" +for ((i=1; i<=VMs; i++)); do + .github/workflows/scripts/qemu-wait-for-vm.sh vm$i +done +echo "All $VMs VMs are up now." diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh index 5ab822f4f076..ca6ac77f146d 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh @@ -111,7 +111,7 @@ fi sudo dmesg -c > dmesg-prerun.txt mount > mount.txt df -h > df-prerun.txt -$TDIR/zfs-tests.sh -vK -s 3GB -T $TAGS +$TDIR/zfs-tests.sh -vKO -s 3GB -T $TAGS RV=$? df -h > df-postrun.txt echo $RV > tests-exitcode.txt diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 5704b5c6de8a..bdb7aee48041 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.16 +Linux-Maximum: 6.17 Linux-Minimum: 4.18 diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index d655fa715e15..70a4ed46f263 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -3301,6 +3301,7 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) uint64_t keyformat, salt, iters; int i; unsigned char c; + FILE *f; VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), @@ -3333,6 +3334,25 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) break; + case ZFS_KEYFORMAT_RAW: + if ((f = fopen(key_material, "r")) == NULL) + return (B_FALSE); + + if (fread(key_out, 1, WRAPPING_KEY_LEN, f) != + WRAPPING_KEY_LEN) { + (void) fclose(f); + return (B_FALSE); + } + + /* Check the key length */ + if (fgetc(f) != EOF) { + (void) fclose(f); + return (B_FALSE); + } + + (void) fclose(f); + break; + default: fatal("no support for key format %u\n", (unsigned int) keyformat); diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index 484986bde719..ccdd5ffef8e6 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -914,7 +914,11 @@ zfs_do_clone(int argc, char **argv) log_history = B_FALSE; } - ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); + /* + * Dataset cloned successfully, mount/share failures are + * non-fatal. + */ + (void) zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); } zfs_close(zhp); @@ -930,19 +934,15 @@ usage: } /* - * Return a default volblocksize for the pool which always uses more than - * half of the data sectors. This primarily applies to dRAID which always - * writes full stripe widths. + * Calculate the minimum allocation size based on the top-level vdevs. */ static uint64_t -default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +calculate_volblocksize(nvlist_t *config) { - uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + uint64_t asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; - nvlist_t *config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { @@ -973,6 +973,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) } } + return (asize); +} + +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0) + asize = calculate_volblocksize(config); + /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. @@ -1319,7 +1337,9 @@ zfs_do_create(int argc, char **argv) goto error; } - ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); + /* Dataset created successfully, mount/share failures are non-fatal */ + ret = 0; + (void) zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); return (ret); diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c index 113797c878b9..c2f646f2567d 100644 --- a/sys/contrib/openzfs/cmd/zinject/zinject.c +++ b/sys/contrib/openzfs/cmd/zinject/zinject.c @@ -107,6 +107,8 @@ * zinject * zinject <-a | -u pool> * zinject -c <id|all> + * zinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range] + * [-T iotype] [-t type object | -b bookmark pool] * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] * [-r range] <object> * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool @@ -132,14 +134,18 @@ * The '-f' flag controls the frequency of errors injected, expressed as a * real number percentage between 0.0001 and 100. The default is 100. * - * The this form is responsible for actually injecting the handler into the + * The <object> form is responsible for actually injecting the handler into the * framework. It takes the arguments described above, translates them to the * internal tuple using libzpool, and then issues an ioctl() to register the * handler. * - * The final form can target a specific bookmark, regardless of whether a + * The '-b' option can target a specific bookmark, regardless of whether a * human-readable interface has been designed. It allows developers to specify * a particular block by number. + * + * The '-E' option injects pipeline ready stage delays for the given object or + * bookmark. The delay is specified in milliseconds, and it supports I/O type + * and range filters. */ #include <errno.h> @@ -346,6 +352,13 @@ usage(void) "\t\tsuch that the operation takes a minimum of supplied seconds\n" "\t\tto complete.\n" "\n" + "\tzinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]\n" + "\t\t[-T iotype] [-t type object | -b bookmark pool]\n" + "\n" + "\t\tInject pipeline ready stage delays for the given object path\n" + "\t\t(data or dnode) or raw bookmark. The delay is specified in\n" + "\t\tmilliseconds.\n" + "\n" "\tzinject -I [-s <seconds> | -g <txgs>] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" @@ -724,12 +737,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record, if (quiet) { (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); } else { + boolean_t show_object = B_FALSE; + boolean_t show_iotype = B_FALSE; (void) printf("Added handler %llu with the following " "properties:\n", (u_longlong_t)zc.zc_guid); (void) printf(" pool: %s\n", pool); if (record->zi_guid) { (void) printf(" vdev: %llx\n", (u_longlong_t)record->zi_guid); + show_iotype = B_TRUE; } else if (record->zi_func[0] != '\0') { (void) printf(" panic function: %s\n", record->zi_func); @@ -742,7 +758,18 @@ register_handler(const char *pool, int flags, zinject_record_t *record, } else if (record->zi_timer > 0) { (void) printf(" timer: %lld ms\n", (u_longlong_t)NSEC2MSEC(record->zi_timer)); + if (record->zi_cmd == ZINJECT_DELAY_READY) { + show_object = B_TRUE; + show_iotype = B_TRUE; + } } else { + show_object = B_TRUE; + } + if (show_iotype) { + (void) printf("iotype: %s\n", + iotype_to_str(record->zi_iotype)); + } + if (show_object) { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); (void) printf("object: %llu\n", @@ -910,6 +937,7 @@ main(int argc, char **argv) int ret; int flags = 0; uint32_t dvas = 0; + hrtime_t ready_delay = -1; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); @@ -940,7 +968,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { + ":aA:b:C:d:D:E:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -1113,6 +1141,18 @@ main(int argc, char **argv) case 'u': flags |= ZINJECT_UNLOAD_SPA; break; + case 'E': + ready_delay = MSEC2NSEC(strtol(optarg, &end, 10)); + if (ready_delay <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid delay '%s': " + "must be a positive duration\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + record.zi_cmd = ZINJECT_DELAY_READY; + record.zi_timer = ready_delay; + break; case 'L': if ((label = name_to_type(optarg)) == TYPE_INVAL && !LABEL_TYPE(type)) { @@ -1150,7 +1190,7 @@ main(int argc, char **argv) */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || - record.zi_freq > 0 || dvas != 0) { + record.zi_freq > 0 || dvas != 0 || ready_delay >= 0) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); @@ -1186,7 +1226,7 @@ main(int argc, char **argv) */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || - dvas != 0) { + dvas != 0 || ready_delay >= 0) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); @@ -1276,13 +1316,23 @@ main(int argc, char **argv) return (1); } - record.zi_cmd = ZINJECT_DATA_FAULT; + if (record.zi_cmd == ZINJECT_UNINITIALIZED) { + record.zi_cmd = ZINJECT_DATA_FAULT; + if (!error) + error = EIO; + } else if (error != 0) { + (void) fprintf(stderr, "error type -e incompatible " + "with delay injection\n"); + libzfs_fini(g_zfs); + return (1); + } else { + record.zi_iotype = io_type; + } + if (translate_raw(raw, &record) != 0) { libzfs_fini(g_zfs); return (1); } - if (!error) - error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL || record.zi_freq > 0 || @@ -1410,6 +1460,13 @@ main(int argc, char **argv) record.zi_dvas = dvas; } + if (record.zi_cmd != ZINJECT_UNINITIALIZED && error != 0) { + (void) fprintf(stderr, "error type -e incompatible " + "with delay injection\n"); + libzfs_fini(g_zfs); + return (1); + } + if (error == EACCES) { if (type != TYPE_DATA) { (void) fprintf(stderr, "decryption errors " @@ -1425,8 +1482,12 @@ main(int argc, char **argv) * not found. */ error = ECKSUM; - } else { + } else if (record.zi_cmd == ZINJECT_UNINITIALIZED) { record.zi_cmd = ZINJECT_DATA_FAULT; + if (!error) + error = EIO; + } else { + record.zi_iotype = io_type; } if (translate_record(type, argv[0], range, level, &record, pool, @@ -1434,8 +1495,6 @@ main(int argc, char **argv) libzfs_fini(g_zfs); return (1); } - if (!error) - error = EIO; } /* diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c index 2eec9a95e24c..fef602736705 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -26,6 +26,7 @@ /* * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright (c) 2025, Klara, Inc. */ #include <libintl.h> @@ -52,7 +53,7 @@ typedef struct zpool_node { zpool_handle_t *zn_handle; uu_avl_node_t zn_avlnode; - int zn_mark; + hrtime_t zn_last_refresh; } zpool_node_t; struct zpool_list { @@ -62,6 +63,7 @@ struct zpool_list { uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; zfs_type_t zl_type; + hrtime_t zl_last_refresh; }; static int @@ -81,26 +83,30 @@ zpool_compare(const void *larg, const void *rarg, void *unused) * of known pools. */ static int -add_pool(zpool_handle_t *zhp, void *data) +add_pool(zpool_handle_t *zhp, zpool_list_t *zlp) { - zpool_list_t *zlp = data; - zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); + zpool_node_t *node, *new = safe_malloc(sizeof (zpool_node_t)); uu_avl_index_t idx; - node->zn_handle = zhp; - uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); - if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { + new->zn_handle = zhp; + uu_avl_node_init(new, &new->zn_avlnode, zlp->zl_pool); + + node = uu_avl_find(zlp->zl_avl, new, NULL, &idx); + if (node == NULL) { if (zlp->zl_proplist && zpool_expand_proplist(zhp, zlp->zl_proplist, zlp->zl_type, zlp->zl_literal) != 0) { zpool_close(zhp); - free(node); + free(new); return (-1); } - uu_avl_insert(zlp->zl_avl, node, idx); + new->zn_last_refresh = zlp->zl_last_refresh; + uu_avl_insert(zlp->zl_avl, new, idx); } else { + zpool_refresh_stats_from_handle(node->zn_handle, zhp); + node->zn_last_refresh = zlp->zl_last_refresh; zpool_close(zhp); - free(node); + free(new); return (-1); } @@ -108,6 +114,18 @@ add_pool(zpool_handle_t *zhp, void *data) } /* + * add_pool(), but always returns 0. This allows zpool_iter() to continue + * even if a pool exists in the tree, or we fail to get the properties for + * a new one. + */ +static int +add_pool_cb(zpool_handle_t *zhp, void *data) +{ + (void) add_pool(zhp, data); + return (0); +} + +/* * Create a list of pools based on the given arguments. If we're given no * arguments, then iterate over all pools in the system and add them to the AVL * tree. Otherwise, add only those pool explicitly specified on the command @@ -135,9 +153,10 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, zlp->zl_type = type; zlp->zl_literal = literal; + zlp->zl_last_refresh = gethrtime(); if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); + (void) zpool_iter(g_zfs, add_pool_cb, zlp); zlp->zl_findall = B_TRUE; } else { int i; @@ -159,15 +178,61 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, } /* - * Search for any new pools, adding them to the list. We only add pools when no - * options were given on the command line. Otherwise, we keep the list fixed as - * those that were explicitly specified. + * Refresh the state of all pools on the list. Additionally, if no options were + * given on the command line, add any new pools and remove any that are no + * longer available. */ -void -pool_list_update(zpool_list_t *zlp) +int +pool_list_refresh(zpool_list_t *zlp) { - if (zlp->zl_findall) - (void) zpool_iter(g_zfs, add_pool, zlp); + zlp->zl_last_refresh = gethrtime(); + + if (!zlp->zl_findall) { + /* + * This list is a fixed list of pools, so we must not add + * or remove any. Just walk over them and refresh their + * state. + */ + int navail = 0; + for (zpool_node_t *node = uu_avl_first(zlp->zl_avl); + node != NULL; node = uu_avl_next(zlp->zl_avl, node)) { + boolean_t missing; + zpool_refresh_stats(node->zn_handle, &missing); + navail += !missing; + node->zn_last_refresh = zlp->zl_last_refresh; + } + return (navail); + } + + /* Search for any new pools and add them to the list. */ + (void) zpool_iter(g_zfs, add_pool_cb, zlp); + + /* Walk the list of existing pools, and update or remove them. */ + zpool_node_t *node, *next; + for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next) { + next = uu_avl_next(zlp->zl_avl, node); + + /* + * Skip any that were refreshed and are online; they were added + * by zpool_iter() and are already up to date. + */ + if (node->zn_last_refresh == zlp->zl_last_refresh && + zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL) + continue; + + /* Refresh and remove if necessary. */ + boolean_t missing; + zpool_refresh_stats(node->zn_handle, &missing); + if (missing) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } else { + node->zn_last_refresh = zlp->zl_last_refresh; + } + } + + return (uu_avl_numnodes(zlp->zl_avl)); } /* @@ -191,23 +256,6 @@ pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, } /* - * Remove the given pool from the list. When running iostat, we want to remove - * those pools that no longer exist. - */ -void -pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) -{ - zpool_node_t search, *node; - - search.zn_handle = zhp; - if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { - uu_avl_remove(zlp->zl_avl, node); - zpool_close(node->zn_handle); - free(node); - } -} - -/* * Free all the handles associated with this list. */ void diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 2c46ad0df895..1feec55c0e8b 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -33,7 +33,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> - * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2021, 2023, 2025, Klara, Inc. * Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP. */ @@ -5761,24 +5761,6 @@ children: return (ret); } -static int -refresh_iostat(zpool_handle_t *zhp, void *data) -{ - iostat_cbdata_t *cb = data; - boolean_t missing; - - /* - * If the pool has disappeared, remove it from the list and continue. - */ - if (zpool_refresh_stats(zhp, &missing) != 0) - return (-1); - - if (missing) - pool_list_remove(cb->cb_list, zhp); - - return (0); -} - /* * Callback to print out the iostats for the given pool. */ @@ -6359,15 +6341,14 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data) * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely - * on pool_list_update() to detect the addition of new pools. Configuration - * changes are all handled within libzfs. + * on pool_list_refresh() to detect the addition and removal of pools. + * Configuration changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; - int npools; float interval = 0; unsigned long count = 0; zpool_list_t *list; @@ -6618,10 +6599,24 @@ zpool_do_iostat(int argc, char **argv) return (1); } + int last_npools = 0; for (;;) { - if ((npools = pool_list_count(list)) == 0) + /* + * Refresh all pools in list, adding or removing pools as + * necessary. + */ + int npools = pool_list_refresh(list); + if (npools == 0) { (void) fprintf(stderr, gettext("no pools available\n")); - else { + } else { + /* + * If the list of pools has changed since last time + * around, reset the iteration count to force the + * header to be redisplayed. + */ + if (last_npools != npools) + cb.cb_iteration = 0; + /* * If this is the first iteration and -y was supplied * we skip any printing. @@ -6630,15 +6625,6 @@ zpool_do_iostat(int argc, char **argv) cb.cb_iteration == 0); /* - * Refresh all statistics. This is done as an - * explicit step before calculating the maximum name - * width, so that any * configuration changes are - * properly accounted for. - */ - (void) pool_list_iter(list, B_FALSE, refresh_iostat, - &cb); - - /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ @@ -6691,6 +6677,7 @@ zpool_do_iostat(int argc, char **argv) if (skip) { (void) fflush(stdout); (void) fsleep(interval); + last_npools = npools; continue; } @@ -6728,6 +6715,8 @@ zpool_do_iostat(int argc, char **argv) (void) fflush(stdout); (void) fsleep(interval); + + last_npools = npools; } pool_list_free(list); diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h index 5ab7cb9750f1..3af23c52bd45 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h @@ -76,11 +76,10 @@ typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, zfs_type_t, boolean_t, int *); -void pool_list_update(zpool_list_t *); +int pool_list_refresh(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); int pool_list_count(zpool_list_t *); -void pool_list_remove(zpool_list_t *, zpool_handle_t *); extern libzfs_handle_t *g_zfs; diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 684b46a2d673..088c0108e911 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -609,22 +609,28 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) ZPOOL_CONFIG_PATH, &path) == 0); /* + * Skip active spares they should never cause + * the pool to be evaluated as inconsistent. + */ + if (is_spare(NULL, path)) + continue; + + /* * If we have a raidz/mirror that combines disks - * with files, report it as an error. + * with files, only report it as an error when + * fatal is set to ensure all the replication + * checks aren't skipped in check_replication(). */ - if (!dontreport && type != NULL && + if (fatal && !dontreport && type != NULL && strcmp(type, childtype) != 0) { if (ret != NULL) free(ret); ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication " - "level: %s contains both " - "files and devices\n"), - rep.zprl_type); - else - return (NULL); + vdev_error(gettext( + "mismatched replication " + "level: %s contains both " + "files and devices\n"), + rep.zprl_type); dontreport = B_TRUE; } diff --git a/sys/contrib/openzfs/contrib/intel_qat/readme.md b/sys/contrib/openzfs/contrib/intel_qat/readme.md index 7e45d395bb80..04c299b6404c 100644 --- a/sys/contrib/openzfs/contrib/intel_qat/readme.md +++ b/sys/contrib/openzfs/contrib/intel_qat/readme.md @@ -8,7 +8,7 @@ This contrib contains community compatibility patches to get Intel QAT working o These patches are based on the following Intel QAT version: [1.7.l.4.10.0-00014](https://01.org/sites/default/files/downloads/qat1.7.l.4.10.0-00014.tar.gz) -When using QAT with above kernels versions, the following patches needs to be applied using: +When using QAT with the above kernel versions, the following patches need to be applied using: patch -p1 < _$PATCH_ _Where $PATCH refers to the path of the patch in question_ diff --git a/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py index 971aa1d0d493..bad1af2d1671 100644 --- a/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py +++ b/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py @@ -4223,7 +4223,7 @@ class _TempPool(object): self.getRoot().reset() return - # On the Buildbot builders this may fail with "pool is busy" + # On the CI builders this may fail with "pool is busy" # Retry 5 times before raising an error retry = 0 while True: diff --git a/sys/contrib/openzfs/etc/init.d/README.md b/sys/contrib/openzfs/etc/init.d/README.md index da780fdc1222..3852dd9a6b2e 100644 --- a/sys/contrib/openzfs/etc/init.d/README.md +++ b/sys/contrib/openzfs/etc/init.d/README.md @@ -1,5 +1,5 @@ DESCRIPTION - These script were written with the primary intention of being portable and + These scripts were written with the primary intention of being portable and usable on as many systems as possible. This is, in practice, usually not possible. But the intention is there. diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index 3fcdc176a621..14930fb90622 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -479,6 +479,8 @@ _LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, const char **, _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); +_LIBZFS_H void zpool_refresh_stats_from_handle(zpool_handle_t *, + zpool_handle_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); _LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *); diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h index 076dab8ba6dc..214f3ea0e787 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h @@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ -/* - * All the io_*() helper functions below can operate on a bio, or a rq, but - * not both. The older submit_bio() codepath will pass a bio, and the - * newer blk-mq codepath will pass a rq. - */ -static inline int -io_data_dir(struct bio *bio, struct request *rq) -{ - if (rq != NULL) { - if (op_is_write(req_op(rq))) { - return (WRITE); - } else { - return (READ); - } - } - return (bio_data_dir(bio)); -} - static inline int io_is_flush(struct bio *bio, struct request *rq) { diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index 49ab9d3db795..662fd81c5ee1 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -748,6 +748,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc" +#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ diff --git a/sys/contrib/openzfs/include/sys/range_tree.h b/sys/contrib/openzfs/include/sys/range_tree.h index 0f6884682459..0f6def36f9f6 100644 --- a/sys/contrib/openzfs/include/sys/range_tree.h +++ b/sys/contrib/openzfs/include/sys/range_tree.h @@ -238,8 +238,7 @@ zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) } static inline void -zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, - uint64_t fill) +zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { @@ -277,7 +276,7 @@ static inline void zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); - zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); + zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); } typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size); diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index 66db16b33c51..f172f2af6f07 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -1030,7 +1030,7 @@ extern void spa_import_progress_set_notes_nolog(spa_t *spa, extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); -extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, +extern void spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); @@ -1084,6 +1084,7 @@ extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 07a959db3447..62b062984d36 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -265,6 +265,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_max_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ diff --git a/sys/contrib/openzfs/include/sys/zfs_ioctl.h b/sys/contrib/openzfs/include/sys/zfs_ioctl.h index 8174242abdac..cfe11f43bb8e 100644 --- a/sys/contrib/openzfs/include/sys/zfs_ioctl.h +++ b/sys/contrib/openzfs/include/sys/zfs_ioctl.h @@ -455,6 +455,7 @@ typedef enum zinject_type { ZINJECT_DECRYPT_FAULT, ZINJECT_DELAY_IMPORT, ZINJECT_DELAY_EXPORT, + ZINJECT_DELAY_READY, } zinject_type_t; typedef enum zinject_iotype { diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index a8acb83b4c2f..acb0a03a36b2 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -718,6 +718,7 @@ extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); +extern hrtime_t zio_handle_ready_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/sys/contrib/openzfs/lib/libuutil/libuutil.abi b/sys/contrib/openzfs/lib/libuutil/libuutil.abi index 6c736c61e4a5..2a740afa07ca 100644 --- a/sys/contrib/openzfs/lib/libuutil/libuutil.abi +++ b/sys/contrib/openzfs/lib/libuutil/libuutil.abi @@ -616,6 +616,7 @@ <array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'> <subrange length='23' type-id='7359adad' id='fdd0f594'/> </array-type-def> + <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'> <subrange length='4' type-id='7359adad' id='16fe7105'/> </array-type-def> @@ -1020,13 +1021,6 @@ <array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'> <subrange length='3' type-id='7359adad' id='56f209d2'/> </array-type-def> - <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'> - <subrange length='1' type-id='7359adad' id='52f813b4'/> - </array-type-def> - <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'> - <subrange length='12' type-id='7359adad' id='84827bdc'/> - </array-type-def> - <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_special' type-id='26a90f95' visibility='default'/> @@ -1061,93 +1055,6 @@ <var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/> </data-member> </class-decl> - <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/> - <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/> - <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/> - <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/> - <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/> - <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'> - <data-member access='public' layout-offset-in-bits='0'> - <var-decl name='tv_sec' type-id='49659421' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='64'> - <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='96'> - <var-decl name='__reserved' type-id='3158a266' visibility='default'/> - </data-member> - </class-decl> - <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'> - <data-member access='public' layout-offset-in-bits='0'> - <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='32'> - <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='64'> - <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='128'> - <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='160'> - <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='192'> - <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='224'> - <var-decl name='stx_mode' type-id='d315442e' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='240'> - <var-decl name='__spare0' type-id='811205dc' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='256'> - <var-decl name='stx_ino' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='320'> - <var-decl name='stx_size' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='384'> - <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='448'> - <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='512'> - <var-decl name='stx_atime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='640'> - <var-decl name='stx_btime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='768'> - <var-decl name='stx_ctime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='896'> - <var-decl name='stx_mtime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1024'> - <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1056'> - <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1088'> - <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1120'> - <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1152'> - <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1216'> - <var-decl name='__spare2' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1280'> - <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/> - </data-member> - </class-decl> <class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/> @@ -1237,8 +1144,6 @@ <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/> <pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/> <qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/> - <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/> - <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/> <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='3cad23cd'/> @@ -1254,14 +1159,6 @@ <parameter type-id='95e97e5e'/> <return type-id='26a90f95'/> </function-decl> - <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'> - <parameter type-id='95e97e5e'/> - <parameter type-id='9d26089a'/> - <parameter type-id='95e97e5e'/> - <parameter type-id='f0981eeb'/> - <parameter type-id='31d265b7'/> - <return type-id='95e97e5e'/> - </function-decl> <function-decl name='__fprintf_chk' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='95e97e5e'/> diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index 184ea4a55b43..f988d27a286a 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -571,6 +571,7 @@ <elf-symbol name='zpool_props_refresh' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_read_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_refresh_stats' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zpool_refresh_stats_from_handle' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_reguid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_reopen_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -641,7 +642,7 @@ <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='spa_feature_table' size='2632' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> - <elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_deleg_perm_tab' size='544' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_max_dataset_nesting' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_userquota_prop_prefixes' size='96' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -1458,103 +1459,8 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'> - <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'> - <subrange length='1' type-id='7359adad' id='52f813b4'/> - </array-type-def> - <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'> - <subrange length='12' type-id='7359adad' id='84827bdc'/> - </array-type-def> - <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/> - <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/> - <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/> - <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/> - <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/> - <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'> - <data-member access='public' layout-offset-in-bits='0'> - <var-decl name='tv_sec' type-id='49659421' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='64'> - <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='96'> - <var-decl name='__reserved' type-id='3158a266' visibility='default'/> - </data-member> - </class-decl> - <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'> - <data-member access='public' layout-offset-in-bits='0'> - <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='32'> - <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='64'> - <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='128'> - <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='160'> - <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='192'> - <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='224'> - <var-decl name='stx_mode' type-id='d315442e' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='240'> - <var-decl name='__spare0' type-id='811205dc' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='256'> - <var-decl name='stx_ino' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='320'> - <var-decl name='stx_size' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='384'> - <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='448'> - <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='512'> - <var-decl name='stx_atime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='640'> - <var-decl name='stx_btime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='768'> - <var-decl name='stx_ctime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='896'> - <var-decl name='stx_mtime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1024'> - <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1056'> - <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1088'> - <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1120'> - <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1152'> - <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1216'> - <var-decl name='__spare2' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1280'> - <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/> - </data-member> - </class-decl> <pointer-type-def type-id='56fe4a37' size-in-bits='64' id='b6b61d2f'/> <qualified-type-def type-id='b6b61d2f' restrict='yes' id='3cad23cd'/> - <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/> - <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/> <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='3cad23cd'/> @@ -1566,14 +1472,6 @@ <parameter type-id='822cd80b'/> <return type-id='95e97e5e'/> </function-decl> - <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'> - <parameter type-id='95e97e5e'/> - <parameter type-id='9d26089a'/> - <parameter type-id='95e97e5e'/> - <parameter type-id='f0981eeb'/> - <parameter type-id='31d265b7'/> - <return type-id='95e97e5e'/> - </function-decl> </abi-instr> <abi-instr address-size='64' path='lib/libspl/timestamp.c' language='LANG_C99'> <typedef-decl name='nl_item' type-id='95e97e5e' id='03b79a94'/> @@ -3194,6 +3092,10 @@ <parameter type-id='dace003f'/> <return type-id='80f4b756'/> </function-decl> + <function-decl name='fnvlist_dup' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='22cce67b'/> + <return type-id='5ce45b60'/> + </function-decl> <function-decl name='fnvpair_value_nvlist' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='3fa542f0'/> <return type-id='5ce45b60'/> @@ -3238,6 +3140,11 @@ <parameter type-id='37e3bd22' name='missing'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='zpool_refresh_stats_from_handle' mangled-name='zpool_refresh_stats_from_handle' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_refresh_stats_from_handle'> + <parameter type-id='4c81de99' name='dzhp'/> + <parameter type-id='4c81de99' name='szhp'/> + <return type-id='48b5725f'/> + </function-decl> <function-decl name='zpool_skip_pool' mangled-name='zpool_skip_pool' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_skip_pool'> <parameter type-id='80f4b756' name='poolname'/> <return type-id='c19b74c3'/> @@ -9398,10 +9305,6 @@ <parameter type-id='5ce45b60'/> <return type-id='48b5725f'/> </function-decl> - <function-decl name='fnvlist_dup' visibility='default' binding='global' size-in-bits='64'> - <parameter type-id='22cce67b'/> - <return type-id='5ce45b60'/> - </function-decl> <function-decl name='spl_pagesize' mangled-name='spl_pagesize' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='spl_pagesize'> <return type-id='b59d7dce'/> </function-decl> @@ -9774,8 +9677,8 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='module/zcommon/zfs_deleg.c' language='LANG_C99'> - <array-type-def dimensions='1' type-id='fa1870fd' size-in-bits='4224' id='55e705e7'> - <subrange length='33' type-id='7359adad' id='6a5934df'/> + <array-type-def dimensions='1' type-id='fa1870fd' size-in-bits='4352' id='55f84f08'> + <subrange length='34' type-id='7359adad' id='6a6a7e00'/> </array-type-def> <array-type-def dimensions='1' type-id='fa1870fd' size-in-bits='infinite' id='7c00e69d'> <subrange length='infinite' id='031f2035'/> @@ -9805,30 +9708,31 @@ <enumerator name='ZFS_DELEG_NOTE_PROMOTE' value='5'/> <enumerator name='ZFS_DELEG_NOTE_RENAME' value='6'/> <enumerator name='ZFS_DELEG_NOTE_SEND' value='7'/> - <enumerator name='ZFS_DELEG_NOTE_RECEIVE' value='8'/> - <enumerator name='ZFS_DELEG_NOTE_ALLOW' value='9'/> - <enumerator name='ZFS_DELEG_NOTE_USERPROP' value='10'/> - <enumerator name='ZFS_DELEG_NOTE_MOUNT' value='11'/> - <enumerator name='ZFS_DELEG_NOTE_SHARE' value='12'/> - <enumerator name='ZFS_DELEG_NOTE_USERQUOTA' value='13'/> - <enumerator name='ZFS_DELEG_NOTE_GROUPQUOTA' value='14'/> - <enumerator name='ZFS_DELEG_NOTE_USERUSED' value='15'/> - <enumerator name='ZFS_DELEG_NOTE_GROUPUSED' value='16'/> - <enumerator name='ZFS_DELEG_NOTE_USEROBJQUOTA' value='17'/> - <enumerator name='ZFS_DELEG_NOTE_GROUPOBJQUOTA' value='18'/> - <enumerator name='ZFS_DELEG_NOTE_USEROBJUSED' value='19'/> - <enumerator name='ZFS_DELEG_NOTE_GROUPOBJUSED' value='20'/> - <enumerator name='ZFS_DELEG_NOTE_HOLD' value='21'/> - <enumerator name='ZFS_DELEG_NOTE_RELEASE' value='22'/> - <enumerator name='ZFS_DELEG_NOTE_DIFF' value='23'/> - <enumerator name='ZFS_DELEG_NOTE_BOOKMARK' value='24'/> - <enumerator name='ZFS_DELEG_NOTE_LOAD_KEY' value='25'/> - <enumerator name='ZFS_DELEG_NOTE_CHANGE_KEY' value='26'/> - <enumerator name='ZFS_DELEG_NOTE_PROJECTUSED' value='27'/> - <enumerator name='ZFS_DELEG_NOTE_PROJECTQUOTA' value='28'/> - <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJUSED' value='29'/> - <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJQUOTA' value='30'/> - <enumerator name='ZFS_DELEG_NOTE_NONE' value='31'/> + <enumerator name='ZFS_DELEG_NOTE_SEND_RAW' value='8'/> + <enumerator name='ZFS_DELEG_NOTE_RECEIVE' value='9'/> + <enumerator name='ZFS_DELEG_NOTE_ALLOW' value='10'/> + <enumerator name='ZFS_DELEG_NOTE_USERPROP' value='11'/> + <enumerator name='ZFS_DELEG_NOTE_MOUNT' value='12'/> + <enumerator name='ZFS_DELEG_NOTE_SHARE' value='13'/> + <enumerator name='ZFS_DELEG_NOTE_USERQUOTA' value='14'/> + <enumerator name='ZFS_DELEG_NOTE_GROUPQUOTA' value='15'/> + <enumerator name='ZFS_DELEG_NOTE_USERUSED' value='16'/> + <enumerator name='ZFS_DELEG_NOTE_GROUPUSED' value='17'/> + <enumerator name='ZFS_DELEG_NOTE_USEROBJQUOTA' value='18'/> + <enumerator name='ZFS_DELEG_NOTE_GROUPOBJQUOTA' value='19'/> + <enumerator name='ZFS_DELEG_NOTE_USEROBJUSED' value='20'/> + <enumerator name='ZFS_DELEG_NOTE_GROUPOBJUSED' value='21'/> + <enumerator name='ZFS_DELEG_NOTE_HOLD' value='22'/> + <enumerator name='ZFS_DELEG_NOTE_RELEASE' value='23'/> + <enumerator name='ZFS_DELEG_NOTE_DIFF' value='24'/> + <enumerator name='ZFS_DELEG_NOTE_BOOKMARK' value='25'/> + <enumerator name='ZFS_DELEG_NOTE_LOAD_KEY' value='26'/> + <enumerator name='ZFS_DELEG_NOTE_CHANGE_KEY' value='27'/> + <enumerator name='ZFS_DELEG_NOTE_PROJECTUSED' value='28'/> + <enumerator name='ZFS_DELEG_NOTE_PROJECTQUOTA' value='29'/> + <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJUSED' value='30'/> + <enumerator name='ZFS_DELEG_NOTE_PROJECTOBJQUOTA' value='31'/> + <enumerator name='ZFS_DELEG_NOTE_NONE' value='32'/> </enum-decl> <typedef-decl name='zfs_deleg_note_t' type-id='729d4547' id='4613c173'/> <class-decl name='zfs_deleg_perm_tab' size-in-bits='128' is-struct='yes' visibility='default' id='5aa05c1f'> diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_config.c b/sys/contrib/openzfs/lib/libzfs/libzfs_config.c index 0d2102191389..9d704e4303ff 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_config.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_config.c @@ -308,6 +308,23 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) } /* + * Copies the pool config and state from szhp to dzhp. szhp and dzhp must + * represent the same pool. Used by pool_list_refresh() to avoid another + * round-trip into the kernel to get stats already collected earlier in the + * function. + */ +void +zpool_refresh_stats_from_handle(zpool_handle_t *dzhp, zpool_handle_t *szhp) +{ + VERIFY0(strcmp(dzhp->zpool_name, szhp->zpool_name)); + nvlist_free(dzhp->zpool_old_config); + dzhp->zpool_old_config = dzhp->zpool_config; + dzhp->zpool_config = fnvlist_dup(szhp->zpool_config); + dzhp->zpool_config_size = szhp->zpool_config_size; + dzhp->zpool_state = szhp->zpool_state; +} + +/* * The following environment variables are undocumented * and should be used for testing purposes only: * diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi index 7464b3adb254..263cad045f7a 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi @@ -617,6 +617,7 @@ <array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'> <subrange length='23' type-id='7359adad' id='fdd0f594'/> </array-type-def> + <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'> <subrange length='4' type-id='7359adad' id='16fe7105'/> </array-type-def> @@ -988,13 +989,6 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'> - <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'> - <subrange length='1' type-id='7359adad' id='52f813b4'/> - </array-type-def> - <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'> - <subrange length='12' type-id='7359adad' id='84827bdc'/> - </array-type-def> - <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_special' type-id='26a90f95' visibility='default'/> @@ -1029,93 +1023,6 @@ <var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/> </data-member> </class-decl> - <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/> - <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/> - <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/> - <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/> - <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/> - <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'> - <data-member access='public' layout-offset-in-bits='0'> - <var-decl name='tv_sec' type-id='49659421' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='64'> - <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='96'> - <var-decl name='__reserved' type-id='3158a266' visibility='default'/> - </data-member> - </class-decl> - <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'> - <data-member access='public' layout-offset-in-bits='0'> - <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='32'> - <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='64'> - <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='128'> - <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='160'> - <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='192'> - <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='224'> - <var-decl name='stx_mode' type-id='d315442e' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='240'> - <var-decl name='__spare0' type-id='811205dc' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='256'> - <var-decl name='stx_ino' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='320'> - <var-decl name='stx_size' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='384'> - <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='448'> - <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='512'> - <var-decl name='stx_atime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='640'> - <var-decl name='stx_btime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='768'> - <var-decl name='stx_ctime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='896'> - <var-decl name='stx_mtime' type-id='94101016' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1024'> - <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1056'> - <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1088'> - <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1120'> - <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1152'> - <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1216'> - <var-decl name='__spare2' type-id='d3130597' visibility='default'/> - </data-member> - <data-member access='public' layout-offset-in-bits='1280'> - <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/> - </data-member> - </class-decl> <class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/> @@ -1191,8 +1098,6 @@ <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/> <pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/> <qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/> - <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/> - <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/> <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='3cad23cd'/> @@ -1208,14 +1113,6 @@ <parameter type-id='95e97e5e'/> <return type-id='26a90f95'/> </function-decl> - <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'> - <parameter type-id='95e97e5e'/> - <parameter type-id='9d26089a'/> - <parameter type-id='95e97e5e'/> - <parameter type-id='f0981eeb'/> - <parameter type-id='31d265b7'/> - <return type-id='95e97e5e'/> - </function-decl> <function-decl name='stat64' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='9d26089a'/> <parameter type-id='f1cadedf'/> diff --git a/sys/contrib/openzfs/man/man8/zinject.8 b/sys/contrib/openzfs/man/man8/zinject.8 index 1d9e43aed5ec..704f6a7accd8 100644 --- a/sys/contrib/openzfs/man/man8/zinject.8 +++ b/sys/contrib/openzfs/man/man8/zinject.8 @@ -138,6 +138,20 @@ This injector is automatically cleared after the import is finished. . .It Xo .Nm zinject +.Fl E Ar delay +.Op Fl a +.Op Fl m +.Op Fl f Ar freq +.Op Fl l Ar level +.Op Fl r Ar range +.Op Fl T Ar iotype +.Op Fl t Ar type Ns | Ns Fl b Ar bookmark +.Xc +Inject pipeline ready stage delays for the given object or bookmark. +The delay is specified in milliseconds. +. +.It Xo +.Nm zinject .Fl I .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs .Ar pool diff --git a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 b/sys/contrib/openzfs/man/man8/zpool-upgrade.8 index cf69060da5ce..adae47f82eb1 100644 --- a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 +++ b/sys/contrib/openzfs/man/man8/zpool-upgrade.8 @@ -65,10 +65,10 @@ property). .Cm upgrade .Fl v .Xc -Displays legacy ZFS versions supported by the this version of ZFS. +Displays legacy ZFS versions supported by this version of ZFS. See .Xr zpool-features 7 -for a description of feature flags features supported by this version of ZFS. +for a description of features supported by this version of ZFS. .It Xo .Nm zpool .Cm upgrade diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 393bfaa65ff5..ebc2c0eeb6d2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +static void +warn_deprecated_sysctl(const char *old, const char *new) +{ + printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n", + old, new); +} + int param_set_arc_max(SYSCTL_HANDLER_ARGS) { @@ -185,9 +192,17 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_max = arc_c_max; + if (arg2 != 0) + warn_deprecated_sysctl("arc_max", "arc.max"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_max, "LU", + "Maximum ARC size in bytes (LEGACY)"); + int param_set_arc_min(SYSCTL_HANDLER_ARGS) { @@ -209,9 +224,17 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_min = arc_c_min; + if (arg2 != 0) + warn_deprecated_sysctl("arc_min", "arc.min"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_min, "LU", + "Minimum ARC size in bytes (LEGACY)"); + extern uint_t zfs_arc_free_target; int @@ -232,9 +255,22 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) zfs_arc_free_target = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_free_target", "arc.free_target"); + return (0); } +/* + * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on + * pagedaemon initialization. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim" + " (LEGACY)"); + int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { @@ -250,9 +286,193 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) arc_no_grow_shift = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_no_grow_shift, "I", + "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); + +extern uint64_t l2arc_write_max; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, + CTLFLAG_RWTUN, &l2arc_write_max, 0, + "Max write bytes per interval (LEGACY)"); + +extern uint64_t l2arc_write_boost; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, + CTLFLAG_RWTUN, &l2arc_write_boost, 0, + "Extra write bytes during device warmup (LEGACY)"); + +extern uint64_t l2arc_headroom; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, + CTLFLAG_RWTUN, &l2arc_headroom, 0, + "Number of max device writes to precache (LEGACY)"); + +extern uint64_t l2arc_headroom_boost; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, + CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, + "Compressed l2arc_headroom multiplier (LEGACY)"); + +extern uint64_t l2arc_feed_secs; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, + CTLFLAG_RWTUN, &l2arc_feed_secs, 0, + "Seconds between L2ARC writing (LEGACY)"); + +extern uint64_t l2arc_feed_min_ms; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, + CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, + "Min feed interval in milliseconds (LEGACY)"); + +extern int l2arc_noprefetch; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, + CTLFLAG_RWTUN, &l2arc_noprefetch, 0, + "Skip caching prefetched buffers (LEGACY)"); + +extern int l2arc_feed_again; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, + CTLFLAG_RWTUN, &l2arc_feed_again, 0, + "Turbo L2ARC warmup (LEGACY)"); + +extern int l2arc_norw; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, + CTLFLAG_RWTUN, &l2arc_norw, 0, + "No reads during writes (LEGACY)"); + +static int +param_get_arc_state_size(SYSCTL_HANDLER_ARGS) +{ + arc_state_t *state = (arc_state_t *)arg1; + int64_t val; + + val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +extern arc_state_t ARC_anon; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_anon, 0, param_get_arc_state_size, "Q", + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in anonymous state"); + +extern arc_state_t ARC_mru; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru, 0, param_get_arc_state_size, "Q", + "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mru state"); + +extern arc_state_t ARC_mru_ghost; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", + "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mru ghost state"); + +extern arc_state_t ARC_mfu; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu, 0, param_get_arc_state_size, "Q", + "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mfu state"); + +extern arc_state_t ARC_mfu_ghost; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", + "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mfu ghost state"); + +extern arc_state_t ARC_uncached; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_uncached, 0, param_get_arc_state_size, "Q", + "size of uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in uncached state"); + +extern arc_state_t ARC_l2c_only; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_l2c_only, 0, param_get_arc_state_size, "Q", + "size of l2c_only state"); + +/* dbuf.c */ + +/* dmu.c */ + +/* dmu_zfetch.c */ + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); + +extern uint32_t zfetch_max_distance; + +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, + CTLFLAG_RWTUN, &zfetch_max_distance, 0, + "Max bytes to prefetch per stream (LEGACY)"); + +extern uint32_t zfetch_max_idistance; + +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, + CTLFLAG_RWTUN, &zfetch_max_idistance, 0, + "Max bytes to prefetch indirects for per stream (LEGACY)"); + +/* dsl_pool.c */ + +/* dnode.c */ + +/* dsl_scan.c */ + /* metaslab.c */ int @@ -313,6 +533,19 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); +extern uint_t zfs_remove_max_segment; + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, + CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, + "Largest contiguous segment ZFS will attempt to allocate when removing" + " a device"); + +extern int zfs_removal_suspend_progress; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, + CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, + "Ensures certain actions can happen while in the middle of a removal"); + /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy @@ -532,9 +765,18 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_min_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("min_auto_ashift", + "vdev.min_auto_ashift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, + param_set_min_auto_ashift, "IU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); + int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { @@ -551,9 +793,19 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_max_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("max_auto_ashift", + "vdev.max_auto_ashift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, + param_set_max_auto_ashift, "IU", + "Max ashift used when optimizing for logical -> physical sector size on" + " new top-level vdevs. (LEGACY)"); + /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. @@ -575,6 +827,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); +extern int vdev_validate_skip; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, + CTLFLAG_RDTUN, &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + +/* vdev_mirror.c */ + +/* vdev_queue.c */ + +extern uint_t zfs_vdev_max_active; + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, + CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device." + " (LEGACY)"); + /* zio.c */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 0dd2ecd7fd8d..3ddbfcb97184 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = { .f_isfd = 1, .f_detach = zvol_filter_detach, .f_event = zvol_filter_vnode, + .f_copy = knote_triv_copy, }; extern uint_t zfs_geom_probe_vdev_key; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index bac166fcd89e..967a018640e1 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -484,7 +484,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (rq != NULL) { + /* + * Flush & trim requests go down the zvol_write codepath. Or + * more specifically: + * + * If request is a write, or if it's op_is_sync() and not a + * read, or if it's a flush, or if it's a discard, then send the + * request down the write path. + */ + if (op_is_write(rq->cmd_flags) || + (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || + req_op(rq) == REQ_OP_FLUSH || + op_is_discard(rq->cmd_flags)) { + rw = WRITE; + } else { + rw = READ; + } + } else { + rw = bio_data_dir(bio); + } if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 864e3898b365..9190ae0362ea 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -364,8 +364,8 @@ zfs_prop_init(void) static const zprop_index_t xattr_table[] = { { "off", ZFS_XATTR_OFF }, - { "on", ZFS_XATTR_SA }, { "sa", ZFS_XATTR_SA }, + { "on", ZFS_XATTR_SA }, { "dir", ZFS_XATTR_DIR }, { NULL } }; diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 591e2dade59e..b677f90280d7 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq; static uint_t zfs_arc_evict_threads = 0; /* The 7 states: */ -static arc_state_t ARC_anon; -/* */ arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -/* */ arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; -static arc_state_t ARC_l2c_only; -static arc_state_t ARC_uncached; +arc_state_t ARC_anon; +arc_state_t ARC_mru; +arc_state_t ARC_mru_ghost; +arc_state_t ARC_mfu; +arc_state_t ARC_mfu_ghost; +arc_state_t ARC_l2c_only; +arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, @@ -832,15 +832,15 @@ typedef struct arc_async_flush { #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ -static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -static int l2arc_feed_again = B_TRUE; /* turbo warmup */ -static int l2arc_norw = B_FALSE; /* no reads during writes */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 3d3a9c713568..51165d0bf723 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -57,19 +57,19 @@ static unsigned int zfetch_max_sec_reap = 2; /* min bytes to prefetch per stream (default 2MB) */ static unsigned int zfetch_min_distance = 2 * 1024 * 1024; /* max bytes to prefetch per stream (default 8MB) */ -static unsigned int zfetch_max_distance = 8 * 1024 * 1024; +unsigned int zfetch_max_distance = 8 * 1024 * 1024; #else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ -static unsigned int zfetch_max_distance = 64 * 1024 * 1024; +unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 128MB) */ -static unsigned int zfetch_max_idistance = 128 * 1024 * 1024; +unsigned int zfetch_max_idistance = 128 * 1024 * 1024; /* max request reorder distance within a stream (default 16MB) */ -static unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +unsigned int zfetch_max_reorder = 16 * 1024 * 1024; /* Max log2 fraction of holes in a stream */ -static unsigned int zfetch_hole_shift = 2; +unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 6c150d31c669..e88d394b5229 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -2656,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } /* + * Adjust *offset to the next (or previous) block byte offset at lvl. + * Returns FALSE if *offset would overflow or underflow. + */ +static boolean_t +dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) +{ + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int span = lvl * epbs + dn->dn_datablkshift; + uint64_t blkid, maxblkid; + + if (span >= 8 * sizeof (uint64_t)) + return (B_FALSE); + + blkid = *offset >> span; + maxblkid = 1ULL << (8 * sizeof (*offset) - span); + if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) + *offset = (blkid + 1) << span; + else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) + *offset = (blkid << span) - 1; + else + return (B_FALSE); + + return (B_TRUE); +} + +/* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, @@ -2682,7 +2708,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t initial_offset = *offset; + uint64_t matched = *offset; int lvl, maxlvl; int error = 0; @@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, maxlvl = dn->dn_phys->dn_nlevels; - for (lvl = minlvl; lvl <= maxlvl; lvl++) { + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); - if (error != ESRCH) + if (error == 0 && lvl > minlvl) { + --lvl; + matched = *offset; + } else if (error == ESRCH && lvl < maxlvl && + dnode_next_block(dn, flags, &matched, lvl)) { + /* + * Continue search at next/prev offset in lvl+1 block. + * + * Usually we only search upwards at the start of the + * search as higher level blocks point at a matching + * minlvl block in most cases, but we backtrack if not. + * + * This can happen for txg > 0 searches if the block + * contains only BPs/dnodes freed at that txg. It also + * happens if we are still syncing out the tree, and + * some BP's at higher levels are not updated yet. + * + * We must adjust offset to avoid coming back to the + * same offset and getting stuck looping forever. This + * also deals with the case where offset is already at + * the beginning or end of the object. + */ + ++lvl; + *offset = matched; + } else { break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + } } /* @@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, error = 0; } - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 7db72b9b04b0..fd46127b6068 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index ea2d2c7227c8..d73195f1a21f 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); zfs_range_tree_stat_incr(rt, &rs_tmp); diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index cf28955b0c50..f615591e826b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 6f7c060f97f8..0bead6d49666 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, - int mmp_flag) + int priority_flag) { (void) tag; int wlocks_held = 0; @@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || - (!mmp_flag && scl->scl_write_wanted)) { + (!priority_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } /* - * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * The spa_config_enter_priority() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is @@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) */ void -spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } @@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + +/* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to @@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index fc6d445f9785..c8d7280387a2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ static uint_t zfs_vdev_max_ms_shift = 34; -static int vdev_validate_skip = B_FALSE; +int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of @@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index c44f654b0261..0d4fdaa77ba0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index e69e5598939e..c12713b107bf 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -122,7 +122,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -static uint_t zfs_vdev_max_active = 1000; +uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 2ce0121324ad..2f7a739da241 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -105,7 +105,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * * See also the accessor function spa_remove_max_segment(). */ -static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; +uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device @@ -137,7 +137,7 @@ uint_t vdev_removal_max_span = 32 * 1024; * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ -static int zfs_removal_suspend_progress = 0; +int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 4cf8912d4269..aeea58bedfe4 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio) ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]); if (vd == NULL) { - if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) - spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) { + /* + * A deadlock workaround. The ddt_prune_unique_entries() + * -> prune_candidates_sync() code path takes the + * SCL_ZIO reader lock and may request it again here. + * If there is another thread who wants the SCL_ZIO + * writer lock, then scl_write_wanted will be set. + * Thus, the spa_config_enter_priority() is used to + * ignore pending writer requests. + * + * The locking should be revised to remove the need + * for this workaround. If that's not workable then + * it should only be applied to the zios involved in + * the pruning process. This impacts the read/write + * I/O balance while pruning. + */ + if (spa->spa_active_ddt_prune) + spa_config_enter_priority(spa, SCL_ZIO, zio, + RW_READER); + else + spa_config_enter(spa, SCL_ZIO, zio, + RW_READER); + } /* * The mirror_ops handle multiple DVAs in a single BP. @@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio) return (NULL); } + if (zio_injection_enabled) { + hrtime_t target = zio_handle_ready_delay(zio); + if (target != 0 && zio->io_target_timestamp == 0) { + zio->io_stage >>= 1; + zio->io_target_timestamp = target; + zio_delay_interrupt(zio); + return (NULL); + } + } + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index 981a1be4847c..287577018ed1 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); } +/* + * For testing, inject a delay before ready state. + */ +hrtime_t +zio_handle_ready_delay(zio_t *zio) +{ + inject_handler_t *handler; + hrtime_t now = gethrtime(); + hrtime_t target = 0; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DELAY_READY) + continue; + + /* If this handler matches, inject the delay */ + if (zio_match_iotype(zio, handler->zi_record.zi_iotype) && + zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + zio_match_dva(zio), &handler->zi_record, zio->io_error)) { + target = now + (hrtime_t)handler->zi_record.zi_timer; + break; + } + } + + rw_exit(&inject_lock); + return (target); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { diff --git a/sys/contrib/openzfs/scripts/zfs-tests.sh b/sys/contrib/openzfs/scripts/zfs-tests.sh index 04f3b6f32cb8..5a0a1a609448 100755 --- a/sys/contrib/openzfs/scripts/zfs-tests.sh +++ b/sys/contrib/openzfs/scripts/zfs-tests.sh @@ -38,6 +38,7 @@ DEBUG="" CLEANUP="yes" CLEANUPALL="no" KMSG="" +TIMEOUT_DEBUG="" LOOPBACK="yes" STACK_TRACER="no" FILESIZE="4G" @@ -364,6 +365,7 @@ OPTIONS: -k Disable cleanup after test failure -K Log test names to /dev/kmsg -f Use files only, disables block device tests + -O Dump debugging info to /dev/kmsg on test timeout -S Enable stack tracer (negative performance impact) -c Only create and populate constrained path -R Automatically rerun failing tests @@ -402,7 +404,7 @@ $0 -x EOF } -while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do +while getopts 'hvqxkKfScRmOn:d:Ds:r:?t:T:u:I:' OPTION; do case $OPTION in h) usage @@ -445,6 +447,9 @@ while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do export NFS=1 . "$nfsfile" ;; + O) + TIMEOUT_DEBUG="yes" + ;; d) FILEDIR="$OPTARG" ;; @@ -773,6 +778,7 @@ msg "${TEST_RUNNER}" \ "${DEBUG:+-D}" \ "${KMEMLEAK:+-m}" \ "${KMSG:+-K}" \ + "${TIMEOUT_DEBUG:+-O}" \ "-c \"${RUNFILES}\"" \ "-T \"${TAGS}\"" \ "-i \"${STF_SUITE}\"" \ @@ -783,6 +789,7 @@ msg "${TEST_RUNNER}" \ ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ ${KMSG:+-K} \ + ${TIMEOUT_DEBUG:+-O} \ -c "${RUNFILES}" \ -T "${TAGS}" \ -i "${STF_SUITE}" \ diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 2b002830c82f..9f531411fbe1 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -168,10 +168,10 @@ tags = ['functional', 'cli_root', 'zinject'] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', - 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', - 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup', - 'zdb_tunables'] + 'zdb_display_block', 'zdb_encrypted', 'zdb_encrypted_raw', + 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', + 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', + 'zdb_backup', 'zdb_tunables'] pre = post = tags = ['functional', 'cli_root', 'zdb'] @@ -395,8 +395,9 @@ tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', - 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] + 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_warn_create', + 'zpool_add_warn_degraded', 'zpool_add_warn_removal', 'add-o_ashift', + 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] @@ -490,6 +491,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 +[tests/functional/cli_root/zpool_iostat] +tests = ['zpool_iostat_interval_all', 'zpool_iostat_interval_some'] +tags = ['functional', 'cli_root', 'zpool_iostat'] + [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] @@ -1085,7 +1090,8 @@ tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', - 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat'] + 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_014_pos', + 'xattr_compat'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] diff --git a/sys/contrib/openzfs/tests/runfiles/sanity.run b/sys/contrib/openzfs/tests/runfiles/sanity.run index b56ffc3a4a2d..249b415029c4 100644 --- a/sys/contrib/openzfs/tests/runfiles/sanity.run +++ b/sys/contrib/openzfs/tests/runfiles/sanity.run @@ -622,7 +622,7 @@ tags = ['functional', 'vdev_zaps'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', - 'xattr_011_pos', 'xattr_013_pos', 'xattr_compat'] + 'xattr_011_pos', 'xattr_013_pos', 'xattr_014_pos', 'xattr_compat'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in b/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in index 2158208be6e5..d2c1185e4a94 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in @@ -34,6 +34,7 @@ from select import select from subprocess import PIPE from subprocess import Popen from subprocess import check_output +from subprocess import run from threading import Timer from time import time, CLOCK_MONOTONIC from os.path import exists @@ -187,6 +188,63 @@ User: %s ''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user) def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False): + + """ + We're about to kill a command due to a timeout. + If we're running with the -O option, then dump debug info about the + process with the highest CPU usage to /dev/kmsg (Linux only). This can + help debug the timeout. + + Debug info includes: + - 30 lines from 'top' + - /proc/<PID>/stack output of process with highest CPU usage + - Last lines strace-ing process with highest CPU usage + """ + if exists("/dev/kmsg"): + c = """ +TOP_OUT="$(COLUMNS=160 top -b -n 1 | head -n 30)" +read -r PID CMD <<< $(echo "$TOP_OUT" | /usr/bin/awk \ +"/COMMAND/{ + print_next=1 + next +} +{ + if (print_next == 1) { + print \\$1\\" \\"\\$12 + exit + } +}") +echo "##### ZTS timeout debug #####" +echo "----- top -----" +echo "$TOP_OUT" +echo "----- /proc/$PID/stack ($CMD)) -----" +cat /proc/$PID/stack +echo "----- strace ($CMD) -----" +TMPFILE="$(mktemp --suffix=ZTS)" +/usr/bin/strace -k --stack-traces -p $PID &> "$TMPFILE" & +sleep 0.1 +killall strace +tail -n 30 $TMPFILE +rm "$TMPFILE" +echo "##### /proc/sysrq-trigger stack #####" +""" + c = "sudo bash -c '" + c + "'" + data = run(c, capture_output=True, shell=True, text=True) + out = data.stdout + try: + kp = Popen([SUDO, "sh", "-c", + "echo '" + out + "' > /dev/kmsg"]) + kp.wait() + + """ + Trigger kernel stack traces + """ + kp = Popen([SUDO, "sh", "-c", + "echo l > /proc/sysrq-trigger"]) + kp.wait() + except Exception: + pass + """ Kill a running command due to timeout, or ^C from the keyboard. If sudo is required, this user was verified previously. @@ -1129,6 +1187,9 @@ def parse_args(): parser.add_option('-o', action='callback', callback=options_cb, default=BASEDIR, dest='outputdir', type='string', metavar='outputdir', help='Specify an output directory.') + parser.add_option('-O', action='store_true', default=False, + dest='timeout_debug', + help='Dump debugging info to /dev/kmsg on test timeout') parser.add_option('-i', action='callback', callback=options_cb, default=TESTDIR, dest='testdir', type='string', metavar='testdir', help='Specify a test directory.') diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index 54b50c9dba77..127ea188f17f 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -76,8 +76,8 @@ READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled -REMOVAL_SUSPEND_PROGRESS vdev.removal_suspend_progress zfs_removal_suspend_progress -REMOVE_MAX_SEGMENT vdev.remove_max_segment zfs_remove_max_segment +REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress +REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms RESILVER_DEFER_PERCENT resilver_defer_percent zfs_resilver_defer_percent SCAN_LEGACY scan_legacy zfs_scan_legacy diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index 1517f90e99a5..678c01b58f94 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -197,6 +197,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \ functional/cli_root/zpool_import/zpool_import.cfg \ functional/cli_root/zpool_import/zpool_import.kshlib \ + functional/cli_root/zpool_iostat/zpool_iostat.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ @@ -640,6 +641,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_decompress_zstd.ksh \ functional/cli_root/zdb/zdb_display_block.ksh \ functional/cli_root/zdb/zdb_encrypted.ksh \ + functional/cli_root/zdb/zdb_encrypted_raw.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ @@ -1027,7 +1029,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ - functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_create.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_removal.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ @@ -1178,6 +1182,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \ + functional/cli_root/zpool_iostat/setup.ksh \ + functional/cli_root/zpool_iostat/cleanup.ksh \ + functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh \ + functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ @@ -2226,6 +2234,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/xattr/xattr_011_pos.ksh \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ + functional/xattr/xattr_014_pos.ksh \ functional/xattr/xattr_compat.ksh \ functional/zap_shrink/cleanup.ksh \ functional/zap_shrink/zap_shrink_001_pos.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh new file mode 100755 index 000000000000..85d267d5402f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# 'zdb -K ...' should enable reading from a raw-encrypted dataset +# +# STRATEGY: +# 1. Create an encrypted dataset +# 2. Write some data to a file +# 3. Run zdb -dddd on the file, confirm it can't be read +# 4. Run zdb -K ... -ddddd on the file, confirm it can be read +# + +verify_runnable "both" + +dataset="$TESTPOOL/$TESTFS2" +file="$TESTDIR2/somefile" +keyfile="$TEST_BASE_DIR/keyfile" + +function cleanup +{ + datasetexists "$dataset" && destroy_dataset "$dataset" -f + rm -f "$keyfile" + default_cleanup_noexit +} + +log_onexit cleanup + +log_must default_setup_noexit $DISKS + +log_assert "'zdb -K' should enable reading from a raw-encrypted dataset" + +# The key must be 32 bytes long. +echo -n "$RAWKEY" > "$keyfile" + +log_must zfs create -o mountpoint="$TESTDIR2" \ + -o encryption=on -o keyformat=raw -o keylocation="file://$keyfile" \ + "$dataset" + +echo 'my great encrypted text' > "$file" + +typeset -i obj=$(ls -i "$file" | cut -d' ' -f1) +typeset -i size=$(wc -c < "$file") + +log_note "test file $file is objid $obj, size $size" + +sync_pool "$TESTPOOL" true + +log_must eval "zdb -dddd $dataset $obj | grep -q 'object encrypted'" + +log_must eval "zdb -K $keyfile -dddd $dataset $obj | grep -q 'size\s$size$'" + +log_pass "'zdb -K' enables reading from a raw-encrypted dataset" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib index 091d65bb4f33..74780bb02141 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib @@ -27,6 +27,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib @@ -89,3 +90,44 @@ function save_dump_dev fi echo $dumpdev } + +function zpool_create_add_setup +{ + typeset -i i=0 + + while ((i < 10)); do + log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i + + eval vdev$i=$TEST_BASE_DIR/vdev$i + ((i += 1)) + done + + if is_linux; then + vdev_lo="$(losetup -f "$vdev4" --show)" + elif is_freebsd; then + vdev_lo=/dev/"$(mdconfig -a -t vnode -f "$vdev4")" + else + vdev_lo="$(lofiadm -a "$vdev4")" + fi +} + +function zpool_create_add_cleanup +{ + datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + + if [[ -e $vdev_lo ]]; then + if is_linux; then + log_must losetup -d "$vdev_lo" + elif is_freebsd; then + log_must mdconfig -d -u "$vdev_lo" + else + log_must lofiadm -d "$vdev_lo" + fi + fi + + typeset -i i=0 + while ((i < 10)); do + rm -f $TEST_BASE_DIR/vdev$i + ((i += 1)) + done +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh index df085a2ec746..661e55998d8d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh @@ -23,67 +23,51 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -# -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib # # DESCRIPTION: -# Verify zpool add succeed when adding vdevs with matching redundancy. +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy for a healthy pool. # # STRATEGY: # 1. Create several files == $MINVDEVSIZE. # 2. Verify 'zpool add' succeeds with matching redundancy. # 3. Verify 'zpool add' warns with differing redundancy. -# 4. Verify 'zpool add' warns with differing redundancy after removal. # verify_runnable "global" -function cleanup -{ - datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - - typeset -i i=0 - while ((i < 10)); do - rm -f $TEST_BASE_DIR/vdev$i - ((i += 1)) - done -} - +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup -log_assert "Verify 'zpool add' succeed with keywords combination." -log_onexit cleanup +zpool_create_add_setup -# 1. Create several files == $MINVDEVSIZE. typeset -i i=0 -while ((i < 10)); do - log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i - - eval vdev$i=$TEST_BASE_DIR/vdev$i - ((i += 1)) -done +typeset -i j=0 set -A redundancy0_create_args \ "$vdev0" set -A redundancy1_create_args \ "mirror $vdev0 $vdev1" \ - "raidz1 $vdev0 $vdev1" + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" set -A redundancy2_create_args \ "mirror $vdev0 $vdev1 $vdev2" \ - "raidz2 $vdev0 $vdev1 $vdev2" + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" set -A redundancy3_create_args \ "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ - "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" set -A redundancy0_add_args \ "$vdev5" \ @@ -93,21 +77,19 @@ set -A redundancy1_add_args \ "mirror $vdev5 $vdev6" \ "raidz1 $vdev5 $vdev6" \ "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ - "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" + "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \ + "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8" set -A redundancy2_add_args \ "mirror $vdev5 $vdev6 $vdev7" \ - "raidz2 $vdev5 $vdev6 $vdev7" + "raidz2 $vdev5 $vdev6 $vdev7" \ + "draid2 $vdev5 $vdev6 $vdev7" set -A redundancy3_add_args \ "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ - "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" - -set -A log_args "log" "$vdev4" -set -A cache_args "cache" "$vdev4" -set -A spare_args "spare" "$vdev4" - -typeset -i j=0 + "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid3 $vdev5 $vdev6 $vdev7 $vdev8" function zpool_create_add { @@ -148,30 +130,6 @@ function zpool_create_forced_add done } -function zpool_create_rm_add -{ - typeset -n create_args=$1 - typeset -n add_args=$2 - typeset -n rm_args=$3 - - i=0 - while ((i < ${#create_args[@]})); do - j=0 - while ((j < ${#add_args[@]})); do - log_must zpool create $TESTPOOL1 ${create_args[$i]} - log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} - log_must zpool add $TESTPOOL1 ${add_args[$j]} - log_must zpool remove $TESTPOOL1 ${rm_args[1]} - log_mustnot zpool add $TESTPOOL1 ${rm_args[1]} - log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} - log_must zpool destroy -f $TESTPOOL1 - - ((j += 1)) - done - ((i += 1)) - done -} - # 2. Verify 'zpool add' succeeds with matching redundancy. zpool_create_add redundancy0_create_args redundancy0_add_args zpool_create_add redundancy1_create_args redundancy1_add_args @@ -195,17 +153,4 @@ zpool_create_forced_add redundancy3_create_args redundancy0_add_args zpool_create_forced_add redundancy3_create_args redundancy1_add_args zpool_create_forced_add redundancy3_create_args redundancy2_add_args -# 4. Verify 'zpool add' warns with differing redundancy after removal. -zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args - -zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args - -zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args - -log_pass "'zpool add' succeed with keywords combination." +log_pass "Verify 'zpool add' warns for differing redundancy." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh new file mode 100755 index 000000000000..313eb3666f27 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh @@ -0,0 +1,204 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib + +# +# DESCRIPTION: +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy for a degraded pool. +# +# STRATEGY: +# 1. Create several files == $MINVDEVSIZE. +# 2. Verify 'zpool add' succeeds with matching redundancy +# 3. Verify 'zpool add' warns with differing redundancy when +# a. Degraded pool with replaced mismatch vdev (file vs disk) +# b. Degraded pool dRAID distributed spare active +# c. Degraded pool hot spare active +# + +verify_runnable "global" + +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup + +zpool_create_add_setup + +set -A redundancy1_create_args \ + "mirror $vdev0 $vdev1" \ + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" + +set -A redundancy2_create_args \ + "mirror $vdev0 $vdev1 $vdev2" \ + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" + +set -A redundancy3_create_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_add_args \ + "mirror $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \ + "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8" + +set -A redundancy2_add_args \ + "mirror $vdev5 $vdev6 $vdev7" \ + "raidz2 $vdev5 $vdev6 $vdev7" \ + "draid2 $vdev5 $vdev6 $vdev7" + +set -A redundancy3_add_args \ + "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ + "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid3 $vdev5 $vdev6 $vdev7 $vdev8" + +set -A redundancy1_create_draid_args \ + "draid1:1s $vdev0 $vdev1 $vdev2" + +set -A redundancy2_create_draid_args \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev3" + +set -A redundancy3_create_draid_args \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_create_spare_args \ + "mirror $vdev0 $vdev1 spare $vdev_lo" \ + "raidz1 $vdev0 $vdev1 spare $vdev_lo" \ + "draid1 $vdev0 $vdev1 spare $vdev_lo" + +set -A redundancy2_create_spare_args \ + "mirror $vdev0 $vdev1 $vdev2 spare $vdev_lo" \ + "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" \ + "draid2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" + +set -A redundancy3_create_spare_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \ + "draid3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" + +set -A replace_args "$vdev1" "$vdev_lo" +set -A draid1_args "$vdev1" "draid1-0-0" +set -A draid2_args "$vdev1" "draid2-0-0" +set -A draid3_args "$vdev1" "draid3-0-0" + +typeset -i i=0 +typeset -i j=0 + +function zpool_create_degraded_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool offline -f $TESTPOOL1 ${rm_args[0]} + log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool destroy -f $TESTPOOL1 + log_must zpool labelclear -f ${rm_args[0]} + + ((j += 1)) + done + ((i += 1)) + done +} + +function zpool_create_forced_degraded_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool offline -f $TESTPOOL1 ${rm_args[0]} + log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_mustnot zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]} + log_must zpool destroy -f $TESTPOOL1 + log_must zpool labelclear -f ${rm_args[0]} + + ((j += 1)) + done + ((i += 1)) + done +} + +# 2. Verify 'zpool add' succeeds with matching redundancy and a degraded pool. +zpool_create_degraded_add redundancy1_create_args redundancy1_add_args replace_args +zpool_create_degraded_add redundancy2_create_args redundancy2_add_args replace_args +zpool_create_degraded_add redundancy3_create_args redundancy3_add_args replace_args + +# 3. Verify 'zpool add' warns with differing redundancy and a degraded pool. +# +# a. Degraded pool with replaced mismatch vdev (file vs disk) +zpool_create_forced_degraded_add redundancy1_create_args redundancy2_add_args replace_args +zpool_create_forced_degraded_add redundancy1_create_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy2_create_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy2_create_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy3_create_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy3_create_args redundancy2_add_args replace_args + +# b. Degraded pool dRAID distributed spare active + +zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy2_add_args draid1_args +zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy3_add_args draid1_args + +zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy1_add_args draid2_args +zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy3_add_args draid2_args + +zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy1_add_args draid3_args +zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy2_add_args draid3_args + +# c. Degraded pool hot spare active +zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy2_add_args replace_args +zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy2_add_args replace_args + +log_pass "Verify 'zpool add' warns for differing redundancy." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh new file mode 100755 index 000000000000..782858e301ac --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib + +# +# DESCRIPTION: +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy after removal. +# +# STRATEGY: +# 1. Create several files == $MINVDEVSIZE. +# 2. Verify 'zpool add' warns with differing redundancy after removal. +# + +verify_runnable "global" + +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup + +zpool_create_add_setup + +typeset -i i=0 +typeset -i j=0 + +set -A redundancy1_create_args \ + "mirror $vdev0 $vdev1" \ + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" + +set -A redundancy2_create_args \ + "mirror $vdev0 $vdev1 $vdev2" \ + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" + +set -A redundancy3_create_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_add_args \ + "mirror $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \ + "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8" + +set -A redundancy2_add_args \ + "mirror $vdev5 $vdev6 $vdev7" \ + "raidz2 $vdev5 $vdev6 $vdev7" \ + "draid2 $vdev5 $vdev6 $vdev7" + +set -A redundancy3_add_args \ + "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ + "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid3 $vdev5 $vdev6 $vdev7 $vdev8" + +set -A log_args "log" "$vdev_lo" +set -A cache_args "cache" "$vdev_lo" +set -A spare_args "spare" "$vdev_lo" + + +function zpool_create_rm_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool remove $TESTPOOL1 ${rm_args[1]} + log_mustnot zpool add $TESTPOOL1 ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool destroy -f $TESTPOOL1 + + ((j += 1)) + done + ((i += 1)) + done +} + +# 2. Verify 'zpool add' warns with differing redundancy after removal. +zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args + +zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args + +zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh new file mode 100755 index 000000000000..099b5426031d --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# +# +. $STF_SUITE/include/libtest.shlib + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh new file mode 100755 index 000000000000..3529a0ccc015 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# +# +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib new file mode 100644 index 000000000000..ea4b0bd2756d --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib @@ -0,0 +1,235 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# Since we want to make sure that iostat responds correctly as pools appear and +# disappear, we run it in the background and capture its output to a file. +# Once we're done, we parse the output and ensure it matches what we'd expect +# from the operations we performed. +# +# Because iostat is producing output every interval, it may produce the "same" +# output for each step of the change; in fact, we want that to make sure we +# don't miss anything. So, we describe what we expect as a series of "chunks". +# Each chunk is a particular kind of output, which may repeat. Current known +# chunk types are: +# +# NOPOOL: the text "no pools available" +# HEADER: three lines, starting with "capacity", "pool" and "----" respectively. +# (the rough shape of the normal iostat header). +# POOL1: a line starting with "pool1" (stats line for a pool of that name) +# POOL2: a line starting with "pool2" +# POOLBOTH: three lines, starting with "pool1", "pool2" (either order) and +# "-----" respectively. (the pool stat output for multiple pools) +# +# (the parser may produce other chunks in a failed parse to assist with +# debugging, but they should never be part of the "wanted" output See the +# parser commentary below). +# +# To help recognise the start of a new interval output, we run iostat with the +# -T u option, which will output a numeric timestamp before each header or +# second-or-later pool stat after the header. +# +# To keep the test run shorter, we use a subsecond interval, but to make sure +# nothing is missed, we sleep for three intervals after each change. + +typeset _iostat_out=$(mktemp) +typeset _iostat_pid="" + +function cleanup_iostat { + if [[ -n $_iostat_pid ]] ; then + kill -KILL $_iostat_pid || true + fi + rm -f $_iostat_out +} + +function start_iostat { + zpool iostat -T u $@ 0.1 > $_iostat_out 2>&1 & + _iostat_pid=$! +} + +function stop_iostat { + kill -TERM $_iostat_pid + wait $_iostat_pid + _iostat_pid="" +} + +function delay_iostat { + sleep 0.3 +} + +typeset -a _iostat_expect +function expect_iostat { + typeset chunk=$1 + _iostat_expect+=($chunk) +} + +# Parse the output The `state` var is used to track state across +# multiple lines. The `last` var and the `_got_iostat` function are used +# to record the completed chunks, and to collapse repetitions. +typeset -a _iostat_got +typeset _iostat_last="" +typeset _iostat_state="" + +function _got_iostat { + typeset chunk=$1 + if [[ -n $chunk && $_iostat_last != $chunk ]] ; then + _iostat_last=$chunk + _iostat_got+=($chunk) + fi + _iostat_state="" +} + +function verify_iostat { + + cat $_iostat_out | while read line ; do + + # The "no pools available" text has no timestamp or other + # header, and should never appear in the middle of multiline + # chunk, so we can close any in-flight state. + if [[ $line = "no pools available" ]] ; then + _got_iostat $_iostat_state + _got_iostat "NOPOOL" + continue + fi + + # A run of digits alone on the line is a timestamp (the `-T u` + # switch to `iostat`). It closes any in-flight state as a + # complete chunk, and indicates the start of a new chunk. + if [[ -z ${line/#+([0-9])/} ]] ; then + _got_iostat $_iostat_state + _iostat_state="TIMESTAMP" + continue + fi + + # For this test, the first word of each line should be unique, + # so we extract it and use it for simplicity. + typeset first=${line%% *} + + # Header is emitted whenever the pool list changes. It has + # three lines: + # + # capacity operations bandwidth + # pool alloc free read write read write + # ---------- ----- ----- ----- ----- ----- ----- + # + # Each line moves the state; when we get to a run of dashes, we + # commit. Note that we check for one-or-more dashes, because + # the width can vary depending on the length of pool name. + # + if [[ $_iostat_state = "TIMESTAMP" && + $first = "capacity" ]] ; then + _iostat_state="INHEADER1" + continue + fi + if [[ $_iostat_state = "INHEADER1" && + $first = "pool" ]] ; then + _iostat_state="INHEADER2" + continue + fi + if [[ $_iostat_state = "INHEADER2" && + -z ${first/#+(-)/} ]] ; then + # Headers never repeat, so if the last committed chunk + # was a header, we commit this one as EXTRAHEADER so we + # can see it in the error output. + if [[ $_iostat_last = "HEADER" ]] ; then + _got_iostat "EXTRAHEADER" + elif [[ $_iostat_last != "EXTRAHEADER" ]] ; then + _got_iostat "HEADER" + fi + _iostat_state="HEADER" + continue + fi + + # A pool stat line looks like: + # + # pool1 147K 240M 0 0 0 0 + # + # If there are multiple pools, iostat follows them with a + # separator of dashed lines: + # + # pool1 147K 240M 0 0 0 0 + # pool2 147K 240M 0 0 0 0 + # ---------- ----- ----- ----- ----- ----- ----- + # + # Stats rows always start after a timestamp or a header. If the + # header was emitted, we won't see a timestamp here (it goes + # before the header). + # + # Because our test exercises both pools on their own and + # together, we allow pools in either order. In practice they + # are sorted, but that's a side-effect of the implementation + # (see zpool_compare()), so we're not going to rely on it here. + if [[ $first = "pool1" ]] || [[ $first = "pool2" ]] ; then + + # First line, track which one we saw. If it's a + # standalone line, it will be committed by the next + # NOPOOL or TIMESTAMP above (or the `_got_iostat` after + # the loop if this is the last line). + if [[ $_iostat_state == "TIMESTAMP" || + $_iostat_state == "HEADER" ]] ; then + if [[ $first = "pool1" ]] ; then + _iostat_state="POOL1" + elif [[ $first = "pool2" ]] ; then + _iostat_state="POOL2" + fi + continue + fi + + # If this is the second pool, we're in a multi-pool + # block, and need to look for the separator to close it + # out. + if [[ $_iostat_state = "POOL1" && $first = "pool2" ]] || + [[ $_iostat_state = "POOL2" && $first = "pool1" ]] ; + then + _iostat_state="INPOOLBOTH" + continue + fi + fi + + # Separator after the stats block. + if [[ $_iostat_state = "INPOOLBOTH" && + -z ${first/#+(-)/} ]] ; then + _got_iostat "POOLBOTH" + continue + fi + + # Anything else will fall through to here. We commit any + # in-flight state, then "UNKNOWN", all to help with debugging.. + if [[ $_iostat_state != "UNKNOWN" ]] ; then + _got_iostat $_iostat_state + _got_iostat "UNKNOWN" + fi + done + + # Close out any remaining state. + _got_iostat $_iostat_state + + # Compare what we wanted with what we got, and pass/fail the test! + if [[ "${_iostat_expect[*]}" != "${_iostat_got[*]}" ]] ; then + log_note "expected: ${_iostat_expect[*]}" + log_note " got: ${_iostat_got[*]}" + log_fail "zpool iostat did not produce expected output" + fi +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh new file mode 100755 index 000000000000..8e040058ec3e --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# `zpool iostat <N>` should keep running and update the pools it displays as +# pools are created/destroyed/imported/export. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib + +typeset vdev1=$(mktemp) +typeset vdev2=$(mktemp) + +function cleanup { + cleanup_iostat + + poolexists pool1 && destroy_pool pool1 + poolexists pool2 && destroy_pool pool2 + rm -f $vdev1 $vdev2 +} + +log_must mkfile $MINVDEVSIZE $vdev1 $vdev2 + +expect_iostat "NOPOOL" + +start_iostat + +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL1" +log_must zpool create pool1 $vdev1 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOLBOTH" +log_must zpool create pool2 $vdev2 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool export -a +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL2" +log_must zpool import -d $vdev2 pool2 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOLBOTH" +log_must zpool import -d $vdev1 pool1 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL2" +log_must zpool destroy pool1 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool destroy pool2 +delay_iostat + +stop_iostat + +verify_iostat + +log_pass "zpool iostat in interval mode follows pool updates" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh new file mode 100755 index 000000000000..ab1f258aa1cd --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# `zpool iostat <pools> <N>` should keep running and only show the listed pools. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib + +typeset vdev1=$(mktemp) +typeset vdev2=$(mktemp) + +function cleanup { + cleanup_iostat + + poolexists pool1 && destroy_pool pool1 + poolexists pool2 && destroy_pool pool2 + rm -f $vdev1 $vdev2 +} + +log_must mkfile $MINVDEVSIZE $vdev1 $vdev2 + +log_must zpool create pool1 $vdev1 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL1" +start_iostat pool1 +delay_iostat + +log_must zpool create pool2 $vdev2 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool export -a +delay_iostat + +log_must zpool import -d $vdev2 pool2 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL1" +log_must zpool import -d $vdev1 pool1 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool destroy pool1 +delay_iostat + +log_must zpool destroy pool2 +delay_iostat + +stop_iostat + +verify_iostat + +log_pass "zpool iostat in interval mode with pools follows listed pool updates" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib index 0a402e71ee68..345239b88680 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib @@ -1234,10 +1234,10 @@ function verify_fs_aedsx typeset oldval set -A modes "on" "off" oldval=$(get_prop $perm $fs) - if [[ $oldval == "on" ]]; then - n=1 - elif [[ $oldval == "off" ]]; then + if [[ $oldval == "off" ]]; then n=0 + else + n=1 fi log_note "$user zfs set $perm=${modes[$n]} $fs" user_run $user zfs set $perm=${modes[$n]} $fs diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh index 26153aafbc02..0e79e9b8b70c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/setup.ksh @@ -39,6 +39,6 @@ verify_runnable "global" # create a pool without any features -log_must mkfile 128m $TMPDEV +log_must truncate -s $MINVDEVSIZE $TMPDEV log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh index d6bd69b7e134..e81d07794689 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh @@ -35,17 +35,19 @@ verify_runnable "global" -TESTFILE="$TESTDIR/file.bin" - log_assert "User accounting upgrade should not be executed on readonly pool" log_onexit cleanup_upgrade # 1. Create a pool with the feature@userobj_accounting disabled to simulate # a legacy pool from a previous ZFS version. -log_must zpool create -d -m $TESTDIR $TESTPOOL $TMPDEV +log_must zpool create -d $TESTPOOL $TMPDEV +log_must zfs create $TESTPOOL/$TESTFS + +MNTPNT=$(get_prop mountpoint $TESTPOOL/$TESTFS) +TESTFILE="$MNTPNT/file.bin" # 2. Create a file on the "legecy" dataset -log_must touch $TESTDIR/file.bin +log_must touch $TESTFILE # 3. Enable feature@userobj_accounting on the pool and verify it is only # "enabled" and not "active": upgrading starts when the filesystem is mounted @@ -54,12 +56,12 @@ log_must test "enabled" == "$(get_pool_prop 'feature@userobj_accounting' $TESTPO # 4. Export the pool and re-import is readonly, without mounting any filesystem log_must zpool export $TESTPOOL -log_must zpool import -o readonly=on -N -d "$(dirname $TMPDEV)" $TESTPOOL +log_must zpool import -o readonly=on -N -d $TEST_BASE_DIR $TESTPOOL # 5. Try to mount the root dataset manually without the "ro" option, then verify # filesystem status and the pool feature status (not "active") to ensure the # pool "readonly" status is enforced. -log_must mount -t zfs -o zfsutil $TESTPOOL $TESTDIR +log_must zfs mount -R $TESTPOOL log_must stat "$TESTFILE" log_mustnot touch "$TESTFILE" log_must test "enabled" == "$(get_pool_prop 'feature@userobj_accounting' $TESTPOOL)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh new file mode 100755 index 000000000000..d4c9a0a41816 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_014_pos.ksh @@ -0,0 +1,53 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/xattr/xattr_common.kshlib + +# +# DESCRIPTION: +# The default xattr should be shown as 'sa', not 'on', for clarity. +# +# STRATEGY: +# 1. Create a filesystem. +# 2. Verify that the xattra is shown as 'sa'. +# 3. Manually set the value to 'dir', 'sa', 'on', and 'off'. +# 4. Verify that it is shown as 'dir', 'sa', 'sa', and 'off. +# + +log_assert "The default and specific xattr values are displayed correctly." + +set -A args "dir" "sa" "on" "off" +set -A display "dir" "sa" "sa" "off" + +log_must eval "[[ 'sa' == '$(zfs get -Hpo value xattr $TESTPOOL)' ]]" + +for i in `seq 0 3`; do + log_must zfs set xattr="${args[$i]}" $TESTPOOL + log_must eval "[[ '${display[$i]}' == '$(zfs get -Hpo value xattr $TESTPOOL)' ]]" +done +log_pass "The default and specific xattr values are displayed correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 571a698eb63a..502ebada22dc 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -50,17 +50,53 @@ fi typeset datafile1="$(mktemp -t zvol_misc_fua1.XXXXXX)" typeset datafile2="$(mktemp -t zvol_misc_fua2.XXXXXX)" +typeset datafile3="$(mktemp -t zvol_misc_fua3_log.XXXXXX)" typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL +typeset DISK1=${DISKS%% *} function cleanup { - rm "$datafile1" "$datafile2" + log_must zpool remove $TESTPOOL $datafile3 + rm "$datafile1" "$datafile2" "$datafile2" +} + +# Prints the total number of sync writes for a vdev +# $1: vdev +function get_sync +{ + zpool iostat -p -H -v -r $TESTPOOL $1 | \ + awk '/[0-9]+$/{s+=$4+$5} END{print s}' } function do_test { # Wait for udev to create symlinks to our zvol block_device_wait $zvolpath + # Write using sync (creates FLUSH calls after writes, but not FUA) + old_vdev_writes=$(get_sync $DISK1) + old_log_writes=$(get_sync $datafile3) + + log_must fio --name=write_iops --size=5M \ + --ioengine=libaio --verify=0 --bs=4K \ + --iodepth=1 --rw=randwrite --group_reporting=1 \ + --filename=$zvolpath --sync=1 + + vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes)) + log_writes=$(( $(get_sync $datafile3) - $old_log_writes)) + + # When we're doing sync writes, we should see many more writes go to + # the log vs the first vdev. Experiments show anywhere from a 160-320x + # ratio of writes to the log vs the first vdev (due to some straggler + # writes to the first vdev). + # + # Check that we have a large ratio (100x) of sync writes going to the + # log device + ratio=$(($log_writes / $vdev_writes)) + log_note "Got $log_writes log writes, $vdev_writes vdev writes." + if [ $ratio -lt 100 ] ; then + log_fail "Expected > 100x more log writes than vdev writes. " + fi + # Create a data file log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5 @@ -81,6 +117,8 @@ log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)" log_onexit cleanup log_must zfs set compression=off $TESTPOOL/$TESTVOL +log_must truncate -s 100M $datafile3 +log_must zpool add $TESTPOOL log $datafile3 log_note "Testing without blk-mq" diff --git a/sys/crypto/chacha20/chacha.c b/sys/crypto/chacha20/chacha.c index 52f7e18c651c..cb06003b0ecf 100644 --- a/sys/crypto/chacha20/chacha.c +++ b/sys/crypto/chacha20/chacha.c @@ -138,7 +138,7 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) for (;;) { if (bytes < 64) { #ifndef KEYSTREAM_ONLY - for (i = 0;i < bytes;++i) tmp[i] = m[i]; + for (i = 0; i < bytes; ++i) tmp[i] = m[i]; m = tmp; #endif ctarget = c; @@ -160,7 +160,7 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) x13 = j13; x14 = j14; x15 = j15; - for (i = 20;i > 0;i -= 2) { + for (i = 20; i > 0; i -= 2) { QUARTERROUND( x0, x4, x8,x12) QUARTERROUND( x1, x5, x9,x13) QUARTERROUND( x2, x6,x10,x14) @@ -240,7 +240,7 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) if (bytes <= 64) { if (bytes < 64) { - for (i = 0;i < bytes;++i) ctarget[i] = c[i]; + for (i = 0; i < bytes; ++i) ctarget[i] = c[i]; } x->input[12] = j12; x->input[13] = j13; diff --git a/sys/crypto/openssl/ossl_sha256.c b/sys/crypto/openssl/ossl_sha256.c index 4613a9409b44..50cb9739d114 100644 --- a/sys/crypto/openssl/ossl_sha256.c +++ b/sys/crypto/openssl/ossl_sha256.c @@ -74,11 +74,11 @@ ossl_sha256_init(void *c_) unsigned int nn; \ switch ((c)->md_len) \ { case SHA224_DIGEST_LENGTH: \ - for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \ + for (nn=0; nn < SHA224_DIGEST_LENGTH / 4; nn++) \ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \ break; \ case SHA256_DIGEST_LENGTH: \ - for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \ + for (nn=0; nn < SHA256_DIGEST_LENGTH / 4; nn++) \ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \ break; \ default: \ diff --git a/sys/dev/aac/aac_linux.c b/sys/dev/aac/aac_linux.c index 609315f50939..65008c562342 100644 --- a/sys/dev/aac/aac_linux.c +++ b/sys/dev/aac/aac_linux.c @@ -52,15 +52,7 @@ #define AAC_LINUX_IOCTL_MIN 0x0000 #define AAC_LINUX_IOCTL_MAX 0x21ff -static linux_ioctl_function_t aac_linux_ioctl; -static struct linux_ioctl_handler aac_linux_handler = {aac_linux_ioctl, - AAC_LINUX_IOCTL_MIN, - AAC_LINUX_IOCTL_MAX}; - -SYSINIT (aac_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &aac_linux_handler); -SYSUNINIT(aac_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &aac_linux_handler); +LINUX_IOCTL_SET(aac, AAC_LINUX_IOCTL_MIN, AAC_LINUX_IOCTL_MAX); static int aac_linux_modevent(module_t mod, int type, void *data) diff --git a/sys/dev/aacraid/aacraid_linux.c b/sys/dev/aacraid/aacraid_linux.c index 267dd84c65d5..6b6259ed7059 100644 --- a/sys/dev/aacraid/aacraid_linux.c +++ b/sys/dev/aacraid/aacraid_linux.c @@ -54,15 +54,7 @@ #define AAC_LINUX_IOCTL_MIN 0x0000 #define AAC_LINUX_IOCTL_MAX 0x21ff -static linux_ioctl_function_t aacraid_linux_ioctl; -static struct linux_ioctl_handler aacraid_linux_handler = {aacraid_linux_ioctl, - AAC_LINUX_IOCTL_MIN, - AAC_LINUX_IOCTL_MAX}; - -SYSINIT (aacraid_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &aacraid_linux_handler); -SYSUNINIT(aacraid_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &aacraid_linux_handler); +LINUX_IOCTL_SET(aacraid, AAC_LINUX_IOCTL_MIN, AAC_LINUX_IOCTL_MAX); static int aacraid_linux_modevent(module_t mod, int type, void *data) diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c index 7f9ca6e39df8..3f0a7b40245d 100644 --- a/sys/dev/acpica/acpi.c +++ b/sys/dev/acpica/acpi.c @@ -3468,10 +3468,10 @@ acpi_EnterSleepState(struct acpi_softc *sc, enum power_stype stype) return_ACPI_STATUS (AE_OK); } - EVENTHANDLER_INVOKE(power_suspend_early); + EVENTHANDLER_INVOKE(power_suspend_early, stype); stop_all_proc(); suspend_all_fs(); - EVENTHANDLER_INVOKE(power_suspend); + EVENTHANDLER_INVOKE(power_suspend, stype); #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); @@ -3632,7 +3632,7 @@ backout: resume_all_fs(); resume_all_proc(); - EVENTHANDLER_INVOKE(power_resume); + EVENTHANDLER_INVOKE(power_resume, stype); /* Allow another sleep request after a while. */ callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME); diff --git a/sys/dev/acpica/acpi_apei.c b/sys/dev/acpica/acpi_apei.c index 9cfd46c97430..624c81ad1b4f 100644 --- a/sys/dev/acpica/acpi_apei.c +++ b/sys/dev/acpica/acpi_apei.c @@ -754,7 +754,7 @@ apei_detach(device_t dev) apei_nmi = NULL; apei_nmi_nges = NULL; if (sc->nges.swi_ih != NULL) { - swi_remove(&sc->nges.swi_ih); + swi_remove(sc->nges.swi_ih); sc->nges.swi_ih = NULL; } if (acpi_get_handle(dev) != NULL) { diff --git a/sys/dev/acpica/acpi_timer.c b/sys/dev/acpica/acpi_timer.c index 3d51a4211b80..b20912e2f5fb 100644 --- a/sys/dev/acpica/acpi_timer.c +++ b/sys/dev/acpica/acpi_timer.c @@ -34,6 +34,7 @@ #include <sys/module.h> #include <sys/sysctl.h> #include <sys/timetc.h> +#include <sys/power.h> #include <machine/bus.h> #include <machine/resource.h> @@ -69,8 +70,10 @@ bool acpi_timer_disabled = false; static void acpi_timer_identify(driver_t *driver, device_t parent); static int acpi_timer_probe(device_t dev); static int acpi_timer_attach(device_t dev); -static void acpi_timer_resume_handler(struct timecounter *); -static void acpi_timer_suspend_handler(struct timecounter *); +static void acpi_timer_resume_handler(struct timecounter *, + enum power_stype); +static void acpi_timer_suspend_handler(struct timecounter *, + enum power_stype); static u_int acpi_timer_get_timecount(struct timecounter *tc); static u_int acpi_timer_get_timecount_safe(struct timecounter *tc); static int acpi_timer_sysctl_freq(SYSCTL_HANDLER_ARGS); @@ -235,7 +238,7 @@ acpi_timer_attach(device_t dev) } static void -acpi_timer_resume_handler(struct timecounter *newtc) +acpi_timer_resume_handler(struct timecounter *newtc, enum power_stype stype) { struct timecounter *tc; @@ -251,7 +254,7 @@ acpi_timer_resume_handler(struct timecounter *newtc) } static void -acpi_timer_suspend_handler(struct timecounter *newtc) +acpi_timer_suspend_handler(struct timecounter *newtc, enum power_stype stype) { struct timecounter *tc; diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c index 82f56fc0d19e..2b4cb37275a6 100644 --- a/sys/dev/ahci/ahci_pci.c +++ b/sys/dev/ahci/ahci_pci.c @@ -467,28 +467,6 @@ ahci_ata_probe(device_t dev) } static int -ahci_pci_read_msix_bars(device_t dev, uint8_t *table_bar, uint8_t *pba_bar) -{ - int cap_offset = 0, ret; - uint32_t val; - - if ((table_bar == NULL) || (pba_bar == NULL)) - return (EINVAL); - - ret = pci_find_cap(dev, PCIY_MSIX, &cap_offset); - if (ret != 0) - return (EINVAL); - - val = pci_read_config(dev, cap_offset + PCIR_MSIX_TABLE, 4); - *table_bar = PCIR_BAR(val & PCIM_MSIX_BIR_MASK); - - val = pci_read_config(dev, cap_offset + PCIR_MSIX_PBA, 4); - *pba_bar = PCIR_BAR(val & PCIM_MSIX_BIR_MASK); - - return (0); -} - -static int ahci_pci_attach(device_t dev) { struct ahci_controller *ctlr = device_get_softc(dev); @@ -496,7 +474,6 @@ ahci_pci_attach(device_t dev) uint32_t devid = pci_get_devid(dev); uint8_t revid = pci_get_revid(dev); int msi_count, msix_count; - uint8_t table_bar = 0, pba_bar = 0; uint32_t caps, pi; msi_count = pci_msi_count(dev); @@ -584,20 +561,11 @@ ahci_pci_attach(device_t dev) if (ctlr->quirks & AHCI_Q_NOMSIX) msix_count = 0; - /* Read MSI-x BAR IDs if supported */ - if (msix_count > 0) { - error = ahci_pci_read_msix_bars(dev, &table_bar, &pba_bar); - if (error == 0) { - ctlr->r_msix_tab_rid = table_bar; - ctlr->r_msix_pba_rid = pba_bar; - } else { - /* Failed to read BARs, disable MSI-x */ - msix_count = 0; - } - } - /* Allocate resources for MSI-x table and PBA */ if (msix_count > 0) { + ctlr->r_msix_tab_rid = pci_msix_table_bar(dev); + ctlr->r_msix_pba_rid = pci_msix_pba_bar(dev); + /* * Allocate new MSI-x table only if not * allocated before. @@ -608,8 +576,8 @@ ahci_pci_attach(device_t dev) ctlr->r_msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ctlr->r_msix_tab_rid, RF_ACTIVE); if (ctlr->r_msix_table == NULL) { - ahci_free_mem(dev); - return (ENXIO); + msix_count = 0; + goto no_msix; } } @@ -624,12 +592,12 @@ ahci_pci_attach(device_t dev) ctlr->r_msix_pba = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ctlr->r_msix_pba_rid, RF_ACTIVE); if (ctlr->r_msix_pba == NULL) { - ahci_free_mem(dev); - return (ENXIO); + msix_count = 0; } } } +no_msix: pci_enable_busmaster(dev); /* Reset controller */ if ((error = ahci_pci_ctlr_reset(dev)) != 0) { diff --git a/sys/dev/ahci/ahciem.c b/sys/dev/ahci/ahciem.c index c9e6c35f4233..012796aa5d6f 100644 --- a/sys/dev/ahci/ahciem.c +++ b/sys/dev/ahci/ahciem.c @@ -479,7 +479,7 @@ ahci_em_emulate_ses_on_led(device_t dev, union ccb *ccb) else ads->common.bytes[0] |= SES_OBJSTAT_NOTINSTALLED; if (ch->disablephy) - ads->common.bytes[3] |= SESCTL_DEVOFF; + ads->bytes[2] |= SESCTL_DEVOFF; ahci_putch(ch); } ccb->ccb_h.status = CAM_REQ_CMP; diff --git a/sys/dev/aic7xxx/aic79xx.c b/sys/dev/aic7xxx/aic79xx.c index 2b5015b20e41..cee45fa5cc8a 100644 --- a/sys/dev/aic7xxx/aic79xx.c +++ b/sys/dev/aic7xxx/aic79xx.c @@ -7788,8 +7788,8 @@ ahd_abort_scbs(struct ahd_softc *ahd, int target, char channel, } if (role != ROLE_TARGET) { - for (;i < maxtarget; i++) { - for (j = minlun;j < maxlun; j++) { + for (; i < maxtarget; i++) { + for (j = minlun; j < maxlun; j++) { u_int scbid; u_int tcl; diff --git a/sys/dev/aic7xxx/aic7xxx.c b/sys/dev/aic7xxx/aic7xxx.c index c09876e9f589..18f68b806948 100644 --- a/sys/dev/aic7xxx/aic7xxx.c +++ b/sys/dev/aic7xxx/aic7xxx.c @@ -5903,8 +5903,8 @@ ahc_abort_scbs(struct ahc_softc *ahc, int target, char channel, } if (role != ROLE_TARGET) { - for (;i < maxtarget; i++) { - for (j = minlun;j < maxlun; j++) { + for (; i < maxtarget; i++) { + for (j = minlun; j < maxlun; j++) { u_int scbid; u_int tcl; diff --git a/sys/dev/atkbdc/psm.c b/sys/dev/atkbdc/psm.c index 8563b5f93aa2..137758b104d3 100644 --- a/sys/dev/atkbdc/psm.c +++ b/sys/dev/atkbdc/psm.c @@ -5287,6 +5287,7 @@ static const struct filterops psmfiltops = { .f_isfd = 1, .f_detach = psmfilter_detach, .f_event = psmfilter, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/cyapa/cyapa.c b/sys/dev/cyapa/cyapa.c index ed755f992949..464b03c0ab64 100644 --- a/sys/dev/cyapa/cyapa.c +++ b/sys/dev/cyapa/cyapa.c @@ -1121,7 +1121,8 @@ static int cyapafilt(struct knote *, long); static const struct filterops cyapa_filtops = { .f_isfd = 1, .f_detach = cyapafiltdetach, - .f_event = cyapafilt + .f_event = cyapafilt, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/dc/if_dc.c b/sys/dev/dc/if_dc.c index bed74c3b6181..5c1d7ff30976 100644 --- a/sys/dev/dc/if_dc.c +++ b/sys/dev/dc/if_dc.c @@ -999,7 +999,7 @@ dc_setfilt_21143(struct dc_softc *sc) else DC_CLRBIT(sc, DC_NETCFG, DC_NETCFG_RX_ALLMULTI); - if_foreach_llmaddr(ifp, dc_hash_maddr_21143, sp); + if_foreach_llmaddr(ifp, dc_hash_maddr_21143, sc); if (if_getflags(ifp) & IFF_BROADCAST) { h = dc_mchash_le(sc, if_getbroadcastaddr(ifp)); diff --git a/sys/dev/enetc/if_enetc.c b/sys/dev/enetc/if_enetc.c index 808397b229a7..53002f9d73ce 100644 --- a/sys/dev/enetc/if_enetc.c +++ b/sys/dev/enetc/if_enetc.c @@ -848,7 +848,7 @@ enetc_hash_vid(uint16_t vid) bool bit; int i; - for (i = 0;i < 6;i++) { + for (i = 0; i < 6; i++) { bit = vid & BIT(i); bit ^= !!(vid & BIT(i + 6)); hash |= bit << i; @@ -1020,7 +1020,7 @@ enetc_msix_intr_assign(if_ctx_t ctx, int msix) ENETC_RBICR0_ICEN | ENETC_RBICR0_SET_ICPT(ENETC_RX_INTR_PKT_THR)); } vector = 0; - for (i = 0;i < sc->tx_num_queues; i++, vector++) { + for (i = 0; i < sc->tx_num_queues; i++, vector++) { tx_queue = &sc->tx_queues[i]; snprintf(irq_name, sizeof(irq_name), "txq%d", i); iflib_softirq_alloc_generic(ctx, &tx_queue->irq, @@ -1130,7 +1130,7 @@ enetc_isc_txd_encap(void *data, if_pkt_info_t ipi) } /* Now add remaining descriptors. */ - for (;i < ipi->ipi_nsegs; i++) { + for (; i < ipi->ipi_nsegs; i++) { desc = &queue->ring[pidx]; bzero(desc, sizeof(*desc)); desc->addr = segs[i].ds_addr; diff --git a/sys/dev/evdev/cdev.c b/sys/dev/evdev/cdev.c index 9fe1299a0937..dd4115cdfc71 100644 --- a/sys/dev/evdev/cdev.c +++ b/sys/dev/evdev/cdev.c @@ -96,6 +96,7 @@ static const struct filterops evdev_cdev_filterops = { .f_attach = NULL, .f_detach = evdev_kqdetach, .f_event = evdev_kqread, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/evdev/uinput.c b/sys/dev/evdev/uinput.c index 9ac9fee8a157..76a530479c02 100644 --- a/sys/dev/evdev/uinput.c +++ b/sys/dev/evdev/uinput.c @@ -104,6 +104,7 @@ static const struct filterops uinput_filterops = { .f_attach = NULL, .f_detach = uinput_kqdetach, .f_event = uinput_kqread, + .f_copy = knote_triv_copy, }; struct uinput_cdev_state diff --git a/sys/dev/fdt/fdt_slicer.c b/sys/dev/fdt/fdt_slicer.c index 3ba4eddf8b61..50112db5cfae 100644 --- a/sys/dev/fdt/fdt_slicer.c +++ b/sys/dev/fdt/fdt_slicer.c @@ -45,7 +45,7 @@ static int fill_slices(device_t dev, const char *provider, struct flash_slice *slices, int *slices_num); -static void fdt_slicer_init(void); +static void fdt_slicer_init(void *); static int fill_slices_from_node(phandle_t node, struct flash_slice *slices, int *count) @@ -138,7 +138,7 @@ fill_slices(device_t dev, const char *provider __unused, } static void -fdt_slicer_init(void) +fdt_slicer_init(void *dummy __unused) { flash_register_slicer(fill_slices, FLASH_SLICES_TYPE_NAND, false); @@ -147,7 +147,7 @@ fdt_slicer_init(void) } static void -fdt_slicer_cleanup(void) +fdt_slicer_cleanup(void *dummy __unused) { flash_register_slicer(NULL, FLASH_SLICES_TYPE_NAND, true); diff --git a/sys/dev/gpio/gpioc.c b/sys/dev/gpio/gpioc.c index 6c6f79227166..517f7752daad 100644 --- a/sys/dev/gpio/gpioc.c +++ b/sys/dev/gpio/gpioc.c @@ -158,7 +158,8 @@ static const struct filterops gpioc_read_filterops = { .f_attach = NULL, .f_detach = gpioc_kqdetach, .f_event = gpioc_kqread, - .f_touch = NULL + .f_touch = NULL, + .f_copy = knote_triv_copy, }; static struct gpioc_pin_event * diff --git a/sys/dev/hid/hidraw.c b/sys/dev/hid/hidraw.c index 4855843cd265..5b5e9b58f8bd 100644 --- a/sys/dev/hid/hidraw.c +++ b/sys/dev/hid/hidraw.c @@ -182,6 +182,7 @@ static const struct filterops hidraw_filterops_read = { .f_isfd = 1, .f_detach = hidraw_kqdetach, .f_event = hidraw_kqread, + .f_copy = knote_triv_copy, }; static void diff --git a/sys/dev/hid/u2f.c b/sys/dev/hid/u2f.c index 08f1a5ceedba..e1f696d72f01 100644 --- a/sys/dev/hid/u2f.c +++ b/sys/dev/hid/u2f.c @@ -132,6 +132,7 @@ static struct filterops u2f_filterops_read = { .f_isfd = 1, .f_detach = u2f_kqdetach, .f_event = u2f_kqread, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/hptmv/entry.c b/sys/dev/hptmv/entry.c index 5c4718bf582f..f3d58f285b39 100644 --- a/sys/dev/hptmv/entry.c +++ b/sys/dev/hptmv/entry.c @@ -430,7 +430,7 @@ static void device_change(IAL_ADAPTER_T *pAdapter , MV_U8 channelIndex, int plug if(pVDev->pParent) { int iMember; - for(iMember = 0; iMember < pVDev->pParent->u.array.bArnMember; iMember++) + for (iMember = 0; iMember < pVDev->pParent->u.array.bArnMember; iMember++) if((PVDevice)pVDev->pParent->u.array.pMember[iMember] == pVDev) pVDev->pParent->u.array.pMember[iMember] = NULL; pVDev->pParent = NULL; @@ -984,7 +984,7 @@ fRegisterVdevice(IAL_ADAPTER_T *pAdapter) PVBus pVBus; int i,j; - for(i=0;i<MV_SATA_CHANNELS_NUM;i++) { + for (i = 0; i < MV_SATA_CHANNELS_NUM; i++) { pPhysical = &(pAdapter->VDevices[i]); pLogical = pPhysical; while (pLogical->pParent) pLogical = pLogical->pParent; @@ -1027,8 +1027,7 @@ GetSpareDisk(_VBUS_ARG PVDevice pArray) PVDevice pVDevice, pFind = NULL; int i; - for(i=0;i<MV_SATA_CHANNELS_NUM;i++) - { + for (i=0; i < MV_SATA_CHANNELS_NUM; i++) { pVDevice = &pAdapter->VDevices[i]; if(!pVDevice) continue; @@ -1356,7 +1355,7 @@ unregister: goto unregister; } - for (i=0; i<MAX_COMMAND_BLOCKS_FOR_EACH_VBUS; i++) { + for (i = 0; i < MAX_COMMAND_BLOCKS_FOR_EACH_VBUS; i++) { FreeCommand(_VBUS_P &(pAdapter->pCommandBlocks[i])); } @@ -1370,7 +1369,7 @@ unregister: memset((void *)pAdapter->pbus_dmamap, 0, sizeof(struct _BUS_DMAMAP) * MAX_QUEUE_COMM); pAdapter->pbus_dmamap_list = 0; - for (i=0; i < MAX_QUEUE_COMM; i++) { + for (i = 0; i < MAX_QUEUE_COMM; i++) { PBUS_DMAMAP pmap = &(pAdapter->pbus_dmamap[i]); pmap->pAdapter = pAdapter; dmamap_put(pmap); @@ -1398,7 +1397,7 @@ unregister: pAdapter->prdTableAlignedAddr = (PUCHAR)(((ULONG_PTR)pAdapter->prdTableAddr + 0x1f) & ~(ULONG_PTR)0x1fL); { PUCHAR PRDTable = pAdapter->prdTableAlignedAddr; - for (i=0; i<PRD_TABLES_FOR_VBUS; i++) + for (i = 0; i < PRD_TABLES_FOR_VBUS; i++) { /* KdPrint(("i=%d,pAdapter->pFreePRDLink=%p\n",i,pAdapter->pFreePRDLink)); */ FreePRDTable(pAdapter, PRDTable); @@ -1447,7 +1446,7 @@ unregister: } #ifdef SUPPORT_ARRAY - for(i = MAX_ARRAY_DEVICE - 1; i >= 0; i--) { + for (i = MAX_ARRAY_DEVICE - 1; i >= 0; i--) { pVDev = ArrayTables(i); mArFreeArrayTable(pVDev); } @@ -1467,7 +1466,7 @@ unregister: _vbus_p->nInstances = 1; fRegisterVdevice(pAdapter); - for (channel=0;channel<MV_SATA_CHANNELS_NUM;channel++) { + for (channel = 0; channel < MV_SATA_CHANNELS_NUM; channel++) { pVDev = _vbus_p->pVDevice[channel]; if (pVDev && pVDev->vf_online) fCheckBootable(pVDev); @@ -1567,7 +1566,7 @@ fResetActiveCommands(PVBus _vbus_p) { MV_SATA_ADAPTER *pMvSataAdapter = &((IAL_ADAPTER_T *)_vbus_p->OsExt)->mvSataAdapter; MV_U8 channel; - for (channel=0;channel< MV_SATA_CHANNELS_NUM;channel++) { + for (channel = 0; channel < MV_SATA_CHANNELS_NUM; channel++) { if (pMvSataAdapter->sataChannel[channel] && pMvSataAdapter->sataChannel[channel]->outstandingCommands) MvSataResetChannel(pMvSataAdapter,channel); } @@ -1590,7 +1589,7 @@ check_cmds: dataxfer_poll(); xor_poll(); #endif - for (channel=0;channel< MV_SATA_CHANNELS_NUM;channel++) { + for (channel = 0; channel < MV_SATA_CHANNELS_NUM; channel++) { pMvSataChannel = pMvSataAdapter->sataChannel[channel]; if (pMvSataChannel && pMvSataChannel->outstandingCommands) { @@ -1716,7 +1715,7 @@ fDeviceSendCommand(_VBUS_ARG PCommand pCmd) MV_BOOLEAN is48bit; MV_U8 channel; - int i=0; + int i = 0; DECLARE_BUFFER(FPSCAT_GATH, tmpSg); @@ -2141,7 +2140,7 @@ FlushAdapter(IAL_ADAPTER_T *pAdapter) hpt_printk(("flush all devices\n")); /* flush all devices */ - for (i=0; i<MAX_VDEVICE_PER_VBUS; i++) { + for (i = 0; i < MAX_VDEVICE_PER_VBUS; i++) { PVDevice pVDev = pAdapter->VBus.pVDevice[i]; if(pVDev) fFlushVDev(pVDev); } @@ -2174,7 +2173,7 @@ Check_Idle_Call(IAL_ADAPTER_T *pAdapter) { int i; PVDevice pArray; - for(i = 0; i < MAX_ARRAY_PER_VBUS; i++){ + for (i = 0; i < MAX_ARRAY_PER_VBUS; i++) { if ((pArray=ArrayTables(i))->u.array.dArStamp==0) continue; else if (pArray->u.array.rf_auto_rebuild) { @@ -2378,7 +2377,7 @@ hpt_free_ccb(union ccb **ccb_Q, union ccb *ccb) static void hpt_worker_thread(void) { - for(;;) { + for (;;) { mtx_lock(&DpcQueue_Lock); while (DpcQueue_First!=DpcQueue_Last) { ST_HPT_DPC p; @@ -2418,7 +2417,7 @@ static void hpt_worker_thread(void) mtx_lock(&pAdapter->lock); _vbus_p = &pAdapter->VBus; - for (i=0;i<MAX_ARRAY_PER_VBUS;i++) + for (i = 0; i < MAX_ARRAY_PER_VBUS; i++) { if ((pArray=ArrayTables(i))->u.array.dArStamp==0) continue; @@ -2472,7 +2471,7 @@ launch_worker_thread(void) int i; PVDevice pVDev; - for(i = 0; i < MAX_ARRAY_PER_VBUS; i++) + for (i = 0; i < MAX_ARRAY_PER_VBUS; i++) if ((pVDev=ArrayTables(i))->u.array.dArStamp==0) continue; else{ diff --git a/sys/dev/hptmv/gui_lib.c b/sys/dev/hptmv/gui_lib.c index d78fdcca69d2..f11044db733a 100644 --- a/sys/dev/hptmv/gui_lib.c +++ b/sys/dev/hptmv/gui_lib.c @@ -86,8 +86,7 @@ check_VDevice_valid(PVDevice p) while(pAdapter != NULL) { _vbus_p = &pAdapter->VBus; - for (i=0;i<MAX_ARRAY_PER_VBUS;i++) - { + for (i = 0; i<MAX_ARRAY_PER_VBUS; i++) { pVDevice=ArrayTables(i); if ((pVDevice->u.array.dArStamp != 0) && (pVDevice == p)) return 0; @@ -244,9 +243,9 @@ static void get_array_info(PVDevice pVDevice, PHPT_ARRAY_INFO pArrayInfo) if(pVDevice->u.array.pMember[i] != NULL) pArrayInfo->Members[pArrayInfo->nDisk++] = VDEV_TO_ID(pVDevice->u.array.pMember[i]); - for(i=pArrayInfo->nDisk; i<MAX_ARRAY_MEMBERS; i++) + for (i = pArrayInfo->nDisk; i < MAX_ARRAY_MEMBERS; i++) pArrayInfo->Members[i] = INVALID_DEVICEID; - } +} static void get_array_info_v2(PVDevice pVDevice, PHPT_ARRAY_INFO_V2 pArrayInfo) { @@ -266,7 +265,7 @@ static void get_array_info_v2(PVDevice pVDevice, PHPT_ARRAY_INFO_V2 pArrayInfo) if(pVDevice->u.array.pMember[i] != NULL) pArrayInfo->Members[pArrayInfo->nDisk++] = VDEV_TO_ID(pVDevice->u.array.pMember[i]); - for(i=pArrayInfo->nDisk; i<MAX_ARRAY_MEMBERS_V2; i++) + for (i = pArrayInfo->nDisk; i < MAX_ARRAY_MEMBERS_V2; i++) pArrayInfo->Members[i] = INVALID_DEVICEID; } #endif @@ -461,8 +460,7 @@ found: pInfo->IoPort = 0; pInfo->ControlPort = 0; - for (i=0; i<2 ;i++) - { + for (i = 0; i < 2; i++) { pInfo->Devices[i] = (DEVICEID)INVALID_DEVICEID; } diff --git a/sys/dev/hptmv/hptproc.c b/sys/dev/hptmv/hptproc.c index 38fe61ee7e04..328750d9034c 100644 --- a/sys/dev/hptmv/hptproc.c +++ b/sys/dev/hptmv/hptproc.c @@ -107,7 +107,7 @@ hpt_set_asc_info(IAL_ADAPTER_T *pAdapter, char *buffer,int length) return -EINVAL; } - for (i=0;i<MV_SATA_CHANNELS_NUM;i++) + for (i = 0; i < MV_SATA_CHANNELS_NUM; i++) if(i == ichan) goto rebuild; diff --git a/sys/dev/ice/ice_common.c b/sys/dev/ice/ice_common.c index ad4ea4c8e7a1..b895f661bc46 100644 --- a/sys/dev/ice/ice_common.c +++ b/sys/dev/ice/ice_common.c @@ -213,6 +213,15 @@ int ice_set_mac_type(struct ice_hw *hw) case ICE_DEV_ID_E830_L_QSFP: case ICE_DEV_ID_E830C_SFP: case ICE_DEV_ID_E830_L_SFP: + case ICE_DEV_ID_E835CC_BACKPLANE: + case ICE_DEV_ID_E835CC_QSFP56: + case ICE_DEV_ID_E835CC_SFP: + case ICE_DEV_ID_E835C_BACKPLANE: + case ICE_DEV_ID_E835C_QSFP: + case ICE_DEV_ID_E835C_SFP: + case ICE_DEV_ID_E835_L_BACKPLANE: + case ICE_DEV_ID_E835_L_QSFP: + case ICE_DEV_ID_E835_L_SFP: hw->mac_type = ICE_MAC_E830; break; default: diff --git a/sys/dev/ice/ice_devids.h b/sys/dev/ice/ice_devids.h index 3f91e9dfbcaf..74712c61ae8e 100644 --- a/sys/dev/ice/ice_devids.h +++ b/sys/dev/ice/ice_devids.h @@ -62,6 +62,24 @@ #define ICE_DEV_ID_E830C_SFP 0x12DA /* Intel(R) Ethernet Controller E830-L for SFP */ #define ICE_DEV_ID_E830_L_SFP 0x12DE +/* Intel(R) Ethernet Controller E835-CC for backplane */ +#define ICE_DEV_ID_E835CC_BACKPLANE 0x1248 +/* Intel(R) Ethernet Controller E835-CC for QSFP */ +#define ICE_DEV_ID_E835CC_QSFP56 0x1249 +/* Intel(R) Ethernet Controller E835-CC for SFP */ +#define ICE_DEV_ID_E835CC_SFP 0x124A +/* Intel(R) Ethernet Controller E835-C for backplane */ +#define ICE_DEV_ID_E835C_BACKPLANE 0x1261 +/* Intel(R) Ethernet Controller E835-C for QSFP */ +#define ICE_DEV_ID_E835C_QSFP 0x1262 +/* Intel(R) Ethernet Controller E835-C for SFP */ +#define ICE_DEV_ID_E835C_SFP 0x1263 +/* Intel(R) Ethernet Controller E835-L for backplane */ +#define ICE_DEV_ID_E835_L_BACKPLANE 0x1265 +/* Intel(R) Ethernet Controller E835-L for QSFP */ +#define ICE_DEV_ID_E835_L_QSFP 0x1266 +/* Intel(R) Ethernet Controller E835-L for SFP */ +#define ICE_DEV_ID_E835_L_SFP 0x1267 /* Intel(R) Ethernet Controller E810-C for backplane */ #define ICE_DEV_ID_E810C_BACKPLANE 0x1591 /* Intel(R) Ethernet Controller E810-C for QSFP */ diff --git a/sys/dev/ice/ice_drv_info.h b/sys/dev/ice/ice_drv_info.h index 2a51a7394424..46965f4124bc 100644 --- a/sys/dev/ice/ice_drv_info.h +++ b/sys/dev/ice/ice_drv_info.h @@ -218,6 +218,45 @@ static const pci_vendor_info_t ice_vendor_info_array[] = { "Intel(R) Ethernet Network Adapter E830-XXV-2"), PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E830_L_SFP, "Intel(R) Ethernet Connection E830-L for SFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_BACKPLANE, + "Intel(R) Ethernet Connection E835-CC for backplane"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0001, 0, + "Intel(R) Ethernet Network Adapter E835-C-Q2"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0002, 0, + "Intel(R) Ethernet Network Adapter E835-C-Q2 for OCP 3.0"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0003, 0, + "Intel(R) Ethernet Network Adapter E835-CC-Q1"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0004, 0, + "Intel(R) Ethernet Network Adapter E835-CC-Q1 for OCP 3.0"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + "Intel(R) Ethernet Connection E835-CC for QSFP56"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0001, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-2 for OCP 3.0"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0003, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-2"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0004, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-4 for OCP 3.0"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + "Intel(R) Ethernet Connection E835-CC for SFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835C_BACKPLANE, + "Intel(R) Ethernet Connection E835-C for backplane"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835C_QSFP, + "Intel(R) Ethernet Connection E835-C for QSFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835C_SFP, + "Intel(R) Ethernet Connection E835-C for SFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835_L_BACKPLANE, + "Intel(R) Ethernet Connection E835-L for backplane"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835_L_QSFP, + "Intel(R) Ethernet Connection E835-L for QSFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835_L_SFP, + "Intel(R) Ethernet Connection E835-L for SFP"), PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E825C_BACKPLANE, "Intel(R) Ethernet Connection E825-C for backplane"), PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E825C_QSFP, diff --git a/sys/dev/iommu/iommu_gas.c b/sys/dev/iommu/iommu_gas.c index ffa8dc096adc..80e37341b3dc 100644 --- a/sys/dev/iommu/iommu_gas.c +++ b/sys/dev/iommu/iommu_gas.c @@ -77,7 +77,7 @@ static int iommu_check_free; #endif static void -intel_gas_init(void) +intel_gas_init(void *dummy __unused) { iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY", diff --git a/sys/dev/ipmi/ipmi_linux.c b/sys/dev/ipmi/ipmi_linux.c index 05eb30a0aa77..58872de12003 100644 --- a/sys/dev/ipmi/ipmi_linux.c +++ b/sys/dev/ipmi/ipmi_linux.c @@ -66,15 +66,7 @@ #define L_IPMICTL_SET_MY_LUN_CMD _IOW(IPMI_IOC_MAGIC, 19, unsigned int) #define L_IPMICTL_GET_MY_LUN_CMD _IOW(IPMI_IOC_MAGIC, 20, unsigned int) -static linux_ioctl_function_t ipmi_linux_ioctl; -static struct linux_ioctl_handler ipmi_linux_handler = {ipmi_linux_ioctl, - IPMI_LINUX_IOCTL_MIN, - IPMI_LINUX_IOCTL_MAX}; - -SYSINIT (ipmi_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &ipmi_linux_handler); -SYSUNINIT(ipmi_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &ipmi_linux_handler); +LINUX_IOCTL_SET(ipmi, IPMI_LINUX_IOCTL_MIN, IPMI_LINUX_IOCTL_MAX); static int ipmi_linux_modevent(module_t mod, int type, void *data) diff --git a/sys/dev/iwx/if_iwx.c b/sys/dev/iwx/if_iwx.c index 8422fcb787c3..04ed09f04604 100644 --- a/sys/dev/iwx/if_iwx.c +++ b/sys/dev/iwx/if_iwx.c @@ -4805,6 +4805,8 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_packet *pkt, static void iwx_clear_oactive(struct iwx_softc *sc, struct iwx_tx_ring *ring) { + IWX_ASSERT_LOCKED(sc); + if (ring->queued < iwx_lomark) { sc->qfullmsk &= ~(1 << ring->qid); if (sc->qfullmsk == 0 /* && ifq_is_oactive(&ifp->if_snd) */) { @@ -4890,11 +4892,19 @@ iwx_rx_bmiss(struct iwx_softc *sc, struct iwx_rx_packet *pkt, bus_dmamap_sync(sc->rxq.data_dmat, data->map, BUS_DMASYNC_POSTREAD); + IWX_DPRINTF(sc, IWX_DEBUG_BEACON, + "%s: mac_id=%u, cmslrx=%u, cmb=%u, neb=%d, nrb=%u\n", + __func__, + le32toh(mbn->mac_id), + le32toh(mbn->consec_missed_beacons_since_last_rx), + le32toh(mbn->consec_missed_beacons), + le32toh(mbn->num_expected_beacons), + le32toh(mbn->num_recvd_beacons)); + missed = le32toh(mbn->consec_missed_beacons_since_last_rx); if (missed > vap->iv_bmissthreshold) { ieee80211_beacon_miss(ic); } - } static int @@ -5491,6 +5501,9 @@ iwx_tx_fill_cmd(struct iwx_softc *sc, struct iwx_node *in, /* for non-data, use the lowest supported rate */ ridx = min_ridx; *flags |= IWX_TX_FLAGS_CMD_RATE; + } else if (ni->ni_flags & IEEE80211_NODE_VHT) { + /* TODO: VHT - the ridx / rate array doesn't have VHT rates yet */ + ridx = iwx_min_basic_rate(ic); } else if (ni->ni_flags & IEEE80211_NODE_HT) { ridx = iwx_mcs2ridx[ieee80211_node_get_txrate_dot11rate(ni) & ~IEEE80211_RATE_MCS]; @@ -5622,6 +5635,8 @@ iwx_tx(struct iwx_softc *sc, struct mbuf *m, struct ieee80211_node *ni) struct mbuf *m1; size_t txcmd_size; + IWX_ASSERT_LOCKED(sc); + wh = mtod(m, struct ieee80211_frame *); type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK; subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK; @@ -7308,97 +7323,107 @@ iwx_rs_init(struct iwx_softc *sc, struct iwx_node *in) return iwx_rs_init_v3(sc, in); } -static void -iwx_rs_update(struct iwx_softc *sc, struct iwx_tlc_update_notif *notif) + +/** + * @brief Turn the given TX rate control notification into an ieee80211_node_txrate + * + * This populates the given txrate node with the TX rate control notification. + * + * @param sc driver softc + * @param notif firmware notification + * @param ni ieee80211_node update + * @returns true if updated, false if not + */ +static bool +iwx_rs_update_node_txrate(struct iwx_softc *sc, + const struct iwx_tlc_update_notif *notif, struct ieee80211_node *ni) { struct ieee80211com *ic = &sc->sc_ic; - struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); - struct ieee80211_node *ni = (void *)vap->iv_bss; + /* XXX TODO: create an inline function in if_iwxreg.h? */ + static int cck_idx_to_rate[] = { 2, 4, 11, 22, 2, 2, 2, 2 }; + static int ofdm_idx_to_rate[] = { 12, 18, 24, 36, 48, 72, 96, 108 }; - struct ieee80211_rateset *rs = &ni->ni_rates; uint32_t rate_n_flags; - uint8_t plcp, rval; - int i, cmd_ver, rate_n_flags_ver2 = 0; - - if (notif->sta_id != IWX_STATION_ID || - (le32toh(notif->flags) & IWX_TLC_NOTIF_FLAG_RATE) == 0) - return; + uint32_t type; + /* Extract the rate and command version */ rate_n_flags = le32toh(notif->rate); + if (sc->sc_rate_n_flags_version != 2) { + net80211_ic_printf(ic, + "%s: unsupported rate_n_flags version (%d)\n", + __func__, + sc->sc_rate_n_flags_version); + return (false); + } + if (sc->sc_debug & IWX_DEBUG_TXRATE) print_ratenflags(__func__, __LINE__, rate_n_flags, sc->sc_rate_n_flags_version); - cmd_ver = iwx_lookup_notif_ver(sc, IWX_DATA_PATH_GROUP, - IWX_TLC_MNG_UPDATE_NOTIF); - if (cmd_ver != IWX_FW_CMD_VER_UNKNOWN && cmd_ver >= 3) - rate_n_flags_ver2 = 1; - - if (rate_n_flags_ver2) { - uint32_t mod_type = (rate_n_flags & IWX_RATE_MCS_MOD_TYPE_MSK); - if (mod_type == IWX_RATE_MCS_HT_MSK) { - - ieee80211_node_set_txrate_dot11rate(ni, - IWX_RATE_HT_MCS_INDEX(rate_n_flags) | - IEEE80211_RATE_MCS); - IWX_DPRINTF(sc, IWX_DEBUG_TXRATE, - "%s:%d new MCS: %d rate_n_flags: %x\n", - __func__, __LINE__, - ieee80211_node_get_txrate_dot11rate(ni) & ~IEEE80211_RATE_MCS, - rate_n_flags); - return; - } - } else { - if (rate_n_flags & IWX_RATE_MCS_HT_MSK_V1) { - ieee80211_node_set_txrate_dot11rate(ni, - rate_n_flags & (IWX_RATE_HT_MCS_RATE_CODE_MSK_V1 | - IWX_RATE_HT_MCS_NSS_MSK_V1)); - - IWX_DPRINTF(sc, IWX_DEBUG_TXRATE, - "%s:%d new MCS idx: %d rate_n_flags: %x\n", - __func__, __LINE__, - ieee80211_node_get_txrate_dot11rate(ni), rate_n_flags); - return; - } + type = (rate_n_flags & IWX_RATE_MCS_MOD_TYPE_MSK); + switch (type) { + case IWX_RATE_MCS_CCK_MSK: + ieee80211_node_set_txrate_dot11rate(ni, + cck_idx_to_rate[rate_n_flags & IWX_RATE_LEGACY_RATE_MSK]); + return (true); + case IWX_RATE_MCS_LEGACY_OFDM_MSK: + ieee80211_node_set_txrate_dot11rate(ni, + ofdm_idx_to_rate[rate_n_flags & IWX_RATE_LEGACY_RATE_MSK]); + return (true); + case IWX_RATE_MCS_HT_MSK: + /* + * TODO: the current API doesn't include channel width + * and other flags, so we can't accurately store them yet! + * + * channel width: (flags & IWX_RATE_MCS_CHAN_WIDTH_MSK) + * >> IWX_RATE_MCS_CHAN_WIDTH_POS) + * LDPC: (flags & (1 << 16)) + */ + ieee80211_node_set_txrate_ht_mcsrate(ni, + IWX_RATE_HT_MCS_INDEX(rate_n_flags)); + return (true); + case IWX_RATE_MCS_VHT_MSK: + /* TODO: same comment on channel width, etc above */ + ieee80211_node_set_txrate_vht_rate(ni, + IWX_RATE_VHT_MCS_CODE(rate_n_flags), + IWX_RATE_VHT_MCS_NSS(rate_n_flags)); + return (true); + default: + net80211_ic_printf(ic, + "%s: unsupported chosen rate type in " + "IWX_RATE_MCS_MOD_TYPE (%d)\n", __func__, + type >> IWX_RATE_MCS_MOD_TYPE_POS); + return (false); } - if (rate_n_flags_ver2) { - const struct ieee80211_rateset *rs; - uint32_t ridx = (rate_n_flags & IWX_RATE_LEGACY_RATE_MSK); - if (rate_n_flags & IWX_RATE_MCS_LEGACY_OFDM_MSK) - rs = &ieee80211_std_rateset_11a; - else - rs = &ieee80211_std_rateset_11b; - if (ridx < rs->rs_nrates) - rval = (rs->rs_rates[ridx] & IEEE80211_RATE_VAL); - else - rval = 0; - } else { - plcp = (rate_n_flags & IWX_RATE_LEGACY_RATE_MSK_V1); + /* Default: if we get here, we didn't successfully update anything */ + return (false); +} - rval = 0; - for (i = IWX_RATE_1M_INDEX; i < nitems(iwx_rates); i++) { - if (iwx_rates[i].plcp == plcp) { - rval = iwx_rates[i].rate; - break; - } - } - } +/** + * @brief Process a firmware rate control update and update net80211. + * + * Since firmware is doing rate control, this just needs to update + * the txrate in the ieee80211_node entry. + */ +static void +iwx_rs_update(struct iwx_softc *sc, struct iwx_tlc_update_notif *notif) +{ + struct ieee80211com *ic = &sc->sc_ic; + struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); + /* XXX TODO: get a node ref! */ + struct ieee80211_node *ni = (void *)vap->iv_bss; - if (rval) { - uint8_t rv; - for (i = 0; i < rs->rs_nrates; i++) { - rv = rs->rs_rates[i] & IEEE80211_RATE_VAL; - if (rv == rval) { - ieee80211_node_set_txrate_dot11rate(ni, i); - break; - } - } - IWX_DPRINTF(sc, IWX_DEBUG_TXRATE, - "%s:%d new rate %d\n", __func__, __LINE__, - ieee80211_node_get_txrate_dot11rate(ni)); - } + /* + * For now the iwx driver only supports a single vdev with a single + * node; it doesn't yet support ibss/hostap/multiple vdevs. + */ + if (notif->sta_id != IWX_STATION_ID || + (le32toh(notif->flags) & IWX_TLC_NOTIF_FLAG_RATE) == 0) + return; + + iwx_rs_update_node_txrate(sc, notif, ni); } static int @@ -8526,6 +8551,8 @@ iwx_start(struct iwx_softc *sc) struct ieee80211_node *ni; struct mbuf *m; + IWX_ASSERT_LOCKED(sc); + while (sc->qfullmsk == 0 && (m = mbufq_dequeue(&sc->sc_snd)) != NULL) { ni = (struct ieee80211_node *)m->m_pkthdr.rcvif; if (iwx_tx(sc, m, ni) != 0) { @@ -8985,10 +9012,10 @@ iwx_rx_pkt(struct iwx_softc *sc, struct iwx_rx_data *data, struct mbuf *ml) break; case IWX_MISSED_BEACONS_NOTIFICATION: + IWX_DPRINTF(sc, IWX_DEBUG_BEACON, + "%s: IWX_MISSED_BEACONS_NOTIFICATION\n", + __func__); iwx_rx_bmiss(sc, pkt, data); - DPRINTF(("%s: IWX_MISSED_BEACONS_NOTIFICATION\n", - __func__)); - ieee80211_beacon_miss(ic); break; case IWX_MFUART_LOAD_NOTIFICATION: diff --git a/sys/dev/iwx/if_iwxreg.h b/sys/dev/iwx/if_iwxreg.h index 6755b93fa0ba..f3d1f078b48e 100644 --- a/sys/dev/iwx/if_iwxreg.h +++ b/sys/dev/iwx/if_iwxreg.h @@ -5176,6 +5176,10 @@ enum { #define IWX_RATE_HT_MCS_INDEX(r) ((((r) & IWX_RATE_MCS_NSS_MSK) >> 1) | \ ((r) & IWX_RATE_HT_MCS_CODE_MSK)) +#define IWX_RATE_VHT_MCS_CODE(r) ((r) & IWX_RATE_HT_MCS_CODE_MSK) +#define IWX_RATE_VHT_MCS_NSS(r) \ + ((((r) & IWX_RATE_MCS_NSS_MSK) == 0) >> IWX_RATE_MCS_NSS_POS) + /* Bits 7-5: reserved */ /* diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index 6d08bd49bc04..1d36fd11f368 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -192,6 +192,8 @@ static int ixgbe_if_i2c_req(if_ctx_t, struct ifi2creq *); static bool ixgbe_if_needs_restart(if_ctx_t, enum iflib_restart_event); int ixgbe_intr(void *); +static int ixgbe_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data); + /************************************************************************ * Function prototypes ************************************************************************/ @@ -239,6 +241,13 @@ static void ixgbe_setup_vlan_hw_support(if_ctx_t); static void ixgbe_config_gpie(struct ixgbe_softc *); static void ixgbe_config_delay_values(struct ixgbe_softc *); +static void ixgbe_add_debug_sysctls(struct ixgbe_softc *sc); +static void ixgbe_add_debug_dump_sysctls(struct ixgbe_softc *sc); +static int ixgbe_debug_dump_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd); +static u8 ixgbe_debug_dump_print_cluster(struct ixgbe_softc *sc, + struct sbuf *sbuf, u8 cluster_id); +static int ixgbe_nvm_access_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd); + /* Sysctl handlers */ static int ixgbe_sysctl_flowcntl(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_advertise(SYSCTL_HANDLER_ARGS); @@ -260,6 +269,9 @@ static int ixgbe_sysctl_wol_enable(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_wufc(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_tso_tcp_flags_mask(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_debug_dump_set_clusters(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_dump_debug_dump(SYSCTL_HANDLER_ARGS); + /* Deferred interrupt tasklets */ static void ixgbe_handle_msf(void *); static void ixgbe_handle_mod(void *); @@ -330,6 +342,7 @@ static device_method_t ixgbe_if_methods[] = { DEVMETHOD(ifdi_get_counter, ixgbe_if_get_counter), DEVMETHOD(ifdi_i2c_req, ixgbe_if_i2c_req), DEVMETHOD(ifdi_needs_restart, ixgbe_if_needs_restart), + DEVMETHOD(ifdi_priv_ioctl, ixgbe_if_priv_ioctl), #ifdef PCI_IOV DEVMETHOD(ifdi_iov_init, ixgbe_if_iov_init), DEVMETHOD(ifdi_iov_uninit, ixgbe_if_iov_uninit), @@ -1015,6 +1028,8 @@ ixgbe_if_attach_pre(if_ctx_t ctx) if (hw->mac.type == ixgbe_mac_E610) ixgbe_init_aci(hw); + sc->do_debug_dump = false; + if (hw->mac.ops.fw_recovery_mode && hw->mac.ops.fw_recovery_mode(hw)) { device_printf(dev, @@ -1395,6 +1410,248 @@ ixgbe_if_needs_restart(if_ctx_t ctx __unused, enum iflib_restart_event event) } /************************************************************************ + * ixgbe_if_priv_ioctl - Ioctl handler for driver + * + * Handler for custom driver specific ioctls + * + * return 0 on success, positive on failure + ************************************************************************/ +static int +ixgbe_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data) +{ + struct ixgbe_softc *sc = iflib_get_softc(ctx); + struct ifdrv *ifd; + device_t dev = sc->dev; + + /* Make sure the command type is valid */ + switch (command) { + case SIOCSDRVSPEC: + case SIOCGDRVSPEC: + /* Accepted commands */ + break; + case SIOCGPRIVATE_0: + /* + * Although we do not support this ioctl command, it's expected + * that iflib will forward it to the IFDI_PRIV_IOCTL handler. + * Do not print a message in this case. + */ + return (ENOTSUP); + default: + /* + * If we get a different command for this function, it's + * definitely unexpected, so log a message indicating what + * command we got for debugging purposes. + */ + device_printf(dev, + "%s: unexpected ioctl command %08lx\n", + __func__, command); + return (EINVAL); + } + + ifd = (struct ifdrv *)data; + + switch (ifd->ifd_cmd) { + case IXGBE_NVM_ACCESS: + IOCTL_DEBUGOUT("ioctl: NVM ACCESS"); + return (ixgbe_nvm_access_ioctl(sc, ifd)); + case IXGBE_DEBUG_DUMP: + IOCTL_DEBUGOUT("ioctl: DEBUG DUMP"); + return (ixgbe_debug_dump_ioctl(sc, ifd)); + default: + IOCTL_DEBUGOUT1( + "ioctl: UNKNOWN SIOC(S|G)DRVSPEC (0x%X) command\n", + (int)ifd->ifd_cmd); + return (EINVAL); + } + + return (0); +} + +/************************************************************************ + * ixgbe_nvm_access_ioctl + * + * Handles an NVM access ioctl request + ************************************************************************/ +static int +ixgbe_nvm_access_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd) +{ + struct ixgbe_nvm_access_data *data; + struct ixgbe_nvm_access_cmd *cmd; + struct ixgbe_hw *hw = &sc->hw; + size_t ifd_len = ifd->ifd_len; + size_t malloc_len; + device_t dev = sc->dev; + u8 *nvm_buffer; + s32 error = 0; + + /* + * ifioctl forwards SIOCxDRVSPEC to iflib without conducting + * a privilege check. Subsequently, iflib passes the ioctl to the driver + * without verifying privileges. To prevent non-privileged threads from + * accessing this interface, perform a privilege check at this point. + */ + error = priv_check(curthread, PRIV_DRIVER); + if (error) + return (error); + + if (ifd_len < sizeof(*cmd)) { + device_printf(dev, + "%s: ifdrv length is too small. Got %zu, " + "but expected %zu\n", + __func__, ifd_len, sizeof(*cmd)); + return (EINVAL); + } + + if (ifd->ifd_data == NULL) { + device_printf(dev, "%s: No ifd data buffer.\n", + __func__); + return (EINVAL); + } + + malloc_len = max(ifd_len, sizeof(*data) + sizeof(*cmd)); + + nvm_buffer = (u8 *)malloc(malloc_len, M_IXGBE, M_ZERO | M_NOWAIT); + if (!nvm_buffer) + return (ENOMEM); + + /* Copy the NVM access command and data in from user space */ + error = copyin(ifd->ifd_data, nvm_buffer, ifd_len); + if (error) { + device_printf(dev, "%s: Failed to copy data in, error: %d\n", + __func__, error); + goto cleanup_free_nvm_buffer; + } + + /* + * The NVM command structure is immediately followed by data which + * varies in size based on the command. + */ + cmd = (struct ixgbe_nvm_access_cmd *)nvm_buffer; + data = (struct ixgbe_nvm_access_data *) + (nvm_buffer + sizeof(struct ixgbe_nvm_access_cmd)); + + /* Handle the NVM access request */ + error = ixgbe_handle_nvm_access(hw, cmd, data); + if (error) { + device_printf(dev, "%s: NVM access request failed, error %d\n", + __func__, error); + } + + /* Copy the possibly modified contents of the handled request out */ + error = copyout(nvm_buffer, ifd->ifd_data, ifd_len); + if (error) { + device_printf(dev, "%s: Copying response back to " + "user space failed, error %d\n", + __func__, error); + goto cleanup_free_nvm_buffer; + } + +cleanup_free_nvm_buffer: + free(nvm_buffer, M_IXGBE); + return (error); +} + +/************************************************************************ + * ixgbe_debug_dump_ioctl + * + * Makes debug dump of internal FW/HW data. + ************************************************************************/ +static int +ixgbe_debug_dump_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd) +{ + struct ixgbe_debug_dump_cmd *dd_cmd; + struct ixgbe_hw *hw = &sc->hw; + size_t ifd_len = ifd->ifd_len; + device_t dev = sc->dev; + s32 error = 0; + + if (!(sc->feat_en & IXGBE_FEATURE_DBG_DUMP)) + return (ENODEV); + + /* Data returned from ACI command */ + u16 ret_buf_size = 0; + u16 ret_next_cluster = 0; + u16 ret_next_table = 0; + u32 ret_next_index = 0; + + /* + * ifioctl forwards SIOCxDRVSPEC to iflib without conducting + * a privilege check. Subsequently, iflib passes the ioctl to the driver + * without verifying privileges. To prevent non-privileged threads from + * accessing this interface, perform a privilege check at this point. + */ + error = priv_check(curthread, PRIV_DRIVER); + if (error) + return (error); + + if (ifd_len < sizeof(*dd_cmd)) { + device_printf(dev, + "%s: ifdrv length is too small. Got %zu, " + "but expected %zu\n", + __func__, ifd_len, sizeof(*dd_cmd)); + return (EINVAL); + } + + if (ifd->ifd_data == NULL) { + device_printf(dev, "%s: No ifd data buffer.\n", + __func__); + return (EINVAL); + } + + dd_cmd = (struct ixgbe_debug_dump_cmd *)malloc(ifd_len, M_IXGBE, + M_NOWAIT | M_ZERO); + if (!dd_cmd) { + error = -ENOMEM; + goto out; + } + /* copy data from userspace */ + error = copyin(ifd->ifd_data, dd_cmd, ifd_len); + if (error) { + device_printf(dev, "%s: Failed to copy data in, error: %d\n", + __func__, error); + goto out; + } + + /* ACI command requires buf_size arg to be grater than 0 */ + if (dd_cmd->data_size == 0) { + device_printf(dev, "%s: data_size must be greater than 0\n", + __func__); + error = EINVAL; + goto out; + } + + /* Zero the data buffer memory space */ + memset(dd_cmd->data, 0, ifd_len - sizeof(*dd_cmd)); + + error = ixgbe_aci_get_internal_data(hw, dd_cmd->cluster_id, + dd_cmd->table_id, dd_cmd->offset, dd_cmd->data, dd_cmd->data_size, + &ret_buf_size, &ret_next_cluster, &ret_next_table, &ret_next_index); + if (error) { + device_printf(dev, + "%s: Failed to get internal FW/HW data, error: %d\n", + __func__, error); + goto out; + } + + dd_cmd->cluster_id = ret_next_cluster; + dd_cmd->table_id = ret_next_table; + dd_cmd->offset = ret_next_index; + dd_cmd->data_size = ret_buf_size; + + error = copyout(dd_cmd, ifd->ifd_data, ifd->ifd_len); + if (error) { + device_printf(dev, + "%s: Failed to copy data out, error: %d\n", + __func__, error); + } + +out: + free(dd_cmd, M_IXGBE); + + return (error); +} + +/************************************************************************ * ixgbe_add_media_types ************************************************************************/ static void @@ -2883,6 +3140,264 @@ ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) } /* ixgbe_sysctl_interrupt_rate_handler */ /************************************************************************ + * ixgbe_debug_dump_print_cluster + ************************************************************************/ +static u8 +ixgbe_debug_dump_print_cluster(struct ixgbe_softc *sc, struct sbuf *sbuf, + u8 cluster_id) +{ + u16 data_buf_size = IXGBE_ACI_MAX_BUFFER_SIZE; + device_t dev = sc->dev; + struct ixgbe_hw *hw = &sc->hw; + const u8 reserved_buf[8] = {}; + int max_aci_calls = 1000; + int error, counter = 0; + u8 *data_buf; + + /* Input parameters / loop variables */ + u16 table_id = 0; + u32 offset = 0; + + /* Data returned from ACI command */ + u16 ret_buf_size = 0; + u16 ret_next_cluster = 0; + u16 ret_next_table = 0; + u32 ret_next_index = 0; + + data_buf = (u8 *)malloc(data_buf_size, M_IXGBE, M_NOWAIT | M_ZERO); + if (!data_buf) + return (0); + + DEBUGOUT2("%s: dumping cluster id (relative) %d\n", + __func__, cluster_id); + + do { + DEBUGOUT3("table_id 0x%04x offset 0x%08x buf_size %d\n", + table_id, offset, data_buf_size); + + error = ixgbe_aci_get_internal_data(hw, cluster_id, table_id, + offset, data_buf, data_buf_size, &ret_buf_size, + &ret_next_cluster, &ret_next_table, &ret_next_index); + if (error) { + device_printf(dev, + "%s: Failed to get internal FW/HW data, error: %d, " + "last aci status: %d\n", + __func__, error, hw->aci.last_status); + break; + } + + DEBUGOUT3("ret_table_id 0x%04x ret_offset 0x%08x " + "ret_buf_size %d\n", + ret_next_table, ret_next_index, ret_buf_size); + + /* Print cluster id */ + u32 print_cluster_id = (u32)cluster_id; + sbuf_bcat(sbuf, &print_cluster_id, sizeof(print_cluster_id)); + /* Print table id */ + u32 print_table_id = (u32)table_id; + sbuf_bcat(sbuf, &print_table_id, sizeof(print_table_id)); + /* Print table length */ + u32 print_table_length = (u32)ret_buf_size; + sbuf_bcat(sbuf, &print_table_length, + sizeof(print_table_length)); + /* Print current offset */ + u32 print_curr_offset = offset; + sbuf_bcat(sbuf, &print_curr_offset, sizeof(print_curr_offset)); + /* Print reserved bytes */ + sbuf_bcat(sbuf, reserved_buf, sizeof(reserved_buf)); + /* Print data */ + sbuf_bcat(sbuf, data_buf, ret_buf_size); + + /* Prepare for the next loop spin */ + memset(data_buf, 0, data_buf_size); + + bool last_index = (ret_next_index == 0xffffffff); + bool last_table = ((ret_next_table == 0xff || + ret_next_table == 0xffff) && + last_index); + + if (last_table) { + /* End of the cluster */ + DEBUGOUT1("End of the cluster ID %d\n", cluster_id); + break; + } else if (last_index) { + /* End of the table */ + table_id = ret_next_table; + offset = 0; + } else { + /* More data left in the table */ + offset = ret_next_index; + } + } while (++counter < max_aci_calls); + + if (counter >= max_aci_calls) + device_printf(dev, "Exceeded nr of ACI calls for cluster %d\n", + cluster_id); + + free(data_buf, M_IXGBE); + + return (++cluster_id); +} /* ixgbe_print_debug_dump_cluster */ + +/************************************************************************ + * ixgbe_sysctl_debug_dump_set_clusters + * + * Sets the cluster to dump from FW when Debug Dump requested. + ************************************************************************/ +static int +ixgbe_sysctl_debug_dump_set_clusters(SYSCTL_HANDLER_ARGS) +{ + struct ixgbe_softc *sc = (struct ixgbe_softc *)arg1; + u32 clusters = sc->debug_dump_cluster_mask; + device_t dev = sc->dev; + int error; + + error = sysctl_handle_32(oidp, &clusters, 0, req); + if ((error) || !req->newptr) + return (error); + + if (clusters & ~(IXGBE_DBG_DUMP_VALID_CLUSTERS_MASK)) { + device_printf(dev, + "%s: Unrecognized parameter: %u\n", + __func__, clusters); + sc->debug_dump_cluster_mask = + IXGBE_ACI_DBG_DUMP_CLUSTER_ID_INVALID; + return (EINVAL); + } + + sc->debug_dump_cluster_mask = clusters; + + return (0); +} /* ixgbe_sysctl_debug_dump_set_clusters */ + +/************************************************************************ + * ixgbe_sysctl_dump_debug_dump + ************************************************************************/ +static int +ixgbe_sysctl_dump_debug_dump(SYSCTL_HANDLER_ARGS) +{ + struct ixgbe_softc *sc = (struct ixgbe_softc *)arg1; + device_t dev = sc->dev; + struct sbuf *sbuf; + int error = 0; + + UNREFERENCED_PARAMETER(arg2); + + if (!sc->do_debug_dump) { + if (req->oldptr == NULL && req->newptr == NULL) { + error = SYSCTL_OUT(req, 0, 0); + return (error); + } + + char input_buf[2] = ""; + error = sysctl_handle_string(oidp, input_buf, + sizeof(input_buf), req); + if ((error) || (req->newptr == NULL)) + return (error); + + if (input_buf[0] == '1') { + if (sc->debug_dump_cluster_mask == + IXGBE_ACI_DBG_DUMP_CLUSTER_ID_INVALID) { + device_printf(dev, + "Debug Dump failed because an invalid " + "cluster was specified.\n"); + return (EINVAL); + } + + sc->do_debug_dump = true; + return (0); + } + + return (EINVAL); + } + + /* Caller just wants the upper bound for size */ + if (req->oldptr == NULL && req->newptr == NULL) { + size_t est_output_len = IXGBE_DBG_DUMP_BASE_SIZE; + if (sc->debug_dump_cluster_mask & 0x2) + est_output_len += IXGBE_DBG_DUMP_BASE_SIZE; + error = SYSCTL_OUT(req, 0, est_output_len); + return (error); + } + + sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req); + sbuf_clear_flags(sbuf, SBUF_INCLUDENUL); + + DEBUGOUT("FW Debug Dump running...\n"); + + if (sc->debug_dump_cluster_mask) { + for (u8 id = 0; id <= IXGBE_ACI_DBG_DUMP_CLUSTER_ID_MAX; id++) { + if (sc->debug_dump_cluster_mask & BIT(id)) { + DEBUGOUT1("Dumping cluster ID %u...\n", id); + ixgbe_debug_dump_print_cluster(sc, sbuf, id); + } + } + } else { + u8 next_cluster_id = 0; + do { + DEBUGOUT1("Dumping cluster ID %u...\n", + next_cluster_id); + next_cluster_id = ixgbe_debug_dump_print_cluster(sc, + sbuf, next_cluster_id); + } while (next_cluster_id != 0 && + next_cluster_id <= IXGBE_ACI_DBG_DUMP_CLUSTER_ID_MAX); + } + + sbuf_finish(sbuf); + sbuf_delete(sbuf); + + sc->do_debug_dump = false; + + return (error); +} /* ixgbe_sysctl_dump_debug_dump */ + +/************************************************************************ + * ixgbe_add_debug_dump_sysctls + ************************************************************************/ +static void +ixgbe_add_debug_dump_sysctls(struct ixgbe_softc *sc) +{ + struct sysctl_oid_list *debug_list, *dump_list; + struct sysctl_oid *dump_node; + struct sysctl_ctx_list *ctx; + device_t dev = sc->dev; + + ctx = device_get_sysctl_ctx(dev); + debug_list = SYSCTL_CHILDREN(sc->debug_sysctls); + + dump_node = SYSCTL_ADD_NODE(ctx, debug_list, OID_AUTO, "dump", + CTLFLAG_RD, NULL, "Internal FW/HW Dump"); + dump_list = SYSCTL_CHILDREN(dump_node); + + SYSCTL_ADD_PROC(ctx, dump_list, OID_AUTO, "clusters", + CTLTYPE_U32 | CTLFLAG_RW, sc, 0, + ixgbe_sysctl_debug_dump_set_clusters, "SU", + IXGBE_SYSCTL_DESC_DEBUG_DUMP_SET_CLUSTER); + + SYSCTL_ADD_PROC(ctx, dump_list, OID_AUTO, "dump", + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + ixgbe_sysctl_dump_debug_dump, "", + IXGBE_SYSCTL_DESC_DUMP_DEBUG_DUMP); +} /* ixgbe_add_debug_dump_sysctls */ + +static void +ixgbe_add_debug_sysctls(struct ixgbe_softc *sc) +{ + struct sysctl_oid_list *ctx_list; + struct sysctl_ctx_list *ctx; + device_t dev = sc->dev; + + ctx = device_get_sysctl_ctx(dev); + ctx_list = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + sc->debug_sysctls = SYSCTL_ADD_NODE(ctx, ctx_list, OID_AUTO, "debug", + CTLFLAG_RD, NULL, "Debug Sysctls"); + + if (sc->feat_en & IXGBE_FEATURE_DBG_DUMP) + ixgbe_add_debug_dump_sysctls(sc); +} /* ixgbe_add_debug_sysctls */ + +/************************************************************************ * ixgbe_add_device_sysctls ************************************************************************/ static void @@ -2992,6 +3507,8 @@ ixgbe_add_device_sysctls(if_ctx_t ctx) CTLTYPE_INT | CTLFLAG_RW, sc, 0, ixgbe_sysctl_eee_state, "I", "EEE Power Save State"); } + + ixgbe_add_debug_sysctls(sc); } /* ixgbe_add_device_sysctls */ /************************************************************************ @@ -5182,6 +5699,7 @@ ixgbe_init_device_features(struct ixgbe_softc *sc) break; case ixgbe_mac_E610: sc->feat_cap |= IXGBE_FEATURE_RECOVERY_MODE; + sc->feat_cap |= IXGBE_FEATURE_DBG_DUMP; break; default: break; @@ -5203,6 +5721,9 @@ ixgbe_init_device_features(struct ixgbe_softc *sc) /* Recovery mode */ if (sc->feat_cap & IXGBE_FEATURE_RECOVERY_MODE) sc->feat_en |= IXGBE_FEATURE_RECOVERY_MODE; + /* FW Debug Dump */ + if (sc->feat_cap & IXGBE_FEATURE_DBG_DUMP) + sc->feat_en |= IXGBE_FEATURE_DBG_DUMP; /* Enabled via global sysctl... */ /* Flow Director */ diff --git a/sys/dev/ixgbe/ixgbe.h b/sys/dev/ixgbe/ixgbe.h index 844064bf8543..624b71acabea 100644 --- a/sys/dev/ixgbe/ixgbe.h +++ b/sys/dev/ixgbe/ixgbe.h @@ -46,6 +46,7 @@ #include <sys/module.h> #include <sys/sockio.h> #include <sys/eventhandler.h> +#include <sys/priv.h> #include <net/if.h> #include <net/if_var.h> @@ -475,6 +476,20 @@ struct ixgbe_softc { u32 feat_cap; u32 feat_en; u16 lse_mask; + + struct sysctl_oid *debug_sysctls; + u32 debug_dump_cluster_mask; + bool do_debug_dump; +}; + +struct ixgbe_debug_dump_cmd { + u32 offset; /* offset to read/write from table, in bytes */ + u8 cluster_id; /* also used to get next cluster id */ + u16 table_id; + u16 data_size; /* size of data field, in bytes */ + u16 reserved1; + u32 reserved2; + u8 data[]; }; /* Precision Time Sync (IEEE 1588) defines */ @@ -499,6 +514,43 @@ struct ixgbe_softc { #define IXGBE_PHY_CURRENT_TEMP 0xC820 #define IXGBE_PHY_OVERTEMP_STATUS 0xC830 +/** + * The ioctl command number used by NVM update for accessing the driver for + * NVM access commands. + */ +#define IXGBE_NVM_ACCESS \ + (((((((('E' << 4) + '1') << 4) + 'K') << 4) + 'G') << 4) | 5) + +/* + * The ioctl command number used by a userspace tool for accessing the driver + * for getting debug dump data from the firmware. + */ +#define IXGBE_DEBUG_DUMP \ + (((((((('E' << 4) + '1') << 4) + 'K') << 4) + 'G') << 4) | 6) + +/* Debug Dump related definitions */ +#define IXGBE_ACI_DBG_DUMP_CLUSTER_ID_INVALID 0xFFFFFF +#define IXGBE_ACI_DBG_DUMP_CLUSTER_ID_BASE 50 +#define IXGBE_ACI_DBG_DUMP_CLUSTER_ID_MAX 1 + +#define IXGBE_DBG_DUMP_VALID_CLUSTERS_MASK 0x3 +#define IXGBE_DBG_DUMP_BASE_SIZE (2 * 1024 * 1024) + +#define IXGBE_SYSCTL_DESC_DEBUG_DUMP_SET_CLUSTER \ +"\nSelect clusters to dump with \"dump\" sysctl" \ +"\nFlags:" \ +"\n\t 0x1 - Link" \ +"\n\t 0x2 - Full CSR Space, excluding RCW registers" \ +"\n\t" \ +"\nUse \"sysctl -x\" to view flags properly." + +#define IXGBE_SYSCTL_DESC_DUMP_DEBUG_DUMP \ +"\nWrite 1 to output a FW debug dump containing the clusters " \ +"specified by the \"clusters\" sysctl" \ +"\nThe \"-b\" flag must be used in order to dump this data " \ +"as binary data because" \ +"\nthis data is opaque and not a string." + /* Sysctl help messages; displayed with sysctl -d */ #define IXGBE_SYSCTL_DESC_ADV_SPEED \ "\nControl advertised link speed using these flags:\n" \ diff --git a/sys/dev/ixgbe/ixgbe_features.h b/sys/dev/ixgbe/ixgbe_features.h index 0cef334a185f..bee9040319d8 100644 --- a/sys/dev/ixgbe/ixgbe_features.h +++ b/sys/dev/ixgbe/ixgbe_features.h @@ -57,6 +57,7 @@ #define IXGBE_FEATURE_LEGACY_IRQ (u32)(1 << 12) #define IXGBE_FEATURE_NEEDS_CTXD (u32)(1 << 13) #define IXGBE_FEATURE_RECOVERY_MODE (u32)(1 << 15) +#define IXGBE_FEATURE_DBG_DUMP (u32)(1 << 16) /* Check for OS support. Undefine features if not included in the OS */ #ifndef PCI_IOV diff --git a/sys/dev/ixl/if_ixl.c b/sys/dev/ixl/if_ixl.c index 261f76055901..bfaf6cd69e58 100644 --- a/sys/dev/ixl/if_ixl.c +++ b/sys/dev/ixl/if_ixl.c @@ -1480,17 +1480,33 @@ ixl_if_multi_set(if_ctx_t ctx) struct ixl_pf *pf = iflib_get_softc(ctx); struct ixl_vsi *vsi = &pf->vsi; struct i40e_hw *hw = vsi->hw; + enum i40e_status_code status; int mcnt; + if_t ifp = iflib_get_ifp(ctx); IOCTL_DEBUGOUT("ixl_if_multi_set: begin"); /* Delete filters for removed multicast addresses */ ixl_del_multi(vsi, false); - mcnt = min(if_llmaddr_count(iflib_get_ifp(ctx)), MAX_MULTICAST_ADDR); + mcnt = min(if_llmaddr_count(ifp), MAX_MULTICAST_ADDR); if (__predict_false(mcnt == MAX_MULTICAST_ADDR)) { - i40e_aq_set_vsi_multicast_promiscuous(hw, + /* Check if promisc mode is already enabled, if yes return */ + if (vsi->flags & IXL_FLAGS_MC_PROMISC) + return; + + status = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid, TRUE, NULL); + if (status != I40E_SUCCESS) + if_printf(ifp, "Failed to enable multicast promiscuous " + "mode, status: %s\n", i40e_stat_str(hw, status)); + else { + if_printf(ifp, "Enabled multicast promiscuous mode\n"); + + /* Set the flag to track promiscuous mode */ + vsi->flags |= IXL_FLAGS_MC_PROMISC; + } + /* Delete all existing MC filters */ ixl_del_multi(vsi, true); return; } @@ -1693,6 +1709,13 @@ ixl_if_promisc_set(if_ctx_t ctx, int flags) return (err); err = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid, multi, NULL); + + /* Update the multicast promiscuous flag based on the new state */ + if (multi) + vsi->flags |= IXL_FLAGS_MC_PROMISC; + else + vsi->flags &= ~IXL_FLAGS_MC_PROMISC; + return (err); } diff --git a/sys/dev/ixl/ixl.h b/sys/dev/ixl/ixl.h index 95379448b570..ab0f38307d90 100644 --- a/sys/dev/ixl/ixl.h +++ b/sys/dev/ixl/ixl.h @@ -202,6 +202,7 @@ #define IXL_FLAGS_KEEP_TSO6 (1 << 1) #define IXL_FLAGS_USES_MSIX (1 << 2) #define IXL_FLAGS_IS_VF (1 << 3) +#define IXL_FLAGS_MC_PROMISC (1 << 4) #define IXL_VSI_IS_PF(v) ((v->flags & IXL_FLAGS_IS_VF) == 0) #define IXL_VSI_IS_VF(v) ((v->flags & IXL_FLAGS_IS_VF) != 0) diff --git a/sys/dev/ixl/ixl_pf_main.c b/sys/dev/ixl/ixl_pf_main.c index 1752efc02fff..b62619ced5cb 100644 --- a/sys/dev/ixl/ixl_pf_main.c +++ b/sys/dev/ixl/ixl_pf_main.c @@ -593,24 +593,29 @@ ixl_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) * Routines for multicast and vlan filter management. * *********************************************************************/ + +/** + * ixl_add_multi - Add multicast filters to the hardware + * @vsi: The VSI structure + * + * In case number of multicast filters in the IFP exceeds 127 entries, + * multicast promiscuous mode will be enabled and the filters will be removed + * from the hardware + */ void ixl_add_multi(struct ixl_vsi *vsi) { if_t ifp = vsi->ifp; - struct i40e_hw *hw = vsi->hw; int mcnt = 0; struct ixl_add_maddr_arg cb_arg; IOCTL_DEBUGOUT("ixl_add_multi: begin"); - mcnt = if_llmaddr_count(ifp); - if (__predict_false(mcnt >= MAX_MULTICAST_ADDR)) { - i40e_aq_set_vsi_multicast_promiscuous(hw, - vsi->seid, TRUE, NULL); - /* delete all existing MC filters */ - ixl_del_multi(vsi, true); - return; - } + /* + * There is no need to check if the number of multicast addresses + * exceeds the MAX_MULTICAST_ADDR threshold and set promiscuous mode + * here, as all callers already handle this case. + */ cb_arg.vsi = vsi; LIST_INIT(&cb_arg.to_add); @@ -633,30 +638,103 @@ ixl_match_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) return (0); } +/** + * ixl_dis_multi_promisc - Disable multicast promiscuous mode + * @vsi: The VSI structure + * @vsi_mcnt: Number of multicast filters in the VSI + * + * Disable multicast promiscuous mode based on number of entries in the IFP + * and the VSI, then re-add multicast filters. + * + */ +static void +ixl_dis_multi_promisc(struct ixl_vsi *vsi, int vsi_mcnt) +{ + struct ifnet *ifp = vsi->ifp; + struct i40e_hw *hw = vsi->hw; + int ifp_mcnt = 0; + enum i40e_status_code status; + + /* + * Check if multicast promiscuous mode was actually enabled. + * If promiscuous mode was not enabled, don't attempt to disable it. + * Also, don't disable if IFF_PROMISC or IFF_ALLMULTI is set. + */ + if (!(vsi->flags & IXL_FLAGS_MC_PROMISC) || + (if_getflags(ifp) & (IFF_PROMISC | IFF_ALLMULTI))) + return; + + ifp_mcnt = if_llmaddr_count(ifp); + /* + * Equal lists or empty ifp list mean the list has not been changed + * and in such case avoid disabling multicast promiscuous mode as it + * was not previously enabled. Case where multicast promiscuous mode has + * been enabled is when vsi_mcnt == 0 && ifp_mcnt > 0. + */ + if (ifp_mcnt == vsi_mcnt || ifp_mcnt == 0 || + ifp_mcnt >= MAX_MULTICAST_ADDR) + return; + + status = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid, + FALSE, NULL); + if (status != I40E_SUCCESS) { + if_printf(ifp, "Failed to disable multicast promiscuous " + "mode, status: %s\n", i40e_stat_str(hw, status)); + + return; + } + + /* Clear the flag since promiscuous mode is now disabled */ + vsi->flags &= ~IXL_FLAGS_MC_PROMISC; + if_printf(ifp, "Disabled multicast promiscuous mode\n"); + + ixl_add_multi(vsi); +} + +/** + * ixl_del_multi - Delete multicast filters from the hardware + * @vsi: The VSI structure + * @all: Bool to determine if all the multicast filters should be removed + * + * In case number of multicast filters in the IFP drops to 127 entries, + * multicast promiscuous mode will be disabled and the filters will be reapplied + * to the hardware. + */ void ixl_del_multi(struct ixl_vsi *vsi, bool all) { - struct ixl_ftl_head to_del; + int to_del_cnt = 0, vsi_mcnt = 0; if_t ifp = vsi->ifp; struct ixl_mac_filter *f, *fn; - int mcnt = 0; + struct ixl_ftl_head to_del; IOCTL_DEBUGOUT("ixl_del_multi: begin"); LIST_INIT(&to_del); /* Search for removed multicast addresses */ LIST_FOREACH_SAFE(f, &vsi->ftl, ftle, fn) { - if ((f->flags & IXL_FILTER_MC) == 0 || - (!all && (if_foreach_llmaddr(ifp, ixl_match_maddr, f) == 0))) + if ((f->flags & IXL_FILTER_MC) == 0) + continue; + + /* Count all the multicast filters in the VSI for comparison */ + vsi_mcnt++; + + if (!all && if_foreach_llmaddr(ifp, ixl_match_maddr, f) != 0) continue; LIST_REMOVE(f, ftle); LIST_INSERT_HEAD(&to_del, f, ftle); - mcnt++; + to_del_cnt++; } - if (mcnt > 0) - ixl_del_hw_filters(vsi, &to_del, mcnt); + if (to_del_cnt > 0) { + ixl_del_hw_filters(vsi, &to_del, to_del_cnt); + return; + } + + ixl_dis_multi_promisc(vsi, vsi_mcnt); + + IOCTL_DEBUGOUT("ixl_del_multi: end"); } void diff --git a/sys/dev/mfi/mfi_linux.c b/sys/dev/mfi/mfi_linux.c index 8ed8baa3858a..9541ff37336a 100644 --- a/sys/dev/mfi/mfi_linux.c +++ b/sys/dev/mfi/mfi_linux.c @@ -53,15 +53,7 @@ #define MFI_LINUX_IOCTL_MIN 0x4d00 #define MFI_LINUX_IOCTL_MAX 0x4d04 -static linux_ioctl_function_t mfi_linux_ioctl; -static struct linux_ioctl_handler mfi_linux_handler = {mfi_linux_ioctl, - MFI_LINUX_IOCTL_MIN, - MFI_LINUX_IOCTL_MAX}; - -SYSINIT (mfi_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &mfi_linux_handler); -SYSUNINIT(mfi_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &mfi_linux_handler); +LINUX_IOCTL_SET(mfi, MFI_LINUX_IOCTL_MIN, MFI_LINUX_IOCTL_MAX); static struct linux_device_handler mfi_device_handler = { "mfi", "megaraid_sas", "mfi0", "megaraid_sas_ioctl_node", -1, 0, 1}; diff --git a/sys/dev/mii/mv88e151x.c b/sys/dev/mii/mv88e151x.c index 618ad81471c9..fb03b2a7a917 100644 --- a/sys/dev/mii/mv88e151x.c +++ b/sys/dev/mii/mv88e151x.c @@ -97,7 +97,7 @@ mv88e151x_attach(device_t dev) { const struct mii_attach_args *ma; struct mii_softc *sc; - uint32_t cop_cap, cop_extcap; + uint32_t cop_cap = 0, cop_extcap = 0; sc = device_get_softc(dev); ma = device_get_ivars(dev); @@ -224,10 +224,12 @@ mv88e151x_fiber_status(struct mii_softc *phy) else if (reg & MV88E151X_STATUS_LINK && reg & MV88E151X_STATUS_SYNC && (reg & MV88E151X_STATUS_ENERGY) == 0) { - if ((reg & MV88E151X_STATUS_SPEED_MASK) == + if (((reg & MV88E151X_STATUS_SPEED_MASK) >> + MV88E151X_STATUS_SPEED_SHIFT) == MV88E151X_STATUS_SPEED_1000) mii->mii_media_active |= IFM_1000_SX; - else if ((reg & MV88E151X_STATUS_SPEED_MASK) == + else if (((reg & MV88E151X_STATUS_SPEED_MASK) >> + MV88E151X_STATUS_SPEED_SHIFT) == MV88E151X_STATUS_SPEED_100) mii->mii_media_active |= IFM_100_FX; else diff --git a/sys/dev/mps/mps_sas.c b/sys/dev/mps/mps_sas.c index d69c8ea5fded..fa0f817ed67b 100644 --- a/sys/dev/mps/mps_sas.c +++ b/sys/dev/mps/mps_sas.c @@ -858,7 +858,7 @@ mps_detach_sas(struct mps_softc *sc) if (sassc->devq != NULL) cam_simq_free(sassc->devq); - for(i=0; i< sassc->maxtargets ;i++) { + for (i = 0; i < sassc->maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPT2); @@ -3396,7 +3396,7 @@ mpssas_realloc_targets(struct mps_softc *sc, int maxtargets) * the allocated LUNs for each target and then the target buffer * itself. */ - for (i=0; i< maxtargets; i++) { + for (i = 0; i < maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPT2); diff --git a/sys/dev/mpt/mpt_raid.c b/sys/dev/mpt/mpt_raid.c index 5ff08ffcf2b3..2b868f6ef070 100644 --- a/sys/dev/mpt/mpt_raid.c +++ b/sys/dev/mpt/mpt_raid.c @@ -830,7 +830,7 @@ mpt_is_raid_volume(struct mpt_softc *mpt, target_id_t tgt) } ioc_vol = mpt->ioc_page2->RaidVolume; ioc_last_vol = ioc_vol + mpt->ioc_page2->NumActiveVolumes; - for (;ioc_vol != ioc_last_vol; ioc_vol++) { + for (; ioc_vol != ioc_last_vol; ioc_vol++) { if (ioc_vol->VolumeID == tgt) { return (1); } @@ -1406,7 +1406,7 @@ mpt_refresh_raid_data(struct mpt_softc *mpt) ioc_vol = mpt->ioc_page2->RaidVolume; ioc_last_vol = ioc_vol + mpt->ioc_page2->NumActiveVolumes; - for (;ioc_vol != ioc_last_vol; ioc_vol++) { + for (; ioc_vol != ioc_last_vol; ioc_vol++) { struct mpt_raid_volume *mpt_vol; mpt_vol = mpt->raid_volumes + ioc_vol->VolumePageNumber; diff --git a/sys/dev/mrsas/mrsas_linux.c b/sys/dev/mrsas/mrsas_linux.c index d7d48740a204..b06788fffc82 100644 --- a/sys/dev/mrsas/mrsas_linux.c +++ b/sys/dev/mrsas/mrsas_linux.c @@ -67,15 +67,7 @@ #define MRSAS_LINUX_IOCTL_MIN 0x4d00 #define MRSAS_LINUX_IOCTL_MAX 0x4d01 -static linux_ioctl_function_t mrsas_linux_ioctl; -static struct linux_ioctl_handler mrsas_linux_handler = {mrsas_linux_ioctl, - MRSAS_LINUX_IOCTL_MIN, -MRSAS_LINUX_IOCTL_MAX}; - -SYSINIT(mrsas_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &mrsas_linux_handler); -SYSUNINIT(mrsas_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &mrsas_linux_handler); +LINUX_IOCTL_SET(mrsas, MRSAS_LINUX_IOCTL_MIN, MRSAS_LINUX_IOCTL_MAX); static struct linux_device_handler mrsas_device_handler = {"mrsas", "megaraid_sas", "mrsas0", "megaraid_sas_ioctl_node", -1, 0, 1}; diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 8cc543d54c2e..9fb4370129f3 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -738,6 +738,7 @@ nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror) out_rem: vm_map_remove(kernel_map, e->kva, e->kva + e->size); + e->obj = NULL; /* reference consumed by vm_map_remove() */ out_rel: vm_object_deallocate(e->obj); e->obj = NULL; @@ -1406,19 +1407,34 @@ netmap_knwrite(struct knote *kn, long hint) return netmap_knrw(kn, hint, POLLOUT); } +static int +netmap_kncopy(struct knote *kn, struct proc *p1) +{ + struct netmap_priv_d *priv; + struct nm_selinfo *si; + + priv = kn->kn_hook; + si = priv->np_si[kn->kn_filter == EVFILT_WRITE ? NR_TX : NR_RX]; + NMG_LOCK(); + si->kqueue_users++; + NMG_UNLOCK(); + return (0); +} + static const struct filterops netmap_rfiltops = { .f_isfd = 1, .f_detach = netmap_knrdetach, .f_event = netmap_knread, + .f_copy = netmap_kncopy, }; static const struct filterops netmap_wfiltops = { .f_isfd = 1, .f_detach = netmap_knwdetach, .f_event = netmap_knwrite, + .f_copy = netmap_kncopy, }; - /* * This is called when a thread invokes kevent() to record * a change in the configuration of the kqueue(). diff --git a/sys/dev/nfe/if_nfe.c b/sys/dev/nfe/if_nfe.c index 4625c2616562..265181ef7ad0 100644 --- a/sys/dev/nfe/if_nfe.c +++ b/sys/dev/nfe/if_nfe.c @@ -2078,7 +2078,7 @@ nfe_rxeof(struct nfe_softc *sc, int count, int *rx_npktsp) bus_dmamap_sync(sc->rxq.rx_desc_tag, sc->rxq.rx_desc_map, BUS_DMASYNC_POSTREAD); - for (prog = 0;;NFE_INC(sc->rxq.cur, NFE_RX_RING_COUNT), vtag = 0) { + for (prog = 0; ; NFE_INC(sc->rxq.cur, NFE_RX_RING_COUNT), vtag = 0) { if (count <= 0) break; count--; @@ -2192,7 +2192,7 @@ nfe_jrxeof(struct nfe_softc *sc, int count, int *rx_npktsp) bus_dmamap_sync(sc->jrxq.jrx_desc_tag, sc->jrxq.jrx_desc_map, BUS_DMASYNC_POSTREAD); - for (prog = 0;;NFE_INC(sc->jrxq.jcur, NFE_JUMBO_RX_RING_COUNT), + for (prog = 0; ; NFE_INC(sc->jrxq.jcur, NFE_JUMBO_RX_RING_COUNT), vtag = 0) { if (count <= 0) break; diff --git a/sys/dev/null/null.c b/sys/dev/null/null.c index 8525eb9543c3..b5725de30bef 100644 --- a/sys/dev/null/null.c +++ b/sys/dev/null/null.c @@ -61,12 +61,14 @@ static int zero_ev(struct knote *kn, long hint); static const struct filterops one_fop = { .f_isfd = 1, - .f_event = one_ev + .f_event = one_ev, + .f_copy = knote_triv_copy, }; static const struct filterops zero_fop = { .f_isfd = 1, - .f_event = zero_ev + .f_event = zero_ev, + .f_copy = knote_triv_copy, }; static struct cdevsw full_cdevsw = { diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c index ead91f0d01fe..d119f9877aaa 100644 --- a/sys/dev/nvme/nvme.c +++ b/sys/dev/nvme/nvme.c @@ -51,7 +51,7 @@ int32_t nvme_retry_count; MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations"); static void -nvme_init(void) +nvme_init(void *dummy __unused) { uint32_t i; @@ -62,7 +62,7 @@ nvme_init(void) SYSINIT(nvme_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_init, NULL); static void -nvme_uninit(void) +nvme_uninit(void *dummy __unused) { } diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index 17c5cdb4db87..f4ea08f129c0 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -1507,9 +1507,7 @@ struct nvme_namespace_data { uint8_t eui64[8]; /** lba format support */ - uint32_t lbaf[16]; - - uint8_t reserved7[192]; + uint32_t lbaf[64]; uint8_t vendor_specific[3712]; } __packed __aligned(4); @@ -2155,8 +2153,6 @@ static inline void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN - int i; - s->nsze = le64toh(s->nsze); s->ncap = le64toh(s->ncap); s->nuse = le64toh(s->nuse); @@ -2175,7 +2171,7 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused) s->anagrpid = le32toh(s->anagrpid); s->nvmsetid = le16toh(s->nvmsetid); s->endgid = le16toh(s->endgid); - for (i = 0; i < 16; i++) + for (unsigned i = 0; i < nitems(s->lbaf); i++) s->lbaf[i] = le32toh(s->lbaf[i]); #endif } diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 52f9e12f8f9a..52e9fcbbebcd 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -463,13 +463,13 @@ static __inline void nvme_completion_poll(struct nvme_completion_poll_status *status) { int timeout = ticks + 10 * hz; - sbintime_t delta_t = SBT_1US; + sbintime_t delta = SBT_1US; while (!atomic_load_acq_int(&status->done)) { if (timeout - ticks < 0) panic("NVME polled command failed to complete within 10s."); - pause_sbt("nvme", delta_t, 0, C_PREL(1)); - delta_t = min(SBT_1MS, delta_t * 3 / 2); + pause_sbt("nvme", delta, 0, C_PREL(1)); + delta = min(SBT_1MS, delta + delta / 2); } } diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c index a06774a64761..7693aa6d54d3 100644 --- a/sys/dev/nvme/nvme_sim.c +++ b/sys/dev/nvme/nvme_sim.c @@ -391,7 +391,7 @@ nvme_sim_controller_fail(void *ctrlr_arg) struct nvme_consumer *consumer_cookie; static void -nvme_sim_init(void) +nvme_sim_init(void *dummy __unused) { if (nvme_use_nvd) return; @@ -404,7 +404,7 @@ SYSINIT(nvme_sim_register, SI_SUB_DRIVERS, SI_ORDER_ANY, nvme_sim_init, NULL); static void -nvme_sim_uninit(void) +nvme_sim_uninit(void *dummy __unused) { if (nvme_use_nvd) return; diff --git a/sys/dev/ocs_fc/ocs_mgmt.c b/sys/dev/ocs_fc/ocs_mgmt.c index 726b499f28ba..5b7f6557c017 100644 --- a/sys/dev/ocs_fc/ocs_mgmt.c +++ b/sys/dev/ocs_fc/ocs_mgmt.c @@ -226,7 +226,7 @@ ocs_mgmt_get_list(ocs_t *ocs, ocs_textbuf_t *textbuf) ocs_mgmt_start_unnumbered_section(textbuf, "ocs"); - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { access = 0; if (mgmt_table[i].get_handler) { access |= MGMT_MODE_RD; @@ -305,7 +305,7 @@ ocs_mgmt_get(ocs_t *ocs, char *name, ocs_textbuf_t *textbuf) if (ocs_strncmp(name, qualifier, strlen(qualifier)) == 0) { char *unqualified_name = name + strlen(qualifier) + 1; - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (ocs_strcmp(unqualified_name, mgmt_table[i].name) == 0) { if (mgmt_table[i].get_handler) { mgmt_table[i].get_handler(ocs, name, textbuf); @@ -387,7 +387,7 @@ ocs_mgmt_set(ocs_t *ocs, char *name, char *value) char *unqualified_name = name + strlen(qualifier) +1; /* See if it's a value I can set */ - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (ocs_strcmp(unqualified_name, mgmt_table[i].name) == 0) { if (mgmt_table[i].set_handler) { return mgmt_table[i].set_handler(ocs, name, value); @@ -469,7 +469,7 @@ ocs_mgmt_exec(ocs_t *ocs, char *action, void *arg_in, char *unqualified_name = action + strlen(qualifier) +1; /* See if it's an action I can perform */ - for (i=0;i<ARRAY_SIZE(mgmt_table); i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (ocs_strcmp(unqualified_name, mgmt_table[i].name) == 0) { if (mgmt_table[i].action_handler) { return mgmt_table[i].action_handler(ocs, action, arg_in, arg_in_length, @@ -527,7 +527,7 @@ ocs_mgmt_get_all(ocs_t *ocs, ocs_textbuf_t *textbuf) ocs_mgmt_start_unnumbered_section(textbuf, "ocs"); - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (mgmt_table[i].get_handler) { mgmt_table[i].get_handler(ocs, mgmt_table[i].name, textbuf); } else if (mgmt_table[i].action_handler) { @@ -1212,7 +1212,7 @@ get_sfp_a2(ocs_t *ocs, char *name, ocs_textbuf_t *textbuf) int buffer_remaining = (SFP_PAGE_SIZE * 3) + 1; int bytes_added; - for (i=0; i < bytes_read; i++) { + for (i = 0; i < bytes_read; i++) { bytes_added = ocs_snprintf(d, buffer_remaining, "%02x ", *s); ++s; d += bytes_added; @@ -2040,7 +2040,7 @@ get_profile_list(ocs_t *ocs, char *name, ocs_textbuf_t *textbuf) result_buf = ocs_malloc(ocs, BUFFER_SIZE, OCS_M_ZERO); bytes_left = BUFFER_SIZE; - for (i=0; i<result.list->num_descriptors; i++) { + for (i = 0; i < result.list->num_descriptors; i++) { sprintf(result_line, "0x%02x:%s\n", result.list->descriptors[i].profile_id, result.list->descriptors[i].profile_description); if (strlen(result_line) < bytes_left) { diff --git a/sys/dev/pci/controller/pci_n1sdp.c b/sys/dev/pci/controller/pci_n1sdp.c index 487041bc78e4..22f0ea27d45b 100644 --- a/sys/dev/pci/controller/pci_n1sdp.c +++ b/sys/dev/pci/controller/pci_n1sdp.c @@ -345,6 +345,17 @@ n1sdp_pcie_write_config(device_t dev, u_int bus, u_int slot, bus_space_write_4(t, h, offset & ~3, data); } +static int +n1sdp_pcie_acpi_request_feature(device_t pcib __unused, device_t dev __unused, + enum pci_feature feature __unused) +{ + /* + * HotPlug isn't supported on the N1SDP as it causes an interrupt storm + */ + return (EINVAL); +} + + static device_method_t n1sdp_pcie_acpi_methods[] = { DEVMETHOD(device_probe, n1sdp_pcie_acpi_probe), DEVMETHOD(device_attach, n1sdp_pcie_acpi_attach), @@ -352,6 +363,7 @@ static device_method_t n1sdp_pcie_acpi_methods[] = { /* pcib interface */ DEVMETHOD(pcib_read_config, n1sdp_pcie_read_config), DEVMETHOD(pcib_write_config, n1sdp_pcie_write_config), + DEVMETHOD(pcib_request_feature, n1sdp_pcie_acpi_request_feature), DEVMETHOD_END }; diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index 9e43a4c1909f..cde98cb62cef 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -240,6 +240,7 @@ struct pci_quirk { #define PCI_QUIRK_DISABLE_MSIX 5 /* MSI-X doesn't work */ #define PCI_QUIRK_MSI_INTX_BUG 6 /* PCIM_CMD_INTxDIS disables MSI */ #define PCI_QUIRK_REALLOC_BAR 7 /* Can't allocate memory at the default address */ +#define PCI_QUIRK_DISABLE_FLR 8 /* Function-Level Reset (FLR) not working. */ int arg1; int arg2; }; @@ -319,6 +320,13 @@ static const struct pci_quirk pci_quirks[] = { * expected place. */ { 0x98741002, PCI_QUIRK_REALLOC_BAR, 0, 0 }, + + /* + * With some MediaTek mt76 WiFi FLR does not work despite advertised. + */ + { 0x061614c3, PCI_QUIRK_DISABLE_FLR, 0, 0 }, /* mt76 7922 */ + + /* end of table */ { 0 } }; @@ -6740,6 +6748,8 @@ pcie_flr(device_t dev, u_int max_delay, bool force) if (!(pci_read_config(dev, cap + PCIER_DEVICE_CAP, 4) & PCIEM_CAP_FLR)) return (false); + if (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_FLR)) + return (false); /* * Disable busmastering to prevent generation of new diff --git a/sys/dev/ppc/ppc.c b/sys/dev/ppc/ppc.c index 9870379e2eba..de75f4747709 100644 --- a/sys/dev/ppc/ppc.c +++ b/sys/dev/ppc/ppc.c @@ -1389,7 +1389,7 @@ ppc_exec_microseq(device_t dev, struct ppb_microseq **p_msq) /* let's suppose the next instr. is the same */ prefetch: - for (;mi->opcode == MS_OP_RASSERT; INCR_PC) + for (; mi->opcode == MS_OP_RASSERT; INCR_PC) w_reg(mi->arg[0].i, ppc, (char)mi->arg[1].i); if (mi->opcode == MS_OP_DELAY) { diff --git a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c index 67e1d4ad2cab..c5b745bb78fb 100644 --- a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c +++ b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c @@ -89,6 +89,7 @@ static struct filterops adf_state_read_filterops = { .f_attach = NULL, .f_detach = adf_state_kqread_detach, .f_event = adf_state_kqread_event, + .f_copy = knote_triv_copy, }; static struct cdev *adf_processes_dev; diff --git a/sys/dev/smartpqi/smartpqi_event.c b/sys/dev/smartpqi/smartpqi_event.c index f000d9ce9db3..88dcf45dd08a 100644 --- a/sys/dev/smartpqi/smartpqi_event.c +++ b/sys/dev/smartpqi/smartpqi_event.c @@ -115,7 +115,7 @@ pqisrc_ack_all_events(void *arg1) pending_event = &softs->pending_events[0]; - for (i=0; i < PQI_NUM_SUPPORTED_EVENTS; i++) { + for (i = 0; i < PQI_NUM_SUPPORTED_EVENTS; i++) { if (pending_event->pending == true) { pending_event->pending = false; pqisrc_acknowledge_event(softs, pending_event); @@ -417,7 +417,7 @@ pqisrc_report_event_config(pqisrc_softstate_t *softs) softs->event_config.num_event_descriptors = MIN(event_config_p->num_event_descriptors, PQI_MAX_EVENT_DESCRIPTORS) ; - for (i=0; i < softs->event_config.num_event_descriptors ;i++){ + for (i = 0; i < softs->event_config.num_event_descriptors; i++) { softs->event_config.descriptors[i].event_type = event_config_p->descriptors[i].event_type; } @@ -477,7 +477,7 @@ pqisrc_set_event_config(pqisrc_softstate_t *softs) event_config_p->num_event_descriptors = softs->event_config.num_event_descriptors; - for (i=0; i < softs->event_config.num_event_descriptors ; i++){ + for (i = 0; i < softs->event_config.num_event_descriptors; i++) { event_config_p->descriptors[i].event_type = softs->event_config.descriptors[i].event_type; if( pqisrc_event_type_to_event_index(event_config_p->descriptors[i].event_type) != -1) diff --git a/sys/dev/smartpqi/smartpqi_queue.c b/sys/dev/smartpqi/smartpqi_queue.c index 2e80b01b5436..f05c951cd4f9 100644 --- a/sys/dev/smartpqi/smartpqi_queue.c +++ b/sys/dev/smartpqi/smartpqi_queue.c @@ -700,7 +700,7 @@ pqisrc_create_op_obq(pqisrc_softstate_t *softs, } else { int i = 0; DBG_WARN("Error Status Descriptors\n"); - for(i = 0; i < 4;i++) + for (i = 0; i < 4; i++) DBG_WARN(" %x ",admin_resp.resp_type.create_op_oq.status_desc[i]); } @@ -743,7 +743,7 @@ pqisrc_create_op_ibq(pqisrc_softstate_t *softs, } else { int i = 0; DBG_WARN("Error Status Decsriptors\n"); - for(i = 0; i < 4;i++) + for (i = 0; i < 4; i++) DBG_WARN(" %x ",admin_resp.resp_type.create_op_iq.status_desc[i]); } diff --git a/sys/dev/sym/sym_hipd.c b/sys/dev/sym/sym_hipd.c index fa65d544e17d..b4e5c1075fb4 100644 --- a/sys/dev/sym/sym_hipd.c +++ b/sys/dev/sym/sym_hipd.c @@ -3266,7 +3266,7 @@ static void sym_init (hcb_p np, int reason) * Reinitialize usrwide. * Prepare sync negotiation according to actual SCSI bus mode. */ - for (i=0;i<SYM_CONF_MAX_TARGET;i++) { + for (i = 0; i < SYM_CONF_MAX_TARGET; i++) { tcb_p tp = &np->target[i]; tp->to_reset = 0; @@ -3715,7 +3715,7 @@ static void sym_log_hard_error(hcb_p np, u_short sist, u_char dstat) } printf ("%s: regdump:", sym_name(np)); - for (i=0; i<24;i++) + for (i = 0; i < 24; i++) printf (" %02x", (unsigned)INB_OFF(i)); printf (".\n"); @@ -5527,8 +5527,8 @@ static int sym_show_msg (u_char * msg) u_char i; printf ("%x",*msg); if (*msg==M_EXTENDED) { - for (i=1;i<8;i++) { - if (i-1>msg[1]) break; + for (i = 1; i < 8; i++) { + if (i - 1 > msg[1]) break; printf ("-%x",msg[i]); } return (i+1); @@ -6744,10 +6744,10 @@ restart_test: /* * Wait 'til done (with timeout) */ - for (i=0; i<SYM_SNOOP_TIMEOUT; i++) + for (i = 0; i < SYM_SNOOP_TIMEOUT; i++) if (INB(nc_istat) & (INTF|SIP|DIP)) break; - if (i>=SYM_SNOOP_TIMEOUT) { + if (i >= SYM_SNOOP_TIMEOUT) { printf ("CACHE TEST FAILED: timeout.\n"); return (0x20); } diff --git a/sys/dev/tdfx/tdfx_linux.c b/sys/dev/tdfx/tdfx_linux.c index f3410106bad2..777144d21bb6 100644 --- a/sys/dev/tdfx/tdfx_linux.c +++ b/sys/dev/tdfx/tdfx_linux.c @@ -42,7 +42,7 @@ LINUX_IOCTL_SET(tdfx, LINUX_IOCTL_TDFX_MIN, LINUX_IOCTL_TDFX_MAX); * Linux emulation IOCTL for /dev/tdfx */ static int -linux_ioctl_tdfx(struct thread *td, struct linux_ioctl_args* args) +tdfx_linux_ioctl(struct thread *td, struct linux_ioctl_args* args) { cap_rights_t rights; int error = 0; diff --git a/sys/dev/tdfx/tdfx_linux.h b/sys/dev/tdfx/tdfx_linux.h index b87cb41f38fe..9d012c12274b 100644 --- a/sys/dev/tdfx/tdfx_linux.h +++ b/sys/dev/tdfx/tdfx_linux.h @@ -35,18 +35,6 @@ #include <machine/../linux/linux_proto.h> #include <compat/linux/linux_ioctl.h> -/* - * This code was donated by Vladimir N. Silynaev to allow for defining - * ioctls within modules - */ -#define LINUX_IOCTL_SET(n,low,high) \ -static linux_ioctl_function_t linux_ioctl_##n; \ -static struct linux_ioctl_handler n##_handler = {linux_ioctl_##n, low, high}; \ -SYSINIT(n##register, SI_SUB_KLD, SI_ORDER_MIDDLE,\ -linux_ioctl_register_handler, &n##_handler); \ -SYSUNINIT(n##unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,\ -linux_ioctl_unregister_handler, &n##_handler); - /* Values for /dev/3dfx */ /* Query IOCTLs */ #define LINUX_IOCTL_TDFX_QUERY_BOARDS 0x3302 diff --git a/sys/dev/tws/tws.c b/sys/dev/tws/tws.c index af151c8c4f06..fccd6689a6aa 100644 --- a/sys/dev/tws/tws.c +++ b/sys/dev/tws/tws.c @@ -311,7 +311,7 @@ attach_fail_4: if (sc->cmd_tag) bus_dma_tag_destroy(sc->cmd_tag); attach_fail_3: - for(i=0;i<sc->irqs;i++) { + for (i = 0; i < sc->irqs; i++) { if ( sc->irq_res[i] ){ if (bus_release_resource(sc->tws_dev, SYS_RES_IRQ, sc->irq_res_id[i], sc->irq_res[i])) @@ -369,7 +369,7 @@ tws_detach(device_t dev) tws_teardown_intr(sc); /* Release irq resource */ - for(i=0;i<sc->irqs;i++) { + for (i = 0; i < sc->irqs; i++) { if ( sc->irq_res[i] ){ if (bus_release_resource(sc->tws_dev, SYS_RES_IRQ, sc->irq_res_id[i], sc->irq_res[i])) @@ -402,7 +402,7 @@ tws_detach(device_t dev) TWS_TRACE(sc, "bus release mem resource", 0, sc->reg_res_id); } - for ( i=0; i< tws_queue_depth; i++) { + for (i = 0; i < tws_queue_depth; i++) { if (sc->reqs[i].dma_map) bus_dmamap_destroy(sc->data_tag, sc->reqs[i].dma_map); callout_drain(&sc->reqs[i].timeout); @@ -432,7 +432,7 @@ tws_setup_intr(struct tws_softc *sc, int irqs) { int i, error; - for(i=0;i<irqs;i++) { + for (i = 0; i < irqs; i++) { if (!(sc->intr_handle[i])) { if ((error = bus_setup_intr(sc->tws_dev, sc->irq_res[i], INTR_TYPE_CAM | INTR_MPSAFE, @@ -452,7 +452,7 @@ tws_teardown_intr(struct tws_softc *sc) { int i; - for(i=0;i<sc->irqs;i++) { + for (i = 0; i < sc->irqs; i++) { if (sc->intr_handle[i]) { bus_teardown_intr(sc->tws_dev, sc->irq_res[i], sc->intr_handle[i]); @@ -669,8 +669,7 @@ tws_init_reqs(struct tws_softc *sc, u_int32_t dma_mem_size) bzero(cmd_buf, dma_mem_size); TWS_TRACE_DEBUG(sc, "phy cmd", sc->dma_mem_phys, 0); mtx_lock(&sc->q_lock); - for ( i=0; i< tws_queue_depth; i++) - { + for (i = 0; i < tws_queue_depth; i++) { if (bus_dmamap_create(sc->data_tag, 0, &sc->reqs[i].dma_map)) { /* log a ENOMEM failure msg here */ mtx_unlock(&sc->q_lock); diff --git a/sys/dev/tws/tws_services.c b/sys/dev/tws/tws_services.c index da8bbacc39f7..e5c3d45c533f 100644 --- a/sys/dev/tws/tws_services.c +++ b/sys/dev/tws/tws_services.c @@ -200,7 +200,7 @@ tws_init_qs(struct tws_softc *sc) { mtx_lock(&sc->q_lock); - for(int i=0;i<TWS_MAX_QS;i++) { + for (int i = 0; i < TWS_MAX_QS; i++) { sc->q_head[i] = NULL; sc->q_tail[i] = NULL; } diff --git a/sys/dev/usb/usb_dev.c b/sys/dev/usb/usb_dev.c index 293b0c72587f..e58d6a674ec0 100644 --- a/sys/dev/usb/usb_dev.c +++ b/sys/dev/usb/usb_dev.c @@ -1231,12 +1231,14 @@ static const struct filterops usb_filtops_write = { .f_isfd = 1, .f_detach = usb_filter_detach, .f_event = usb_filter_write, + .f_copy = knote_triv_copy, }; static const struct filterops usb_filtops_read = { .f_isfd = 1, .f_detach = usb_filter_detach, .f_event = usb_filter_read, + .f_copy = knote_triv_copy, }; /* ARGSUSED */ diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 460a508a60dc..4961b21180e1 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -120,18 +120,18 @@ vcpu_unlock_one(struct vcpu *vcpu) vcpu_set_state(vcpu, VCPU_IDLE, false); } +#ifndef __amd64__ static int -vcpu_lock_all(struct vmmdev_softc *sc) +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) { struct vcpu *vcpu; int error; uint16_t i, j, maxcpus; error = 0; - vm_slock_vcpus(sc->vm); - maxcpus = vm_get_maxcpus(sc->vm); + maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { - vcpu = vm_vcpu(sc->vm, i); + vcpu = vm_vcpu(vm, i); if (vcpu == NULL) continue; error = vcpu_lock_one(vcpu); @@ -141,16 +141,32 @@ vcpu_lock_all(struct vmmdev_softc *sc) if (error) { for (j = 0; j < i; j++) { - vcpu = vm_vcpu(sc->vm, j); + vcpu = vm_vcpu(vm, j); if (vcpu == NULL) continue; vcpu_unlock_one(vcpu); } - vm_unlock_vcpus(sc->vm); } return (error); } +#endif + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + int error; + + /* + * Serialize vcpu_lock_all() callers. Individual vCPUs are not locked + * in a consistent order so we need to serialize to avoid deadlocks. + */ + vm_lock_vcpus(sc->vm); + error = vcpu_set_state_all(sc->vm, VCPU_FROZEN); + if (error != 0) + vm_unlock_vcpus(sc->vm); + return (error); +} static void vcpu_unlock_all(struct vmmdev_softc *sc) diff --git a/sys/dev/vmm/vmm_mem.c b/sys/dev/vmm/vmm_mem.c index be59e37de33d..9df31c9ba133 100644 --- a/sys/dev/vmm/vmm_mem.c +++ b/sys/dev/vmm/vmm_mem.c @@ -26,10 +26,14 @@ static void vm_free_memmap(struct vm *vm, int ident); -void -vm_mem_init(struct vm_mem *mem) +int +vm_mem_init(struct vm_mem *mem, vm_offset_t lo, vm_offset_t hi) { + mem->mem_vmspace = vmmops_vmspace_alloc(lo, hi); + if (mem->mem_vmspace == NULL) + return (ENOMEM); sx_init(&mem->mem_segs_lock, "vm_mem_segs"); + return (0); } static bool @@ -93,10 +97,21 @@ vm_mem_destroy(struct vm *vm) for (int i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); + vmmops_vmspace_free(mem->mem_vmspace); + sx_xunlock(&mem->mem_segs_lock); sx_destroy(&mem->mem_segs_lock); } +struct vmspace * +vm_vmspace(struct vm *vm) +{ + struct vm_mem *mem; + + mem = vm_mem(vm); + return (mem->mem_vmspace); +} + void vm_slock_memsegs(struct vm *vm) { @@ -246,7 +261,7 @@ vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, struct vm_mem *mem; struct vm_mem_seg *seg; struct vm_mem_map *m, *map; - struct vmspace *vmspace; + struct vm_map *vmmap; vm_ooffset_t last; int i, error; @@ -282,19 +297,19 @@ vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, if (map == NULL) return (ENOSPC); - vmspace = vm_vmspace(vm); - error = vm_map_find(&vmspace->vm_map, seg->object, first, &gpa, - len, 0, VMFS_NO_SPACE, prot, prot, 0); + vmmap = &mem->mem_vmspace->vm_map; + error = vm_map_find(vmmap, seg->object, first, &gpa, len, 0, + VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { - error = vm_map_wire(&vmspace->vm_map, gpa, gpa + len, + error = vm_map_wire(vmmap, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { - vm_map_remove(&vmspace->vm_map, gpa, gpa + len); + vm_map_remove(vmmap, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } diff --git a/sys/dev/vmm/vmm_mem.h b/sys/dev/vmm/vmm_mem.h index 856470cf2590..f3d22058c7b8 100644 --- a/sys/dev/vmm/vmm_mem.h +++ b/sys/dev/vmm/vmm_mem.h @@ -36,6 +36,7 @@ enum { struct vm; struct vm_object; +struct vmspace; struct vm_mem_seg { size_t len; @@ -56,12 +57,15 @@ struct vm_mem { struct vm_mem_map mem_maps[VM_MAX_MEMMAPS]; struct vm_mem_seg mem_segs[VM_MAX_MEMSEGS]; struct sx mem_segs_lock; + struct vmspace *mem_vmspace; }; -void vm_mem_init(struct vm_mem *mem); +int vm_mem_init(struct vm_mem *mem, vm_offset_t lo, vm_offset_t hi); void vm_mem_cleanup(struct vm *vm); void vm_mem_destroy(struct vm *vm); +struct vmspace *vm_vmspace(struct vm *vm); + /* * APIs that modify the guest memory map require all vcpus to be frozen. */ diff --git a/sys/dev/vmware/vmxnet3/if_vmx.c b/sys/dev/vmware/vmxnet3/if_vmx.c index 62b5f313a137..1a314ca6660e 100644 --- a/sys/dev/vmware/vmxnet3/if_vmx.c +++ b/sys/dev/vmware/vmxnet3/if_vmx.c @@ -2056,7 +2056,12 @@ vmxnet3_update_admin_status(if_ctx_t ctx) struct vmxnet3_softc *sc; sc = iflib_get_softc(ctx); - if (sc->vmx_ds->event != 0) + /* + * iflib may invoke this routine before vmxnet3_attach_post() has + * run, which is before the top level shared data area is + * initialized and the device made aware of it. + */ + if (sc->vmx_ds != NULL && sc->vmx_ds->event != 0) vmxnet3_evintr(sc); vmxnet3_refresh_host_stats(sc); diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index b51ef6766de4..bcf67ddc9689 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -195,8 +195,8 @@ static void vt_update_static(void *); #ifndef SC_NO_CUTPASTE static void vt_mouse_paste(void); #endif -static void vt_suspend_handler(void *priv); -static void vt_resume_handler(void *priv); +static void vt_suspend_handler(void *priv, enum power_stype stype); +static void vt_resume_handler(void *priv, enum power_stype stype); SET_DECLARE(vt_drv_set, struct vt_driver); @@ -3330,7 +3330,7 @@ vt_replace_backend(const struct vt_driver *drv, void *softc) } static void -vt_suspend_handler(void *priv) +vt_suspend_handler(void *priv, enum power_stype stype) { struct vt_device *vd; @@ -3341,7 +3341,7 @@ vt_suspend_handler(void *priv) } static void -vt_resume_handler(void *priv) +vt_resume_handler(void *priv, enum power_stype stype) { struct vt_device *vd; diff --git a/sys/dev/xdma/xdma.c b/sys/dev/xdma/xdma.c index 62b781159d03..cdd9ad0b8f39 100644 --- a/sys/dev/xdma/xdma.c +++ b/sys/dev/xdma/xdma.c @@ -555,7 +555,7 @@ xdma_put(xdma_controller_t *xdma) } static void -xdma_init(void) +xdma_init(void *dummy __unused) { mtx_init(&xdma_mtx, "xDMA", NULL, MTX_DEF); diff --git a/sys/dev/xen/bus/xen_intr.c b/sys/dev/xen/bus/xen_intr.c index cb30b6efa484..2b5fa8fb7cd1 100644 --- a/sys/dev/xen/bus/xen_intr.c +++ b/sys/dev/xen/bus/xen_intr.c @@ -460,7 +460,7 @@ xen_intr_handle_upcall(void *unused __unused) return (FILTER_HANDLED); } -static int +static void xen_intr_init(void *dummy __unused) { shared_info_t *s = HYPERVISOR_shared_info; @@ -468,7 +468,7 @@ xen_intr_init(void *dummy __unused) int i; if (!xen_domain()) - return (0); + return; _Static_assert(is_valid_evtchn(0), "is_valid_evtchn(0) fails (unused by Xen, but valid by interface"); @@ -502,8 +502,6 @@ xen_intr_init(void *dummy __unused) if (bootverbose) printf("Xen interrupt system initialized\n"); - - return (0); } SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL); diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c index 123df4992894..2c61b48c0451 100644 --- a/sys/dev/xen/control/control.c +++ b/sys/dev/xen/control/control.c @@ -91,6 +91,7 @@ #include <sys/smp.h> #include <sys/eventhandler.h> #include <sys/timetc.h> +#include <sys/power.h> #include <geom/geom.h> @@ -175,12 +176,12 @@ xctrl_suspend(void) cpuset_t cpu_suspend_map; #endif - EVENTHANDLER_INVOKE(power_suspend_early); + EVENTHANDLER_INVOKE(power_suspend_early, POWER_STYPE_SUSPEND_TO_MEM); xs_lock(); stop_all_proc(); xs_unlock(); suspend_all_fs(); - EVENTHANDLER_INVOKE(power_suspend); + EVENTHANDLER_INVOKE(power_suspend, POWER_STYPE_SUSPEND_TO_MEM); #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); @@ -297,7 +298,7 @@ xctrl_suspend(void) resume_all_fs(); resume_all_proc(); - EVENTHANDLER_INVOKE(power_resume); + EVENTHANDLER_INVOKE(power_resume, POWER_STYPE_SUSPEND_TO_MEM); if (bootverbose) printf("System resumed after suspension\n"); diff --git a/sys/fs/cuse/cuse.c b/sys/fs/cuse/cuse.c index d63a7d4691cf..b2524324584a 100644 --- a/sys/fs/cuse/cuse.c +++ b/sys/fs/cuse/cuse.c @@ -195,12 +195,14 @@ static const struct filterops cuse_client_kqfilter_read_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_read_detach, .f_event = cuse_client_kqfilter_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops cuse_client_kqfilter_write_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_write_detach, .f_event = cuse_client_kqfilter_write_event, + .f_copy = knote_triv_copy, }; static d_open_t cuse_client_open; diff --git a/sys/fs/devfs/devfs_dir.c b/sys/fs/devfs/devfs_dir.c index 3dc87538017d..aad87606e738 100644 --- a/sys/fs/devfs/devfs_dir.c +++ b/sys/fs/devfs/devfs_dir.c @@ -162,7 +162,7 @@ int devfs_pathpath(const char *p1, const char *p2) { - for (;;p1++, p2++) { + for (;; p1++, p2++) { if (*p1 != *p2) { if (*p1 == '/' && *p2 == '\0') return (1); diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c index 57b3559731f7..75bc0357571f 100644 --- a/sys/fs/fuse/fuse_device.c +++ b/sys/fs/fuse/fuse_device.c @@ -126,11 +126,13 @@ static const struct filterops fuse_device_rfiltops = { .f_isfd = 1, .f_detach = fuse_device_filt_detach, .f_event = fuse_device_filt_read, + .f_copy = knote_triv_copy, }; static const struct filterops fuse_device_wfiltops = { .f_isfd = 1, .f_event = fuse_device_filt_write, + .f_copy = knote_triv_copy, }; /**************************** diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 5c28db29fc63..683ee2f7ad56 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -284,7 +284,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) struct mount *mp = vnode_mount(vp); int err; - if (fsess_not_impl(vnode_mount(vp), FUSE_FLUSH)) + if (fsess_not_impl(mp, FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); @@ -292,7 +292,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) return err; if (fufh->fuse_open_flags & FOPEN_NOFLUSH && - (!fsess_opt_writeback(vnode_mount(vp)))) + (!fsess_opt_writeback(mp))) return (0); fdisp_init(&fdi, sizeof(*ffi)); diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c index 7f5b29ca2085..ec95716ea485 100644 --- a/sys/fs/nfs/nfs_commonsubs.c +++ b/sys/fs/nfs/nfs_commonsubs.c @@ -216,10 +216,17 @@ NFSD_VNET_DEFINE_STATIC(u_char *, nfsrv_dnsname) = NULL; * marked 0 in this array, the code will still work, just not quite as * efficiently.) */ -static int nfs_bigreply[NFSV42_NPROCS] = { 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 }; +static bool nfs_bigreply[NFSV42_NPROCS] = { + [NFSPROC_GETACL] = true, + [NFSPROC_GETEXTATTR] = true, + [NFSPROC_LISTEXTATTR] = true, + [NFSPROC_LOOKUP] = true, + [NFSPROC_READ] = true, + [NFSPROC_READDIR] = true, + [NFSPROC_READDIRPLUS] = true, + [NFSPROC_READDS] = true, + [NFSPROC_READLINK] = true, +}; /* local functions */ static int nfsrv_skipace(struct nfsrv_descript *nd, int *acesizep); @@ -232,6 +239,8 @@ static int nfsrv_getrefstr(struct nfsrv_descript *, u_char **, u_char **, static void nfsrv_refstrbigenough(int, u_char **, u_char **, int *); static uint32_t vtonfsv4_type(struct vattr *); static __enum_uint8(vtype) nfsv4tov_type(uint32_t, uint16_t *); +static void nfsv4_setsequence(struct nfsmount *, struct nfsrv_descript *, + struct nfsclsession *, bool, struct ucred *); static struct { int op; @@ -5021,9 +5030,9 @@ nfsv4_seqsess_cacherep(uint32_t slotid, struct nfsslot *slots, int repstat, /* * Generate the xdr for an NFSv4.1 Sequence Operation. */ -void +static void nfsv4_setsequence(struct nfsmount *nmp, struct nfsrv_descript *nd, - struct nfsclsession *sep, int dont_replycache, struct ucred *cred) + struct nfsclsession *sep, bool dont_replycache, struct ucred *cred) { uint32_t *tl, slotseq = 0; int error, maxslot, slotpos; @@ -5054,7 +5063,7 @@ nfsv4_setsequence(struct nfsmount *nmp, struct nfsrv_descript *nd, *tl++ = txdr_unsigned(slotseq); *tl++ = txdr_unsigned(slotpos); *tl++ = txdr_unsigned(maxslot); - if (dont_replycache == 0) + if (!dont_replycache) *tl = newnfs_true; else *tl = newnfs_false; diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 61083ecf2d66..16a76c060e78 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -361,8 +361,6 @@ int nfsv4_getipaddr(struct nfsrv_descript *, struct sockaddr_in *, int nfsv4_seqsession(uint32_t, uint32_t, uint32_t, struct nfsslot *, struct mbuf **, uint16_t); void nfsv4_seqsess_cacherep(uint32_t, struct nfsslot *, int, struct mbuf **); -void nfsv4_setsequence(struct nfsmount *, struct nfsrv_descript *, - struct nfsclsession *, int, struct ucred *); int nfsv4_sequencelookup(struct nfsmount *, struct nfsclsession *, int *, int *, uint32_t *, uint8_t *, bool); void nfsv4_freeslot(struct nfsclsession *, int, bool); diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index 0a93878c859f..7bfdc20a3f67 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -35,7 +35,11 @@ #ifndef FS_NULL_H #define FS_NULL_H -#define NULLM_CACHE 0x0001 +#include <sys/ck.h> +#include <vm/uma.h> + +#define NULLM_CACHE 0x0001 +#define NULLM_NOUNPBYPASS 0x0002 struct null_mount { struct mount *nullm_vfs; @@ -50,7 +54,7 @@ struct null_mount { * A cache of vnode references */ struct null_node { - LIST_ENTRY(null_node) null_hash; /* Hash list */ + CK_SLIST_ENTRY(null_node) null_hash; /* Hash list */ struct vnode *null_lowervp; /* VREFed once */ struct vnode *null_vnode; /* Back pointer */ u_int null_flags; @@ -61,6 +65,7 @@ struct null_node { #define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) #define VTONULL(vp) ((struct null_node *)(vp)->v_data) +#define VTONULL_SMR(vp) ((struct null_node *)vn_load_v_data_smr(vp)) #define NULLTOV(xp) ((xp)->null_vnode) int nullfs_init(struct vfsconf *vfsp); @@ -78,10 +83,18 @@ struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno); #endif extern struct vop_vector null_vnodeops; +extern struct vop_vector null_vnodeops_no_unp_bypass; -#ifdef MALLOC_DECLARE -MALLOC_DECLARE(M_NULLFSNODE); -#endif +static inline bool +null_is_nullfs_vnode(struct vnode *vp) +{ + const struct vop_vector *op; + + op = vp->v_op; + return (op == &null_vnodeops || op == &null_vnodeops_no_unp_bypass); +} + +extern uma_zone_t null_node_zone; #ifdef NULLFS_DEBUG #define NULLFSDEBUG(format, args...) printf(format ,## args) diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index 053614b6910d..a843ae44f121 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -36,14 +36,19 @@ #include <sys/systm.h> #include <sys/kernel.h> #include <sys/lock.h> -#include <sys/rwlock.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/smr.h> #include <sys/vnode.h> #include <fs/nullfs/null.h> +#include <vm/uma.h> + +VFS_SMR_DECLARE; + /* * Null layer cache: * Each cache entry holds a reference to the lower vnode @@ -54,12 +59,12 @@ #define NULL_NHASH(vp) (&null_node_hashtbl[vfs_hash_index(vp) & null_hash_mask]) -static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; +static CK_SLIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; static struct rwlock null_hash_lock; static u_long null_hash_mask; static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table"); -MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part"); +uma_zone_t __read_mostly null_node_zone; static void null_hashins(struct mount *, struct null_node *); @@ -73,6 +78,10 @@ nullfs_init(struct vfsconf *vfsp) null_node_hashtbl = hashinit(desiredvnodes, M_NULLFSHASH, &null_hash_mask); rw_init(&null_hash_lock, "nullhs"); + null_node_zone = uma_zcreate("nullfs node", sizeof(struct null_node), + NULL, NULL, NULL, NULL, 0, UMA_ZONE_ZINIT); + VFS_SMR_ZONE_SET(null_node_zone); + return (0); } @@ -80,6 +89,7 @@ int nullfs_uninit(struct vfsconf *vfsp) { + uma_zdestroy(null_node_zone); rw_destroy(&null_hash_lock); hashdestroy(null_node_hashtbl, M_NULLFSHASH, null_hash_mask); return (0); @@ -96,7 +106,7 @@ null_hashget_locked(struct mount *mp, struct vnode *lowervp) struct null_node *a; struct vnode *vp; - ASSERT_VOP_LOCKED(lowervp, "null_hashget"); + ASSERT_VOP_LOCKED(lowervp, __func__); rw_assert(&null_hash_lock, RA_LOCKED); /* @@ -106,18 +116,21 @@ null_hashget_locked(struct mount *mp, struct vnode *lowervp) * reference count (but NOT the lower vnode's VREF counter). */ hd = NULL_NHASH(lowervp); - LIST_FOREACH(a, hd, null_hash) { - if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { - /* - * Since we have the lower node locked the nullfs - * node can not be in the process of recycling. If - * it had been recycled before we grabed the lower - * lock it would not have been found on the hash. - */ - vp = NULLTOV(a); - vref(vp); - return (vp); - } + CK_SLIST_FOREACH(a, hd, null_hash) { + if (a->null_lowervp != lowervp) + continue; + /* + * Since we have the lower node locked the nullfs + * node can not be in the process of recycling. If + * it had been recycled before we grabed the lower + * lock it would not have been found on the hash. + */ + vp = NULLTOV(a); + VNPASS(!VN_IS_DOOMED(vp), vp); + if (vp->v_mount != mp) + continue; + vref(vp); + return (vp); } return (NULL); } @@ -126,17 +139,34 @@ struct vnode * null_hashget(struct mount *mp, struct vnode *lowervp) { struct null_node_hashhead *hd; + struct null_node *a; struct vnode *vp; + enum vgetstate vs; - hd = NULL_NHASH(lowervp); - if (LIST_EMPTY(hd)) - return (NULL); - - rw_rlock(&null_hash_lock); - vp = null_hashget_locked(mp, lowervp); - rw_runlock(&null_hash_lock); + ASSERT_VOP_LOCKED(lowervp, __func__); + rw_assert(&null_hash_lock, RA_UNLOCKED); - return (vp); + vfs_smr_enter(); + hd = NULL_NHASH(lowervp); + CK_SLIST_FOREACH(a, hd, null_hash) { + if (a->null_lowervp != lowervp) + continue; + /* + * See null_hashget_locked as to why the nullfs vnode can't be + * doomed here. + */ + vp = NULLTOV(a); + VNPASS(!VN_IS_DOOMED(vp), vp); + if (vp->v_mount != mp) + continue; + vs = vget_prep_smr(vp); + vfs_smr_exit(); + VNPASS(vs != VGET_NONE, vp); + vget_finish_ref(vp, vs); + return (vp); + } + vfs_smr_exit(); + return (NULL); } static void @@ -151,7 +181,7 @@ null_hashins(struct mount *mp, struct null_node *xp) hd = NULL_NHASH(xp->null_lowervp); #ifdef INVARIANTS - LIST_FOREACH(oxp, hd, null_hash) { + CK_SLIST_FOREACH(oxp, hd, null_hash) { if (oxp->null_lowervp == xp->null_lowervp && NULLTOV(oxp)->v_mount == mp) { VNASSERT(0, NULLTOV(oxp), @@ -159,7 +189,7 @@ null_hashins(struct mount *mp, struct null_node *xp) } } #endif - LIST_INSERT_HEAD(hd, xp, null_hash); + CK_SLIST_INSERT_HEAD(hd, xp, null_hash); } static void @@ -174,7 +204,7 @@ null_destroy_proto(struct vnode *vp, void *xp) VI_UNLOCK(vp); vgone(vp); vput(vp); - free(xp, M_NULLFSNODE); + uma_zfree_smr(null_node_zone, xp); } /* @@ -208,12 +238,14 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp) * Note that duplicate can only appear in hash if the lowervp is * locked LK_SHARED. */ - xp = malloc(sizeof(struct null_node), M_NULLFSNODE, M_WAITOK); + xp = uma_zalloc_smr(null_node_zone, M_WAITOK); - error = getnewvnode("nullfs", mp, &null_vnodeops, &vp); + error = getnewvnode("nullfs", mp, (MOUNTTONULLMOUNT(mp)->nullm_flags & + NULLM_NOUNPBYPASS) != 0 ? &null_vnodeops_no_unp_bypass : + &null_vnodeops, &vp); if (error) { vput(lowervp); - free(xp, M_NULLFSNODE); + uma_zfree_smr(null_node_zone, xp); return (error); } @@ -261,8 +293,8 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp) return (error); } - null_hashins(mp, xp); vn_set_state(vp, VSTATE_CONSTRUCTED); + null_hashins(mp, xp); rw_wunlock(&null_hash_lock); *vpp = vp; @@ -275,9 +307,11 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp) void null_hashrem(struct null_node *xp) { + struct null_node_hashhead *hd; + hd = NULL_NHASH(xp->null_lowervp); rw_wlock(&null_hash_lock); - LIST_REMOVE(xp, null_hash); + CK_SLIST_REMOVE(hd, xp, null_node, null_hash); rw_wunlock(&null_hash_lock); } diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 4cddf24a5745..170a3dd51cd8 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -85,6 +85,10 @@ nullfs_mount(struct mount *mp) char *target; int error, len; bool isvnunlocked; + static const char cache_opt_name[] = "cache"; + static const char nocache_opt_name[] = "nocache"; + static const char unixbypass_opt_name[] = "unixbypass"; + static const char nounixbypass_opt_name[] = "nounixbypass"; NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp); @@ -116,7 +120,7 @@ nullfs_mount(struct mount *mp) /* * Unlock lower node to avoid possible deadlock. */ - if (mp->mnt_vnodecovered->v_op == &null_vnodeops && + if (null_is_nullfs_vnode(mp->mnt_vnodecovered) && VOP_ISLOCKED(mp->mnt_vnodecovered) == LK_EXCLUSIVE) { VOP_UNLOCK(mp->mnt_vnodecovered); isvnunlocked = true; @@ -150,7 +154,7 @@ nullfs_mount(struct mount *mp) /* * Check multi null mount to avoid `lock against myself' panic. */ - if (mp->mnt_vnodecovered->v_op == &null_vnodeops) { + if (null_is_nullfs_vnode(mp->mnt_vnodecovered)) { nn = VTONULL(mp->mnt_vnodecovered); if (nn == NULL || lowerrootvp == nn->null_lowervp) { NULLFSDEBUG("nullfs_mount: multi null mount?\n"); @@ -205,9 +209,10 @@ nullfs_mount(struct mount *mp) MNT_IUNLOCK(mp); } - if (vfs_getopt(mp->mnt_optnew, "cache", NULL, NULL) == 0) { + if (vfs_getopt(mp->mnt_optnew, cache_opt_name, NULL, NULL) == 0) { xmp->nullm_flags |= NULLM_CACHE; - } else if (vfs_getopt(mp->mnt_optnew, "nocache", NULL, NULL) == 0) { + } else if (vfs_getopt(mp->mnt_optnew, nocache_opt_name, NULL, + NULL) == 0) { ; } else if (null_cache_vnodes && (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) == 0) { @@ -219,6 +224,13 @@ nullfs_mount(struct mount *mp) &xmp->notify_node); } + if (vfs_getopt(mp->mnt_optnew, unixbypass_opt_name, NULL, NULL) == 0) { + ; + } else if (vfs_getopt(mp->mnt_optnew, nounixbypass_opt_name, NULL, + NULL) == 0) { + xmp->nullm_flags |= NULLM_NOUNPBYPASS; + } + if (lowerrootvp == mp->mnt_vnodecovered) { vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE); lowerrootvp->v_vflag |= VV_CROSSLOCK; diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index e9d598014a2f..d4baabeb40ab 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -174,6 +174,8 @@ #include <sys/mount.h> #include <sys/mutex.h> #include <sys/namei.h> +#include <sys/proc.h> +#include <sys/smr.h> #include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/stat.h> @@ -185,6 +187,8 @@ #include <vm/vm_object.h> #include <vm/vnode_pager.h> +VFS_SMR_DECLARE; + static int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); @@ -274,7 +278,7 @@ null_bypass(struct vop_generic_args *ap) * that aren't. (We must always map first vp or vclean fails.) */ if (i != 0 && (*this_vp_p == NULL || - (*this_vp_p)->v_op != &null_vnodeops)) { + !null_is_nullfs_vnode(*this_vp_p))) { old_vps[i] = NULL; } else { old_vps[i] = *this_vp_p; @@ -768,83 +772,111 @@ null_rmdir(struct vop_rmdir_args *ap) } /* - * We need to process our own vnode lock and then clear the - * interlock flag as it applies only to our vnode, not the - * vnodes below us on the stack. + * We need to process our own vnode lock and then clear the interlock flag as + * it applies only to our vnode, not the vnodes below us on the stack. + * + * We have to hold the vnode here to solve a potential reclaim race. If we're + * forcibly vgone'd while we still have refs, a thread could be sleeping inside + * the lowervp's vop_lock routine. When we vgone we will drop our last ref to + * the lowervp, which would allow it to be reclaimed. The lowervp could then + * be recycled, in which case it is not legal to be sleeping in its VOP. We + * prevent it from being recycled by holding the vnode here. */ +static struct vnode * +null_lock_prep_with_smr(struct vop_lock1_args *ap) +{ + struct null_node *nn; + struct vnode *lvp; + + lvp = NULL; + + vfs_smr_enter(); + + nn = VTONULL_SMR(ap->a_vp); + if (__predict_true(nn != NULL)) { + lvp = nn->null_lowervp; + if (lvp != NULL && !vhold_smr(lvp)) + lvp = NULL; + } + + vfs_smr_exit(); + return (lvp); +} + +static struct vnode * +null_lock_prep_with_interlock(struct vop_lock1_args *ap) +{ + struct null_node *nn; + struct vnode *lvp; + + ASSERT_VI_LOCKED(ap->a_vp, __func__); + + ap->a_flags &= ~LK_INTERLOCK; + + lvp = NULL; + + nn = VTONULL(ap->a_vp); + if (__predict_true(nn != NULL)) { + lvp = nn->null_lowervp; + if (lvp != NULL) + vholdnz(lvp); + } + VI_UNLOCK(ap->a_vp); + return (lvp); +} + static int null_lock(struct vop_lock1_args *ap) { - struct vnode *vp = ap->a_vp; - int flags; - struct null_node *nn; struct vnode *lvp; - int error; + int error, flags; - if ((ap->a_flags & LK_INTERLOCK) == 0) - VI_LOCK(vp); - else - ap->a_flags &= ~LK_INTERLOCK; - flags = ap->a_flags; - nn = VTONULL(vp); + if (__predict_true((ap->a_flags & LK_INTERLOCK) == 0)) { + lvp = null_lock_prep_with_smr(ap); + if (__predict_false(lvp == NULL)) { + VI_LOCK(ap->a_vp); + lvp = null_lock_prep_with_interlock(ap); + } + } else { + lvp = null_lock_prep_with_interlock(ap); + } + + ASSERT_VI_UNLOCKED(ap->a_vp, __func__); + + if (__predict_false(lvp == NULL)) + return (vop_stdlock(ap)); + + VNPASS(lvp->v_holdcnt > 0, lvp); + error = VOP_LOCK(lvp, ap->a_flags); /* - * If we're still active we must ask the lower layer to - * lock as ffs has special lock considerations in its - * vop lock. + * We might have slept to get the lock and someone might have + * clean our vnode already, switching vnode lock from one in + * lowervp to v_lock in our own vnode structure. Handle this + * case by reacquiring correct lock in requested mode. */ - if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { - /* - * We have to hold the vnode here to solve a potential - * reclaim race. If we're forcibly vgone'd while we - * still have refs, a thread could be sleeping inside - * the lowervp's vop_lock routine. When we vgone we will - * drop our last ref to the lowervp, which would allow it - * to be reclaimed. The lowervp could then be recycled, - * in which case it is not legal to be sleeping in its VOP. - * We prevent it from being recycled by holding the vnode - * here. - */ - vholdnz(lvp); - VI_UNLOCK(vp); - error = VOP_LOCK(lvp, flags); - - /* - * We might have slept to get the lock and someone might have - * clean our vnode already, switching vnode lock from one in - * lowervp to v_lock in our own vnode structure. Handle this - * case by reacquiring correct lock in requested mode. - */ - if (VTONULL(vp) == NULL && error == 0) { - ap->a_flags &= ~LK_TYPE_MASK; - switch (flags & LK_TYPE_MASK) { - case LK_SHARED: - ap->a_flags |= LK_SHARED; - break; - case LK_UPGRADE: - case LK_EXCLUSIVE: - ap->a_flags |= LK_EXCLUSIVE; - break; - default: - panic("Unsupported lock request %d\n", - ap->a_flags); - } - VOP_UNLOCK(lvp); - error = vop_stdlock(ap); + if (VTONULL(ap->a_vp) == NULL && error == 0) { + VOP_UNLOCK(lvp); + + flags = ap->a_flags; + ap->a_flags &= ~LK_TYPE_MASK; + switch (flags & LK_TYPE_MASK) { + case LK_SHARED: + ap->a_flags |= LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUSIVE: + ap->a_flags |= LK_EXCLUSIVE; + break; + default: + panic("Unsupported lock request %d\n", + flags); } - vdrop(lvp); - } else { - VI_UNLOCK(vp); error = vop_stdlock(ap); } - + vdrop(lvp); return (error); } -/* - * We need to process our own vnode unlock and then clear the - * interlock flag as it applies only to our vnode, not the - * vnodes below us on the stack. - */ static int null_unlock(struct vop_unlock_args *ap) { @@ -853,11 +885,20 @@ null_unlock(struct vop_unlock_args *ap) struct vnode *lvp; int error; + /* + * Contrary to null_lock, we don't need to hold the vnode around + * unlock. + * + * We hold the lock, which means we can't be racing against vgone. + * + * At the same time VOP_UNLOCK promises to not touch anything after + * it finishes unlock, just like we don't. + * + * vop_stdunlock for a doomed vnode matches doomed locking in null_lock. + */ nn = VTONULL(vp); if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { - vholdnz(lvp); error = VOP_UNLOCK(lvp); - vdrop(lvp); } else { error = vop_stdunlock(ap); } @@ -961,7 +1002,7 @@ null_reclaim(struct vop_reclaim_args *ap) vunref(lowervp); else vput(lowervp); - free(xp, M_NULLFSNODE); + uma_zfree_smr(null_node_zone, xp); return (0); } @@ -1215,3 +1256,11 @@ struct vop_vector null_vnodeops = { .vop_copy_file_range = VOP_PANIC, }; VFS_VOP_VECTOR_REGISTER(null_vnodeops); + +struct vop_vector null_vnodeops_no_unp_bypass = { + .vop_default = &null_vnodeops, + .vop_unp_bind = vop_stdunp_bind, + .vop_unp_connect = vop_stdunp_connect, + .vop_unp_detach = vop_stdunp_detach, +}; +VFS_VOP_VECTOR_REGISTER(null_vnodeops_no_unp_bypass); diff --git a/sys/fs/p9fs/p9_transport.c b/sys/fs/p9fs/p9_transport.c index c82d81fedcd7..25eee984265c 100644 --- a/sys/fs/p9fs/p9_transport.c +++ b/sys/fs/p9fs/p9_transport.c @@ -34,9 +34,8 @@ TAILQ_HEAD(, p9_trans_module) transports; static void -p9_transport_init(void) +p9_transport_init(void *dummy __unused) { - TAILQ_INIT(&transports); } diff --git a/sys/fs/udf/osta.c b/sys/fs/udf/osta.c index f79b86993367..1a083d8c26b1 100644 --- a/sys/fs/udf/osta.c +++ b/sys/fs/udf/osta.c @@ -383,7 +383,7 @@ int UDFTransName( int maxFilenameLen; /* Translate extension, and store it in ext. */ for(index = 0; index<EXT_SIZE && - extIndex + index +1 < udfLen; index++ ) { + extIndex + index +1 < udfLen; index++) { current = udfName[extIndex + index + 1]; if (IsIllegal(current) || !UnicodeIsPrint(current)) { @@ -432,7 +432,7 @@ int UDFTransName( /* Place a translated extension at end, if found. */ if (hasExt) { newName[newIndex++] = PERIOD; - for (index = 0;index < localExtIndex ;index++ ) { + for (index = 0; index < localExtIndex; index++) { newName[newIndex++] = ext[index]; } } diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index a14f9ca74305..b6d6db60ca3d 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -587,6 +587,7 @@ unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) struct unionfs_node_status *unsp; pid_t pid; + MPASS(td != NULL); pid = td->td_proc->p_pid; ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); @@ -612,6 +613,7 @@ unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, struct unionfs_node_status *unsp; pid_t pid; + MPASS(td != NULL); pid = td->td_proc->p_pid; KASSERT(NULL != unspp, ("%s: NULL status", __func__)); diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index 627b2f6e9a1d..66fee97a07d5 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -814,7 +814,7 @@ unionfs_close(struct vop_close_args *ap) unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; - unsp = unionfs_find_node_status(unp, td); + unsp = (td != NULL) ? unionfs_find_node_status(unp, td) : NULL; if (unsp == NULL || (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0)) { @@ -2208,7 +2208,6 @@ unionfs_lock_restart: vholdnz(tvp); VI_UNLOCK(vp); error = VOP_LOCK(tvp, flags); - vdrop(tvp); if (error == 0 && (lvp_locked || VTOUNIONFS(vp) == NULL)) { /* * After dropping the interlock above, there exists a window @@ -2234,6 +2233,7 @@ unionfs_lock_restart: unp = VTOUNIONFS(vp); if (unp == NULL || unp->un_uppervp != NULL) { VOP_UNLOCK(tvp); + vdrop(tvp); /* * If we previously held the lock, the upgrade may * have temporarily dropped the lock, in which case @@ -2249,6 +2249,7 @@ unionfs_lock_restart: goto unionfs_lock_restart; } } + vdrop(tvp); return (error); } @@ -2259,7 +2260,6 @@ unionfs_unlock(struct vop_unlock_args *ap) struct vnode *vp; struct vnode *tvp; struct unionfs_node *unp; - int error; KASSERT_UNIONFS_VNODE(ap->a_vp); @@ -2271,11 +2271,7 @@ unionfs_unlock(struct vop_unlock_args *ap) tvp = (unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp); - vholdnz(tvp); - error = VOP_UNLOCK(tvp); - vdrop(tvp); - - return (error); + return (VOP_UNLOCK(tvp)); } static int diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c index 27c65f15d5e3..db0bc77a752f 100644 --- a/sys/geom/geom_dev.c +++ b/sys/geom/geom_dev.c @@ -82,6 +82,7 @@ static const struct filterops gdev_filterops_vnode = { .f_isfd = 1, .f_detach = gdev_filter_detach, .f_event = gdev_filter_vnode, + .f_copy = knote_triv_copy, }; static struct cdevsw g_dev_cdevsw = { diff --git a/sys/geom/part/g_part.c b/sys/geom/part/g_part.c index 4c0d0c3aa902..1e4236507fa4 100644 --- a/sys/geom/part/g_part.c +++ b/sys/geom/part/g_part.c @@ -122,13 +122,13 @@ struct g_part_alias_list { { "ntfs", G_PART_ALIAS_MS_NTFS }, { "openbsd-data", G_PART_ALIAS_OPENBSD_DATA }, { "prep-boot", G_PART_ALIAS_PREP_BOOT }, - { "solaris-boot", G_PART_ALIAS_SOLARIS_BOOT }, - { "solaris-root", G_PART_ALIAS_SOLARIS_ROOT }, - { "solaris-swap", G_PART_ALIAS_SOLARIS_SWAP }, - { "solaris-backup", G_PART_ALIAS_SOLARIS_BACKUP }, - { "solaris-var", G_PART_ALIAS_SOLARIS_VAR }, - { "solaris-home", G_PART_ALIAS_SOLARIS_HOME }, - { "solaris-altsec", G_PART_ALIAS_SOLARIS_ALTSEC }, + { "solaris-boot", G_PART_ALIAS_SOLARIS_BOOT }, + { "solaris-root", G_PART_ALIAS_SOLARIS_ROOT }, + { "solaris-swap", G_PART_ALIAS_SOLARIS_SWAP }, + { "solaris-backup", G_PART_ALIAS_SOLARIS_BACKUP }, + { "solaris-var", G_PART_ALIAS_SOLARIS_VAR }, + { "solaris-home", G_PART_ALIAS_SOLARIS_HOME }, + { "solaris-altsec", G_PART_ALIAS_SOLARIS_ALTSEC }, { "solaris-reserved", G_PART_ALIAS_SOLARIS_RESERVED }, { "u-boot-env", G_PART_ALIAS_U_BOOT_ENV }, { "vmware-reserved", G_PART_ALIAS_VMRESERVED }, diff --git a/sys/i386/acpica/acpi_wakeup.c b/sys/i386/acpica/acpi_wakeup.c index 2d60d5e037a0..96be64de017b 100644 --- a/sys/i386/acpica/acpi_wakeup.c +++ b/sys/i386/acpica/acpi_wakeup.c @@ -84,7 +84,7 @@ static cpuset_t suspcpus; static struct susppcb **susppcbs; #endif -static void acpi_stop_beep(void *); +static void acpi_stop_beep(void *, enum power_stype); #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *, int); @@ -100,7 +100,7 @@ static void acpi_wakeup_cpus(struct acpi_softc *); } while (0) static void -acpi_stop_beep(void *arg) +acpi_stop_beep(void *arg, enum power_stype stype) { if (acpi_resume_beep != 0) diff --git a/sys/i386/i386/elf_machdep.c b/sys/i386/i386/elf_machdep.c index 13769af0fbca..14c942942d08 100644 --- a/sys/i386/i386/elf_machdep.c +++ b/sys/i386/i386/elf_machdep.c @@ -92,7 +92,7 @@ struct sysentvec elf32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); -static Elf32_Brandinfo freebsd_brand_info = { +static const Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -103,11 +103,11 @@ static Elf32_Brandinfo freebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); -static Elf32_Brandinfo freebsd_brand_oinfo = { +static const Elf32_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -118,11 +118,11 @@ static Elf32_Brandinfo freebsd_brand_oinfo = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_oinfo); -static Elf32_Brandinfo kfreebsd_brand_info = { +static const Elf32_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -133,7 +133,7 @@ static Elf32_Brandinfo kfreebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; -SYSINIT(kelf32, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(kelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &kfreebsd_brand_info); diff --git a/sys/i386/i386/in_cksum_machdep.c b/sys/i386/i386/in_cksum_machdep.c index 27ab09d82da0..b658d85bc892 100644 --- a/sys/i386/i386/in_cksum_machdep.c +++ b/sys/i386/i386/in_cksum_machdep.c @@ -84,7 +84,7 @@ in_cksum_skip(struct mbuf *m, int len, int skip) } } - for (;m && len; m = m->m_next) { + for (; m && len; m = m->m_next) { if (m->m_len == 0) continue; w = mtod(m, u_short *); diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 6aac0e968362..3f659432552c 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -1605,7 +1605,7 @@ init386(int first) } static void -machdep_init_trampoline(void) +machdep_init_trampoline(void *dummy __unused) { struct region_descriptor r_gdt, r_idt; struct i386tss *tss; diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index b44f5e08bbcf..1cf0867d57c3 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -720,7 +720,7 @@ __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) } static void -pmap_init_reserved_pages(void) +pmap_init_reserved_pages(void *dummy __unused) { struct pcpu *pc; vm_offset_t pages; diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c index 85877bf40997..14b5f64388d2 100644 --- a/sys/i386/linux/linux_sysvec.c +++ b/sys/i386/linux/linux_sysvec.c @@ -796,7 +796,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux_brandnote = { +static const Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, @@ -805,7 +805,7 @@ static Elf_Brandnote linux_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf32_Brandinfo linux_brand = { +static const Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -816,7 +816,7 @@ static Elf32_Brandinfo linux_brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_glibc2brand = { +static const Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -827,7 +827,7 @@ static Elf32_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_muslbrand = { +static const Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -839,7 +839,7 @@ static Elf32_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -Elf32_Brandinfo *linux_brandlist[] = { +const Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, @@ -849,7 +849,7 @@ Elf32_Brandinfo *linux_brandlist[] = { static int linux_elf_modevent(module_t mod, int type, void *data) { - Elf32_Brandinfo **brandinfo; + const Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 1bc2491a1a12..779158b41221 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -92,7 +92,7 @@ #define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE) static int __elfN(check_header)(const Elf_Ehdr *hdr); -static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, +static const Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry); @@ -104,7 +104,7 @@ static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static bool __elfN(check_note)(struct image_params *imgp, - Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0, + const Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0, uint32_t *fctl0); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); @@ -227,11 +227,11 @@ SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx, CTLFLAG_RWTUN, &__elfN(allow_wx), 0, "Allow pages to be mapped simultaneously writable and executable"); -static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; +static const Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a)) -Elf_Brandnote __elfN(freebsd_brandnote) = { +const Elf_Brandnote __elfN(freebsd_brandnote) = { .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR), .hdr.n_descsz = sizeof(int32_t), .hdr.n_type = NT_FREEBSD_ABI_TAG, @@ -254,7 +254,7 @@ __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel) static int GNU_KFREEBSD_ABI_DESC = 3; -Elf_Brandnote __elfN(kfreebsd_brandnote) = { +const Elf_Brandnote __elfN(kfreebsd_brandnote) = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, @@ -286,7 +286,7 @@ kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) } int -__elfN(insert_brand_entry)(Elf_Brandinfo *entry) +__elfN(insert_brand_entry)(const Elf_Brandinfo *entry) { int i; @@ -305,7 +305,7 @@ __elfN(insert_brand_entry)(Elf_Brandinfo *entry) } int -__elfN(remove_brand_entry)(Elf_Brandinfo *entry) +__elfN(remove_brand_entry)(const Elf_Brandinfo *entry) { int i; @@ -321,7 +321,7 @@ __elfN(remove_brand_entry)(Elf_Brandinfo *entry) } bool -__elfN(brand_inuse)(Elf_Brandinfo *entry) +__elfN(brand_inuse)(const Elf_Brandinfo *entry) { struct proc *p; bool rval = false; @@ -338,12 +338,12 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry) return (rval); } -static Elf_Brandinfo * +static const Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; - Elf_Brandinfo *bi, *bi_m; + const Elf_Brandinfo *bi, *bi_m; bool ret, has_fctl0; int i, interp_name_len; @@ -492,7 +492,7 @@ __elfN(phdr_in_zero_page)(const Elf_Ehdr *hdr) static int __elfN(check_header)(const Elf_Ehdr *hdr) { - Elf_Brandinfo *bi; + const Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || @@ -1109,7 +1109,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) struct vmspace *vmspace; vm_map_t map; char *interp; - Elf_Brandinfo *brand_info; + const Elf_Brandinfo *brand_info; struct sysentvec *sv; u_long addr, baddr, entry, proghdr; u_long maxalign, maxsalign, mapsz, maxv, maxv1, anon_loc; @@ -1925,7 +1925,7 @@ __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs, Elf_Phdr *phdr; Elf_Shdr *shdr; struct phdr_closure phc; - Elf_Brandinfo *bi; + const Elf_Brandinfo *bi; ehdr = (Elf_Ehdr *)hdr; bi = td->td_proc->p_elf_brandinfo; @@ -2831,7 +2831,7 @@ __elfN(parse_notes)(const struct image_params *imgp, const Elf_Note *checknote, } if ((const char *)note_end - (const char *)note < sizeof(Elf_Note)) { - uprintf("ELF note to short\n"); + uprintf("ELF note too short\n"); goto retf; } if (note->n_namesz != checknote->n_namesz || @@ -2839,9 +2839,9 @@ __elfN(parse_notes)(const struct image_params *imgp, const Elf_Note *checknote, note->n_type != checknote->n_type) goto nextnote; note_name = (const char *)(note + 1); - if (note_name + checknote->n_namesz >= - (const char *)note_end || strncmp(note_vendor, - note_name, checknote->n_namesz) != 0) + if (note_name + roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) + + note->n_descsz >= (const char *)note_end || + strncmp(note_vendor, note_name, checknote->n_namesz) != 0) goto nextnote; if (cb(note, cb_arg, &res)) @@ -2861,7 +2861,7 @@ ret: } struct brandnote_cb_arg { - Elf_Brandnote *brandnote; + const Elf_Brandnote *brandnote; int32_t *osrel; }; @@ -2883,7 +2883,7 @@ brandnote_cb(const Elf_Note *note, void *arg0, bool *res) return (true); } -static Elf_Note fctl_note = { +static const Elf_Note fctl_note = { .n_namesz = sizeof(FREEBSD_ABI_VENDOR), .n_descsz = sizeof(uint32_t), .n_type = NT_FREEBSD_FEATURE_CTL, @@ -2918,7 +2918,7 @@ note_fctl_cb(const Elf_Note *note, void *arg0, bool *res) * as for headers. */ static bool -__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, +__elfN(check_note)(struct image_params *imgp, const Elf_Brandnote *brandnote, int32_t *osrel, bool *has_fctl0, uint32_t *fctl0) { const Elf_Phdr *phdr; diff --git a/sys/kern/kern_boottrace.c b/sys/kern/kern_boottrace.c index 1fa87955a299..c83255bc74ee 100644 --- a/sys/kern/kern_boottrace.c +++ b/sys/kern/kern_boottrace.c @@ -579,7 +579,7 @@ sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS) } static void -boottrace_init(void) +boottrace_init(void *dummy __unused) { if (!boottrace_enabled) diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 19118eb7f275..a71a601733e5 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2486,7 +2486,7 @@ fdunshare(struct thread *td) if (refcount_load(&p->p_fd->fd_refcnt) == 1) return; - tmp = fdcopy(p->p_fd); + tmp = fdcopy(p->p_fd, p); fdescfree(td); p->p_fd = tmp; } @@ -2515,14 +2515,17 @@ pdunshare(struct thread *td) * this is to ease callers, not catch errors. */ struct filedesc * -fdcopy(struct filedesc *fdp) +fdcopy(struct filedesc *fdp, struct proc *p1) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; + struct file *fp; int i, lastfile; + bool fork_pass; MPASS(fdp != NULL); + fork_pass = false; newfdp = fdinit(); FILEDESC_SLOCK(fdp); for (;;) { @@ -2533,10 +2536,35 @@ fdcopy(struct filedesc *fdp) fdgrowtable(newfdp, lastfile + 1); FILEDESC_SLOCK(fdp); } - /* copy all passable descriptors (i.e. not kqueue) */ + + /* + * Copy all passable descriptors (i.e. not kqueue), and + * prepare to handle copyable but not passable descriptors + * (kqueues). + * + * The pass to handle copying is performed after all passable + * files are installed into the new file descriptor's table, + * since kqueues need all referenced file descriptors already + * valid, including other kqueues. For the same reason the + * copying is done in two passes by itself, first installing + * not fully initialized ('empty') copyable files into the new + * fd table, and then giving the subsystems a second chance to + * really fill the copied file backing structure with the + * content. + */ newfdp->fd_freefile = fdp->fd_freefile; FILEDESC_FOREACH_FDE(fdp, i, ofde) { - if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || + const struct fileops *ops; + + ops = ofde->fde_file->f_ops; + fp = NULL; + if ((ops->fo_flags & DFLAG_FORK) != 0 && + (ofde->fde_flags & UF_FOCLOSE) == 0) { + if (ops->fo_fork(newfdp, ofde->fde_file, &fp, p1, + curthread) != 0) + continue; + fork_pass = true; + } else if ((ops->fo_flags & DFLAG_PASSABLE) == 0 || (ofde->fde_flags & UF_FOCLOSE) != 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == fdp->fd_freefile) @@ -2545,11 +2573,30 @@ fdcopy(struct filedesc *fdp) } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; + if (fp != NULL) + nfde->fde_file = fp; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } MPASS(newfdp->fd_freefile != -1); FILEDESC_SUNLOCK(fdp); + + /* + * Now handle copying kqueues, since all fds, including + * kqueues, are in place. + */ + if (__predict_false(fork_pass)) { + FILEDESC_FOREACH_FDE(newfdp, i, nfde) { + const struct fileops *ops; + + ops = nfde->fde_file->f_ops; + if ((ops->fo_flags & DFLAG_FORK) == 0 || + nfde->fde_file == NULL) + continue; + ops->fo_fork(newfdp, NULL, &nfde->fde_file, p1, + curthread); + } + } return (newfdp); } diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c index 7a2818c29b1a..a37cb23efed8 100644 --- a/sys/kern/kern_devctl.c +++ b/sys/kern/kern_devctl.c @@ -130,6 +130,7 @@ static const struct filterops devctl_rfiltops = { .f_isfd = 1, .f_detach = filt_devctl_detach, .f_event = filt_devctl_read, + .f_copy = knote_triv_copy, }; static struct cdev *devctl_dev; @@ -140,7 +141,7 @@ static struct devctlbridge { } devctl_notify_hook = { .send_f = NULL }; static void -devctl_init(void) +devctl_init(void *dummy __unused) { int reserve; uma_zone_t z; diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 23d8dc9cf54a..1baa24d278bf 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -134,6 +134,7 @@ static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; +static fo_fork_t kqueue_fork; static const struct fileops kqueueops = { .fo_read = invfo_rdwr, @@ -148,7 +149,9 @@ static const struct fileops kqueueops = { .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_cmp = file_kcmp_generic, + .fo_fork = kqueue_fork, .fo_fill_kinfo = kqueue_fill_kinfo, + .fo_flags = DFLAG_FORK, }; static int knote_attach(struct knote *kn, struct kqueue *kq); @@ -156,7 +159,7 @@ static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); -static void knote_init(void); +static void knote_init(void *); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); @@ -176,6 +179,7 @@ static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); +static int filt_timercopy(struct knote *kn, struct proc *p1); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); @@ -187,11 +191,13 @@ static void filt_usertouch(struct knote *kn, struct kevent *kev, static const struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, + .f_copy = knote_triv_copy, }; static const struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, + .f_copy = knote_triv_copy, }; /* XXX - move to kern_proc.c? */ static const struct filterops proc_filtops = { @@ -199,12 +205,14 @@ static const struct filterops proc_filtops = { .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, + .f_copy = knote_triv_copy, }; static const struct filterops jail_filtops = { .f_isfd = 0, .f_attach = filt_jailattach, .f_detach = filt_jaildetach, .f_event = filt_jail, + .f_copy = knote_triv_copy, }; static const struct filterops timer_filtops = { .f_isfd = 0, @@ -212,12 +220,14 @@ static const struct filterops timer_filtops = { .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, + .f_copy = filt_timercopy, }; static const struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, + .f_copy = knote_triv_copy, }; static uma_zone_t knote_zone; @@ -347,6 +357,7 @@ filt_nullattach(struct knote *kn) static const struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, + .f_copy = knote_triv_copy, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ @@ -940,6 +951,30 @@ filt_timerattach(struct knote *kn) return (0); } +static int +filt_timercopy(struct knote *kn, struct proc *p) +{ + struct kq_timer_cb_data *kc_src, *kc; + + if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) { + atomic_subtract_int(&kq_ncallouts, 1); + return (ENOMEM); + } + + kn->kn_status &= ~KN_DETACHED; + kc_src = kn->kn_ptr.p_v; + kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); + kc->kn = kn; + kc->p = p; + kc->flags = kc_src->flags & ~KQ_TIMER_CB_ENQUEUED; + kc->next = kc_src->next; + kc->to = kc_src->to; + kc->cpuid = PCPU_GET(cpuid); + callout_init(&kc->c, 1); + kqtimer_sched_callout(kc); + return (0); +} + static void filt_timerstart(struct knote *kn, sbintime_t to) { @@ -1151,7 +1186,7 @@ int sys_kqueue(struct thread *td, struct kqueue_args *uap) { - return (kern_kqueue(td, 0, NULL)); + return (kern_kqueue(td, 0, false, NULL)); } int @@ -1159,55 +1194,76 @@ sys_kqueuex(struct thread *td, struct kqueuex_args *uap) { int flags; - if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0) + if ((uap->flags & ~(KQUEUE_CLOEXEC | KQUEUE_CPONFORK)) != 0) return (EINVAL); flags = 0; if ((uap->flags & KQUEUE_CLOEXEC) != 0) flags |= O_CLOEXEC; - return (kern_kqueue(td, flags, NULL)); + return (kern_kqueue(td, flags, (uap->flags & KQUEUE_CPONFORK) != 0, + NULL)); } static void -kqueue_init(struct kqueue *kq) +kqueue_init(struct kqueue *kq, bool cponfork) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); + if (cponfork) + kq->kq_state |= KQ_CPONFORK; } -int -kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) +static int +kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip, + struct file **fpp, int flags, struct filecaps *fcaps, bool cponfork, + struct kqueue **kqp) { - struct filedesc *fdp; - struct kqueue *kq; - struct file *fp; struct ucred *cred; - int fd, error; + struct kqueue *kq; + int error; - fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); - error = falloc_caps(td, &fp, &fd, flags, fcaps); + error = fdip != NULL ? falloc_caps(td, fpp, fdip, flags, fcaps) : + _falloc_noinstall(td, fpp, 1); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ - kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); - kqueue_init(kq); + kq = malloc(sizeof(*kq), M_KQUEUE, M_WAITOK | M_ZERO); + kqueue_init(kq, cponfork); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); - FILEDESC_XLOCK(fdp); + if (fdip != NULL) + FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); - FILEDESC_XUNLOCK(fdp); + if (fdip != NULL) + FILEDESC_XUNLOCK(fdp); + + finit(*fpp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); + *kqp = kq; + return (0); +} + +int +kern_kqueue(struct thread *td, int flags, bool cponfork, struct filecaps *fcaps) +{ + struct kqueue *kq; + struct file *fp; + int fd, error; + + error = kern_kqueue_alloc(td, td->td_proc->p_fd, &fd, &fp, flags, + fcaps, cponfork, &kq); + if (error != 0) + return (error); - finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; @@ -1488,7 +1544,7 @@ kern_kevent_anonymous(struct thread *td, int nevents, struct kqueue kq = {}; int error; - kqueue_init(&kq); + kqueue_init(&kq, false); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); @@ -1576,7 +1632,7 @@ kqueue_fo_release(int filt) mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, - ("filter object refcount not valid on release")); + ("filter object %d refcount not valid on release", filt)); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } @@ -1855,17 +1911,8 @@ done: } static int -kqueue_acquire(struct file *fp, struct kqueue **kqp) +kqueue_acquire_ref(struct kqueue *kq) { - int error; - struct kqueue *kq; - - error = 0; - - kq = fp->f_data; - if (fp->f_type != DTYPE_KQUEUE || kq == NULL) - return (EINVAL); - *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); @@ -1873,8 +1920,22 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp) } kq->kq_refcnt++; KQ_UNLOCK(kq); + return (0); +} - return error; +static int +kqueue_acquire(struct file *fp, struct kqueue **kqp) +{ + struct kqueue *kq; + int error; + + kq = fp->f_data; + if (fp->f_type != DTYPE_KQUEUE || kq == NULL) + return (EINVAL); + error = kqueue_acquire_ref(kq); + if (error == 0) + *kqp = kq; + return (error); } static void @@ -2887,7 +2948,7 @@ knote_dequeue(struct knote *kn) } static void -knote_init(void) +knote_init(void *dummy __unused) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, @@ -2937,6 +2998,152 @@ noacquire: return (error); } +static int +kqueue_fork_alloc(struct filedesc *fdp, struct file *fp, struct file **fp1, + struct thread *td) +{ + struct kqueue *kq, *kq1; + int error; + + MPASS(fp->f_type == DTYPE_KQUEUE); + kq = fp->f_data; + if ((kq->kq_state & KQ_CPONFORK) == 0) + return (EOPNOTSUPP); + error = kqueue_acquire_ref(kq); + if (error != 0) + return (error); + error = kern_kqueue_alloc(td, fdp, NULL, fp1, 0, NULL, true, &kq1); + if (error == 0) { + kq1->kq_forksrc = kq; + (*fp1)->f_flag = fp->f_flag & (FREAD | FWRITE | FEXEC | + O_CLOEXEC | O_CLOFORK); + } else { + kqueue_release(kq, 0); + } + return (error); +} + +static void +kqueue_fork_copy_knote(struct kqueue *kq1, struct knote *kn, struct proc *p1, + struct filedesc *fdp) +{ + struct knote *kn1; + const struct filterops *fop; + int error; + + fop = kn->kn_fop; + if (fop->f_copy == NULL || (fop->f_isfd && + fdp->fd_files->fdt_ofiles[kn->kn_kevent.ident].fde_file == NULL)) + return; + error = kqueue_expand(kq1, fop, kn->kn_kevent.ident, M_WAITOK); + if (error != 0) + return; + + kn1 = knote_alloc(M_WAITOK); + *kn1 = *kn; + kn1->kn_status |= KN_DETACHED; + kn1->kn_status &= ~KN_QUEUED; + kn1->kn_kq = kq1; + error = fop->f_copy(kn1, p1); + if (error != 0) { + knote_free(kn1); + return; + } + (void)kqueue_fo_find(kn->kn_kevent.filter); + if (fop->f_isfd && !fhold(kn1->kn_fp)) { + fop->f_detach(kn1); + kqueue_fo_release(kn->kn_kevent.filter); + knote_free(kn1); + return; + } + if (kn->kn_knlist != NULL) + knlist_add(kn->kn_knlist, kn1, 0); + KQ_LOCK(kq1); + knote_attach(kn1, kq1); + kn1->kn_influx = 0; + if ((kn->kn_status & KN_QUEUED) != 0) + knote_enqueue(kn1); + KQ_UNLOCK(kq1); +} + +static void +kqueue_fork_copy_list(struct klist *knlist, struct knote *marker, + struct kqueue *kq, struct kqueue *kq1, struct proc *p1, + struct filedesc *fdp) +{ + struct knote *kn; + + KQ_OWNED(kq); + kn = SLIST_FIRST(knlist); + while (kn != NULL) { + if ((kn->kn_status & KN_DETACHED) != 0 || + (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0)) { + kn = SLIST_NEXT(kn, kn_link); + continue; + } + kn_enter_flux(kn); + SLIST_INSERT_AFTER(kn, marker, kn_link); + KQ_UNLOCK(kq); + kqueue_fork_copy_knote(kq1, kn, p1, fdp); + KQ_LOCK(kq); + kn_leave_flux(kn); + kn = SLIST_NEXT(marker, kn_link); + /* XXXKIB switch kn_link to LIST? */ + SLIST_REMOVE(knlist, marker, knote, kn_link); + } +} + +static int +kqueue_fork_copy(struct filedesc *fdp, struct file *fp, struct file *fp1, + struct proc *p1, struct thread *td) +{ + struct kqueue *kq, *kq1; + struct knote *marker; + int error, i; + + error = 0; + MPASS(fp == NULL); + MPASS(fp1->f_type == DTYPE_KQUEUE); + + kq1 = fp1->f_data; + kq = kq1->kq_forksrc; + marker = knote_alloc(M_WAITOK); + marker->kn_status = KN_MARKER; + + KQ_LOCK(kq); + for (i = 0; i < kq->kq_knlistsize; i++) { + kqueue_fork_copy_list(&kq->kq_knlist[i], marker, kq, kq1, + p1, fdp); + } + if (kq->kq_knhashmask != 0) { + for (i = 0; i <= kq->kq_knhashmask; i++) { + kqueue_fork_copy_list(&kq->kq_knhash[i], marker, kq, + kq1, p1, fdp); + } + } + kqueue_release(kq, 1); + kq1->kq_forksrc = NULL; + KQ_UNLOCK(kq); + + knote_free(marker); + return (error); +} + +static int +kqueue_fork(struct filedesc *fdp, struct file *fp, struct file **fp1, + struct proc *p1, struct thread *td) +{ + if (*fp1 == NULL) + return (kqueue_fork_alloc(fdp, fp, fp1, td)); + return (kqueue_fork_copy(fdp, fp, *fp1, p1, td)); +} + +int +knote_triv_copy(struct knote *kn __unused, struct proc *p1 __unused) +{ + return (0); +} + struct knote_status_export_bit { int kn_status_bit; int knt_status_bit; diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 0fc2d0e7f1bc..2bdd6faa025a 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -418,7 +418,7 @@ do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, #endif int error, i, orig_osrel; uint32_t orig_fctl0; - Elf_Brandinfo *orig_brandinfo; + const Elf_Brandinfo *orig_brandinfo; size_t freepath_size; static const char fexecv_proc_title[] = "(fexecv)"; @@ -1314,7 +1314,7 @@ exec_map_stack(struct image_params *imgp) MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } else { sharedpage_addr = sv->sv_shared_page_base; - vm_map_fixed(map, obj, 0, + error = vm_map_fixed(map, obj, 0, sharedpage_addr, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index a32b5a1b3354..c4b1c8201ff2 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -127,6 +127,27 @@ proc_realparent(struct proc *child) return (parent); } +static void +reaper_clear(struct proc *p, struct proc *rp) +{ + struct proc *p1; + bool clear; + + sx_assert(&proctree_lock, SX_XLOCKED); + LIST_REMOVE(p, p_reapsibling); + if (p->p_reapsubtree == 1) + return; + clear = true; + LIST_FOREACH(p1, &rp->p_reaplist, p_reapsibling) { + if (p1->p_reapsubtree == p->p_reapsubtree) { + clear = false; + break; + } + } + if (clear) + proc_id_clear(PROC_ID_REAP, p->p_reapsubtree); +} + void reaper_abandon_children(struct proc *p, bool exiting) { @@ -138,7 +159,7 @@ reaper_abandon_children(struct proc *p, bool exiting) return; p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { - LIST_REMOVE(p2, p_reapsibling); + reaper_clear(p2, p); p2->p_reaper = p1; p2->p_reapsubtree = p->p_reapsubtree; LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling); @@ -152,27 +173,6 @@ reaper_abandon_children(struct proc *p, bool exiting) p->p_treeflag &= ~P_TREE_REAPER; } -static void -reaper_clear(struct proc *p) -{ - struct proc *p1; - bool clear; - - sx_assert(&proctree_lock, SX_LOCKED); - LIST_REMOVE(p, p_reapsibling); - if (p->p_reapsubtree == 1) - return; - clear = true; - LIST_FOREACH(p1, &p->p_reaper->p_reaplist, p_reapsibling) { - if (p1->p_reapsubtree == p->p_reapsubtree) { - clear = false; - break; - } - } - if (clear) - proc_id_clear(PROC_ID_REAP, p->p_reapsubtree); -} - void proc_clear_orphan(struct proc *p) { @@ -807,7 +807,7 @@ kern_abort2(struct thread *td, const char *why, int nargs, void **uargs) } if (nargs > 0) { sbuf_putc(sb, '('); - for (i = 0;i < nargs; i++) + for (i = 0; i < nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); sbuf_putc(sb, ')'); } @@ -972,7 +972,7 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options) sx_xunlock(PIDHASHLOCK(p->p_pid)); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); - reaper_clear(p); + reaper_clear(p, p->p_reaper); PROC_LOCK(p); proc_clear_orphan(p); PROC_UNLOCK(p); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 7f6abae187b3..8b237b6dbd17 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -423,7 +423,7 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread * pd = pdshare(p1->p_pd); else pd = pdcopy(p1->p_pd); - fd = fdcopy(p1->p_fd); + fd = fdcopy(p1->p_fd, p2); fdtol = NULL; } else { if (fr->fr_flags2 & FR2_SHARE_PATHS) diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c index 3f322b271400..a564393d3366 100644 --- a/sys/kern/kern_jaildesc.c +++ b/sys/kern/kern_jaildesc.c @@ -344,6 +344,7 @@ static const struct filterops jaildesc_kqops = { .f_isfd = 1, .f_detach = jaildesc_kqops_detach, .f_event = jaildesc_kqops_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/kern_jailmeta.c b/sys/kern/kern_jailmeta.c index 4e37eccad03a..91bb7155820d 100644 --- a/sys/kern/kern_jailmeta.c +++ b/sys/kern/kern_jailmeta.c @@ -599,22 +599,18 @@ SYSCTL_PROC(_security_jail, OID_AUTO, env, /* Setup and tear down. */ -static int +static void jm_sysinit(void *arg __unused) { meta.osd_slot = osd_jail_register(jm_osd_destructor, meta.methods); env.osd_slot = osd_jail_register(jm_osd_destructor, env.methods); - - return (0); } -static int +static void jm_sysuninit(void *arg __unused) { osd_jail_deregister(meta.osd_slot); osd_jail_deregister(env.osd_slot); - - return (0); } SYSINIT(jailmeta, SI_SUB_DRIVERS, SI_ORDER_ANY, jm_sysinit, NULL); diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index d566bc01bc5e..e2f63cbc0c5a 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -435,7 +435,7 @@ linker_file_register_modules(linker_file_t lf) } static void -linker_init_kernel_modules(void) +linker_init_kernel_modules(void *dummy __unused) { sx_xlock(&kld_sx); diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 653ce1ee556b..fcbfbe64f854 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -303,7 +303,7 @@ sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS) */ #if MALLOC_DEBUG_MAXZONES > 1 static void -tunable_set_numzones(void) +tunable_set_numzones(void *dummy __unused) { TUNABLE_INT_FETCH("debug.malloc.numzones", @@ -1302,7 +1302,7 @@ mallocinit(void *dummy) #endif align, UMA_ZONE_MALLOC); } - for (;i <= size; i+= KMEM_ZBASE) + for (; i <= size; i+= KMEM_ZBASE) kmemsize[i >> KMEM_ZSHIFT] = indx; } } diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index 8b5908f5219a..d67c70984528 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -503,8 +503,8 @@ _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line) /* * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * - * We call this if the lock is either contested (i.e. we need to go to - * sleep waiting for it), or if we need to recurse on it. + * We get here if lock profiling is enabled, the lock is already held by + * someone else or we are recursing on it. */ #if LOCK_DEBUG > 0 void @@ -660,13 +660,8 @@ retry_turnstile: } #endif - /* - * If the mutex isn't already contested and a failure occurs - * setting the contested bit, the mutex was either released - * or the state of the MTX_RECURSED bit changed. - */ - if ((v & MTX_CONTESTED) == 0 && - !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_CONTESTED)) { + if ((v & MTX_WAITERS) == 0 && + !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_WAITERS)) { goto retry_turnstile; } @@ -869,7 +864,7 @@ _thread_lock(struct thread *td) WITNESS_LOCK(&m->lock_object, LOP_EXCLUSIVE, file, line); return; } - _mtx_release_lock_quick(m); + atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED); slowpath_unlocked: spinlock_exit(); slowpath_noirq: @@ -959,7 +954,7 @@ retry: } if (m == td->td_lock) break; - _mtx_release_lock_quick(m); + atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED); } LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); @@ -1029,8 +1024,8 @@ thread_lock_set(struct thread *td, struct mtx *new) /* * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * - * We are only called here if the lock is recursed, contested (i.e. we - * need to wake up a blocked thread) or lockstat probe is active. + * We get here if lock profiling is enabled, the lock is already held by + * someone else or we are recursing on it. */ #if LOCK_DEBUG > 0 void @@ -1071,7 +1066,7 @@ __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v) * can be removed from the hash list if it is empty. */ turnstile_chain_lock(&m->lock_object); - _mtx_release_lock_quick(m); + atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED); ts = turnstile_lookup(&m->lock_object); MPASS(ts != NULL); if (LOCK_LOG_TEST(&m->lock_object, opts)) @@ -1207,7 +1202,7 @@ _mtx_destroy(volatile uintptr_t *c) if (!mtx_owned(m)) MPASS(mtx_unowned(m)); else { - MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); + MPASS((m->mtx_lock & (MTX_RECURSED|MTX_WAITERS)) == 0); /* Perform the non-mtx related part of mtx_unlock_spin(). */ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) { @@ -1359,8 +1354,8 @@ db_show_mtx(const struct lock_object *lock) db_printf("DESTROYED"); else { db_printf("OWNED"); - if (m->mtx_lock & MTX_CONTESTED) - db_printf(", CONTESTED"); + if (m->mtx_lock & MTX_WAITERS) + db_printf(", WAITERS"); if (m->mtx_lock & MTX_RECURSED) db_printf(", RECURSED"); } diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 7351e9cb6313..2aab151aba08 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -1312,7 +1312,7 @@ static struct kproc_desc racctd_kp = { }; static void -racctd_init(void) +racctd_init(void *dummy __unused) { if (!racct_enable) return; @@ -1322,7 +1322,7 @@ racctd_init(void) SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void -racct_init(void) +racct_init(void *dummy __unused) { if (!racct_enable) return; diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index 3854ffbeec29..cd66bff62608 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -300,7 +300,7 @@ static void rangelock_free_free(struct rl_q_entry *free); static void rangelock_noncheating_destroy(struct rangelock *lock); static void -rangelock_sys_init(void) +rangelock_sys_init(void *dummy __unused) { rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry), diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 4232c71f86fb..682ba86d23ff 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -209,7 +209,7 @@ static struct dict actionnames[] = { { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; -static void rctl_init(void); +static void rctl_init(void *); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; @@ -2175,7 +2175,7 @@ rctl_racct_release(struct racct *racct) } static void -rctl_init(void) +rctl_init(void *dummy __unused) { if (!racct_enable) diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c index 5b8398caaca9..f48d0e3d616b 100644 --- a/sys/kern/kern_sharedpage.c +++ b/sys/kern/kern_sharedpage.c @@ -130,8 +130,7 @@ shared_page_init(void *dummy __unused) shared_page_mapping = (char *)addr; } -SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init, - NULL); +SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, shared_page_init, NULL); /* * Push the timehands update to the shared page. diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 8efc0886988b..a55f3c761449 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -113,7 +113,7 @@ static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock); -static void sigqueue_start(void); +static void sigqueue_start(void *); static void sigfastblock_setpend(struct thread *td, bool resched); static void sig_handle_first_stop(struct thread *td, struct proc *p, int sig); @@ -124,6 +124,7 @@ const struct filterops sig_filtops = { .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, + .f_copy = knote_triv_copy, }; static int kern_forcesigexit = 1; @@ -344,7 +345,7 @@ ast_sigsuspend(struct thread *td, int tda __unused) } static void -sigqueue_start(void) +sigqueue_start(void *dummy __unused) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index 2a6f0989f6aa..5b7485c25cd7 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -90,7 +90,7 @@ static int user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *ua_rqtp, struct timespec *ua_rmtp); -static void itimer_start(void); +static void itimer_start(void *); static int itimer_init(void *, int, int); static void itimer_fini(void *, int); static void itimer_enter(struct itimer *); @@ -1170,7 +1170,7 @@ eventratecheck(struct timeval *lasttime, int *cureps, int maxeps) } static void -itimer_start(void) +itimer_start(void *dummy __unused) { static const struct kclock rt_clock = { .timer_create = realtimer_create, diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index bbebadc4c395..ebd203858b66 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -518,9 +518,15 @@ link_elf_init(void* arg) (void)link_elf_link_common_finish(linker_kernel_file); linker_kernel_file->flags |= LINKER_FILE_LINKED; TAILQ_INIT(&set_pcpu_list); + ef->pcpu_start = DPCPU_START; + ef->pcpu_stop = DPCPU_STOP; + ef->pcpu_base = DPCPU_START; #ifdef VIMAGE TAILQ_INIT(&set_vnet_list); vnet_save_init((void *)VNET_START, VNET_STOP - VNET_START); + ef->vnet_start = VNET_START; + ef->vnet_stop = VNET_STOP; + ef->vnet_base = VNET_START; #endif } diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c index 151aab96f9be..a3a53a39bfd6 100644 --- a/sys/kern/link_elf_obj.c +++ b/sys/kern/link_elf_obj.c @@ -70,6 +70,7 @@ typedef struct { void *addr; + void *origaddr; /* Used by debuggers. */ Elf_Off size; int flags; /* Section flags. */ int sec; /* Original section number. */ @@ -492,7 +493,8 @@ link_elf_link_preload(linker_class_t cls, const char *filename, case SHT_FINI_ARRAY: if (shdr[i].sh_addr == 0) break; - ef->progtab[pb].addr = (void *)shdr[i].sh_addr; + ef->progtab[pb].addr = ef->progtab[pb].origaddr = + (void *)shdr[i].sh_addr; if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<<PROGBITS>>"; #ifdef __amd64__ @@ -1088,6 +1090,8 @@ link_elf_load_file(linker_class_t cls, const char *filename, ef->progtab[pb].name = "<<NOBITS>>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { + ef->progtab[pb].origaddr = + (void *)(uintptr_t)mapbase; ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); if (ef->progtab[pb].addr == NULL) { @@ -1101,6 +1105,8 @@ link_elf_load_file(linker_class_t cls, const char *filename, #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { + ef->progtab[pb].origaddr = + (void *)(uintptr_t)mapbase; ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); if (ef->progtab[pb].addr == NULL) { diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c index 07a9cc0f57be..c4d0223d484f 100644 --- a/sys/kern/subr_devstat.c +++ b/sys/kern/subr_devstat.c @@ -415,7 +415,7 @@ sysctl_devstat(SYSCTL_HANDLER_ARGS) if (error != 0) return (error); - for (;nds != NULL;) { + while (nds != NULL) { error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); if (error != 0) return (error); diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c index 5380902e602f..aac35a56130e 100644 --- a/sys/kern/subr_log.c +++ b/sys/kern/subr_log.c @@ -79,6 +79,7 @@ static const struct filterops log_read_filterops = { .f_attach = NULL, .f_detach = logkqdetach, .f_event = logkqread, + .f_copy = knote_triv_copy, }; static struct logsoftc { diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c index 5c14e15830f4..c9a387a5e87b 100644 --- a/sys/kern/subr_pcpu.c +++ b/sys/kern/subr_pcpu.c @@ -140,7 +140,7 @@ uma_zone_t pcpu_zone_32; uma_zone_t pcpu_zone_64; static void -pcpu_zones_startup(void) +pcpu_zones_startup(void *dummy __unused) { pcpu_zone_4 = uma_zcreate("pcpu-4", 4, diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index db0ceb17b9f0..e2070ae3f865 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -766,7 +766,7 @@ reswitch: switch (ch = (u_char)*fmt++) { PCHAR(hex2ascii(*up & 0x0f)); up++; if (width) - for (q=p;*q;q++) + for (q = p; *q; q++) PCHAR(*q); } break; diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c index c2a0f67cae85..04ed107c933d 100644 --- a/sys/kern/sys_eventfd.c +++ b/sys/kern/sys_eventfd.c @@ -85,13 +85,16 @@ static int filt_eventfdwrite(struct knote *kn, long hint); static const struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, - .f_event = filt_eventfdread + .f_event = filt_eventfdread, + .f_copy = knote_triv_copy, }; + static const struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, - .f_event = filt_eventfdwrite + .f_event = filt_eventfdwrite, + .f_copy = knote_triv_copy, }; struct eventfd { diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 5606b36f772f..7d666da9f88b 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -729,7 +729,7 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; - int error, tmp, locked; + int error, f_flag, tmp, locked; AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(com); @@ -782,30 +782,36 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) goto out; } + f_flag = 0; switch (com) { case FIONCLEX: fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; - goto out; + break; case FIOCLEX: fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; - goto out; - case FIONBIO: - if ((tmp = *(int *)data)) - atomic_set_int(&fp->f_flag, FNONBLOCK); - else - atomic_clear_int(&fp->f_flag, FNONBLOCK); - data = (void *)&tmp; break; + case FIONBIO: case FIOASYNC: - if ((tmp = *(int *)data)) - atomic_set_int(&fp->f_flag, FASYNC); - else - atomic_clear_int(&fp->f_flag, FASYNC); - data = (void *)&tmp; + f_flag = com == FIONBIO ? FNONBLOCK : FASYNC; + tmp = *(int *)data; + fsetfl_lock(fp); + if (((fp->f_flag & f_flag) != 0) != (tmp != 0)) { + error = fo_ioctl(fp, com, (void *)&tmp, td->td_ucred, + td); + if (error == 0) { + if (tmp != 0) + atomic_set_int(&fp->f_flag, f_flag); + else + atomic_clear_int(&fp->f_flag, f_flag); + } + } + fsetfl_unlock(fp); + break; + default: + error = fo_ioctl(fp, com, data, td->td_ucred, td); break; } - error = fo_ioctl(fp, com, data, td->td_ucred, td); out: switch (locked) { case LA_XLOCKED: diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 30527fdd4fd0..6531cea31423 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -181,20 +181,23 @@ static int filt_pipedump(struct proc *p, struct knote *kn, static const struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, - .f_event = filt_pipenotsup + .f_event = filt_pipenotsup, /* no userdump */ + .f_copy = knote_triv_copy, }; static const struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead, .f_userdump = filt_pipedump, + .f_copy = knote_triv_copy, }; static const struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite, .f_userdump = filt_pipedump, + .f_copy = knote_triv_copy, }; /* @@ -567,7 +570,7 @@ pipespace_new(struct pipe *cpipe, int size) static int curfail = 0; static struct timeval lastfail; - KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); + PIPE_LOCK_ASSERT(cpipe, MA_NOTOWNED); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: @@ -1679,8 +1682,7 @@ static void pipe_free_kmem(struct pipe *cpipe) { - KASSERT(!mtx_owned(PIPE_MTX(cpipe)), - ("pipe_free_kmem: pipe mutex locked")); + PIPE_LOCK_ASSERT(cpipe, MA_NOTOWNED); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c index acaf1241cb2e..c5db21544b0f 100644 --- a/sys/kern/sys_procdesc.c +++ b/sys/kern/sys_procdesc.c @@ -486,6 +486,7 @@ static const struct filterops procdesc_kqops = { .f_isfd = 1, .f_detach = procdesc_kqops_detach, .f_event = procdesc_kqops_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index c221106ae067..bc0725230cca 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -586,7 +586,7 @@ soaio_enqueue(struct task *task) } static void -soaio_init(void) +soaio_init(void *dummy __unused) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; diff --git a/sys/kern/tty.c b/sys/kern/tty.c index c8e2c561b7cf..067471eb949a 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -754,12 +754,14 @@ static const struct filterops tty_kqops_read = { .f_isfd = 1, .f_detach = tty_kqops_read_detach, .f_event = tty_kqops_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops tty_kqops_write = { .f_isfd = 1, .f_detach = tty_kqops_write_detach, .f_event = tty_kqops_write_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index 1291770a9ccb..2672935c2d89 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -491,11 +491,13 @@ static const struct filterops pts_kqops_read = { .f_isfd = 1, .f_detach = pts_kqops_read_detach, .f_event = pts_kqops_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops pts_kqops_write = { .f_isfd = 1, .f_detach = pts_kqops_write_detach, .f_event = pts_kqops_write_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index a8aec397b352..4c1bb1ff228e 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -281,11 +281,13 @@ static const struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, + .f_copy = knote_triv_copy, }; static const struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index fe2d8d056062..eb9544628137 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -191,16 +191,19 @@ static const struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, + .f_copy = knote_triv_copy, }; static const struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, + .f_copy = knote_triv_copy, }; static const struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, + .f_copy = knote_triv_copy, }; so_gen_t so_gencnt; /* generation count for sockets */ diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 340d84666459..807271488af2 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1069,6 +1069,21 @@ uipc_stream_sbspace(struct sockbuf *sb) return (min(space, mbspace)); } +/* + * UNIX version of generic sbwait() for writes. We wait on peer's receive + * buffer, using our timeout. + */ +static int +uipc_stream_sbwait(struct socket *so, sbintime_t timeo) +{ + struct sockbuf *sb = &so->so_rcv; + + SOCK_RECVBUF_LOCK_ASSERT(so); + sb->sb_flags |= SB_WAIT; + return (msleep_sbt(&sb->sb_acc, SOCK_RECVBUF_MTX(so), PSOCK | PCATCH, + "sbwait", timeo, 0, 0)); +} + static int uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr, struct uio *uio0, struct mbuf *m, struct mbuf *c, int flags, @@ -1203,7 +1218,8 @@ restart: error = EWOULDBLOCK; goto out4; } - if ((error = sbwait(so2, SO_RCV)) != 0) { + if ((error = uipc_stream_sbwait(so2, + so->so_snd.sb_timeo)) != 0) { SOCK_RECVBUF_UNLOCK(so2); goto out4; } else @@ -1543,15 +1559,19 @@ restart: mc_init_m(&cmc, control); SOCK_RECVBUF_LOCK(so); - MPASS(!(sb->sb_state & SBS_CANTRCVMORE)); - - if (__predict_false(cmc.mc_len + sb->sb_ccc + - sb->sb_ctl > sb->sb_hiwat)) { + if (__predict_false( + (sb->sb_state & SBS_CANTRCVMORE) || + cmc.mc_len + sb->sb_ccc + sb->sb_ctl > + sb->sb_hiwat)) { /* - * Too bad, while unp_externalize() was - * failing, the other side had filled - * the buffer and we can't prepend data - * back. Losing data! + * While the lock was dropped and we + * were failing in unp_externalize(), + * the peer could has a) disconnected, + * b) filled the buffer so that we + * can't prepend data back. + * These are two edge conditions that + * we just can't handle, so lose the + * data and return the error. */ SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); @@ -1835,11 +1855,13 @@ static const struct filterops uipc_write_filtops = { .f_isfd = 1, .f_detach = uipc_filt_sowdetach, .f_event = uipc_filt_sowrite, + .f_copy = knote_triv_copy, }; static const struct filterops uipc_empty_filtops = { .f_isfd = 1, .f_detach = uipc_filt_sowdetach, .f_event = uipc_filt_soempty, + .f_copy = knote_triv_copy, }; static int @@ -2397,7 +2419,7 @@ uipc_sendfile_wait(struct socket *so, off_t need, int *space) } if (!sockref) soref(so2); - error = sbwait(so2, SO_RCV); + error = uipc_stream_sbwait(so2, so->so_snd.sb_timeo); if (error == 0 && __predict_false(sb->sb_state & SBS_CANTRCVMORE)) error = EPIPE; diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index e63fa4c01434..60916a9fbd32 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -345,12 +345,14 @@ static const struct filterops aio_filtops = { .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, + .f_copy = knote_triv_copy, }; static const struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, - .f_event = filt_lio + .f_event = filt_lio, + .f_copy = knote_triv_copy, }; static eventhandler_tag exit_tag, exec_tag; diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c index b265a5ff3a62..e60d8426ee42 100644 --- a/sys/kern/vfs_inotify.c +++ b/sys/kern/vfs_inotify.c @@ -111,6 +111,7 @@ static const struct filterops inotify_rfiltops = { .f_isfd = 1, .f_detach = filt_inotifydetach, .f_event = filt_inotifyevent, + .f_copy = knote_triv_copy, }; static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 73e110c05bc1..58975f7ac932 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -6545,6 +6545,7 @@ const struct filterops fs_filtops = { .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent, + .f_copy = knote_triv_copy, }; static int @@ -6624,24 +6625,28 @@ static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static int filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin); +static int filt_vfscopy(struct knote *kn, struct proc *p1); static const struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread, .f_userdump = filt_vfsdump, + .f_copy = filt_vfscopy, }; static const struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite, .f_userdump = filt_vfsdump, + .f_copy = filt_vfscopy, }; static const struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode, .f_userdump = filt_vfsdump, + .f_copy = filt_vfscopy, }; static void @@ -6825,6 +6830,16 @@ filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin) return (0); } +static int +filt_vfscopy(struct knote *kn, struct proc *p1) +{ + struct vnode *vp; + + vp = (struct vnode *)kn->kn_hook; + vhold(vp); + return (0); +} + int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3d4567b6ab1e..a53df50c06bd 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -806,9 +806,12 @@ file_v_lock(struct file *fp, short lock_bit, short lock_wait_bit) flagsp = &fp->f_vflags; state = atomic_load_16(flagsp); - if ((state & lock_bit) == 0 && - atomic_cmpset_acq_16(flagsp, state, state | lock_bit)) - return; + for (;;) { + if ((state & lock_bit) != 0) + break; + if (atomic_fcmpset_acq_16(flagsp, &state, state | lock_bit)) + return; + } sleepq_lock(flagsp); state = atomic_load_16(flagsp); @@ -842,9 +845,12 @@ file_v_unlock(struct file *fp, short lock_bit, short lock_wait_bit) flagsp = &fp->f_vflags; state = atomic_load_16(flagsp); - if ((state & lock_wait_bit) == 0 && - atomic_cmpset_rel_16(flagsp, state, state & ~lock_bit)) - return; + for (;;) { + if ((state & lock_wait_bit) != 0) + break; + if (atomic_fcmpset_rel_16(flagsp, &state, state & ~lock_bit)) + return; + } sleepq_lock(flagsp); MPASS((*flagsp & lock_bit) != 0); @@ -864,10 +870,6 @@ foffset_lock(struct file *fp, int flags) FILE_V_FOFFSET_LOCK_WAITING); } - /* - * According to McKusick the vn lock was protecting f_offset here. - * It is now protected by the FOFFSET_LOCKED flag. - */ return (atomic_load_long(&fp->f_offset)); } diff --git a/sys/libkern/arc4random.c b/sys/libkern/arc4random.c index 016822e9f03c..6fca7c3c4e9d 100644 --- a/sys/libkern/arc4random.c +++ b/sys/libkern/arc4random.c @@ -156,7 +156,7 @@ chacha20_randomstir(struct chacha20_s *chacha20) * Initialize the contexts. */ static void -chacha20_init(void) +chacha20_init(void *dummy __unused) { struct chacha20_s *chacha20; @@ -176,7 +176,7 @@ SYSINIT(chacha20, SI_SUB_LOCK, SI_ORDER_ANY, chacha20_init, NULL); static void -chacha20_uninit(void) +chacha20_uninit(void *dummy __unused) { struct chacha20_s *chacha20; diff --git a/sys/libkern/x86/crc32_sse42.c b/sys/libkern/x86/crc32_sse42.c index b79c7afbeeb1..94ffdc178910 100644 --- a/sys/libkern/x86/crc32_sse42.c +++ b/sys/libkern/x86/crc32_sse42.c @@ -199,8 +199,10 @@ crc32c_shift(uint32_t zeros[][256], uint32_t crc) static void #ifndef _KERNEL __attribute__((__constructor__)) -#endif crc32c_init_hw(void) +#else +crc32c_init_hw(void *dummy __unused) +#endif { crc32c_zeros(crc32c_long, LONG); crc32c_zeros(crc32c_2long, 2 * LONG); diff --git a/sys/modules/Makefile b/sys/modules/Makefile index feb9778c23da..63a0b3260e6d 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -577,6 +577,7 @@ _mlx5ib= mlx5ib ${MACHINE_CPUARCH} == "i386" _ena= ena _gve= gve +_igc= igc # gcc13 and earlier lack __builtin_bitcountg used by linux emulation .if !(${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} < 140000) _iwlwifi= iwlwifi @@ -747,7 +748,6 @@ _et= et _ftgpio= ftgpio _ftwd= ftwd _exca= exca -_igc= igc _io= io _itwd= itwd _ix= ix diff --git a/sys/modules/aic7xxx/ahc/Makefile b/sys/modules/aic7xxx/ahc/Makefile index 3741d4fb666f..6f9bdcb1d8bd 100644 --- a/sys/modules/aic7xxx/ahc/Makefile +++ b/sys/modules/aic7xxx/ahc/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/dev/aic7xxx KMOD= ahc SUBDIR+= ahc_isa ahc_pci diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile index 2989ad580b97..7ebdc1d51945 100644 --- a/sys/modules/cxgb/Makefile +++ b/sys/modules/cxgb/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR= cxgb SUBDIR+= cxgb_t3fw diff --git a/sys/modules/dpdk_lpm4/Makefile b/sys/modules/dpdk_lpm4/Makefile index ff68fac78915..9bc2693aeffb 100644 --- a/sys/modules/dpdk_lpm4/Makefile +++ b/sys/modules/dpdk_lpm4/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/contrib/dpdk_rte_lpm KMOD= dpdk_lpm4 diff --git a/sys/modules/dpdk_lpm6/Makefile b/sys/modules/dpdk_lpm6/Makefile index f2248e5d1c1c..9de2c6650422 100644 --- a/sys/modules/dpdk_lpm6/Makefile +++ b/sys/modules/dpdk_lpm6/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/contrib/dpdk_rte_lpm KMOD= dpdk_lpm6 diff --git a/sys/modules/fib_dxr/Makefile b/sys/modules/fib_dxr/Makefile index 7d1996ba510f..f8a28abe957a 100644 --- a/sys/modules/fib_dxr/Makefile +++ b/sys/modules/fib_dxr/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/netinet KMOD= fib_dxr diff --git a/sys/modules/if_enc/Makefile b/sys/modules/if_enc/Makefile index 449d869d6a21..bd865a0216a4 100644 --- a/sys/modules/if_enc/Makefile +++ b/sys/modules/if_enc/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/net KMOD= if_enc diff --git a/sys/modules/if_gif/Makefile b/sys/modules/if_gif/Makefile index efcd6952a8ac..5e3fda3a51c6 100644 --- a/sys/modules/if_gif/Makefile +++ b/sys/modules/if_gif/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/net ${SYSDIR}/netinet ${SYSDIR}/netinet6 KMOD= if_gif diff --git a/sys/modules/if_gre/Makefile b/sys/modules/if_gre/Makefile index 9f50708a14d7..58bd03c23785 100644 --- a/sys/modules/if_gre/Makefile +++ b/sys/modules/if_gre/Makefile @@ -1,6 +1,5 @@ SYSDIR?=${SRCTOP}/sys .PATH: ${SYSDIR}/net ${SYSDIR}/netinet ${SYSDIR}/netinet6 -.include "${SYSDIR}/conf/kern.opts.mk" KMOD= if_gre SRCS= if_gre.c opt_inet.h opt_inet6.h opt_rss.h diff --git a/sys/modules/iser/Makefile b/sys/modules/iser/Makefile index 615199ec97a3..ff08ae6f346a 100644 --- a/sys/modules/iser/Makefile +++ b/sys/modules/iser/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/dev/iser/ KMOD= iser diff --git a/sys/modules/ktest/Makefile b/sys/modules/ktest/Makefile index 151db53417df..d5f15576f38b 100644 --- a/sys/modules/ktest/Makefile +++ b/sys/modules/ktest/Makefile @@ -1,8 +1,6 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR= ktest \ ktest_example \ - ktest_netlink_message_writer + ktest_netlink_message_writer \ + ktest_tcphpts .include <bsd.subdir.mk> diff --git a/sys/modules/ktest/ktest/Makefile b/sys/modules/ktest/ktest/Makefile index 3d4f1a8c2cc0..9741662ef709 100644 --- a/sys/modules/ktest/ktest/Makefile +++ b/sys/modules/ktest/ktest/Makefile @@ -1,9 +1,5 @@ PACKAGE= tests - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - -.PATH: ${SYSDIR}/tests +.PATH: ${SRCTOP}/sys/tests KMOD= ktest SRCS= ktest.c diff --git a/sys/modules/ktest/ktest_example/Makefile b/sys/modules/ktest/ktest_example/Makefile index 2b572d867aa5..aacc8f0e4ca5 100644 --- a/sys/modules/ktest/ktest_example/Makefile +++ b/sys/modules/ktest/ktest_example/Makefile @@ -1,9 +1,8 @@ PACKAGE= tests -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" -.PATH: ${SYSDIR}/tests +.PATH: ${SRCTOP}/sys/tests KMOD= ktest_example SRCS= ktest_example.c diff --git a/sys/modules/ktest/ktest_netlink_message_writer/Makefile b/sys/modules/ktest/ktest_netlink_message_writer/Makefile index a91c45755d0d..3f05f9b26785 100644 --- a/sys/modules/ktest/ktest_netlink_message_writer/Makefile +++ b/sys/modules/ktest/ktest_netlink_message_writer/Makefile @@ -1,8 +1,6 @@ PACKAGE= tests SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/netlink KMOD= ktest_netlink_message_writer diff --git a/sys/modules/ktest/ktest_tcphpts/Makefile b/sys/modules/ktest/ktest_tcphpts/Makefile new file mode 100644 index 000000000000..b642c0cb4209 --- /dev/null +++ b/sys/modules/ktest/ktest_tcphpts/Makefile @@ -0,0 +1,13 @@ +PACKAGE= tests +WARNS?= 6 + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/netinet + +KMOD= ktest_tcphpts +SRCS= tcp_hpts_test.c + +.include <bsd.kmod.mk> + diff --git a/sys/modules/miiproxy/Makefile b/sys/modules/miiproxy/Makefile index 730bef4220cd..ab92ebe71b43 100644 --- a/sys/modules/miiproxy/Makefile +++ b/sys/modules/miiproxy/Makefile @@ -3,7 +3,7 @@ KMOD = miiproxy SRCS= miiproxy.c -SRCS+= bus_if.h mdio_if.h miibus_if.h opt_platform.h +SRCS+= bus_if.h device_if.h mdio_if.h miibus_if.h opt_platform.h CFLAGS+= -I${SRCTOP}/sys/dev/etherswitch .include <bsd.kmod.mk> diff --git a/sys/modules/netgraph/Makefile b/sys/modules/netgraph/Makefile index 94560d5c51d7..b2d65af16e7f 100644 --- a/sys/modules/netgraph/Makefile +++ b/sys/modules/netgraph/Makefile @@ -1,5 +1,3 @@ -# $Whistle: Makefile,v 1.5 1999/01/24 06:48:37 archie Exp $ - SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" diff --git a/sys/modules/netgraph/checksum/Makefile b/sys/modules/netgraph/checksum/Makefile index 4e2b1f547a40..bbbc7363d045 100644 --- a/sys/modules/netgraph/checksum/Makefile +++ b/sys/modules/netgraph/checksum/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - KMOD= ng_checksum SRCS= ng_checksum.c opt_inet.h opt_inet6.h diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index 17b52aec1893..8c114ac51538 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -2,9 +2,6 @@ # Compile netmap as a module, useful if you want a netmap bridge # or loadable drivers. -.include <bsd.own.mk> # FreeBSD 10 and earlier -# .include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${.CURDIR}/../../dev/netmap .PATH.h: ${.CURDIR}/../../net CFLAGS += -I${.CURDIR}/../../ -D INET -D VIMAGE diff --git a/sys/modules/opensolaris/Makefile b/sys/modules/opensolaris/Makefile index 98f52057e45e..7e2d5f9101ad 100644 --- a/sys/modules/opensolaris/Makefile +++ b/sys/modules/opensolaris/Makefile @@ -1,4 +1,4 @@ -SYSDIR?= ${SRCTOP}/sys +SYSDIR?=${SRCTOP}/sys .PATH: ${SYSDIR}/cddl/compat/opensolaris/kern .PATH: ${SYSDIR}/contrib/openzfs/module/os/freebsd/spl diff --git a/sys/modules/ow/Makefile b/sys/modules/ow/Makefile index 76fefe3e63be..7aa9d2de8183 100644 --- a/sys/modules/ow/Makefile +++ b/sys/modules/ow/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR = ow owc ow_temp .include <bsd.subdir.mk> diff --git a/sys/modules/qlnx/Makefile b/sys/modules/qlnx/Makefile index 2121f9d586a6..291b681c809e 100644 --- a/sys/modules/qlnx/Makefile +++ b/sys/modules/qlnx/Makefile @@ -31,9 +31,6 @@ # # -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR=qlnxe SUBDIR+=qlnxev SUBDIR+=qlnxr diff --git a/sys/modules/rtwn/Makefile b/sys/modules/rtwn/Makefile index 9afdd2084ecb..f15cbbe8236b 100644 --- a/sys/modules/rtwn/Makefile +++ b/sys/modules/rtwn/Makefile @@ -1,7 +1,5 @@ .PATH: ${SRCTOP}/sys/dev/rtwn - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" KMOD = rtwn SRCS = if_rtwn.c if_rtwn_tx.c if_rtwn_rx.c if_rtwn_beacon.c \ diff --git a/sys/modules/rtwn_pci/Makefile b/sys/modules/rtwn_pci/Makefile index ce2144121e88..3fea80d7d256 100644 --- a/sys/modules/rtwn_pci/Makefile +++ b/sys/modules/rtwn_pci/Makefile @@ -1,7 +1,5 @@ .PATH: ${SRCTOP}/sys/dev/rtwn/pci - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" KMOD = if_rtwn_pci SRCS = rtwn_pci_attach.c rtwn_pci_reg.c rtwn_pci_rx.c rtwn_pci_tx.c \ diff --git a/sys/modules/rtwn_usb/Makefile b/sys/modules/rtwn_usb/Makefile index 16899b8a8c49..6a73276d088c 100644 --- a/sys/modules/rtwn_usb/Makefile +++ b/sys/modules/rtwn_usb/Makefile @@ -1,7 +1,5 @@ .PATH: ${SRCTOP}/sys/dev/rtwn/usb - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" KMOD = if_rtwn_usb SRCS = rtwn_usb_attach.c rtwn_usb_ep.c rtwn_usb_reg.c rtwn_usb_rx.c \ diff --git a/sys/modules/sound/driver/Makefile b/sys/modules/sound/driver/Makefile index ff9499fdf841..02703d4b591a 100644 --- a/sys/modules/sound/driver/Makefile +++ b/sys/modules/sound/driver/Makefile @@ -1,5 +1,4 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" # Modules that include binary-only blobs of microcode should be selectable by # MK_SOURCELESS_UCODE option (see below). diff --git a/sys/modules/sound/sound/Makefile b/sys/modules/sound/sound/Makefile index f3978e9bd9cc..169b1a2730ec 100644 --- a/sys/modules/sound/sound/Makefile +++ b/sys/modules/sound/sound/Makefile @@ -1,5 +1,4 @@ SYSDIR?=${SRCTOP}/sys - .PATH: ${SYSDIR}/dev/sound .PATH: ${SYSDIR}/dev/sound/pcm .PATH: ${SYSDIR}/dev/sound/midi diff --git a/sys/modules/tests/fib_lookup/Makefile b/sys/modules/tests/fib_lookup/Makefile index 7d6198396911..b78d4309f145 100644 --- a/sys/modules/tests/fib_lookup/Makefile +++ b/sys/modules/tests/fib_lookup/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/tests/fib_lookup KMOD= test_lookup diff --git a/sys/modules/vnic/Makefile b/sys/modules/vnic/Makefile index 7b975bfebe81..53e208328159 100644 --- a/sys/modules/vnic/Makefile +++ b/sys/modules/vnic/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - CFLAGS+= -DFDT SUBDIR = mrmlbus thunder_mdio thunder_bgx vnicpf vnicvf diff --git a/sys/modules/vnic/mrmlbus/Makefile b/sys/modules/vnic/mrmlbus/Makefile index a3581b7a79a5..a8fe9e5474e1 100644 --- a/sys/modules/vnic/mrmlbus/Makefile +++ b/sys/modules/vnic/mrmlbus/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/thunder_bgx/Makefile b/sys/modules/vnic/thunder_bgx/Makefile index 90df4b25df90..bf46c3194493 100644 --- a/sys/modules/vnic/thunder_bgx/Makefile +++ b/sys/modules/vnic/thunder_bgx/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/thunder_mdio/Makefile b/sys/modules/vnic/thunder_mdio/Makefile index 37032516f3ca..07cc583bfaf8 100644 --- a/sys/modules/vnic/thunder_mdio/Makefile +++ b/sys/modules/vnic/thunder_mdio/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/vnicpf/Makefile b/sys/modules/vnic/vnicpf/Makefile index 37cd29e6fdd8..3cd64d08a788 100644 --- a/sys/modules/vnic/vnicpf/Makefile +++ b/sys/modules/vnic/vnicpf/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/vnicvf/Makefile b/sys/modules/vnic/vnicvf/Makefile index c6ffaaa2c302..da938b7fd073 100644 --- a/sys/modules/vnic/vnicvf/Makefile +++ b/sys/modules/vnic/vnicvf/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index c595030ed4a0..db1b6f33a8ef 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -843,7 +843,7 @@ /* #undef ZFS_DEVICE_MINOR */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.4.99-72-FreeBSD_gb2196fbed" +#define ZFS_META_ALIAS "zfs-2.4.99-95-FreeBSD_g5605a6d79" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -852,7 +852,7 @@ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.16" +#define ZFS_META_KVER_MAX "6.17" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "4.18" @@ -873,7 +873,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "72-FreeBSD_gb2196fbed" +#define ZFS_META_RELEASE "95-FreeBSD_g5605a6d79" /* Define the project version. */ #define ZFS_META_VERSION "2.4.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 9eae1e8573c0..8a1802f5480b 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.4.99-72-gb2196fbed" +#define ZFS_META_GITREV "zfs-2.4.99-95-g5605a6d79" diff --git a/sys/net/bpf.c b/sys/net/bpf.c index a347dbe2eb73..f598733773d0 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -253,12 +253,14 @@ static const struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, + .f_copy = knote_triv_copy, }; static const struct filterops bpfwrite_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfwrite, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index c8dbb6aa8893..56bb90cce9bc 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -261,6 +261,7 @@ static const struct filterops tun_read_filterops = { .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, + .f_copy = knote_triv_copy, }; static const struct filterops tun_write_filterops = { @@ -268,6 +269,7 @@ static const struct filterops tun_write_filterops = { .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, + .f_copy = knote_triv_copy, }; static struct tuntap_driver { diff --git a/sys/net/route.c b/sys/net/route.c index 7a50bcc43e06..d2c9f3e39c17 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -89,7 +89,7 @@ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, * SI_ORDER_MIDDLE. */ static void -route_init(void) +route_init(void *dummy __unused) { nhops_init(); diff --git a/sys/net/route/route_tables.c b/sys/net/route/route_tables.c index 176ca43fa1c5..3b7bb1385d0e 100644 --- a/sys/net/route/route_tables.c +++ b/sys/net/route/route_tables.c @@ -186,7 +186,7 @@ rtables_prison_destructor(void *data) } static void -rtables_init(void) +rtables_init(void *dummy __unused) { osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_ATTACH] = rtables_check_proc_fib, diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index f0dcc973ca7c..be858428bb3e 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -309,7 +309,7 @@ rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc) } static void -rtsock_init(void) +rtsock_init(void *dummy __unused) { rtsbridge_orig_p = rtsock_callback_p; rtsock_callback_p = &rtsbridge; diff --git a/sys/net80211/ieee80211.c b/sys/net80211/ieee80211.c index 2b7cf635b9f5..1299f86ebdc7 100644 --- a/sys/net80211/ieee80211.c +++ b/sys/net80211/ieee80211.c @@ -2689,13 +2689,18 @@ ieee80211_channel_type_char(const struct ieee80211_channel *c) return 'f'; } -/* - * Determine whether the given key in the given VAP is a global key. +/** + * @brief Determine whether the given key in the given VAP is a global key. + * * (key index 0..3, shared between all stations on a VAP.) * * This is either a WEP key or a GROUP key. * * Note this will NOT return true if it is a IGTK key. + * + * @param vap the current VAP + * @param key ieee80211_key to use/check + * @returns true if it's a global/WEP key, false otherwise */ bool ieee80211_is_key_global(const struct ieee80211vap *vap, @@ -2705,8 +2710,23 @@ ieee80211_is_key_global(const struct ieee80211vap *vap, key < &vap->iv_nw_keys[IEEE80211_WEP_NKID]); } -/* - * Determine whether the given key in the given VAP is a unicast key. +/** + * @brief Determine whether the given key in the given VAP is a unicast key. + * + * This only returns true if it's a unicast key. + * + * Note: For now net80211 only supports a single unicast key, stored in + * an ieee80211_node entry. + * + * Code should use this to know if it's a unicast key and then call + * ieee80211_crypto_get_keyid() to get the 802.11 key ID (0..3 for + * unicast/global keys, 4..5 for IGTK keys.) Since the unicast + * and global key indexes "overlap", callers will need to check + * both the type and id. + * + * @param vap the current VAP + * @param key ieee80211_key to use/check + * @returns true if the key is a unicast key, false if it is not */ bool ieee80211_is_key_unicast(const struct ieee80211vap *vap, diff --git a/sys/net80211/ieee80211_crypto.c b/sys/net80211/ieee80211_crypto.c index 1e63ca46f28f..566f0b2e0c23 100644 --- a/sys/net80211/ieee80211_crypto.c +++ b/sys/net80211/ieee80211_crypto.c @@ -611,11 +611,15 @@ ieee80211_crypto_setkey(struct ieee80211vap *vap, struct ieee80211_key *key) return dev_key_set(vap, key); } -/* - * Return index if the key is a WEP key (0..3); -1 otherwise. +/** + * @brief Return index if the key is a WEP key (0..3); -1 otherwise. * * This is different to "get_keyid" which defaults to returning * 0 for unicast keys; it assumes that it won't be used for WEP. + * + * @param vap the current VAP + * @param k ieee80211_key to check + * @returns 0..3 if it's a global/WEP key, -1 otherwise. */ int ieee80211_crypto_get_key_wepidx(const struct ieee80211vap *vap, @@ -628,8 +632,18 @@ ieee80211_crypto_get_key_wepidx(const struct ieee80211vap *vap, return (-1); } -/* - * Note: only supports a single unicast key (0). +/** + * @brief Return the index of a unicast, global or IGTK key. + * + * Return the index of a key. For unicast keys the index is 0..1. + * For global/WEP keys it's 0..3. For IGTK keys its 4..5. + * + * TODO: support >1 unicast key + * TODO: support IGTK keys + * + * @param vap the current VAP + * @param k ieee80211_key to check + * @returns 0..3 for a WEP/global key, 0..1 for unicast key, 4..5 for IGTK key */ uint8_t ieee80211_crypto_get_keyid(struct ieee80211vap *vap, struct ieee80211_key *k) @@ -641,6 +655,19 @@ ieee80211_crypto_get_keyid(struct ieee80211vap *vap, struct ieee80211_key *k) return (0); } +/** + * @param Return the key to use for encrypting an mbuf frame to a node + * + * This routine chooses a suitable key used to encrypt the given frame with. + * It doesn't do the encryption; it only chooses the key. If a key is not + * available then the routine will return NULL. + * + * It's up to the caller to enforce whether a key is absolutely required or not. + * + * @param ni The ieee80211_node to send the frame to + * @param m the mbuf to encrypt + * @returns the ieee80211_key to encrypt with, or NULL if there's no suitable key + */ struct ieee80211_key * ieee80211_crypto_get_txkey(struct ieee80211_node *ni, struct mbuf *m) { @@ -676,8 +703,28 @@ ieee80211_crypto_get_txkey(struct ieee80211_node *ni, struct mbuf *m) return &ni->ni_ucastkey; } -/* - * Add privacy headers appropriate for the specified key. +/** + * @brief Privacy encapsulate and encrypt the given mbuf. + * + * This routine handles the mechanics of encryption - expanding the + * mbuf to add privacy headers, IV, ICV, MIC, MMIC, and then encrypts + * the given mbuf if required. + * + * This should be called by the driver in its TX path as part of + * encapsulation before passing frames to the hardware/firmware + * queues. + * + * Drivers/hardware which does its own entirely offload path + * should still call this for completeness - it indicates to the + * driver that the frame itself should be encrypted. + * + * The driver should have set capability bits in the attach / + * key allocation path to disable various encapsulation/encryption + * features. + * + * @param ni ieee80211_node for this frame + * @param mbuf mbuf to modify + * @returns the key used if the frame is to be encrypted, NULL otherwise */ struct ieee80211_key * ieee80211_crypto_encap(struct ieee80211_node *ni, struct mbuf *m) @@ -693,9 +740,31 @@ ieee80211_crypto_encap(struct ieee80211_node *ni, struct mbuf *m) return NULL; } -/* - * Validate and strip privacy headers (and trailer) for a - * received frame that has the WEP/Privacy bit set. +/** + * @brief Decapsulate and validate an encrypted frame. + * + * This handles an encrypted frame (one with the privacy bit set.) + * It also obeys the key / config / receive packet flags for how + * the driver says its already been processed. + * + * Unlike ieee80211_crypto_encap(), this isn't called in the driver. + * Instead, drivers passed the potentially decrypted frame - fully, + * partial, or not at all - and net80211 will call this as appropriate. + * + * This handles NICs (like ath(4)) which have a variable size between + * the 802.11 header and 802.11 payload due to DMA alignment / encryption + * engine concerns. + * + * If the frame was decrypted and validated successfully then 1 is returned + * and the mbuf can be treated as an 802.11 frame. If it is not decrypted + * successfully or it was decrypted but failed validation/checks, then + * 0 is returned. + * + * @param ni ieee80211_node for received frame + * @param m mbuf frame to receive + * @param hdrlen length of the 802.11 header, including trailing null bytes + * @param key pointer to ieee80211_key that will be set if appropriate + * @returns 0 if the frame wasn't decrypted/validated, 1 if decrypted/validated. */ int ieee80211_crypto_decap(struct ieee80211_node *ni, struct mbuf *m, int hdrlen, diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c index 3af56a228295..a8a767785fce 100644 --- a/sys/net80211/ieee80211_ht.c +++ b/sys/net80211/ieee80211_ht.c @@ -167,7 +167,7 @@ static ieee80211_send_action_func ht_send_action_ba_delba; static ieee80211_send_action_func ht_send_action_ht_txchwidth; static void -ieee80211_ht_init(void) +ieee80211_ht_init(void *dummy __unused) { /* * Setup HT parameters that depends on the clock frequency. diff --git a/sys/net80211/ieee80211_hwmp.c b/sys/net80211/ieee80211_hwmp.c index b69210768c54..084e67da13db 100644 --- a/sys/net80211/ieee80211_hwmp.c +++ b/sys/net80211/ieee80211_hwmp.c @@ -212,7 +212,7 @@ SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, inact, "mesh route inactivity timeout (ms)"); static void -ieee80211_hwmp_init(void) +ieee80211_hwmp_init(void *dummy __unused) { /* Default values as per amendment */ ieee80211_hwmp_pathtimeout = msecs_to_ticks(5*1000); diff --git a/sys/net80211/ieee80211_mesh.c b/sys/net80211/ieee80211_mesh.c index 3f0410a69e3c..7f2e8bdcb963 100644 --- a/sys/net80211/ieee80211_mesh.c +++ b/sys/net80211/ieee80211_mesh.c @@ -548,7 +548,7 @@ mesh_gatemode_cb(void *arg) } static void -ieee80211_mesh_init(void) +ieee80211_mesh_init(void *dummy __unused) { memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths)); diff --git a/sys/net80211/ieee80211_phy.c b/sys/net80211/ieee80211_phy.c index 7f53c717152b..b4d9b16907d2 100644 --- a/sys/net80211/ieee80211_phy.c +++ b/sys/net80211/ieee80211_phy.c @@ -348,7 +348,7 @@ ieee80211_setup_ratetable(struct ieee80211_rate_table *rt) /* Setup all rate tables */ static void -ieee80211_phy_init(void) +ieee80211_phy_init(void *dummy __unused) { static struct ieee80211_rate_table * const ratetables[] = { &ieee80211_half_table, diff --git a/sys/net80211/ieee80211_proto.c b/sys/net80211/ieee80211_proto.c index 0c161d98a55a..4918bf7d025f 100644 --- a/sys/net80211/ieee80211_proto.c +++ b/sys/net80211/ieee80211_proto.c @@ -459,7 +459,7 @@ static const struct ieee80211_authenticator auth_internal = { * Setup internal authenticators once; they are never unregistered. */ static void -ieee80211_auth_setup(void) +ieee80211_auth_setup(void *dummy __unused) { ieee80211_authenticator_register(IEEE80211_AUTH_OPEN, &auth_internal); ieee80211_authenticator_register(IEEE80211_AUTH_SHARED, &auth_internal); diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c index 10a5fc7f08ab..095c4108c768 100644 --- a/sys/net80211/ieee80211_vht.c +++ b/sys/net80211/ieee80211_vht.c @@ -102,7 +102,7 @@ vht_send_action_placeholder(struct ieee80211_node *ni, } static void -ieee80211_vht_init(void) +ieee80211_vht_init(void *dummy __unused) { ieee80211_recv_action_register(IEEE80211_ACTION_CAT_VHT, diff --git a/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c b/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c index 0181a67ac604..f35712cc8f69 100644 --- a/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c +++ b/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c @@ -81,9 +81,6 @@ const STRUCT_USB_HOST_ID ubt_rtl_devs[] = { USB_VPI(0x0bda, 0xb00c, 0) }, { USB_VPI(0x0bda, 0xc822, 0) }, - /* Realtek 8822CU Bluetooth devices */ - { USB_VPI(0x13d3, 0x3549, 0) }, - /* Realtek 8851BE Bluetooth devices */ { USB_VPI(0x13d3, 0x3600, 0) }, diff --git a/sys/netgraph/netflow/netflow.c b/sys/netgraph/netflow/netflow.c index 978d6fd0b54d..05c6062463be 100644 --- a/sys/netgraph/netflow/netflow.c +++ b/sys/netgraph/netflow/netflow.c @@ -960,7 +960,7 @@ struct ngnf_show_header *resp) list_id = 0; TAILQ_FOREACH(fle, &hsh->head, fle_hash) { - if (hsh->mtx.mtx_lock & MTX_CONTESTED) { + if (hsh->mtx.mtx_lock & MTX_WAITERS) { resp->hash_id = i; resp->list_id = list_id; mtx_unlock(&hsh->mtx); @@ -1111,7 +1111,7 @@ ng_netflow_expire(void *arg) * Interrupt thread wants this entry! * Quick! Quick! Bail out! */ - if (hsh->mtx.mtx_lock & MTX_CONTESTED) + if (hsh->mtx.mtx_lock & MTX_WAITERS) break; /* @@ -1150,7 +1150,7 @@ ng_netflow_expire(void *arg) * Interrupt thread wants this entry! * Quick! Quick! Bail out! */ - if (hsh->mtx.mtx_lock & MTX_CONTESTED) + if (hsh->mtx.mtx_lock & MTX_WAITERS) break; /* diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index c20a20cd983d..bc06616dbf93 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -271,7 +271,7 @@ cc_check_default(struct cc_algo *remove_cc) * Initialise CC subsystem on system boot. */ static void -cc_init(void) +cc_init(void *dummy __unused) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); diff --git a/sys/netinet/in_fib_algo.c b/sys/netinet/in_fib_algo.c index 123dacb409e7..95621c300064 100644 --- a/sys/netinet/in_fib_algo.c +++ b/sys/netinet/in_fib_algo.c @@ -767,7 +767,7 @@ struct fib_lookup_module flm_radix4 = { }; static void -fib4_algo_init(void) +fib4_algo_init(void *dummy __unused) { fib_module_register(&flm_bsearch4); diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c index f5b20c49ffd2..ba112afbf002 100644 --- a/sys/netinet/in_mcast.c +++ b/sys/netinet/in_mcast.c @@ -159,9 +159,6 @@ static struct ip_moptions * static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); -static struct ifnet * - inp_lookup_mcast_ifp(const struct inpcb *, - const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); @@ -1832,69 +1829,55 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) } /* - * Look up the ifnet to use for a multicast group membership, - * given the IPv4 address of an interface, and the IPv4 group address. - * - * This routine exists to support legacy multicast applications - * which do not understand that multicast memberships are scoped to - * specific physical links in the networking stack, or which need - * to join link-scope groups before IPv4 addresses are configured. - * - * Use this socket's current FIB number for any required FIB lookup. - * If ina is INADDR_ANY, look up the group address in the unicast FIB, - * and use its ifp; usually, this points to the default next-hop. - * - * If the FIB lookup fails, attempt to use the first non-loopback - * interface with multicast capability in the system as a - * last resort. The legacy IPv4 ASM API requires that we do - * this in order to allow groups to be joined when the routing - * table has not yet been populated during boot. - * - * Returns NULL if no ifp could be found, otherwise return referenced ifp. + * Look up the ifnet to join a multicast group membership via legacy + * IP_ADD_MEMBERSHIP or via more modern MCAST_JOIN_GROUP. * - * FUTURE: Implement IPv4 source-address selection. + * If the interface index was specified explicitly, just use it. If the + * address was specified (legacy), try to find matching interface. Else + * (index == 0 && no address) do a route lookup. If that fails for a modern + * MCAST_JOIN_GROUP return failure, for legacy IP_ADD_MEMBERSHIP find first + * multicast capable interface. */ static struct ifnet * -inp_lookup_mcast_ifp(const struct inpcb *inp, - const struct sockaddr_in *gsin, const struct in_addr ina) +inp_lookup_mcast_ifp(const struct inpcb *inp, const struct in_addr maddr, +const struct in_addr *ina, const u_int index) { struct ifnet *ifp; struct nhop_object *nh; NET_EPOCH_ASSERT(); - KASSERT(inp != NULL, ("%s: inp must not be NULL", __func__)); - KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); - KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), - ("%s: not multicast", __func__)); - ifp = NULL; - if (!in_nullhost(ina)) { - INADDR_TO_IFP(ina, ifp); + if (index != 0) + return (ifnet_byindex_ref(index)); + + if (ina != NULL && !in_nullhost(*ina)) { + INADDR_TO_IFP(*ina, ifp); if (ifp != NULL) if_ref(ifp); - } else { - nh = fib4_lookup(inp->inp_inc.inc_fibnum, gsin->sin_addr, 0, NHR_NONE, 0); - if (nh != NULL) { - ifp = nh->nh_ifp; - if_ref(ifp); - } else { - struct in_ifaddr *ia; - struct ifnet *mifp; - - mifp = NULL; - CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - mifp = ia->ia_ifp; - if (!(mifp->if_flags & IFF_LOOPBACK) && - (mifp->if_flags & IFF_MULTICAST)) { - ifp = mifp; - if_ref(ifp); - break; - } + return (ifp); + } + + nh = fib4_lookup(inp->inp_inc.inc_fibnum, maddr, 0, NHR_NONE, 0); + if (nh != NULL) { + ifp = nh->nh_ifp; + if_ref(ifp); + return (ifp); + } + + if (ina != NULL) { + struct in_ifaddr *ia; + + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (!(ia->ia_ifp->if_flags & IFF_LOOPBACK) && + (ia->ia_ifp->if_flags & IFF_MULTICAST)) { + ifp = ia->ia_ifp; + if_ref(ifp); + return (ifp); } } } - return (ifp); + return (NULL); } /* @@ -1926,13 +1909,13 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: { struct ip_mreqn mreqn; + bool mreq; - if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) - error = sooptcopyin(sopt, &mreqn, - sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); - else - error = sooptcopyin(sopt, &mreqn, - sizeof(struct ip_mreq), sizeof(struct ip_mreq)); + mreq = (sopt->sopt_valsize != sizeof(struct ip_mreqn)); + + error = sooptcopyin(sopt, &mreqn, + mreq ? sizeof(struct ip_mreq) : sizeof(struct ip_mreqn), + mreq ? sizeof(struct ip_mreq) : sizeof(struct ip_mreqn)); if (error) return (error); @@ -1943,12 +1926,9 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) return (EINVAL); NET_EPOCH_ENTER(et); - if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && - mreqn.imr_ifindex != 0) - ifp = ifnet_byindex_ref(mreqn.imr_ifindex); - else - ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, - mreqn.imr_address); + ifp = inp_lookup_mcast_ifp(inp, mreqn.imr_multiaddr, + mreq ? &mreqn.imr_address : NULL, + mreq ? 0 : mreqn.imr_ifindex); NET_EPOCH_EXIT(et); break; } @@ -1971,8 +1951,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) ssa->sin.sin_addr = mreqs.imr_sourceaddr; NET_EPOCH_ENTER(et); - ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, - mreqs.imr_interface); + ifp = inp_lookup_mcast_ifp(inp, mreqs.imr_multiaddr, + &mreqs.imr_interface, 0); NET_EPOCH_EXIT(et); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); @@ -2013,7 +1993,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) return (EINVAL); NET_EPOCH_ENTER(et); - ifp = ifnet_byindex_ref(gsr.gsr_interface); + ifp = inp_lookup_mcast_ifp(inp, gsa->sin.sin_addr, NULL, + gsr.gsr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index c143d74a2f45..41f0a328daec 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -2181,7 +2181,7 @@ LibAliasInit(struct libalias *la) #undef malloc /* XXX: ugly */ la = malloc(sizeof *la, M_ALIAS, M_WAITOK | M_ZERO); #else - la = calloc(sizeof *la, 1); + la = calloc(1, sizeof *la); if (la == NULL) return (la); #endif diff --git a/sys/netinet/sctp_lock_bsd.h b/sys/netinet/sctp_lock_bsd.h index ec66be0cf371..a60983cb30e3 100644 --- a/sys/netinet/sctp_lock_bsd.h +++ b/sys/netinet/sctp_lock_bsd.h @@ -263,10 +263,10 @@ } while (0) #define SCTP_INP_LOCK_CONTENDED(_inp) \ - ((_inp)->inp_mtx.mtx_lock & MTX_CONTESTED) + ((_inp)->inp_mtx.mtx_lock & MTX_WAITERS) #define SCTP_INP_READ_CONTENDED(_inp) \ - ((_inp)->inp_rdata_mtx.mtx_lock & MTX_CONTESTED) + ((_inp)->inp_rdata_mtx.mtx_lock & MTX_WAITERS) #ifdef SCTP_LOCK_LOGGING #define SCTP_INP_RLOCK(_inp) do { \ @@ -337,7 +337,7 @@ } while (0) #define SCTP_ASOC_CREATE_LOCK_CONTENDED(_inp) \ - ((_inp)->inp_create_mtx.mtx_lock & MTX_CONTESTED) + ((_inp)->inp_create_mtx.mtx_lock & MTX_WAITERS) /* * For the majority of things (once we have found the association) we will diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 374b5595fcbc..5b89ca026e85 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -519,7 +519,7 @@ siftr_pkt_manager_thread(void *arg) if (log_buf != NULL) { alq_post_flags(siftr_alq, log_buf, 0); } - for (;cnt > 0; cnt--) { + for (; cnt > 0; cnt--) { pkt_node = STAILQ_FIRST(&tmp_pkt_queue); STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes); free(pkt_node, M_SIFTR_PKTNODE); diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 63bbe4bba11b..c54459bb5f01 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -39,15 +39,14 @@ * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules - * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The - * slot is the time from now that the stack wants to be called but it - * must be converted to tcp_hpts's notion of slot. This is done with - * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical + * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The + * usecs is the time from now that the stack wants to be called and is + * passing time directly in microseconds. So a typical * call from the tcp_output() routine might look like: * - * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); + * tcp_hpts_insert(tp, 550, NULL); * - * The above would schedule tcp_output() to be called in 550 useconds. + * The above would schedule tcp_output() to be called in 550 microseconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: @@ -149,27 +148,44 @@ #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #include <netinet/tcp_log_buf.h> #ifdef tcp_offload #include <netinet/tcp_offload.h> #endif -/* - * The hpts uses a 102400 wheel. The wheel - * defines the time in 10 usec increments (102400 x 10). - * This gives a range of 10usec - 1024ms to place - * an entry within. If the user requests more than - * 1.024 second, a remaineder is attached and the hpts - * when seeing the remainder will re-insert the - * inpcb forward in time from where it is until - * the remainder is zero. - */ +/* Global instance for TCP HPTS */ +struct tcp_hptsi *tcp_hptsi_pace; + +/* Default function table for production use. */ +const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = { + .microuptime = microuptime, + .swi_add = swi_add, + .swi_remove = swi_remove, + .swi_sched = swi_sched, + .intr_event_bind = intr_event_bind, + .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset, + .callout_init = callout_init, + .callout_reset_sbt_on = callout_reset_sbt_on, + ._callout_stop_safe = _callout_stop_safe, +}; -#define NUM_OF_HPTSI_SLOTS 102400 +#ifdef TCP_HPTS_KTEST +#define microuptime pace->funcs->microuptime +#define swi_add pace->funcs->swi_add +#define swi_remove pace->funcs->swi_remove +#define swi_sched pace->funcs->swi_sched +#define intr_event_bind pace->funcs->intr_event_bind +#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset +#define callout_init pace->funcs->callout_init +#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on +#define _callout_stop_safe pace->funcs->_callout_stop_safe +#endif -/* The number of connections after which the dynamic sleep logic kicks in. */ -#define DEFAULT_CONNECTION_THRESHOLD 100 +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); + +static void tcp_hpts_thread(void *ctx); /* * When using the hpts, a TCP stack must make sure @@ -204,87 +220,22 @@ * * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh * then we do a dynamic adjustment on the time we sleep. - * Our threshold is if the lateness of the first client served (in ticks) is + * Our threshold is if the lateness of the first client served (in slots) is * greater than or equal too slots_indicate_more_sleep (10ms - * or 10000 ticks). If we were that late, the actual sleep time - * is adjusted down by 50%. If the ticks_ran is less than - * slots_indicate_more_sleep (100 ticks or 1000usecs). + * or 10000 slots). If we were that late, the actual sleep time + * is adjusted down by 50%. If the slots_ran is less than + * slots_indicate_more_sleep (100 slots or 1000usecs). * */ -/* Each hpts has its own p_mtx which is used for locking */ -#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) -#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) -#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) -#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) -struct tcp_hpts_entry { - /* Cache line 0x00 */ - struct mtx p_mtx; /* Mutex for hpts */ - struct timeval p_mysleep; /* Our min sleep time */ - uint64_t syscall_cnt; - uint64_t sleeping; /* What the actual sleep was (if sleeping) */ - uint16_t p_hpts_active; /* Flag that says hpts is awake */ - uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ - uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ - uint32_t p_runningslot; /* Current tick we are at if we are running */ - uint32_t p_prev_slot; /* Previous slot we were on */ - uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ - uint32_t p_nxt_slot; /* The next slot outside the current range of - * slots that the hpts is running on. */ - int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t p_lasttick; /* Last tick before the current one */ - uint8_t p_direct_wake :1, /* boolean */ - p_on_min_sleep:1, /* boolean */ - p_hpts_wake_scheduled:1, /* boolean */ - hit_callout_thresh:1, - p_avail:4; - uint8_t p_fill[3]; /* Fill to 32 bits */ - /* Cache line 0x40 */ - struct hptsh { - TAILQ_HEAD(, tcpcb) head; - uint32_t count; - uint32_t gencnt; - } *p_hptss; /* Hptsi wheel */ - uint32_t p_hpts_sleep_time; /* Current sleep interval having a max - * of 255ms */ - uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ - uint32_t saved_lasttick; /* for logging */ - uint32_t saved_curtick; /* for logging */ - uint32_t saved_curslot; /* for logging */ - uint32_t saved_prev_slot; /* for logging */ - uint32_t p_delayed_by; /* How much were we delayed by */ - /* Cache line 0x80 */ - struct sysctl_ctx_list hpts_ctx; - struct sysctl_oid *hpts_root; - struct intr_event *ie; - void *ie_cookie; - uint16_t p_num; /* The hpts number one per cpu */ - uint16_t p_cpu; /* The hpts CPU */ - /* There is extra space in here */ - /* Cache line 0x100 */ - struct callout co __aligned(CACHE_LINE_SIZE); -} __aligned(CACHE_LINE_SIZE); - -static struct tcp_hptsi { - struct cpu_group **grps; - struct tcp_hpts_entry **rp_ent; /* Array of hptss */ - uint32_t *cts_last_ran; - uint32_t grp_cnt; - uint32_t rp_num_hptss; /* Number of hpts threads */ -} tcp_pace; - -static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -static int tcp_bind_threads = 1; +int tcp_bind_threads = 1; #else -static int tcp_bind_threads = 2; +int tcp_bind_threads = 2; #endif static int tcp_use_irq_cpu = 0; static int hpts_does_tp_logging = 0; - -static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); -static void tcp_hpts_thread(void *ctx); - +static int32_t tcp_hpts_precision = 120; int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD; static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; @@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TCP Hpts statistics"); -#define timersub(tvp, uvp, vvp) \ - do { \ - (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ - (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ - if ((vvp)->tv_usec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_usec += 1000000; \ - } \ - } while (0) - -static int32_t tcp_hpts_precision = 120; - -static struct hpts_domain_info { - int count; - int cpu[MAXCPU]; -} hpts_domains[MAXMEMDOM]; - counter_u64_t hpts_hopelessly_behind; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, &tcp_hpts_no_wake_over_thresh, 0, "When we are over the threshold on the pacer do we prohibit wakeups?"); -static uint16_t -hpts_random_cpu(void) +uint16_t +tcp_hptsi_random_cpu(struct tcp_hptsi *pace) { uint16_t cpuid; uint32_t ran; ran = arc4random(); - cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); + cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss); return (cpuid); } @@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, log.u_bbr.flex2 = hpts->p_cur_slot; log.u_bbr.flex3 = hpts->p_prev_slot; log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; log.u_bbr.flex6 = hpts->p_on_queue_cnt; log.u_bbr.flex7 = hpts->p_cpu; log.u_bbr.flex8 = (uint8_t)from_callout; log.u_bbr.inflight = slots_to_run; log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; log.u_bbr.timeStamp = tcp_tv_to_usec(tv); log.u_bbr.epoch = hpts->saved_curslot; log.u_bbr.lt_epoch = hpts->saved_prev_slot; @@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, } } +/* + * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI + * for the HPTS entry to run. + */ static void -tcp_wakehpts(struct tcp_hpts_entry *hpts) +tcp_hpts_sleep_timeout(void *arg) { +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + struct tcp_hpts_entry *hpts; + + hpts = (struct tcp_hpts_entry *)arg; +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + swi_sched(hpts->ie_cookie, 0); +} + +/* + * Reset the HPTS callout timer with the provided timeval. Returns the results + * of the callout_reset_sbt_on() function. + */ +static int +tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv) +{ +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + sbintime_t sb; + +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + + /* Store off to make visible the actual sleep time */ + hpts->sleeping = tv->tv_usec; + + sb = tvtosbt(*tv); + return (callout_reset_sbt_on( + &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)))); +} + +/* + * Schedules the SWI for the HTPS entry to run, if not already scheduled or + * running. + */ +void +tcp_hpts_wake(struct tcp_hpts_entry *hpts) +{ +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + HPTS_MTX_ASSERT(hpts); +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) { hpts->p_direct_wake = 0; return; @@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts) } static void -hpts_timeout_swi(void *arg) -{ - struct tcp_hpts_entry *hpts; - - hpts = (struct tcp_hpts_entry *)arg; - swi_sched(hpts->ie_cookie, 0); -} - -static void tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) { struct inpcb *inp = tptoinpcb(tp); @@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) } static struct tcp_hpts_entry * -tcp_hpts_lock(struct tcpcb *tp) +tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; INP_LOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; + hpts = pace->rp_ent[tp->t_hpts_cpu]; HPTS_LOCK(hpts); return (hpts); @@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp) * and has never received a first packet. */ void -tcp_hpts_init(struct tcpcb *tp) +__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp) { - if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { - tp->t_hpts_cpu = hpts_random_cpu(); + tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace); MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); } } @@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp) * INP lock and then get the hpts lock. */ void -tcp_hpts_remove(struct tcpcb *tp) +__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; struct hptsh *hptsh; INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(tp); + hpts = tcp_hpts_lock(pace, tp); if (tp->t_in_hpts == IHPTS_ONQUEUE) { hptsh = &hpts->p_hptss[tp->t_hpts_slot]; tp->t_hpts_request = 0; @@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus) { /* * Given a slot on the wheel, what slot - * is that plus ticks out? + * is that plus slots out? */ - KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); + KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot)); return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS); } static inline int -tick_to_wheel(uint32_t cts_in_wticks) +cts_to_wheel(uint32_t cts) { /* - * Given a timestamp in ticks (so by - * default to get it to a real time one - * would multiply by 10.. i.e the number - * of ticks in a slot) map it to our limited - * space wheel. + * Given a timestamp in useconds map it to our limited space wheel. */ - return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); + return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS); } static inline int @@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * if ((hpts->p_hpts_active == 1) && (hpts->p_wheel_complete == 0)) { end_slot = hpts->p_runningslot; - /* Back up one tick */ + /* Back up one slot */ if (end_slot == 0) end_slot = NUM_OF_HPTSI_SLOTS - 1; else @@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * * not active, or we have * completed the pass over * the wheel, we can use the - * prev tick and subtract one from it. This puts us + * prev slot and subtract one from it. This puts us * as far out as possible on the wheel. */ end_slot = hpts->p_prev_slot; @@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * /* * Now we have close to the full wheel left minus the * time it has been since the pacer went to sleep. Note - * that wheel_tick, passed in, should be the current time + * that wheel_slot, passed in, should be the current time * from the perspective of the caller, mapped to the wheel. */ if (hpts->p_prev_slot != wheel_slot) @@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * #ifdef INVARIANTS static void check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, - uint32_t hptsslot, int line) + uint32_t hptsslot) { /* * Sanity checks for the pacer with invariants @@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, } #endif -uint32_t -tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) +void +__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, + struct hpts_diag *diag) { struct tcp_hpts_entry *hpts; struct timeval tv; - uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; + uint32_t slot, wheel_cts, last_slot, need_new_to = 0; int32_t wheel_slot, maxslots; bool need_wakeup = false; @@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE)); /* + * Convert microseconds to slots for internal use. * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ - hpts = tcp_hpts_lock(tp); + slot = HPTS_USEC_TO_SLOTS(usecs); + hpts = tcp_hpts_lock(pace, tp); microuptime(&tv); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); @@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ diag->p_runningslot = hpts->p_runningslot; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; - diag->p_curtick = hpts->p_curtick; - diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; diag->p_on_min_sleep = hpts->p_on_min_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; @@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ * timeout is not 1. */ hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); + tcp_hpts_wake(hpts); } - slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); - return (slot_on); + return; } - /* Get the current time relative to the wheel */ - wheel_cts = tcp_tv_to_hpts_slot(&tv); - /* Map it onto the wheel */ - wheel_slot = tick_to_wheel(wheel_cts); + /* Get the current time stamp and map it onto the wheel */ + wheel_cts = tcp_tv_to_usec(&tv); + wheel_slot = cts_to_wheel(wheel_cts); /* Now what's the max we can place it at? */ maxslots = max_slots_available(hpts, wheel_slot, &last_slot); if (diag) { @@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ tp->t_hpts_slot = last_slot; } if (diag) { - diag->slot_remaining = tp->t_hpts_request; + diag->time_remaining = tp->t_hpts_request; diag->inp_hptsslot = tp->t_hpts_slot; } #ifdef INVARIANTS - check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); + check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot); #endif if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) tcp_hpts_insert_internal(tp, hpts); @@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ } /* * Now how far is the hpts sleeping to? if active is 1, its - * up and ticking we do nothing, otherwise we may need to + * up and running we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); + tcp_hpts_wake(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; @@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ } else if (need_new_to) { int32_t co_ret; struct timeval tv; - sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; @@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } - tv.tv_usec = need_new_to; - sb = tvtosbt(tv); - co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */ + co_ret = tcp_hpts_sleep(hpts, &tv); if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } - slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); - - return (slot_on); } static uint16_t -hpts_cpuid(struct tcpcb *tp, int *failed) +hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed) { struct inpcb *inp = tptoinpcb(tp); u_int cpuid; @@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) - return (hpts_random_cpu()); + return (tcp_hptsi_random_cpu(pace)); else return (cpuid); #endif @@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) */ if (inp->inp_flowtype == M_HASHTYPE_NONE) { counter_u64_add(cpu_uses_random, 1); - return (hpts_random_cpu()); + return (tcp_hptsi_random_cpu(pace)); } /* * Hash to a thread based on the flowid. If we are using numa, @@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) #ifdef NUMA } else { /* Hash into the cpu's that use that domain */ - di = &hpts_domains[inp->inp_numa_domain]; + di = &pace->domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } #endif @@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) } } -static int32_t +static bool +tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run) +{ + return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT)); +} + +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) { + struct tcp_hptsi *pace; struct tcpcb *tp; struct timeval tv; int32_t slots_to_run, i, error; @@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; int32_t orig_exit_slot; + uint32_t cts, cts_last_run; bool completed_measure, seen_endpoint; completed_measure = false; @@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); + + pace = hpts->p_hptsi; + MPASS(pace != NULL); + /* record previous info for any logging */ - hpts->saved_lasttick = hpts->p_lasttick; - hpts->saved_curtick = hpts->p_curtick; hpts->saved_curslot = hpts->p_cur_slot; hpts->saved_prev_slot = hpts->p_prev_slot; - hpts->p_lasttick = hpts->p_curtick; - hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); - orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + microuptime(&tv); + cts_last_run = pace->cts_last_ran[hpts->p_cpu]; + pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv); + + orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts); if ((hpts->p_on_queue_cnt == 0) || - (hpts->p_lasttick == hpts->p_curtick)) { + !tcp_hpts_different_slots(cts, cts_last_run)) { /* - * No time has yet passed, - * or nothing to do. + * Not enough time has yet passed or nothing to do. */ hpts->p_prev_slot = hpts->p_cur_slot; - hpts->p_lasttick = hpts->p_curtick; goto no_run; } again: hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); - if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) && - (hpts->p_on_queue_cnt != 0)) { + if ((hpts->p_on_queue_cnt != 0) && + ((cts - cts_last_run) > + ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) { /* * Wheel wrap is occuring, basically we * are behind and the distance between @@ -1238,7 +1214,7 @@ again: uint32_t runningslot; /* - * Calculate our delay, if there are no extra ticks there + * Calculate our delay, if there are no extra slots there * was not any (i.e. if slots_to_run == 1, no delay). */ hpts->p_delayed_by = (slots_to_run - (i + 1)) * @@ -1391,7 +1367,7 @@ again: * gets added to the hpts (not this one) * :-) */ - tcp_set_hpts(tp); + __tcp_set_hpts(pace, tp); } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ @@ -1450,10 +1426,12 @@ no_one: hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run - * more ticks (if we did not hit eno-bufs). + * more slots (if we did not hit eno-bufs). */ hpts->p_prev_slot = hpts->p_cur_slot; - hpts->p_lasttick = hpts->p_curtick; + microuptime(&tv); + cts_last_run = cts; + cts = tcp_tv_to_usec(&tv); if (!from_callout || (loop_cnt > max_pacer_loops)) { /* * Something is serious slow we have @@ -1465,7 +1443,7 @@ no_one: * can never catch up :( * * We will just lie to this thread - * and let it thing p_curtick is + * and let it think p_curslot is * correct. When it next awakens * it will find itself further behind. */ @@ -1473,20 +1451,19 @@ no_one: counter_u64_add(hpts_hopelessly_behind, 1); goto no_run; } - hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + + hpts->p_cur_slot = cts_to_wheel(cts); if (!seen_endpoint) { /* We saw no endpoint but we may be looping */ orig_exit_slot = hpts->p_cur_slot; } - if ((wrap_loop_cnt < 2) && - (hpts->p_lasttick != hpts->p_curtick)) { + if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) { counter_u64_add(hpts_loops, 1); loop_cnt++; goto again; } no_run: - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); + pace->cts_last_ran[hpts->p_cpu] = cts; /* * Set flag to tell that we are done for * any slot input that happens during @@ -1494,25 +1471,36 @@ no_run: */ hpts->p_wheel_complete = 1; /* - * Now did we spend too long running input and need to run more ticks? - * Note that if wrap_loop_cnt < 2 then we should have the conditions - * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt - * is greater than 2, then the condtion most likely are *not* true. - * Also if we are called not from the callout, we don't run the wheel - * multiple times so the slots may not align either. - */ - KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || - (wrap_loop_cnt >= 2) || !from_callout), - ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, - hpts->p_prev_slot, hpts->p_cur_slot)); - KASSERT(((hpts->p_lasttick == hpts->p_curtick) - || (wrap_loop_cnt >= 2) || !from_callout), - ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, - hpts->p_lasttick, hpts->p_curtick)); - if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { - hpts->p_curtick = tcp_gethptstick(&tv); + * If enough time has elapsed that we should be processing the next + * slot(s), then we should have kept running and not marked the wheel as + * complete. + * + * But there are several other conditions where we would have stopped + * processing, so the prev/cur slots and cts variables won't match. + * These conditions are: + * + * - Calls not from callouts don't run multiple times + * - The wheel is empty + * - We've processed more than max_pacer_loops times + * - We've wrapped more than 2 times + * + * This assert catches when the logic above has violated this design. + * + */ + KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) || + (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) || + ((hpts->p_prev_slot == hpts->p_cur_slot) && + !tcp_hpts_different_slots(cts, cts_last_run))), + ("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, " + "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d", + hpts, hpts->p_prev_slot, hpts->p_cur_slot, + cts_last_run, cts, loop_cnt, wrap_loop_cnt)); + + if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) { + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); + hpts->p_cur_slot = cts_to_wheel(cts); counter_u64_add(hpts_loops, 1); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } @@ -1526,16 +1514,16 @@ no_run: } void -tcp_set_hpts(struct tcpcb *tp) +__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(tp); + hpts = tcp_hpts_lock(pace, tp); if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { - tp->t_hpts_cpu = hpts_cpuid(tp, &failed); + tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed); if (failed == 0) tp->t_flags2 |= TF2_HPTS_CPU_SET; } @@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp) } static struct tcp_hpts_entry * -tcp_choose_hpts_to_run(void) +tcp_choose_hpts_to_run(struct tcp_hptsi *pace) { + struct timeval tv; int i, oldest_idx, start, end; uint32_t cts, time_since_ran, calc; - cts = tcp_get_usecs(NULL); + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); time_since_ran = 0; /* Default is all one group */ start = 0; - end = tcp_pace.rp_num_hptss; + end = pace->rp_num_hptss; /* * If we have more than one L3 group figure out which one * this CPU is in. */ - if (tcp_pace.grp_cnt > 1) { - for (i = 0; i < tcp_pace.grp_cnt; i++) { - if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { - start = tcp_pace.grps[i]->cg_first; - end = (tcp_pace.grps[i]->cg_last + 1); + if (pace->grp_cnt > 1) { + for (i = 0; i < pace->grp_cnt; i++) { + if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) { + start = pace->grps[i]->cg_first; + end = (pace->grps[i]->cg_last + 1); break; } } } oldest_idx = -1; for (i = start; i < end; i++) { - if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) - calc = cts - tcp_pace.cts_last_ran[i]; + if (TSTMP_GT(cts, pace->cts_last_ran[i])) + calc = cts - pace->cts_last_ran[i]; else calc = 0; if (calc > time_since_ran) { @@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void) } } if (oldest_idx >= 0) - return(tcp_pace.rp_ent[oldest_idx]); + return(pace->rp_ent[oldest_idx]); else - return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); + return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]); } static void @@ -1588,9 +1578,9 @@ __tcp_run_hpts(void) { struct epoch_tracker et; struct tcp_hpts_entry *hpts; - int ticks_ran; + int slots_ran; - hpts = tcp_choose_hpts_to_run(); + hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace); if (hpts->p_hpts_active) { /* Already active */ @@ -1606,12 +1596,11 @@ __tcp_run_hpts(void) hpts->syscall_cnt++; counter_u64_add(hpts_direct_call, 1); hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, false); + slots_ran = tcp_hptsi(hpts, false); /* We may want to adjust the sleep values here */ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { - if (ticks_ran > slots_indicate_less_sleep) { + if (slots_ran > slots_indicate_less_sleep) { struct timeval tv; - sbintime_t sb; hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) @@ -1635,13 +1624,8 @@ __tcp_run_hpts(void) * the dynamic value and set the on_min_sleep * flag so we will not be awoken. */ - sb = tvtosbt(tv); - /* Store off to make visible the actual sleep time */ - hpts->sleeping = tv.tv_usec; - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } else if (ticks_ran < slots_indicate_more_sleep) { + (void)tcp_hpts_sleep(hpts, &tv); + } else if (slots_ran < slots_indicate_more_sleep) { /* For the further sleep, don't reschedule hpts */ hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) @@ -1658,17 +1642,22 @@ out_with_mtx: static void tcp_hpts_thread(void *ctx) { +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif struct tcp_hpts_entry *hpts; struct epoch_tracker et; struct timeval tv; - sbintime_t sb; - int ticks_ran; + int slots_ran; hpts = (struct tcp_hpts_entry *)ctx; +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif HPTS_LOCK(hpts); if (hpts->p_direct_wake) { /* Signaled by input or output with low occupancy count. */ - callout_stop(&hpts->co); + _callout_stop_safe(&hpts->co, 0); counter_u64_add(hpts_direct_awakening, 1); } else { /* Timed out, the normal case. */ @@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx) } hpts->sleeping = 0; hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, true); + slots_ran = tcp_hptsi(hpts, true); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { @@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx) * Only adjust sleep time if we were * called from the callout i.e. direct_wake == 0. */ - if (ticks_ran < slots_indicate_more_sleep) { + if (slots_ran < slots_indicate_more_sleep) { hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; - } else if (ticks_ran > slots_indicate_less_sleep) { + } else if (slots_ran > slots_indicate_less_sleep) { hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) hpts->p_mysleep.tv_usec = dynamic_min_sleep; @@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx) hpts->p_hpts_active = 0; back_to_sleep: hpts->p_direct_wake = 0; - sb = tvtosbt(tv); - /* Store off to make visible the actual sleep time */ - hpts->sleeping = tv.tv_usec; - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + (void)tcp_hpts_sleep(hpts, &tv); NET_EPOCH_EXIT(et); HPTS_UNLOCK(hpts); } -#undef timersub - static int32_t hpts_count_level(struct cpu_group *cg) { @@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g } } -static void -tcp_hpts_mod_load(void) +/* + * Initialize a tcp_hptsi structure. This performs the core initialization + * without starting threads. + */ +struct tcp_hptsi* +tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl) { + struct tcp_hptsi *pace; struct cpu_group *cpu_top; - int32_t error __diagused; - int32_t i, j, bound = 0, created = 0; + uint32_t i, j, cts; + int32_t count; size_t sz, asz; struct timeval tv; - sbintime_t sb; struct tcp_hpts_entry *hpts; - struct pcpu *pc; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; - int count, domain; + KASSERT(funcs != NULL, ("funcs is NULL")); + + /* Allocate the main structure */ + pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO); + if (pace == NULL) + return (NULL); + + memset(pace, 0, sizeof(*pace)); + pace->funcs = funcs; + + /* Setup CPU topology information */ #ifdef SMP cpu_top = smp_topo(); #else cpu_top = NULL; #endif - tcp_pace.rp_num_hptss = ncpus; - hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); - hpts_loops = counter_u64_alloc(M_WAITOK); - back_tosleep = counter_u64_alloc(M_WAITOK); - combined_wheel_wrap = counter_u64_alloc(M_WAITOK); - wheel_wrap = counter_u64_alloc(M_WAITOK); - hpts_wake_timeout = counter_u64_alloc(M_WAITOK); - hpts_direct_awakening = counter_u64_alloc(M_WAITOK); - hpts_back_tosleep = counter_u64_alloc(M_WAITOK); - hpts_direct_call = counter_u64_alloc(M_WAITOK); - cpu_uses_flowid = counter_u64_alloc(M_WAITOK); - cpu_uses_random = counter_u64_alloc(M_WAITOK); + pace->rp_num_hptss = ncpus; - sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); - tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); - sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); - tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); - tcp_pace.grp_cnt = 0; + /* Allocate hpts entry array */ + sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *)); + pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); + + /* Allocate timestamp tracking array */ + sz = (sizeof(uint32_t) * pace->rp_num_hptss); + pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); + + /* Setup CPU groups */ if (cpu_top == NULL) { - tcp_pace.grp_cnt = 1; + pace->grp_cnt = 1; } else { /* Find out how many cache level 3 domains we have */ count = 0; - tcp_pace.grp_cnt = hpts_count_level(cpu_top); - if (tcp_pace.grp_cnt == 0) { - tcp_pace.grp_cnt = 1; + pace->grp_cnt = hpts_count_level(cpu_top); + if (pace->grp_cnt == 0) { + pace->grp_cnt = 1; } - sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); - tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); + sz = (pace->grp_cnt * sizeof(struct cpu_group *)); + pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK); /* Now populate the groups */ - if (tcp_pace.grp_cnt == 1) { + if (pace->grp_cnt == 1) { /* * All we need is the top level all cpu's are in * the same cache so when we use grp[0]->cg_mask @@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void) * all cpu's in it. The level here is probably * zero which is ok. */ - tcp_pace.grps[0] = cpu_top; + pace->grps[0] = cpu_top; } else { /* * Here we must find all the level three cache domains * and setup our pointers to them. */ count = 0; - hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); + hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top); } } + + /* Cache the current time for initializing the hpts entries */ + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); + + /* Initialize each hpts entry */ asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), + for (i = 0; i < pace->rp_num_hptss; i++) { + pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); - tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); - hpts = tcp_pace.rp_ent[i]; - /* - * Init all the hpts structures that are not specifically - * zero'd by the allocations. Also lets attach them to the - * appropriate sysctl block as well. - */ - mtx_init(&hpts->p_mtx, "tcp_hpts_lck", - "hpts", MTX_DEF | MTX_DUPOK); - for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { - TAILQ_INIT(&hpts->p_hptss[j].head); - hpts->p_hptss[j].count = 0; - hpts->p_hptss[j].gencnt = 0; - } - sysctl_ctx_init(&hpts->hpts_ctx); - sprintf(unit, "%d", i); - hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), - OID_AUTO, - unit, - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - ""); - SYSCTL_ADD_INT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "out_qcnt", CTLFLAG_RD, - &hpts->p_on_queue_cnt, 0, - "Count TCB's awaiting output processing"); - SYSCTL_ADD_U16(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "active", CTLFLAG_RD, - &hpts->p_hpts_active, 0, - "Is the hpts active"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curslot", CTLFLAG_RD, - &hpts->p_cur_slot, 0, - "What the current running pacers goal"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "runtick", CTLFLAG_RD, - &hpts->p_runningslot, 0, - "What the running pacers current slot is"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curtick", CTLFLAG_RD, - &hpts->p_curtick, 0, - "What the running pacers last tick mapped to the wheel was"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "lastran", CTLFLAG_RD, - &tcp_pace.cts_last_ran[i], 0, - "The last usec tick that this hpts ran"); - SYSCTL_ADD_LONG(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "cur_min_sleep", CTLFLAG_RD, - &hpts->p_mysleep.tv_usec, - "What the running pacers is using for p_mysleep.tv_usec"); - SYSCTL_ADD_U64(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "now_sleeping", CTLFLAG_RD, - &hpts->sleeping, 0, - "What the running pacers is actually sleeping for"); - SYSCTL_ADD_U64(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "syscall_cnt", CTLFLAG_RD, - &hpts->syscall_cnt, 0, - "How many times we had syscalls on this hpts"); + pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, + M_WAITOK | M_ZERO); + hpts = pace->rp_ent[i]; + /* Basic initialization */ hpts->p_hpts_sleep_time = hpts_sleep_max; - hpts->p_num = i; - hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv); - hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); - hpts->p_cpu = 0xffff; + hpts->p_cpu = i; + pace->cts_last_ran[i] = cts; + hpts->p_cur_slot = cts_to_wheel(cts); + hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); + hpts->p_hptsi = pace; + mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", + MTX_DEF | MTX_DUPOK); + for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { + TAILQ_INIT(&hpts->p_hptss[j].head); + } + + /* Setup SYSCTL if requested */ + if (enable_sysctl) { + sysctl_ctx_init(&hpts->hpts_ctx); + sprintf(unit, "%d", i); + hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), + OID_AUTO, + unit, + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + ""); + SYSCTL_ADD_INT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "out_qcnt", CTLFLAG_RD, + &hpts->p_on_queue_cnt, 0, + "Count TCB's awaiting output processing"); + SYSCTL_ADD_U16(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "active", CTLFLAG_RD, + &hpts->p_hpts_active, 0, + "Is the hpts active"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "curslot", CTLFLAG_RD, + &hpts->p_cur_slot, 0, + "What the current running pacers goal"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "runslot", CTLFLAG_RD, + &hpts->p_runningslot, 0, + "What the running pacers current slot is"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "lastran", CTLFLAG_RD, + &pace->cts_last_ran[i], 0, + "The last usec timestamp that this hpts ran"); + SYSCTL_ADD_LONG(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "cur_min_sleep", CTLFLAG_RD, + &hpts->p_mysleep.tv_usec, + "What the running pacers is using for p_mysleep.tv_usec"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "now_sleeping", CTLFLAG_RD, + &hpts->sleeping, 0, + "What the running pacers is actually sleeping for"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "syscall_cnt", CTLFLAG_RD, + &hpts->syscall_cnt, 0, + "How many times we had syscalls on this hpts"); + } } - /* Don't try to bind to NUMA domains if we don't have any */ - if (vm_ndomains == 1 && tcp_bind_threads == 2) - tcp_bind_threads = 0; - /* - * Now lets start ithreads to handle the hptss. - */ - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - hpts->p_cpu = i; + return (pace); +} + +/* + * Create threads for a tcp_hptsi structure and starts timers for the current + * (minimum) sleep interval. + */ +void +tcp_hptsi_start(struct tcp_hptsi *pace) +{ + struct tcp_hpts_entry *hpts; + struct pcpu *pc; + struct timeval tv; + uint32_t i, j; + int count, domain; + int error __diagused; + + KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL")); + + /* Start threads for each hpts entry */ + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + + KASSERT(hpts->ie_cookie == NULL, + ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i)); error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); KASSERT(error == 0, - ("Can't add hpts:%p i:%d err:%d", - hpts, i, error)); - created++; - hpts->p_mysleep.tv_sec = 0; - hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; + ("Can't add hpts:%p i:%d err:%d", hpts, i, error)); + if (tcp_bind_threads == 1) { - if (intr_event_bind(hpts->ie, i) == 0) - bound++; + (void)intr_event_bind(hpts->ie, i); } else if (tcp_bind_threads == 2) { /* Find the group for this CPU (i) and bind into it */ - for (j = 0; j < tcp_pace.grp_cnt; j++) { - if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { + for (j = 0; j < pace->grp_cnt; j++) { + if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) { if (intr_event_bind_ithread_cpuset(hpts->ie, - &tcp_pace.grps[j]->cg_mask) == 0) { - bound++; + &pace->grps[j]->cg_mask) == 0) { pc = pcpu_find(i); domain = pc->pc_domain; - count = hpts_domains[domain].count; - hpts_domains[domain].cpu[count] = i; - hpts_domains[domain].count++; + count = pace->domains[domain].count; + pace->domains[domain].cpu[count] = i; + pace->domains[domain].count++; break; } } } } + + hpts->p_mysleep.tv_sec = 0; + hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; - hpts->sleeping = tv.tv_usec; - sb = tvtosbt(tv); - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } - /* - * If we somehow have an empty domain, fall back to choosing - * among all htps threads. - */ - for (i = 0; i < vm_ndomains; i++) { - if (hpts_domains[i].count == 0) { - tcp_bind_threads = 0; - break; - } + (void)tcp_hpts_sleep(hpts, &tv); } - tcp_hpts_softclock = __tcp_run_hpts; - tcp_lro_hpts_init(); - printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", - created, bound, - tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); } -static void -tcp_hpts_mod_unload(void) +/* + * Stop all callouts/threads for a tcp_hptsi structure. + */ +void +tcp_hptsi_stop(struct tcp_hptsi *pace) { + struct tcp_hpts_entry *hpts; int rv __diagused; + uint32_t i; - tcp_lro_hpts_uninit(); - atomic_store_ptr(&tcp_hpts_softclock, NULL); + KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL")); - for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { - struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i)); + KASSERT(hpts->ie_cookie != NULL, + ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i)); - rv = callout_drain(&hpts->co); + rv = _callout_stop_safe(&hpts->co, CS_DRAIN); MPASS(rv != 0); rv = swi_remove(hpts->ie_cookie); MPASS(rv == 0); + hpts->ie_cookie = NULL; + } +} - rv = sysctl_ctx_free(&hpts->hpts_ctx); - MPASS(rv == 0); +/* + * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create. + */ +void +tcp_hptsi_destroy(struct tcp_hptsi *pace) +{ + struct tcp_hpts_entry *hpts; + uint32_t i; + + KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL")); + KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL")); + + /* Cleanup each hpts entry */ + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + if (hpts != NULL) { + /* Cleanup SYSCTL if it was initialized */ + if (hpts->hpts_root != NULL) { + sysctl_ctx_free(&hpts->hpts_ctx); + } - mtx_destroy(&hpts->p_mtx); - free(hpts->p_hptss, M_TCPHPTS); - free(hpts, M_TCPHPTS); + mtx_destroy(&hpts->p_mtx); + free(hpts->p_hptss, M_TCPHPTS); + free(hpts, M_TCPHPTS); + } } - free(tcp_pace.rp_ent, M_TCPHPTS); - free(tcp_pace.cts_last_ran, M_TCPHPTS); + /* Cleanup main arrays */ + free(pace->rp_ent, M_TCPHPTS); + free(pace->cts_last_ran, M_TCPHPTS); #ifdef SMP - free(tcp_pace.grps, M_TCPHPTS); + free(pace->grps, M_TCPHPTS); #endif + /* Free the main structure */ + free(pace, M_TCPHPTS); +} + +static int +tcp_hpts_mod_load(void) +{ + int i; + + /* Don't try to bind to NUMA domains if we don't have any */ + if (vm_ndomains == 1 && tcp_bind_threads == 2) + tcp_bind_threads = 0; + + /* Create the tcp_hptsi structure */ + tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true); + if (tcp_hptsi_pace == NULL) + return (ENOMEM); + + /* Initialize global counters */ + hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); + hpts_loops = counter_u64_alloc(M_WAITOK); + back_tosleep = counter_u64_alloc(M_WAITOK); + combined_wheel_wrap = counter_u64_alloc(M_WAITOK); + wheel_wrap = counter_u64_alloc(M_WAITOK); + hpts_wake_timeout = counter_u64_alloc(M_WAITOK); + hpts_direct_awakening = counter_u64_alloc(M_WAITOK); + hpts_back_tosleep = counter_u64_alloc(M_WAITOK); + hpts_direct_call = counter_u64_alloc(M_WAITOK); + cpu_uses_flowid = counter_u64_alloc(M_WAITOK); + cpu_uses_random = counter_u64_alloc(M_WAITOK); + + /* Start the threads */ + tcp_hptsi_start(tcp_hptsi_pace); + + /* Enable the global HPTS softclock function */ + tcp_hpts_softclock = __tcp_run_hpts; + + /* Initialize LRO HPTS */ + tcp_lro_hpts_init(); + + /* + * If we somehow have an empty domain, fall back to choosing among all + * HPTS threads. + */ + for (i = 0; i < vm_ndomains; i++) { + if (tcp_hptsi_pace->domains[i].count == 0) { + tcp_bind_threads = 0; + break; + } + } + + printf("TCP HPTS started %u (%s) swi interrupt threads\n", + tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ? + "(unbounded)" : + (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain")); + + return (0); +} + +static void +tcp_hpts_mod_unload(void) +{ + tcp_lro_hpts_uninit(); + + /* Disable the global HPTS softclock function */ + atomic_store_ptr(&tcp_hpts_softclock, NULL); + + tcp_hptsi_stop(tcp_hptsi_pace); + tcp_hptsi_destroy(tcp_hptsi_pace); + tcp_hptsi_pace = NULL; + + /* Cleanup global counters */ counter_u64_free(hpts_hopelessly_behind); counter_u64_free(hpts_loops); counter_u64_free(back_tosleep); @@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void) } static int -tcp_hpts_modevent(module_t mod, int what, void *arg) +tcp_hpts_mod_event(module_t mod, int what, void *arg) { - switch (what) { case MOD_LOAD: - tcp_hpts_mod_load(); - return (0); + return (tcp_hpts_mod_load()); case MOD_QUIESCE: /* * Since we are a dependency of TCP stack modules, they should @@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg) static moduledata_t tcp_hpts_module = { .name = "tcphpts", - .evhand = tcp_hpts_modevent, + .evhand = tcp_hpts_mod_event, }; DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index 6172baf2a062..6b05f9701ac2 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -28,19 +28,11 @@ /* Number of useconds represented by an hpts slot */ #define HPTS_USECS_PER_SLOT 10 -#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) -#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 static inline uint32_t -tcp_tv_to_hpts_slot(const struct timeval *sv) -{ - return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT)); -} - -static inline uint32_t tcp_tv_to_usec(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -66,7 +58,7 @@ struct hpts_diag { uint32_t p_runningslot; /* bbr->inflight */ uint32_t slot_req; /* bbr->flex3 x */ uint32_t inp_hptsslot; /* bbr->flex4 x */ - uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t time_remaining; /* bbr->flex5 x */ uint32_t have_slept; /* bbr->epoch x */ uint32_t hpts_sleep_time; /* bbr->applimited x */ uint32_t yet_to_sleep; /* bbr->lt_epoch x */ @@ -75,8 +67,6 @@ struct hpts_diag { uint32_t maxslots; /* bbr->delRate x */ uint32_t wheel_cts; /* bbr->rttProp x */ int32_t co_ret; /* bbr->pkts_out x */ - uint32_t p_curtick; /* upper bbr->cur_del_rate */ - uint32_t p_lasttick; /* lower bbr->cur_del_rate */ uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; @@ -92,13 +82,18 @@ struct hpts_diag { #ifdef _KERNEL +extern struct tcp_hptsi *tcp_hptsi_pace; + /* * The following are the definitions for the kernel HPTS interface for managing * the HPTS ring and the TCBs on it. */ -void tcp_hpts_init(struct tcpcb *); -void tcp_hpts_remove(struct tcpcb *); +void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp) + +void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp) static inline bool tcp_in_hpts(struct tcpcb *tp) @@ -132,12 +127,13 @@ tcp_in_hpts(struct tcpcb *tp) * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ -uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, - struct hpts_diag *diag); -#define tcp_hpts_insert(inp, slot) \ - tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) +void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, + struct hpts_diag *diag); +#define tcp_hpts_insert(tp, usecs, diag) \ + __tcp_hpts_insert(tcp_hptsi_pace, (tp), (usecs), (diag)) -void tcp_set_hpts(struct tcpcb *tp); +void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp); +#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp) extern int32_t tcp_min_hptsi_time; @@ -147,17 +143,6 @@ get_hpts_min_sleep_time(void) return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT); } -static inline uint32_t -tcp_gethptstick(struct timeval *sv) -{ - struct timeval tv; - - if (sv == NULL) - sv = &tv; - microuptime(sv); - return (tcp_tv_to_hpts_slot(sv)); -} - static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { @@ -180,12 +165,5 @@ tcp_get_usecs(struct timeval *tv) return (tcp_tv_to_usec(tv)); } -/* - * LRO HPTS initialization and uninitialization, only for internal use by the - * HPTS code. - */ -void tcp_lro_hpts_init(void); -void tcp_lro_hpts_uninit(void); - #endif /* _KERNEL */ #endif /* __tcp_hpts_h__ */ diff --git a/sys/netinet/tcp_hpts_internal.h b/sys/netinet/tcp_hpts_internal.h new file mode 100644 index 000000000000..8b33e03a6981 --- /dev/null +++ b/sys/netinet/tcp_hpts_internal.h @@ -0,0 +1,184 @@ +/*- + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __tcp_hpts_internal_h__ +#define __tcp_hpts_internal_h__ + +/* + * TCP High Precision Timer System (HPTS) - Internal Definitions + * + * This header contains internal structures, constants, and interfaces that are + * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of + * the HPTS subsystem. + */ + +#if defined(_KERNEL) + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* Convert microseconds to HPTS slots */ +#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +extern int tcp_bind_threads; /* Thread binding configuration + * (0=none, 1=cpu, 2=numa) */ + +/* + * Abstraction layer controlling time, interrupts and callouts. + */ +struct tcp_hptsi_funcs { + void (*microuptime)(struct timeval *tv); + int (*swi_add)(struct intr_event **eventp, const char *name, + driver_intr_t handler, void *arg, int pri, enum intr_type flags, + void **cookiep); + int (*swi_remove)(void *cookie); + void (*swi_sched)(void *cookie, int flags); + int (*intr_event_bind)(struct intr_event *ie, int cpu); + int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie, + struct _cpuset *mask); + void (*callout_init)(struct callout *c, int mpsafe); + int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt, + sbintime_t precision, void (*func)(void *), void *arg, int cpu, + int flags); + int (*_callout_stop_safe)(struct callout *c, int flags); +}; + +/* Default function table for system operation */ +extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs; + +/* Each hpts has its own p_mtx which is used for locking */ +#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) +#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) +#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) +#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) + +struct tcp_hpts_entry { + /* Cache line 0x00 */ + struct mtx p_mtx; /* Mutex for hpts */ + struct timeval p_mysleep; /* Our min sleep time */ + uint64_t syscall_cnt; + uint64_t sleeping; /* What the actual sleep was (if sleeping) */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_runningslot; /* Current slot we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ + uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ + uint32_t p_nxt_slot; /* The next slot outside the current range + * of slots that the hpts is running on. */ + int32_t p_on_queue_cnt; /* Count on queue in this hpts */ + uint8_t p_direct_wake :1, /* boolean */ + p_on_min_sleep:1, /* boolean */ + p_hpts_wake_scheduled:1,/* boolean */ + hit_callout_thresh:1, + p_avail:4; + uint8_t p_fill[3]; /* Fill to 32 bits */ + /* Cache line 0x40 */ + struct hptsh { + TAILQ_HEAD(, tcpcb) head; + uint32_t count; + uint32_t gencnt; + } *p_hptss; /* Hptsi wheel */ + uint32_t p_hpts_sleep_time; /* Current sleep interval having a max + * of 255ms */ + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ + uint32_t saved_curslot; /* for logging */ + uint32_t saved_prev_slot; /* for logging */ + uint32_t p_delayed_by; /* How much were we delayed by */ + /* Cache line 0x80 */ + struct sysctl_ctx_list hpts_ctx; + struct sysctl_oid *hpts_root; + struct intr_event *ie; + void *ie_cookie; + uint16_t p_cpu; /* The hpts CPU */ + struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */ + /* There is extra space in here */ + /* Cache line 0x100 */ + struct callout co __aligned(CACHE_LINE_SIZE); +} __aligned(CACHE_LINE_SIZE); + +struct tcp_hptsi { + struct cpu_group **grps; + struct tcp_hpts_entry **rp_ent; /* Array of hptss */ + uint32_t *cts_last_ran; + uint32_t grp_cnt; + uint32_t rp_num_hptss; /* Number of hpts threads */ + struct hpts_domain_info { + int count; + int cpu[MAXCPU]; + } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */ + const struct tcp_hptsi_funcs *funcs; /* Function table for testability */ +}; + +/* + * Core tcp_hptsi structure manipulation functions. + */ +struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, + bool enable_sysctl); +void tcp_hptsi_destroy(struct tcp_hptsi *pace); +void tcp_hptsi_start(struct tcp_hptsi *pace); +void tcp_hptsi_stop(struct tcp_hptsi *pace); +uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace); +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); + +void tcp_hpts_wake(struct tcp_hpts_entry *hpts); + +/* + * LRO HPTS initialization and uninitialization, only for internal use by the + * HPTS code. + */ +void tcp_lro_hpts_init(void); +void tcp_lro_hpts_uninit(void); + +#endif /* defined(_KERNEL) */ +#endif /* __tcp_hpts_internal_h__ */ diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c new file mode 100644 index 000000000000..c5dc9cb5b03b --- /dev/null +++ b/sys/netinet/tcp_hpts_test.c @@ -0,0 +1,1682 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <tests/ktest.h> +#include <sys/cdefs.h> +#include "opt_inet.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> +#include <dev/tcp_log/tcp_log_dev.h> +#include <netinet/tcp_log_buf.h> + +#undef tcp_hpts_init +#undef tcp_hpts_remove +#undef tcp_hpts_insert +#undef tcp_set_hpts + +/* Custom definitions that take the tcp_hptsi */ +#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp)) +#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp)) +#define tcp_hpts_insert(pace, tp, usecs, diag) \ + __tcp_hpts_insert((pace), (tp), (usecs), (diag)) +#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp)) + +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test"); + +static int test_exit_on_failure = true; +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP HPTS test controls"); +SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW, + &test_exit_on_failure, 0, + "Exit HPTS test immediately on first failure (1) or continue running all tests (0)"); + +#define KTEST_VERIFY(x) do { \ + if (!(x)) { \ + KTEST_ERR(ctx, "FAIL: %s", #x); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s", #x); \ + } \ +} while (0) + +#define KTEST_EQUAL(x, y) do { \ + if ((x) != (y)) { \ + KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_NEQUAL(x, y) do { \ + if ((x) == (y)) { \ + KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_GREATER_THAN(x, y) do { \ + if ((x) <= (y)) { \ + KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_VERIFY_RET(x, y) do { \ + if (!(x)) { \ + KTEST_ERR(ctx, "FAIL: %s", #x); \ + if (test_exit_on_failure) \ + return (y); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s", #x); \ + } \ +} while (0) + +#ifdef TCP_HPTS_KTEST + +static void +dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts) +{ + KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts); + KTEST_LOG(ctx, " p_cur_slot: %u", hpts->p_cur_slot); + KTEST_LOG(ctx, " p_prev_slot: %u", hpts->p_prev_slot); + KTEST_LOG(ctx, " p_nxt_slot: %u", hpts->p_nxt_slot); + KTEST_LOG(ctx, " p_runningslot: %u", hpts->p_runningslot); + KTEST_LOG(ctx, " p_on_queue_cnt: %d", hpts->p_on_queue_cnt); + KTEST_LOG(ctx, " p_hpts_active: %u", hpts->p_hpts_active); + KTEST_LOG(ctx, " p_wheel_complete: %u", hpts->p_wheel_complete); + KTEST_LOG(ctx, " p_direct_wake: %u", hpts->p_direct_wake); + KTEST_LOG(ctx, " p_on_min_sleep: %u", hpts->p_on_min_sleep); + KTEST_LOG(ctx, " p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled); + KTEST_LOG(ctx, " hit_callout_thresh: %u", hpts->hit_callout_thresh); + KTEST_LOG(ctx, " p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time); + KTEST_LOG(ctx, " p_delayed_by: %u", hpts->p_delayed_by); + KTEST_LOG(ctx, " overidden_sleep: %u", hpts->overidden_sleep); + KTEST_LOG(ctx, " saved_curslot: %u", hpts->saved_curslot); + KTEST_LOG(ctx, " saved_prev_slot: %u", hpts->saved_prev_slot); + KTEST_LOG(ctx, " syscall_cnt: %lu", hpts->syscall_cnt); + KTEST_LOG(ctx, " sleeping: %lu", hpts->sleeping); + KTEST_LOG(ctx, " p_cpu: %u", hpts->p_cpu); + KTEST_LOG(ctx, " ie_cookie: %p", hpts->ie_cookie); + KTEST_LOG(ctx, " p_hptsi: %p", hpts->p_hptsi); + KTEST_LOG(ctx, " p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec); +} + +static void +dump_tcpcb(struct tcpcb *tp) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct inpcb *inp = &tp->t_inpcb; + + KTEST_LOG(ctx, "tcp_control_block(%p)", tp); + + /* HPTS-specific fields */ + KTEST_LOG(ctx, " t_in_hpts: %d", tp->t_in_hpts); + KTEST_LOG(ctx, " t_hpts_cpu: %u", tp->t_hpts_cpu); + KTEST_LOG(ctx, " t_hpts_slot: %d", tp->t_hpts_slot); + KTEST_LOG(ctx, " t_hpts_gencnt: %u", tp->t_hpts_gencnt); + KTEST_LOG(ctx, " t_hpts_request: %u", tp->t_hpts_request); + + /* LRO CPU field */ + KTEST_LOG(ctx, " t_lro_cpu: %u", tp->t_lro_cpu); + + /* TCP flags that affect HPTS */ + KTEST_LOG(ctx, " t_flags2: 0x%x", tp->t_flags2); + KTEST_LOG(ctx, " TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO"); + KTEST_LOG(ctx, " TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO"); + KTEST_LOG(ctx, " TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO"); + + /* Input PCB fields that HPTS uses */ + KTEST_LOG(ctx, " inp_flags: 0x%x", inp->inp_flags); + KTEST_LOG(ctx, " INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO"); + KTEST_LOG(ctx, " inp_flowid: 0x%x", inp->inp_flowid); + KTEST_LOG(ctx, " inp_flowtype: %u", inp->inp_flowtype); + KTEST_LOG(ctx, " inp_numa_domain: %d", inp->inp_numa_domain); +} + +/* Enum for call counting indices */ +enum test_call_counts { + CCNT_MICROUPTIME = 0, + CCNT_SWI_ADD, + CCNT_SWI_REMOVE, + CCNT_SWI_SCHED, + CCNT_INTR_EVENT_BIND, + CCNT_INTR_EVENT_BIND_CPUSET, + CCNT_CALLOUT_INIT, + CCNT_CALLOUT_RESET_SBT_ON, + CCNT_CALLOUT_STOP_SAFE, + CCNT_TCP_OUTPUT, + CCNT_TCP_TFB_DO_QUEUED_SEGMENTS, + CCNT_MAX +}; + +static uint32_t call_counts[CCNT_MAX]; + +static uint64_t test_time_usec = 0; + +/* + * Reset all test global variables to a clean state. + */ +static void +test_hpts_init(void) +{ + memset(call_counts, 0, sizeof(call_counts)); + test_time_usec = 0; +} + +static void +test_microuptime(struct timeval *tv) +{ + call_counts[CCNT_MICROUPTIME]++; + tv->tv_sec = test_time_usec / 1000000; + tv->tv_usec = test_time_usec % 1000000; +} + +static int +test_swi_add(struct intr_event **eventp, const char *name, + driver_intr_t handler, void *arg, int pri, enum intr_type flags, + void **cookiep) +{ + call_counts[CCNT_SWI_ADD]++; + /* Simulate successful SWI creation */ + *eventp = (struct intr_event *)0xfeedface; /* Mock event */ + *cookiep = (void *)0xdeadbeef; /* Mock cookie */ + return (0); +} + +static int +test_swi_remove(void *cookie) +{ + call_counts[CCNT_SWI_REMOVE]++; + /* Simulate successful removal */ + return (0); +} + +static void +test_swi_sched(void *cookie, int flags) +{ + call_counts[CCNT_SWI_SCHED]++; + /* Simulate successful SWI scheduling */ +} + +static int +test_intr_event_bind(struct intr_event *ie, int cpu) +{ + call_counts[CCNT_INTR_EVENT_BIND]++; + /* Simulate successful binding */ + return (0); +} + +static int +test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask) +{ + call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++; + /* Simulate successful cpuset binding */ + return (0); +} + +static void +test_callout_init(struct callout *c, int mpsafe) +{ + call_counts[CCNT_CALLOUT_INIT]++; + memset(c, 0, sizeof(*c)); +} + +static int +test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*func)(void *), void *arg, int cpu, int flags) +{ + call_counts[CCNT_CALLOUT_RESET_SBT_ON]++; + /* Return 1 to simulate successful timer scheduling */ + return (1); +} + +static int +test_callout_stop_safe(struct callout *c, int flags) +{ + call_counts[CCNT_CALLOUT_STOP_SAFE]++; + /* Return 1 to simulate successful timer stopping */ + return (1); +} + +static const struct tcp_hptsi_funcs test_funcs = { + .microuptime = test_microuptime, + .swi_add = test_swi_add, + .swi_remove = test_swi_remove, + .swi_sched = test_swi_sched, + .intr_event_bind = test_intr_event_bind, + .intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset, + .callout_init = test_callout_init, + .callout_reset_sbt_on = test_callout_reset_sbt_on, + ._callout_stop_safe = test_callout_stop_safe, +}; + +#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare +#define TP_LOG_TEST(tp) tp->t_log_state_set + +static int +test_tcp_output(struct tcpcb *tp) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; + struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + + call_counts[CCNT_TCP_OUTPUT]++; + if (TP_LOG_TEST(tp)) { + KTEST_LOG(ctx, "=> tcp_output(%p)", tp); + dump_tcpcb(tp); + dump_hpts_entry(ctx, hpts); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { + if (TP_LOG_TEST(tp)) + KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); + tcp_hpts_remove(pace, tp); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { + INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */ + return (-1); /* Simulate tcp_output error */ + } + + return (0); +} + +static int +test_tfb_do_queued_segments(struct tcpcb *tp, int flag) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; + struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + + call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++; + KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag); + dump_tcpcb(tp); + dump_hpts_entry(ctx, hpts); + + if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { + if (TP_LOG_TEST(tp)) + KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); + tcp_hpts_remove(pace, tp); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { + INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */ + return (-1); /* Simulate do_queued_segments error */ + } + + return (0); +} + +static struct tcp_function_block test_tcp_fb = { + .tfb_tcp_block_name = "hpts_test_tcp", + .tfb_tcp_output = test_tcp_output, + .tfb_do_queued_segments = test_tfb_do_queued_segments, +}; + +/* + * Create a minimally initialized tcpcb that can be safely inserted into HPTS. + * This function allocates and initializes all the fields that HPTS code + * reads or writes. + */ +static struct tcpcb * +test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace) +{ + struct tcpcb *tp; + + tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO); + if (tp) { + rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp", + RW_RECURSE | RW_DUPOK); + refcount_init(&tp->t_inpcb.inp_refcount, 1); + tp->t_inpcb.inp_pcbinfo = &V_tcbinfo; + tp->t_fb = &test_tcp_fb; + tp->t_hpts_cpu = HPTS_CPU_NONE; + STAILQ_INIT(&tp->t_inqueue); + tcp_hpts_init(pace, tp); + + /* Stuff some pointers in the tcb for test purposes. */ + tp->t_fb_ptr = ctx; + tp->t_tfo_pending = (unsigned int*)pace; + } + + return (tp); +} + +/* + * Free a test tcpcb created by test_hpts_create_tcpcb() + */ +static void +test_hpts_free_tcpcb(struct tcpcb *tp) +{ + if (tp == NULL) + return; + + INP_LOCK_DESTROY(&tp->t_inpcb); + free(tp, M_TCPHPTS); +} + +/* + * *********************************************** + * * KTEST functions for testing the HPTS module * + * *********************************************** + */ + +/* + * Validates that the HPTS module is properly loaded and initialized by checking + * that the minimum HPTS time is configured. + */ +KTEST_FUNC(module_load) +{ + test_hpts_init(); + KTEST_NEQUAL(tcp_min_hptsi_time, 0); + KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2); + KTEST_NEQUAL(tcp_hptsi_pace, NULL); + return (0); +} + +/* + * Validates the creation and destruction of tcp_hptsi structures, ensuring + * proper initialization of internal fields and clean destruction. + */ +KTEST_FUNC(hptsi_create_destroy) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + KTEST_NEQUAL(pace->rp_ent, NULL); + KTEST_NEQUAL(pace->cts_last_ran, NULL); + KTEST_VERIFY(pace->rp_num_hptss > 0); + KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */ + KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */ + KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */ + + /* Verify individual HPTS entries are properly initialized */ + for (uint32_t i = 0; i < pace->rp_num_hptss; i++) { + KTEST_NEQUAL(pace->rp_ent[i], NULL); + KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i); + KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace); + KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + } + + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcp_hptsi structures can be started and stopped properly, + * including verification that threads are created during start and cleaned up + * during stop operations. + */ +KTEST_FUNC(hptsi_start_stop) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + + tcp_hptsi_start(pace); + + /* Verify that entries have threads started */ + struct tcp_hpts_entry *hpts = pace->rp_ent[0]; + KTEST_NEQUAL(hpts->ie_cookie, NULL); /* Should have SWI handler */ + KTEST_EQUAL(hpts->p_hptsi, pace); /* Should point to our pace */ + + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that multiple tcp_hptsi instances can coexist independently, with + * different configurations and CPU assignments without interfering with each + * other. + */ +KTEST_FUNC(hptsi_independence) +{ + struct tcp_hptsi *pace1, *pace2; + uint16_t cpu1, cpu2; + + test_hpts_init(); + + pace1 = tcp_hptsi_create(&test_funcs, false); + pace2 = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace1, NULL); + KTEST_NEQUAL(pace2, NULL); + KTEST_NEQUAL(pace2->rp_ent, NULL); + + cpu1 = tcp_hptsi_random_cpu(pace1); + cpu2 = tcp_hptsi_random_cpu(pace2); + KTEST_VERIFY(cpu1 < pace1->rp_num_hptss); + KTEST_VERIFY(cpu2 < pace2->rp_num_hptss); + + /* Verify both instances have independent entry arrays */ + KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent); + /* Verify they may have different CPU counts but both reasonable */ + KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU); + KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU); + + tcp_hptsi_destroy(pace1); + tcp_hptsi_destroy(pace2); + + return (0); +} + +/* + * Validates that custom function injection works correctly, ensuring that + * test-specific implementations of microuptime and others are properly + * called by the HPTS system. + */ +KTEST_FUNC(function_injection) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + KTEST_EQUAL(pace->funcs, &test_funcs); + KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0); + + tcp_hptsi_start(pace); + KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0); + KTEST_VERIFY(tcp_bind_threads == 0 || + call_counts[CCNT_INTR_EVENT_BIND] > 0 || + call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0); + + tcp_hptsi_stop(pace); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0); + KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0); + + tcp_hptsi_destroy(pace); + + /* Verify we have a reasonable balance of create/destroy calls */ + KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]); + + return (0); +} + +/* + * Validates that a tcpcb can be properly initialized for HPTS compatibility, + * ensuring all required fields are set correctly and function pointers are + * valid for safe HPTS operations. + */ +KTEST_FUNC(tcpcb_initialization) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Verify the tcpcb is properly initialized for HPTS */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + KTEST_NEQUAL(tp->t_fb, NULL); + KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL); + KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0); + + /* Verify that HPTS-specific fields are initialized */ + KTEST_EQUAL(tp->t_hpts_gencnt, 0); + KTEST_EQUAL(tp->t_hpts_slot, 0); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_lro_cpu, 0); + KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss); + KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1); + KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED)); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcpcb structures can be successfully inserted into and removed + * from the HPTS wheel, with proper state tracking and slot assignment during + * the process. + */ +KTEST_FUNC(tcpcb_insertion) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + uint32_t timeout_usecs = 10; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0); + + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); + tcp_hpts_insert(pace, tp, timeout_usecs, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); + KTEST_VERIFY(tcp_in_hpts(tp)); + KTEST_VERIFY(tp->t_hpts_slot >= 0); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(timeout_usecs)); + //KTEST_EQUAL(tp->t_hpts_gencnt, 1); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_VERIFY(!tcp_in_hpts(tp)); + + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the core HPTS timer functionality by verifying that scheduled + * tcpcb entries trigger tcp_output calls at appropriate times, simulating + * real-world timer-driven TCP processing. + */ +KTEST_FUNC(timer_functionality) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + struct tcpcb *tp; + int32_t slots_ran; + uint32_t i; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + /* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + dump_tcpcb(tp); + TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */ + + KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp); + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */ + tcp_hpts_insert(pace, tp, 500, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + dump_tcpcb(tp); + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + KTEST_EQUAL(hpts->p_prev_slot, 0); + KTEST_EQUAL(hpts->p_cur_slot, 0); + KTEST_EQUAL(hpts->p_runningslot, 0); + KTEST_EQUAL(hpts->p_nxt_slot, 1); + KTEST_EQUAL(hpts->p_hpts_active, 0); + + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + + /* Set our test flag to indicate the tcpcb should be removed from the + * wheel when tcp_output is called. */ + TP_REMOVE_FROM_HPTS(tp) = 1; + + /* Test early exit condition: advance time by insufficient amount */ + KTEST_LOG(ctx, "Testing early exit with insufficient time advancement"); + test_time_usec += 1; /* Very small advancement - should cause early exit */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Should return 0 slots due to insufficient time advancement */ + KTEST_EQUAL(slots_ran, 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */ + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */ + + /* Wait for 498 more usecs and trigger the HPTS workers and verify + * nothing happens yet (total 499 usec) */ + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + test_time_usec += 498; + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, pace->rp_ent[i]); + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49); + KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49); + } + + dump_tcpcb(tp); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + + /* Wait for 1 more usec and trigger the HPTS workers and verify it + * triggers tcp_output this time */ + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + test_time_usec += 1; + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, pace->rp_ent[i]); + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50); + KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50); + } + + dump_tcpcb(tp); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into + * the HPTS wheel, testing performance under high load conditions. + */ +KTEST_FUNC(scalability_tcpcbs) +{ + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + uint32_t i, num_tcpcbs = 100000, total_queued = 0; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Allocate array to hold pointers to all tcpcbs */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + /* Create a LOT of tcpcbs */ + KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs); + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + if (tcpcbs[i] == NULL) { + KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL"); + return (EINVAL); + } + } + + /* Insert all created tcpcbs into HPTS */ + KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS..."); + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + /* Insert with varying future timeouts to distribute across slots */ + tcp_hpts_insert(pace, tcpcbs[i], 100 + (i % 1000), NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Verify total queue counts across all CPUs */ + for (i = 0; i < pace->rp_num_hptss; i++) { + total_queued += pace->rp_ent[i]->p_on_queue_cnt; + } + KTEST_EQUAL(total_queued, num_tcpcbs); + + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + /* Remove all tcpcbs from HPTS */ + KTEST_LOG(ctx, "Removing all tcpcbs from HPTS..."); + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tcpcbs[i]); + } + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Verify all queues are now empty */ + for (i = 0; i < pace->rp_num_hptss; i++) { + if (pace->rp_ent[i]->p_on_queue_cnt != 0) { + KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0"); + return (EINVAL); + } + } + + for (i = 0; i < num_tcpcbs; i++) { + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates wheel wrap scenarios where the timer falls significantly behind + * and needs to process more than one full wheel revolution worth of slots. + */ +KTEST_FUNC(wheel_wrap_recovery) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + uint32_t i, timeout_usecs, num_tcpcbs = 500; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Allocate array to hold pointers to tcpcbs */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + /* Create tcpcbs and insert them across many slots */ + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tcpcbs[i], NULL); + TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; + + timeout_usecs = ((i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs) * HPTS_USECS_PER_SLOT; /* Spread across slots */ + + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tcpcbs[i], timeout_usecs, NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Fast forward time significantly to trigger wheel wrap */ + test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; + + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i); + KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */ + KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot, + pace->rp_ent[i]->p_prev_slot); + } + + /* Cleanup */ + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tcpcbs[i]); + } + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs + * when a tcpcb is being processed by the HPTS thread but gets removed. + */ +KTEST_FUNC(tcpcb_moving_state) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2; + struct tcp_hpts_entry *hpts; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Create two tcpcbs on the same CPU/slot */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + KTEST_NEQUAL(tp2, NULL); + + /* Force them to the same CPU for predictable testing */ + tp1->t_hpts_cpu = 0; + tp2->t_hpts_cpu = 0; + + /* Insert both into the same slot */ + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, 100, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, 100, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + hpts = pace->rp_ent[0]; + + /* Manually transition tp1 to MOVING state to simulate race condition */ + HPTS_LOCK(hpts); + tp1->t_in_hpts = IHPTS_MOVING; + tp1->t_hpts_slot = -1; /* Mark for removal */ + HPTS_UNLOCK(hpts); + + /* Set time and run HPTS to process the moving state */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */ + + /* tp1 should be cleaned up and removed */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + /* tp2 should have been processed normally */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are + * properly handled and re-inserted into appropriate future slots after + * the wheel processes enough slots to accommodate the original request. + */ +KTEST_FUNC(deferred_requests) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp, *tp2; + struct tcp_hpts_entry *hpts; + uint32_t large_timeout_usecs = (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; /* Beyond wheel capacity */ + uint32_t huge_timeout_usecs = (NUM_OF_HPTSI_SLOTS * 3) * HPTS_USECS_PER_SLOT; /* 3x wheel capacity */ + uint32_t initial_request; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + + /* Insert with a request that exceeds current wheel capacity */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, large_timeout_usecs, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + /* Verify it was inserted with a deferred request */ + dump_tcpcb(tp); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_VERIFY(tp->t_hpts_request > 0); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Advance time to process deferred requests */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + + /* Process the wheel to handle deferred requests */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, hpts); + KTEST_GREATER_THAN(slots_ran, 0); + dump_tcpcb(tp); + KTEST_EQUAL(tp->t_hpts_request, 0); + + /* Test incremental deferred request processing over multiple cycles */ + KTEST_LOG(ctx, "Testing incremental deferred request processing"); + + /* Create a new connection with an even larger request */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */ + + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, huge_timeout_usecs, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify initial deferred request */ + initial_request = tp2->t_hpts_request; + KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS); + + /* Process one wheel cycle - should reduce but not eliminate request */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Request should be reduced but not zero */ + KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request); + KTEST_VERIFY(tp2->t_hpts_request > 0); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */ + + /* For huge_timeout_usecs = NUM_OF_HPTSI_SLOTS * 3 * HPTS_USECS_PER_SLOT, we need ~3 cycles to complete. + * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */ + KTEST_VERIFY(tp2->t_hpts_request < initial_request); + KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */ + + /* Clean up second connection */ + INP_WLOCK(&tp2->t_inpcb); + if (tp2->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tp2); + } + INP_WUNLOCK(&tp2->t_inpcb); + test_hpts_free_tcpcb(tp2); + + /* Clean up */ + INP_WLOCK(&tp->t_inpcb); + if (tp->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tp); + } + INP_WUNLOCK(&tp->t_inpcb); + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates CPU assignment and affinity mechanisms, including flowid-based + * assignment, random fallback scenarios, and explicit CPU setting. Tests + * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts. + */ +KTEST_FUNC(cpu_assignment) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2, *tp2_dup, *tp3; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + + /* Test random CPU assignment (no flowid) */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE; + INP_WLOCK(&tp1->t_inpcb); + tcp_set_hpts(pace, tp1); + INP_WUNLOCK(&tp1->t_inpcb); + KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss); + KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET); + + /* Test flowid-based assignment */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; + tp2->t_inpcb.inp_flowid = 12345; + INP_WLOCK(&tp2->t_inpcb); + tcp_set_hpts(pace, tp2); + INP_WUNLOCK(&tp2->t_inpcb); + KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss); + KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET); + + /* With the same flowid, should get same CPU assignment */ + tp2_dup = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2_dup, NULL); + tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; + tp2_dup->t_inpcb.inp_flowid = 12345; + INP_WLOCK(&tp2_dup->t_inpcb); + tcp_set_hpts(pace, tp2_dup); + INP_WUNLOCK(&tp2_dup->t_inpcb); + KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu); + + /* Test explicit CPU setting */ + tp3 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp3, NULL); + tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */ + tp3->t_flags2 |= TF2_HPTS_CPU_SET; + INP_WLOCK(&tp3->t_inpcb); + tcp_set_hpts(pace, tp3); + INP_WUNLOCK(&tp3->t_inpcb); + KTEST_EQUAL(tp3->t_hpts_cpu, 1); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + test_hpts_free_tcpcb(tp2_dup); + test_hpts_free_tcpcb(tp3); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates edge cases in slot calculation including boundary conditions + * around slot 0, maximum slots, and slot wrapping arithmetic. + */ +KTEST_FUNC(slot_boundary_conditions) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Test insertion at slot 0 */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert immediately (0 timeout) */ + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + /* Test insertion at maximum slot value */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, (NUM_OF_HPTSI_SLOTS - 1) * HPTS_USECS_PER_SLOT, NULL); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + /* Test very small timeout values */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, 1, NULL); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(1)); /* Should convert 1 usec to slot */ + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS behavior under high load conditions, including proper + * processing of many connections and connection count tracking. + */ +KTEST_FUNC(dynamic_sleep_adjustment) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + struct tcp_hpts_entry *hpts; + uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Create many connections to exceed threshold */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tcpcbs[i], NULL); + tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */ + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */ + tcp_hpts_insert(pace, tcpcbs[i], 100, NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + hpts = pace->rp_ent[0]; + dump_hpts_entry(ctx, hpts); + + /* Verify we're above threshold */ + KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD); + + /* Run HPTS to process many connections */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Verify HPTS processed slots and connections correctly */ + KTEST_GREATER_THAN(slots_ran, 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs); + + /* Verify all connections were removed from queue */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + /* Cleanup */ + for (i = 0; i < num_tcpcbs; i++) { + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates handling of concurrent insert/remove operations and race conditions + * between HPTS processing and user operations. + */ +KTEST_FUNC(concurrent_operations) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2; + struct tcp_hpts_entry *hpts; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp1 = test_hpts_create_tcpcb(ctx, pace); + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + KTEST_NEQUAL(tp2, NULL); + + /* Force all to CPU 0 */ + tp1->t_hpts_cpu = 0; + tp2->t_hpts_cpu = 0; + + /* Insert tp1 */ + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, 100, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Insert tp2 into same slot */ + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, 100, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify both are inserted */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Verify they're both assigned to the same slot */ + KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot); + + /* Verify queue count reflects both connections */ + KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */ + hpts = pace->rp_ent[tp1->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 2); + + /* Remove tp1 while tp2 is still there */ + INP_WLOCK(&tp1->t_inpcb); + tcp_hpts_remove(pace, tp1); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Verify tp1 removed, tp2 still there */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Verify queue count decreased by one */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + + /* Remove tp2 */ + INP_WLOCK(&tp2->t_inpcb); + tcp_hpts_remove(pace, tp2); + INP_WUNLOCK(&tp2->t_inpcb); + + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + + /* Verify queue is now completely empty */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the queued segments processing path via tfb_do_queued_segments, + * which is an alternative to direct tcp_output calls. + */ +KTEST_FUNC(queued_segments_processing) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + struct mbuf *fake_mbuf; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + + /* Create a minimal fake mbuf that has valid STAILQ pointers */ + fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_NEQUAL(fake_mbuf, NULL); + + /* Set up for queued segments path */ + tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ); + STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_insert(pace, tp, 100, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Run HPTS and verify queued segments path is taken */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1); + + /* Connection should be removed from HPTS after processing */ + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + + /* Clean up the fake mbuf if it's still in the queue */ + if (!STAILQ_EMPTY(&tp->t_inqueue)) { + struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue); + STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); + free(m, M_TCPHPTS); + } + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the direct wake mechanism and wake inhibition logic when + * the connection count exceeds thresholds. + */ +KTEST_FUNC(direct_wake_mechanism) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Test direct wake when not over threshold */ + HPTS_LOCK(hpts); + hpts->p_on_queue_cnt = 50; /* Below threshold */ + hpts->p_hpts_wake_scheduled = 0; + tcp_hpts_wake(hpts); + KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1); + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); + HPTS_UNLOCK(hpts); + + /* Reset for next test */ + hpts->p_hpts_wake_scheduled = 0; + call_counts[CCNT_SWI_SCHED] = 0; + + /* Test wake inhibition when over threshold */ + HPTS_LOCK(hpts); + hpts->p_on_queue_cnt = 200; /* Above threshold */ + hpts->p_direct_wake = 1; /* Request direct wake */ + tcp_hpts_wake(hpts); + KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */ + KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */ + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */ + HPTS_UNLOCK(hpts); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS collision detection when attempting to run HPTS while + * it's already active. + */ +KTEST_FUNC(hpts_collision_detection) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + hpts = pace->rp_ent[0]; + + /* Mark HPTS as active */ + HPTS_LOCK(hpts); + hpts->p_hpts_active = 1; + HPTS_UNLOCK(hpts); + + /* Attempt to run HPTS again - should detect collision */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */ + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Should return 0 indicating no work done due to collision */ + KTEST_EQUAL(slots_ran, 0); + + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates generation count handling for race condition detection between + * HPTS processing and connection insertion/removal operations. + */ +KTEST_FUNC(generation_count_validation) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + struct tcpcb *tp1, *tp2; + uint32_t initial_gencnt, slot_to_test = 10; + uint32_t timeout_usecs = slot_to_test * HPTS_USECS_PER_SLOT; + uint32_t tp2_original_gencnt; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + hpts = pace->rp_ent[0]; + + /* Record initial generation count for the test slot */ + initial_gencnt = hpts->p_hptss[slot_to_test].gencnt; + + /* Create and insert first connection */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + tp1->t_hpts_cpu = 0; /* Force to CPU 0 */ + + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, timeout_usecs, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Verify connection stored the generation count */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test); + KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt); + + /* Create second connection but don't insert yet */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_hpts_cpu = 0; /* Force to CPU 0 */ + + /* Force generation count increment by processing the slot */ + test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Verify processing occurred */ + KTEST_VERIFY(slots_ran > 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + + /* Verify generation count was incremented */ + KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1); + + /* Verify first connection was processed and removed */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + + /* Insert second connection and record its generation count */ + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, timeout_usecs, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify connection was inserted successfully */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Record the generation count that tp2 received */ + tp2_original_gencnt = tp2->t_hpts_gencnt; + + /* Test generation count mismatch detection during processing */ + /* Manually set stale generation count to simulate race condition */ + tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */ + + /* Process the slot to trigger generation count validation */ + test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Connection should be processed despite generation count mismatch */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */ + + /* The key test: HPTS should handle mismatched generation counts gracefully */ + KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */ + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +static const struct ktest_test_info tests[] = { + KTEST_INFO(module_load), + KTEST_INFO(hptsi_create_destroy), + KTEST_INFO(hptsi_start_stop), + KTEST_INFO(hptsi_independence), + KTEST_INFO(function_injection), + KTEST_INFO(tcpcb_initialization), + KTEST_INFO(tcpcb_insertion), + KTEST_INFO(timer_functionality), + KTEST_INFO(scalability_tcpcbs), + KTEST_INFO(wheel_wrap_recovery), + KTEST_INFO(tcpcb_moving_state), + KTEST_INFO(deferred_requests), + KTEST_INFO(cpu_assignment), + KTEST_INFO(slot_boundary_conditions), + KTEST_INFO(dynamic_sleep_adjustment), + KTEST_INFO(concurrent_operations), + KTEST_INFO(queued_segments_processing), + KTEST_INFO(direct_wake_mechanism), + KTEST_INFO(hpts_collision_detection), + KTEST_INFO(generation_count_validation), +}; + +#else /* TCP_HPTS_KTEST */ + +/* + * Stub to indicate that the TCP HPTS ktest is not enabled. + */ +KTEST_FUNC(module_load_without_tests) +{ + KTEST_LOG(ctx, "Warning: TCP HPTS ktest is not enabled"); + return (0); +} + +static const struct ktest_test_info tests[] = { + KTEST_INFO(module_load_without_tests), +}; + +#endif + +KTEST_MODULE_DECLARE(ktest_tcphpts, tests); +KTEST_MODULE_DEPEND(ktest_tcphpts, tcphpts); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index dd27ec77c1af..2146b0cac48f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -219,7 +219,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); -VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; +VNET_DEFINE(int, tcp_autorcvbuf_max) = 8*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 64efa4bf060f..9b5baf115855 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1475,10 +1475,11 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) } /* create sequence number */ - lc->lro_mbuf_data[lc->lro_mbuf_count].seq = - (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | - (((uint64_t)mb->m_pkthdr.flowid) << 24) | - ((uint64_t)lc->lro_mbuf_count); + lc->lro_mbuf_data[lc->lro_mbuf_count].seq = lc->lro_mbuf_count; + if (M_HASHTYPE_ISHASH(mb)) + lc->lro_mbuf_data[lc->lro_mbuf_count].seq |= + (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | + (((uint64_t)mb->m_pkthdr.flowid) << 24); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c index 43587285fe26..ac1a27a4290a 100644 --- a/sys/netinet/tcp_lro_hpts.c +++ b/sys/netinet/tcp_lro_hpts.c @@ -29,6 +29,8 @@ #include "opt_inet6.h" #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> @@ -62,6 +64,7 @@ #include <netinet/tcp_lro.h> #include <netinet/tcp_var.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #ifdef TCP_BLACKBOX #include <netinet/tcp_log_buf.h> #endif diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 2dfb7faf56e3..208f72c4661c 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -123,7 +123,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); -VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; +VNET_DEFINE(int, tcp_autosndbuf_max) = 8*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index f2d7867df9b4..66983edcdd73 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -480,7 +480,7 @@ bbr_find_lowest_rsm(struct tcp_bbr *bbr); static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which); static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, @@ -489,7 +489,7 @@ bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay); static void @@ -724,7 +724,7 @@ bbr_minseg(struct tcp_bbr *bbr) } static void -bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) +bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t pacing_delay, uint32_t tot_len) { struct inpcb *inp = tptoinpcb(tp); struct hpts_diag diag; @@ -751,40 +751,40 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_timer_exp = 0; prev_delay = bbr->r_ctl.rc_last_delay_val; if (bbr->r_ctl.rc_last_delay_val && - (slot == 0)) { + (pacing_delay == 0)) { /* * If a previous pacer delay was in place we * are not coming from the output side (where * we calculate a delay, more likely a timer). */ - slot = bbr->r_ctl.rc_last_delay_val; + pacing_delay = bbr->r_ctl.rc_last_delay_val; if (TSTMP_GT(cts, bbr->rc_pacer_started)) { /* Compensate for time passed */ delay_calc = cts - bbr->rc_pacer_started; - if (delay_calc <= slot) - slot -= delay_calc; + if (delay_calc <= pacing_delay) + pacing_delay -= delay_calc; } } /* Do we have early to make up for by pushing out the pacing time? */ if (bbr->r_agg_early_set) { - bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); - slot += bbr->r_ctl.rc_agg_early; + bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, pacing_delay, 0, bbr->r_agg_early_set, 2); + pacing_delay += bbr->r_ctl.rc_agg_early; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; } /* Are we running a total debt that needs to be compensated for? */ if (bbr->r_ctl.rc_hptsi_agg_delay) { - if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { + if (pacing_delay > bbr->r_ctl.rc_hptsi_agg_delay) { /* We nuke the delay */ - slot -= bbr->r_ctl.rc_hptsi_agg_delay; + pacing_delay -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } else { /* We nuke some of the delay, put in a minimal 100usecs */ - bbr->r_ctl.rc_hptsi_agg_delay -= slot; - bbr->r_ctl.rc_last_delay_val = slot = 100; + bbr->r_ctl.rc_hptsi_agg_delay -= pacing_delay; + bbr->r_ctl.rc_last_delay_val = pacing_delay = 100; } } - bbr->r_ctl.rc_last_delay_val = slot; + bbr->r_ctl.rc_last_delay_val = pacing_delay; hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (bbr->rc_in_persist == 0) { @@ -810,7 +810,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; hpts_timeout = delayed_ack; } - if (slot) { + if (pacing_delay) { /* Mark that we have a pacing timer up */ BBR_STAT_INC(bbr_paced_segments); bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; @@ -820,7 +820,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && - (slot == 0)) { + (pacing_delay == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* @@ -849,7 +849,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ if (left < hpts_timeout) hpts_timeout = left; } - if (bbr->r_ctl.rc_incr_tmrs && slot && + if (bbr->r_ctl.rc_incr_tmrs && pacing_delay && (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * If configured to do so, and the timer is either @@ -867,7 +867,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * this extra delay but this is easier and being more * conservative is probably better. */ - hpts_timeout += slot; + hpts_timeout += pacing_delay; } if (hpts_timeout) { /* @@ -879,10 +879,10 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } else bbr->r_ctl.rc_timer_exp = 0; - if ((slot) && + if ((pacing_delay) && (bbr->rc_use_google || bbr->output_error_seen || - (slot <= hpts_timeout)) ) { + (pacing_delay <= hpts_timeout)) ) { /* * Tell LRO that it can queue packets while * we pace. @@ -900,17 +900,15 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), - __LINE__, &diag); + tcp_hpts_insert(tp, pacing_delay, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; - bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); + bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); /* - * We add the flag here as well if the slot is set, + * We add the flag here as well if the pacing delay is set, * since hpts will call in to clear the queue first before * calling the output routine (which does our timers). * We don't want to set the flag if its just a timer @@ -919,7 +917,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * on a keep-alive timer and a request comes in for * more data. */ - if (slot) + if (pacing_delay) bbr->rc_pacer_started = cts; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { @@ -936,12 +934,12 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ TF2_DONT_SACK_QUEUE); } bbr->bbr_timer_src = frm; - bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); + bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 0); bbr_log_hpts_diag(bbr, cts, &diag); bbr->rc_timer_first = 1; } bbr->rc_tmr_stopped = 0; - bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); + bbr_log_type_bbrsnd(bbr, tot_len, pacing_delay, delay_calc, cts, frm, prev_delay); } static void @@ -1033,8 +1031,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock } /* * Ok the timer originally started is not what we want now. We will - * force the hpts to be stopped if any, and restart with the slot - * set to what was in the saved slot. + * force the hpts to be stopped if any, and restart with the pacing + * delay set to what was in the saved delay. */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { @@ -2397,7 +2395,7 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; - log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex5 = diag->time_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -2411,9 +2409,6 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) log.u_bbr.bw_inuse = diag->wheel_slot; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.delRate = diag->maxslots; - log.u_bbr.cur_del_rate = diag->p_curtick; - log.u_bbr.cur_del_rate <<= 32; - log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, @@ -2473,7 +2468,7 @@ bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, } static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; @@ -2483,7 +2478,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = slot; + log.u_bbr.flex4 = pacing_delay; log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_tp->t_flags2; @@ -2733,13 +2728,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, } static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = del_by; log.u_bbr.flex3 = prev_delay; log.u_bbr.flex4 = line; @@ -5205,7 +5200,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); - tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left)); + tcp_hpts_insert(tp, left, NULL); return (1); } bbr->rc_tmr_stopped = 0; @@ -5254,7 +5249,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) else time_since_send = 0; if (bbr->r_ctl.rc_last_delay_val > time_since_send) { - /* Cut down our slot time */ + /* Cut down our pacing_delay time */ bbr->r_ctl.rc_last_delay_val -= time_since_send; } else { bbr->r_ctl.rc_last_delay_val = 0; @@ -5888,7 +5883,7 @@ bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next - * slot (11). + * pacing delay (11). * */ INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -11856,7 +11851,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) struct bbr_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; - int32_t slot = 0; + int32_t pacing_delay = 0; struct inpcb *inp; struct sockbuf *sb; bool hpts_calling; @@ -11986,8 +11981,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) delay_calc -= bbr->r_ctl.rc_last_delay_val; else { /* - * We are early setup to adjust - * our slot time. + * We are early setup to adjust out pacing delay. */ uint64_t merged_val; @@ -12104,7 +12098,7 @@ again: #endif error = 0; tso = 0; - slot = 0; + pacing_delay = 0; mtu = 0; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sb_offset = tp->snd_max - tp->snd_una; @@ -12126,7 +12120,7 @@ recheck_resend: tot_len = tp->t_maxseg; if (hpts_calling) /* Retry in a ms */ - slot = 1001; + pacing_delay = 1001; goto just_return_nolock; } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); @@ -12699,9 +12693,9 @@ just_return: SOCK_SENDBUF_UNLOCK(so); just_return_nolock: if (tot_len) - slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) - slot = 0; + pacing_delay = 0; if (tot_len == 0) { if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= tp->snd_wnd) { @@ -12751,7 +12745,7 @@ just_return_nolock: /* Dont update the time if we did not send */ bbr->r_ctl.rc_last_delay_val = 0; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); + bbr_start_hpts_timer(bbr, tp, cts, 9, pacing_delay, tot_len); bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ @@ -12787,7 +12781,7 @@ send: flags &= ~TH_FIN; if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { /* Lets not send this */ - slot = 0; + pacing_delay = 0; goto just_return; } } @@ -13053,7 +13047,7 @@ send: /* * We have outstanding data, don't send a fin by itself!. */ - slot = 0; + pacing_delay = 0; goto just_return; } /* @@ -13763,7 +13757,7 @@ nomore: if (tp->snd_cwnd < maxseg) tp->snd_cwnd = maxseg; } - slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; BBR_STAT_INC(bbr_saw_enobuf); if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_hdwr_pacing_enobuf, 1); @@ -13812,18 +13806,18 @@ nomore: } /* * Nuke all other things that can interfere - * with slot + * with pacing delay */ if ((tot_len + len) && (len >= tp->t_maxseg)) { - slot = bbr_get_pacing_delay(bbr, + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, (tot_len + len), cts, 0); - if (slot < bbr_error_base_paceout) - slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + if (pacing_delay < bbr_error_base_paceout) + pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; } else - slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 10, slot, + bbr_start_hpts_timer(bbr, tp, cts, 10, pacing_delay, tot_len); return (error); } @@ -13841,9 +13835,9 @@ nomore: } /* FALLTHROUGH */ default: - slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); + bbr_start_hpts_timer(bbr, tp, cts, 11, pacing_delay, 0); return (error); } #ifdef STATS @@ -13981,12 +13975,12 @@ skip_again: tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { /* - * Calculate/Re-Calculate the hptsi slot in usecs based on + * Calculate/Re-Calculate the hptsi timeout in usecs based on * what we have sent so far */ - slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) - slot = 0; + pacing_delay = 0; } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: @@ -13999,8 +13993,8 @@ enobufs: (more_to_rxt || ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { /* Rack cheats and shotguns out all rxt's 1ms apart */ - if (slot > 1000) - slot = 1000; + if (pacing_delay > 1000) + pacing_delay = 1000; } if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { /* @@ -14014,7 +14008,7 @@ enobufs: tcp_bbr_tso_size_check(bbr, cts); } } - bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); + bbr_start_hpts_timer(bbr, tp, cts, 12, pacing_delay, tot_len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; @@ -14132,8 +14126,7 @@ bbr_switch_failed(struct tcpcb *tp) } } else toval = HPTS_USECS_PER_SLOT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), - __LINE__, &diag); + tcp_hpts_insert(tp, toval, &diag); bbr_log_hpts_diag(bbr, cts, &diag); } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 11ef5ba706c5..0320479e882e 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -250,11 +250,11 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the co static int32_t rack_persist_min = 250000; /* 250usec */ static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ -static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */ +static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ -static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */ static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ @@ -278,7 +278,7 @@ static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ -static int32_t rack_slot_reduction = 4; +static int32_t rack_pacing_delay_reduction = 4; static int32_t rack_wma_divisor = 8; /* For WMA calculation */ static int32_t rack_cwnd_block_ends_measure = 0; static int32_t rack_rwnd_block_ends_measure = 0; @@ -478,7 +478,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack, uint16_t flex7, uint8_t mod); static void -rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality); static struct rack_sendmap * @@ -1107,7 +1107,7 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "burst_reduces", CTLFLAG_RW, - &rack_slot_reduction, 4, + &rack_pacing_delay_reduction, 4, "When doing only burst mitigation what is the reduce divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1399,7 +1399,7 @@ rack_init_sysctls(void) SYSCTL_CHILDREN(rack_timers), OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, &rack_max_reduce, 10, - "Max percentage we will reduce slot by for pacing when we are behind"); + "Max percentage we will reduce pacing delay by for pacing when we are behind"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmin", CTLFLAG_RW, @@ -2700,7 +2700,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t } static void -rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) { if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -2710,7 +2710,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.flex1 = rack->rc_tp->t_srtt; log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = slot; + log.u_bbr.flex4 = pacing_delay; log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; @@ -3034,14 +3034,14 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, } static void -rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line) { if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; else @@ -3139,7 +3139,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg } static void -rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, +rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay, uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) { if (tcp_bblogging_on(rack->rc_tp)) { @@ -3148,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; if (rack->rack_no_prr) @@ -6482,7 +6482,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; - log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex5 = diag->time_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -6497,9 +6497,6 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.timeStamp = cts; log.u_bbr.delRate = diag->maxslots; - log.u_bbr.cur_del_rate = diag->p_curtick; - log.u_bbr.cur_del_rate <<= 32; - log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -6532,14 +6529,14 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin static void rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, - int32_t slot, uint32_t tot_len_this_send, int sup_rack) + int32_t usecs, uint32_t tot_len_this_send, int sup_rack) { struct hpts_diag diag; struct inpcb *inp = tptoinpcb(tp); struct timeval tv; uint32_t delayed_ack = 0; uint32_t hpts_timeout; - uint32_t entry_slot = slot; + uint32_t entry_usecs = usecs; uint8_t stopped; uint32_t left = 0; uint32_t us_cts; @@ -6560,7 +6557,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, rack->r_ctl.rc_hpts_flags = 0; us_cts = tcp_get_usecs(&tv); /* Now early/late accounting */ - rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0); if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { /* * We have a early carry over set, @@ -6571,7 +6568,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * penalize the next timer for being awoke * by an ack aka the rc_agg_early (non-paced mode). */ - slot += rack->r_ctl.rc_agg_early; + usecs += rack->r_ctl.rc_agg_early; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; } @@ -6583,29 +6580,29 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * really depends on what * the current pacing time is. */ - if (rack->r_ctl.rc_agg_delayed >= slot) { + if (rack->r_ctl.rc_agg_delayed >= usecs) { /* * We can't compensate for it all. * And we have to have some time * on the clock. We always have a min - * 10 slots (10 x 10 i.e. 100 usecs). + * 10 HPTS timer units (10 x 10 i.e. 100 usecs). */ - if (slot <= HPTS_USECS_PER_SLOT) { + if (usecs <= HPTS_USECS_PER_SLOT) { /* We gain delay */ - rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot); - slot = HPTS_USECS_PER_SLOT; + rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs); + usecs = HPTS_USECS_PER_SLOT; } else { /* We take off some */ - rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT); - slot = HPTS_USECS_PER_SLOT; + rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT); + usecs = HPTS_USECS_PER_SLOT; } } else { - slot -= rack->r_ctl.rc_agg_delayed; + usecs -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; /* Make sure we have 100 useconds at minimum */ - if (slot < HPTS_USECS_PER_SLOT) { - rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot; - slot = HPTS_USECS_PER_SLOT; + if (usecs < HPTS_USECS_PER_SLOT) { + rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs; + usecs = HPTS_USECS_PER_SLOT; } if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0; @@ -6614,17 +6611,17 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, /* r_use_hpts_min is on and so is DGP */ uint32_t max_red; - max_red = (slot * rack->r_ctl.max_reduction) / 100; + max_red = (usecs * rack->r_ctl.max_reduction) / 100; if (max_red >= rack->r_ctl.rc_agg_delayed) { - slot -= rack->r_ctl.rc_agg_delayed; + usecs -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; } else { - slot -= max_red; + usecs -= max_red; rack->r_ctl.rc_agg_delayed -= max_red; } } if ((rack->r_use_hpts_min == 1) && - (slot > 0) && + (usecs > 0) && (rack->dgp_on == 1)) { /* * We are enforcing a min pacing timer @@ -6633,8 +6630,8 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, uint32_t min; min = get_hpts_min_sleep_time(); - if (min > slot) { - slot = min; + if (min > usecs) { + usecs = min; } } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); @@ -6652,7 +6649,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && - (slot == 0)) { + (usecs == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* @@ -6709,10 +6706,10 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } - rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); if ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0) && - (hpts_timeout < slot) && + (hpts_timeout < usecs) && (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * We have no good estimate yet for the @@ -6722,7 +6719,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * pace that long since we know the calculation * so far is not accurate. */ - slot = hpts_timeout; + usecs = hpts_timeout; } /** * Turn off all the flags for queuing by default. The @@ -6754,11 +6751,11 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * so LRO can call into us. */ tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); - if (slot) { + if (usecs) { rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; - rack->r_ctl.rc_last_output_to = us_cts + slot; + rack->r_ctl.rc_last_output_to = us_cts + usecs; /* - * A pacing timer (slot) is being set, in + * A pacing timer (usecs microseconds) is being set, in * such a case we cannot send (we are blocked by * the timer). So lets tell LRO that it should not * wake us unless there is a SACK. Note this only @@ -6799,20 +6796,18 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, } if ((rack->use_rack_rr) && (rack->r_rr_config < 2) && - ((hpts_timeout) && (hpts_timeout < slot))) { + ((hpts_timeout) && (hpts_timeout < usecs))) { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); } else { - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), - __LINE__, &diag); + tcp_hpts_insert(tp, usecs, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 1); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 1); } } else if (hpts_timeout) { /* @@ -6824,22 +6819,21 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * at the start of this block) are good enough. */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { - panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", - tp, rack, tot_len_this_send, cts, slot, hpts_timeout); + panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?", + tp, rack, tot_len_this_send, cts, usecs, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; - if (slot) - rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); + if (usecs) + rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__); } static void @@ -6870,6 +6864,18 @@ rack_mark_lost(struct tcpcb *tp, } } +static inline void +rack_mark_nolonger_lost(struct tcp_rack *rack, struct rack_sendmap *rsm) +{ + KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + rsm->r_flags &= ~RACK_WAS_LOST; + if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) + rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; + else + rack->r_ctl.rc_considered_lost = 0; +} + /* * RACK Timer, here we simply do logging and house keeping. * the normal rack_output() function will call the @@ -8016,7 +8022,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; ret = -3; left = rack->r_ctl.rc_timer_exp - cts; - tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); + tcp_hpts_insert(tp, left, NULL); rack_log_to_processing(rack, cts, ret, left); return (1); } @@ -8136,13 +8142,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, * remove the lost desgination and reduce the * bytes considered lost. */ - rsm->r_flags &= ~RACK_WAS_LOST; - KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), - ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); - if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) - rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; - else - rack->r_ctl.rc_considered_lost = 0; + rack_mark_nolonger_lost(rack, rsm); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; @@ -9498,6 +9498,11 @@ do_rest_ofb: if (rsm->r_flags & RACK_WAS_LOST) { int my_chg; + /* + * Note here we do not use our rack_mark_nolonger_lost() function + * since we are moving our data pointer around and the + * ack'ed side is already not considered lost. + */ my_chg = (nrsm->r_end - nrsm->r_start); KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); @@ -9665,16 +9670,11 @@ do_rest_ofb: changed += (rsm->r_end - rsm->r_start); /* You get a count for acking a whole segment or more */ if (rsm->r_flags & RACK_WAS_LOST) { - int my_chg; - - my_chg = (rsm->r_end - rsm->r_start); - rsm->r_flags &= ~RACK_WAS_LOST; - KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), - ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); - if (my_chg <= rack->r_ctl.rc_considered_lost) - rack->r_ctl.rc_considered_lost -= my_chg; - else - rack->r_ctl.rc_considered_lost = 0; + /* + * Here we can use the inline function since + * the rsm is truly marked lost and now no longer lost. + */ + rack_mark_nolonger_lost(rack, rsm); } rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ @@ -9857,6 +9857,10 @@ do_rest_ofb: if (rsm->r_flags & RACK_WAS_LOST) { int my_chg; + /* + * Note here we are using hookery again so we can't + * use our rack_mark_nolonger_lost() function. + */ my_chg = (nrsm->r_end - nrsm->r_start); KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); @@ -9958,16 +9962,10 @@ do_rest_ofb: rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); if (rsm->r_flags & RACK_WAS_LOST) { - int my_chg; - - my_chg = (rsm->r_end - rsm->r_start); - rsm->r_flags &= ~RACK_WAS_LOST; - KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), - ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); - if (my_chg <= rack->r_ctl.rc_considered_lost) - rack->r_ctl.rc_considered_lost -= my_chg; - else - rack->r_ctl.rc_considered_lost = 0; + /* + * Here it is safe to use our function. + */ + rack_mark_nolonger_lost(rack, rsm); } rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); @@ -10368,13 +10366,7 @@ more: * and yet before retransmitting we get an ack * which can happen due to reordering. */ - rsm->r_flags &= ~RACK_WAS_LOST; - KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), - ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); - if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) - rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; - else - rack->r_ctl.rc_considered_lost = 0; + rack_mark_nolonger_lost(rack, rsm); } rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; @@ -10482,12 +10474,7 @@ more: * which can happen due to reordering. In this * case its only a partial ack of the send. */ - KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)), - ("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm, rack, th_ack)); - if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)) - rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start; - else - rack->r_ctl.rc_considered_lost = 0; + rack_mark_nolonger_lost(rack, rsm); } /* * Clear the dup ack count for @@ -14377,8 +14364,7 @@ rack_switch_failed(struct tcpcb *tp) } } else toval = HPTS_USECS_PER_SLOT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), - __LINE__, &diag); + tcp_hpts_insert(tp, toval, &diag); rack_log_hpts_diag(rack, cts, &diag, &tv); } @@ -14973,8 +14959,7 @@ rack_init(struct tcpcb *tp, void **ptr) if (tov) { struct hpts_diag diag; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), - __LINE__, &diag); + tcp_hpts_insert(tp, tov, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); } } @@ -16367,7 +16352,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct rack_sendmap *rsm; int32_t prev_state = 0; int no_output = 0; - int slot_remaining = 0; + int time_remaining = 0; #ifdef TCP_ACCOUNTING int ack_val_set = 0xf; #endif @@ -16416,7 +16401,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * could be, if a sack is present, we want to be awoken and * so should process the packets. */ - slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; + time_remaining = rack->r_ctl.rc_last_output_to - us_cts; if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { no_output = 1; } else { @@ -16436,7 +16421,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, (*ts_ptr == TCP_LRO_TS_OPTION))) no_output = 1; } - if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { + if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) { /* * It is unrealistic to think we can pace in less than * the minimum granularity of the pacer (def:250usec). So @@ -16919,10 +16904,10 @@ do_output_now: (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use - * the remaining time (slot_remaining) to restart the timer. + * the remaining time (time_remaining) to restart the timer. */ - KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); - rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); + KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); + rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0); rack_free_trim(rack); } /* Clear the flag, it may have been cleared by output but we may not have */ @@ -17102,7 +17087,7 @@ check_it: } static void -rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality) { @@ -17125,7 +17110,7 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, } } memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; @@ -17284,25 +17269,25 @@ rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uin } static int32_t -pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) +pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) { uint64_t lentim, fill_bw; rack->r_via_fill_cw = 0; if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) - return (slot); + return (pacing_delay); if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) - return (slot); + return (pacing_delay); if (rack->r_ctl.rc_last_us_rtt == 0) - return (slot); + return (pacing_delay); if (rack->rc_pace_fill_if_rttin_range && (rack->r_ctl.rc_last_us_rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { /* The rtt is huge, N * smallest, lets not fill */ - return (slot); + return (pacing_delay); } if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) - return (slot); + return (pacing_delay); /* * first lets calculate the b/w based on the last us-rtt * and the the smallest send window. @@ -17368,7 +17353,7 @@ at_lt_bw: if (non_paced) *rate_wanted = fill_bw; if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) - return (slot); + return (pacing_delay); rack->r_via_fill_cw = 1; if (rack->r_rack_hw_rate_caps && (rack->r_ctl.crte != NULL)) { @@ -17423,19 +17408,19 @@ at_lt_bw: lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; lentim /= fill_bw; *rate_wanted = fill_bw; - if (non_paced || (lentim < slot)) { - rack_log_pacing_delay_calc(rack, len, slot, fill_bw, + if (non_paced || (lentim < pacing_delay)) { + rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw, 0, lentim, 12, __LINE__, NULL, 0); return ((int32_t)lentim); } else - return (slot); + return (pacing_delay); } static int32_t rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) { uint64_t srtt; - int32_t slot = 0; + int32_t pacing_delay = 0; int can_start_hw_pacing = 1; int err; int pace_one; @@ -17483,25 +17468,25 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * cwnd. Which in that case we are just waiting for * a ACK. */ - slot = len / tr_perms; + pacing_delay = len / tr_perms; /* Now do we reduce the time so we don't run dry? */ - if (slot && rack_slot_reduction) { - reduce = (slot / rack_slot_reduction); - if (reduce < slot) { - slot -= reduce; + if (pacing_delay && rack_pacing_delay_reduction) { + reduce = (pacing_delay / rack_pacing_delay_reduction); + if (reduce < pacing_delay) { + pacing_delay -= reduce; } else - slot = 0; + pacing_delay = 0; } else reduce = 0; - slot *= HPTS_USEC_IN_MSEC; + pacing_delay *= HPTS_USEC_IN_MSEC; if (rack->rc_pace_to_cwnd) { uint64_t rate_wanted = 0; - slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); + pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1); rack->rc_ack_can_sendout_data = 1; - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); } else - rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); /*******************************************************/ /* RRS: We insert non-paced call to stats here for len */ /*******************************************************/ @@ -17575,7 +17560,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str segs *= oh; lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; res = lentim / rate_wanted; - slot = (uint32_t)res; + pacing_delay = (uint32_t)res; if (rack_hw_rate_min && (rate_wanted < rack_hw_rate_min)) { can_start_hw_pacing = 0; @@ -17635,7 +17620,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * We want to pace at our rate *or* faster to * fill the cwnd to the max if its not full. */ - slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); + pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0); /* Re-check to make sure we are not exceeding our max b/w */ if ((rack->r_ctl.crte != NULL) && (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { @@ -17786,15 +17771,15 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str srtt = rack->rc_tp->t_srtt; else srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ - if (srtt < (uint64_t)slot) { - rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); - slot = srtt; + if (srtt < (uint64_t)pacing_delay) { + rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); + pacing_delay = srtt; } } /*******************************************************************/ /* RRS: We insert paced call to stats here for len and rate_wanted */ /*******************************************************************/ - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); } if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { /* @@ -17811,9 +17796,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str hw_boost_delay = rack_enobuf_hw_max; else if (hw_boost_delay < rack_enobuf_hw_min) hw_boost_delay = rack_enobuf_hw_min; - slot += hw_boost_delay; + pacing_delay += hw_boost_delay; } - return (slot); + return (pacing_delay); } static void @@ -18482,7 +18467,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma struct tcpopt to; u_char opt[TCP_MAXOLEN]; uint32_t hdrlen, optlen; - int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; + int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0; uint16_t flags; uint32_t if_hw_tsomaxsegcount = 0, startseq; uint32_t if_hw_tsomaxsegsize; @@ -18688,9 +18673,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } if (rack->r_ctl.crte != NULL) { /* See if we can send via the hw queue */ - slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); + pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); /* If there is nothing in queue (no pacing time) we can send via the hw queue */ - if (slot == 0) + if (pacing_delay == 0) ip_sendflag = 0; } tcp_set_flags(th, flags); @@ -18955,20 +18940,20 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma rack_log_queue_level(tp, rack, len, tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 0x7f) rack->rc_enobuf++; - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) + pacing_delay = 10 * HPTS_USEC_IN_MSEC; if (rack->r_ctl.crte != NULL) { counter_u64_add(rack_saw_enobuf_hw, 1); tcp_rl_log_enobuf(rack->r_ctl.crte); } counter_u64_add(rack_saw_enobuf, 1); } else { - slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); } - rack_start_hpts_timer(rack, tp, cts, slot, len, 0); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19071,7 +19056,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, #ifdef TCP_ACCOUNTING int cnt_thru = 1; #endif - int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; + int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; uint16_t flags; uint32_t s_soff; uint32_t if_hw_tsomaxsegcount = 0, startseq; @@ -19519,8 +19504,8 @@ again: } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); counter_u64_add(rack_fto_send, 1); - slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); - rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); + pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19707,7 +19692,7 @@ rack_output(struct tcpcb *tp) struct rack_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; - int32_t slot = 0; + int32_t pacing_delay = 0; int32_t sup_rack = 0; uint32_t cts, ms_cts, delayed, early; uint32_t add_flag = RACK_SENT_SP; @@ -20070,7 +20055,7 @@ again: if (rsm == NULL) { if (hpts_calling) /* Retry in a ms */ - slot = (1 * HPTS_USEC_IN_MSEC); + pacing_delay = (1 * HPTS_USEC_IN_MSEC); so = inp->inp_socket; sb = &so->so_snd; goto just_return_nolock; @@ -20877,7 +20862,7 @@ just_return_nolock: } if (tot_len_this_send > 0) { rack->r_ctl.fsb.recwin = recwin; - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); if ((error == 0) && rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && @@ -21060,8 +21045,8 @@ just_return_nolock: /* Yes lets make sure to move to persist before timer-start */ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); } - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); - rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack); + rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use); } #ifdef NETFLIX_SHARED_CWND if ((sbavail(sb) == 0) && @@ -21100,8 +21085,8 @@ send: * we come around to again, the flag will be clear. */ check_done = 1; - slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); - if (slot) { + pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); + if (pacing_delay) { rack->r_ctl.rc_agg_delayed = 0; rack->r_ctl.rc_agg_early = 0; rack->r_early = 0; @@ -22358,11 +22343,11 @@ nomore: rack_log_queue_level(tp, rack, len, &tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 0x7f) rack->rc_enobuf++; - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) + pacing_delay = 10 * HPTS_USEC_IN_MSEC; if (rack->r_ctl.crte != NULL) { counter_u64_add(rack_saw_enobuf_hw, 1); tcp_rl_log_enobuf(rack->r_ctl.crte); @@ -22389,8 +22374,8 @@ nomore: goto again; } } - slot = 10 * HPTS_USEC_IN_MSEC; - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + pacing_delay = 10 * HPTS_USEC_IN_MSEC; + rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22412,8 +22397,8 @@ nomore: } /* FALLTHROUGH */ default: - slot = 10 * HPTS_USEC_IN_MSEC; - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + pacing_delay = 10 * HPTS_USEC_IN_MSEC; + rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22456,18 +22441,18 @@ enobufs: /* * We don't send again after sending a RST. */ - slot = 0; + pacing_delay = 0; sendalot = 0; if (error == 0) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); - } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { + } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) { /* * Get our pacing rate, if an error * occurred in sending (ENOBUF) we would * hit the else if with slot preset. Other * errors return. */ - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); } /* We have sent clear the flag */ rack->r_ent_rec_ns = 0; @@ -22499,7 +22484,7 @@ enobufs: */ tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); } - if (slot) { + if (pacing_delay) { /* set the rack tcb into the slot N */ if ((error == 0) && rack_use_rfo && @@ -22564,7 +22549,7 @@ skip_all_send: /* Assure when we leave that snd_nxt will point to top */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount() - ts_val; if (tot_len_this_send) { diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index def6bc886617..be20fb44a820 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1046,6 +1046,8 @@ abort: * * On syncache_socket() success the newly created socket * has its underlying inp locked. + * + * *lsop is updated, if and only if 1 is returned. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, @@ -1094,12 +1096,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_spurcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected " "(syncookies disabled)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } if (sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { @@ -1109,12 +1113,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_spurcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected " "(no syncache entry)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } SCH_UNLOCK(sch); } @@ -1128,11 +1134,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, TCPSTAT_INC(tcps_sc_recvcookie); } else { TCPSTAT_INC(tcps_sc_failcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* If received ACK has MD5 signature, check it. */ @@ -1206,9 +1214,9 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "%s; %s: SEG.TSval %u < TS.Recent %u, " "segment dropped\n", s, __func__, to->to_tsval, sc->sc_tsreflect); - free(s, M_TCPLOG); } SCH_UNLOCK(sch); + free(s, M_TCPLOG); return (-1); /* Do not send RST */ } @@ -1225,7 +1233,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "expected, segment processed normally\n", s, __func__); free(s, M_TCPLOG); - s = NULL; } } @@ -1272,7 +1279,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, syncache_send_challenge_ack(sc, m); SCH_UNLOCK(sch); free(s, M_TCPLOG); - return (-1); /* Do not send RST */; + return (-1); /* Do not send RST */ } /* @@ -1285,7 +1292,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "segment rejected\n", s, __func__, th->th_ack, sc->sc_iss + 1); SCH_UNLOCK(sch); - goto failed; + free(s, M_TCPLOG); + return (0); /* Do send RST, do not free sc. */ } TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); @@ -1311,16 +1319,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if (sc != &scs) syncache_free(sc); return (1); -failed: - if (sc != NULL) { - TCPSTATES_DEC(TCPS_SYN_RECEIVED); - if (sc != &scs) - syncache_free(sc); - } - if (s != NULL) - free(s, M_TCPLOG); - *lsop = NULL; - return (0); } static struct socket * diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 4f756a75fac7..b98703bdfbfe 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -1295,8 +1295,8 @@ in6_addifaddr(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *i */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; - pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, - NULL); + pr0.ndpr_plen = ia->ia_plen = + in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; @@ -1490,16 +1490,16 @@ in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) * positive reference. */ remove_lle = 0; - if (ia->ia6_ndpr == NULL) { - nd6log((LOG_NOTICE, - "in6_unlink_ifa: autoconf'ed address " - "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); - } else { + if (ia->ia6_ndpr != NULL) { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; + } else if (ia->ia_plen < 128) { + nd6log((LOG_NOTICE, + "in6_unlink_ifa: autoconf'ed address " + "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } nd6_rem_ifa_lle(ia, remove_lle); diff --git a/sys/netinet6/in6_fib_algo.c b/sys/netinet6/in6_fib_algo.c index 10ffe7ab0265..ef5cfc6d5ef6 100644 --- a/sys/netinet6/in6_fib_algo.c +++ b/sys/netinet6/in6_fib_algo.c @@ -351,7 +351,7 @@ struct fib_lookup_module flm_radix6 = { }; static void -fib6_algo_init(void) +fib6_algo_init(void *dummy __unused) { fib_module_register(&flm_radix6_lockless); diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c index 737d4a50098a..05a01b75e0bb 100644 --- a/sys/netipsec/xform_ipcomp.c +++ b/sys/netipsec/xform_ipcomp.c @@ -750,7 +750,7 @@ static struct xformsw ipcomp_xformsw = { }; static void -ipcomp_attach(void) +ipcomp_attach(void *dummy __unused) { #ifdef INET @@ -763,7 +763,7 @@ ipcomp_attach(void) } static void -ipcomp_detach(void) +ipcomp_detach(void *dummy __unused) { #ifdef INET diff --git a/sys/netlink/netlink_snl.h b/sys/netlink/netlink_snl.h index 6dd8a9cbdb35..57f7e1e29d08 100644 --- a/sys/netlink/netlink_snl.h +++ b/sys/netlink/netlink_snl.h @@ -1068,14 +1068,14 @@ snl_init_writer(struct snl_state *ss, struct snl_writer *nw) { nw->size = SNL_WRITER_BUFFER_SIZE; nw->base = (char *)snl_allocz(ss, nw->size); - if (nw->base == NULL) { + if (__predict_false(nw->base == NULL)) { nw->error = true; nw->size = 0; - } + } else + nw->error = false; nw->offset = 0; nw->hdr = NULL; - nw->error = false; nw->ss = ss; } diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c index 03116cb0641c..3a8de2b2bfee 100644 --- a/sys/netpfil/ipfw/ip_dn_io.c +++ b/sys/netpfil/ipfw/ip_dn_io.c @@ -43,6 +43,7 @@ #include <sys/priv.h> #include <sys/proc.h> #include <sys/rwlock.h> +#include <sys/sdt.h> #include <sys/socket.h> #include <sys/time.h> #include <sys/sysctl.h> @@ -70,6 +71,9 @@ #endif #include <netpfil/ipfw/dn_sched.h> +SDT_PROVIDER_DEFINE(dummynet); +SDT_PROBE_DEFINE2(dummynet, , , drop, "struct mbuf *", "struct dn_queue *"); + /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) @@ -545,6 +549,7 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) drop: V_dn_cfg.io_pkt_drop++; + SDT_PROBE2(dummynet, , , drop, m, q); q->ni.drops++; ni->drops++; FREE_PKT(m); @@ -1001,6 +1006,7 @@ done: dropit: V_dn_cfg.io_pkt_drop++; + SDT_PROBE2(dummynet, , , drop, m, q); DN_BH_WUNLOCK(); if (m) FREE_PKT(m); diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c index b3f52322425f..d522f9da0fbe 100644 --- a/sys/netpfil/ipfw/ip_dummynet.c +++ b/sys/netpfil/ipfw/ip_dummynet.c @@ -1150,7 +1150,7 @@ copy_data_helper(void *_o, void *_arg) return 0; /* not a pipe */ /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { + for (; r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; /* Found a valid entry, copy and we are done */ @@ -1183,7 +1183,7 @@ copy_data_helper(void *_o, void *_arg) if (n >= DN_MAX_ID) return 0; /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { + for (; r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; if (copy_flowset(a, fs, 0)) diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c index b59d8d08bf80..d15d7760d7f1 100644 --- a/sys/netpfil/ipfw/ip_fw2.c +++ b/sys/netpfil/ipfw/ip_fw2.c @@ -3578,11 +3578,9 @@ sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) /* * Stuff that must be initialised only on boot or module load */ -static int -ipfw_init(void) +static void +ipfw_init(void *dummy __unused) { - int error = 0; - /* * Only print out this stuff the first time around, * when called from the sysinit code. @@ -3627,14 +3625,13 @@ ipfw_init(void) ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); - return (error); } /* * Called for the removal of the last instance only on module unload. */ static void -ipfw_destroy(void) +ipfw_destroy(void *dummy __unused) { ipfw_iface_destroy(); diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c index 1e2ff1bca290..8bd27f6885ab 100644 --- a/sys/netpfil/ipfw/ip_fw_nat.c +++ b/sys/netpfil/ipfw/ip_fw_nat.c @@ -999,9 +999,11 @@ ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; - int i; + int error, i; - sooptcopyin(sopt, &i, sizeof i, sizeof i); + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error != 0) + return (error); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); @@ -1104,7 +1106,7 @@ ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; - int i, size; + int error, i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; @@ -1134,9 +1136,9 @@ ipfw_nat_get_log(struct sockopt *sopt) i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); - sooptcopyout(sopt, data, size); + error = sooptcopyout(sopt, data, size); free(data, M_IPFW); - return(0); + return (error); } static int @@ -1166,7 +1168,7 @@ vnet_ipfw_nat_uninit(const void *arg __unused) } static void -ipfw_nat_init(void) +ipfw_nat_init(void *dummy __unused) { /* init ipfw hooks */ @@ -1183,7 +1185,7 @@ ipfw_nat_init(void) } static void -ipfw_nat_destroy(void) +ipfw_nat_destroy(void *dummy __unused) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index d6fc24a23fe9..fd70fb1c8a36 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -5965,6 +5965,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, ctx.nat_pool = &(ctx.nr->rdr); } + *ctx.rm = &V_pf_default_rule; if (ctx.nr && ctx.nr->natpass) { r = ctx.nr; ruleset = *ctx.rsm; diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c index d58af6e5ec4d..a4557f139ae5 100644 --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -259,7 +259,7 @@ static void dehook_pf_eth(void); static void dehook_pf(void); static int shutdown_pf(void); static int pf_load(void); -static void pf_unload(void); +static void pf_unload(void *); static struct cdevsw pf_cdevsw = { .d_ioctl = pfioctl, @@ -7082,7 +7082,7 @@ pf_unload_vnet(void) } static void -pf_unload(void) +pf_unload(void *dummy __unused) { sx_xlock(&pf_end_lock); diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c index fb1b121d0bc0..5d85e16f18e3 100644 --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -216,6 +216,7 @@ pf_match_translation_rule(int rs_num, struct pf_test_ctx *ctx, struct pf_krulese */ ctx->arsm = ctx->aruleset; } + break; } else { ctx->a = r; /* remember anchor */ ctx->aruleset = ruleset; /* and its ruleset */ diff --git a/sys/nfs/nfs_diskless.c b/sys/nfs/nfs_diskless.c index 42cfee63d184..0f0cf80feeec 100644 --- a/sys/nfs/nfs_diskless.c +++ b/sys/nfs/nfs_diskless.c @@ -428,7 +428,7 @@ decode_nfshandle(char *ev, u_char *fh, int maxfh) #if !defined(BOOTP_NFSROOT) static void -nfs_rootconf(void) +nfs_rootconf(void *dummy __unused) { nfs_setup_diskless(); diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 796b1719b8ba..01bf4c7e90a8 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -297,7 +297,7 @@ static u_int moea64_clear_bit(vm_page_t, uint64_t); static void moea64_kremove(vm_offset_t); static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); -static void moea64_pmap_init_qpages(void); +static void moea64_pmap_init_qpages(void *); static void moea64_remove_locked(pmap_t, vm_offset_t, vm_offset_t, struct pvo_dlist *); @@ -1284,7 +1284,7 @@ moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) } static void -moea64_pmap_init_qpages(void) +moea64_pmap_init_qpages(void *dummy __unused) { struct pcpu *pc; int i; diff --git a/sys/powerpc/cpufreq/pmcr.c b/sys/powerpc/cpufreq/pmcr.c index dd489b607606..6ae0777a8ac7 100644 --- a/sys/powerpc/cpufreq/pmcr.c +++ b/sys/powerpc/cpufreq/pmcr.c @@ -40,7 +40,8 @@ static int pstate_ids[256]; static int pstate_freqs[256]; static int npstates; -static void parse_pstates(void) +static void +parse_pstates(void *dummy __unused) { phandle_t node; diff --git a/sys/powerpc/powerpc/elf32_machdep.c b/sys/powerpc/powerpc/elf32_machdep.c index e1118713bff0..6b904c02ea15 100644 --- a/sys/powerpc/powerpc/elf32_machdep.c +++ b/sys/powerpc/powerpc/elf32_machdep.c @@ -143,7 +143,7 @@ struct sysentvec elf32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); -static Elf32_Brandinfo freebsd_brand_info = { +static const Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC, .compat_3_brand = "FreeBSD", @@ -158,11 +158,11 @@ static Elf32_Brandinfo freebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); -static Elf32_Brandinfo freebsd_brand_oinfo = { +static const Elf32_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC, .compat_3_brand = "FreeBSD", @@ -173,7 +173,7 @@ static Elf32_Brandinfo freebsd_brand_oinfo = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_oinfo); diff --git a/sys/powerpc/powerpc/elf64_machdep.c b/sys/powerpc/powerpc/elf64_machdep.c index a999f742caeb..0be40bed69cb 100644 --- a/sys/powerpc/powerpc/elf64_machdep.c +++ b/sys/powerpc/powerpc/elf64_machdep.c @@ -154,7 +154,7 @@ static bool ppc64_elfv1_header_match(const struct image_params *params, static bool ppc64_elfv2_header_match(const struct image_params *params, const int32_t *, const uint32_t *); -static Elf64_Brandinfo freebsd_brand_info_elfv1 = { +static const Elf64_Brandinfo freebsd_brand_info_elfv1 = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC64, .compat_3_brand = "FreeBSD", @@ -166,11 +166,11 @@ static Elf64_Brandinfo freebsd_brand_info_elfv1 = { .header_supported = &ppc64_elfv1_header_match }; -SYSINIT(elf64v1, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(elf64v1, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_info_elfv1); -static Elf64_Brandinfo freebsd_brand_info_elfv2 = { +static const Elf64_Brandinfo freebsd_brand_info_elfv2 = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC64, .compat_3_brand = "FreeBSD", @@ -182,11 +182,11 @@ static Elf64_Brandinfo freebsd_brand_info_elfv2 = { .header_supported = &ppc64_elfv2_header_match }; -SYSINIT(elf64v2, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(elf64v2, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_info_elfv2); -static Elf64_Brandinfo freebsd_brand_oinfo = { +static const Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC64, .compat_3_brand = "FreeBSD", @@ -198,7 +198,7 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .header_supported = &ppc64_elfv1_header_match }; -SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_oinfo); diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h index de7119dd534a..e227dd825966 100644 --- a/sys/riscv/include/vmm.h +++ b/sys/riscv/include/vmm.h @@ -123,10 +123,33 @@ struct vm_eventinfo { int *iptr; /* reqidle cookie */ }; +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (void)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, exception, (void *vcpui, uint64_t scause)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); + int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -212,7 +235,6 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu); -struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); enum vm_reg_name vm_segment_name(int seg_encoding); diff --git a/sys/riscv/riscv/elf_machdep.c b/sys/riscv/riscv/elf_machdep.c index 67b1fcc4c1a9..5bd4af4c15f8 100644 --- a/sys/riscv/riscv/elf_machdep.c +++ b/sys/riscv/riscv/elf_machdep.c @@ -100,7 +100,7 @@ static struct sysentvec elf64_freebsd_sysvec = { }; INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); -static Elf64_Brandinfo freebsd_brand_info = { +static const Elf64_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_RISCV, .compat_3_brand = "FreeBSD", @@ -110,7 +110,7 @@ static Elf64_Brandinfo freebsd_brand_info = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info); static void diff --git a/sys/riscv/vmm/riscv.h b/sys/riscv/vmm/riscv.h index 870d0d6c5cd1..917a333520ed 100644 --- a/sys/riscv/vmm/riscv.h +++ b/sys/riscv/vmm/riscv.h @@ -122,29 +122,6 @@ struct hyptrap { uint64_t htinst; }; -#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - ret_type vmmops_##opname args; - -DEFINE_VMMOPS_IFUNC(int, modinit, (void)) -DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) -DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) -DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) -DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, - struct vm_eventinfo *info)) -DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) -DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, - int vcpu_id)) -DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) -DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t scause)) -DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) -DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) -DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) -DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) -DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, - vm_offset_t max)) -DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) - #define dprintf(fmt, ...) struct hypctx *riscv_get_active_vcpu(void); diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c index ec4514f70fa6..4c9b1fa53f7a 100644 --- a/sys/riscv/vmm/vmm.c +++ b/sys/riscv/vmm/vmm.c @@ -92,7 +92,6 @@ struct vcpu { struct fpreg *guestfpu; /* (a,i) guest fpu state */ }; -#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) @@ -121,7 +120,6 @@ struct vm { bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ - struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) [m+v] guest memory */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ @@ -174,6 +172,7 @@ vcpu_cleanup(struct vcpu *vcpu, bool destroy) vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); + free(vcpu, M_VMM); } } @@ -285,7 +284,7 @@ vm_init(struct vm *vm, bool create) { int i; - vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); MPASS(vm->cookie != NULL); CPU_ZERO(&vm->active_cpus); @@ -347,9 +346,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -362,7 +361,7 @@ int vm_create(const char *name, struct vm **retvm) { struct vm *vm; - struct vmspace *vmspace; + int error; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -374,14 +373,13 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); - vmspace = vmmops_vmspace_alloc(0, 1ul << 39); - if (vmspace == NULL) - return (ENOMEM); - vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, 1ul << 39); + if (error != 0) { + free(vm, M_VMM); + return (error); + } strcpy(vm->name, name); - vm->vmspace = vmspace; - vm_mem_init(&vm->mem); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->sockets = 1; @@ -450,11 +448,6 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_mem_destroy(vm); - vmmops_vmspace_free(vm->vmspace); - vm->vmspace = NULL; - - for (i = 0; i < vm->maxcpus; i++) - free(vm->vcpu[i], M_VMM); free(vm->vcpu, M_VMM); sx_destroy(&vm->vcpus_init_lock); } @@ -760,12 +753,6 @@ vcpu_notify_event(struct vcpu *vcpu) vcpu_unlock(vcpu); } -struct vmspace * -vm_vmspace(struct vm *vm) -{ - return (vm->vmspace); -} - struct vm_mem * vm_mem(struct vm *vm) { @@ -1084,7 +1071,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) vm = vcpu->vm; vme = &vcpu->exitinfo; - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); addr = (vme->htval << 2) & ~(PAGE_SIZE - 1); dprintf("%s: %lx\n", __func__, addr); @@ -1107,7 +1094,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) if (pmap_fault(pmap, addr, ftype)) return (0); - map = &vm->vmspace->vm_map; + map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL); if (rv != KERN_SUCCESS) { printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n", @@ -1189,7 +1176,7 @@ vm_run(struct vcpu *vcpu) if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = NULL; evinfo.sptr = &vm->suspend; diff --git a/sys/rpc/auth.h b/sys/rpc/auth.h index 33c33ffd594d..648fb99a3a27 100644 --- a/sys/rpc/auth.h +++ b/sys/rpc/auth.h @@ -354,6 +354,10 @@ __END_DECLS #define RPCSEC_GSS 6 /* RPCSEC_GSS */ #define AUTH_TLS 7 /* Initiate RPC-over-TLS */ +/* RFC 5531's prescribed limits for variable-lenth arrays. */ +#define AUTH_SYS_MAX_HOSTNAME 255 +#define AUTH_SYS_MAX_GROUPS 16 /* Supplementary groups. */ + /* * Pseudo auth flavors for RPCSEC_GSS. */ diff --git a/sys/rpc/authunix_prot.c b/sys/rpc/authunix_prot.c index b107d5541c50..ff4c12c3f52e 100644 --- a/sys/rpc/authunix_prot.c +++ b/sys/rpc/authunix_prot.c @@ -30,7 +30,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> /* * authunix_prot.c * XDR for UNIX style authentication parameters for RPC @@ -40,8 +39,7 @@ #include <sys/param.h> #include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/systm.h> +#include <sys/libkern.h> #include <sys/ucred.h> #include <rpc/types.h> @@ -50,9 +48,6 @@ #include <rpc/rpc_com.h> -/* gids compose part of a credential; there may not be more than 16 of them */ -#define NGRPS 16 - /* * XDR for unix authentication parameters. */ @@ -60,25 +55,23 @@ bool_t xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) { uint32_t namelen; - uint32_t ngroups, i; + uint32_t supp_ngroups, i; uint32_t junk; char hostbuf[MAXHOSTNAMELEN]; + if (xdrs->x_op == XDR_FREE) + /* This function does not allocate auxiliary memory. */ + return (TRUE); + if (xdrs->x_op == XDR_ENCODE) { - /* - * Restrict name length to 255 according to RFC 1057. - */ getcredhostname(NULL, hostbuf, sizeof(hostbuf)); namelen = strlen(hostbuf); - if (namelen > 255) - namelen = 255; - } else { + if (namelen > AUTH_SYS_MAX_HOSTNAME) + namelen = AUTH_SYS_MAX_HOSTNAME; + } else namelen = 0; - } - junk = 0; - if (!xdr_uint32_t(xdrs, time) - || !xdr_uint32_t(xdrs, &namelen)) + if (!xdr_uint32_t(xdrs, time) || !xdr_uint32_t(xdrs, &namelen)) return (FALSE); /* @@ -88,43 +81,65 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) if (!xdr_opaque(xdrs, hostbuf, namelen)) return (FALSE); } else { + if (namelen > AUTH_SYS_MAX_HOSTNAME) + return (FALSE); xdr_setpos(xdrs, xdr_getpos(xdrs) + RNDUP(namelen)); } if (!xdr_uint32_t(xdrs, &cred->cr_uid)) return (FALSE); + + /* + * Safety check: The protocol needs at least one group (access to + * 'cr_gid', decrementation of 'cr_ngroups' below). + */ + if (xdrs->x_op == XDR_ENCODE && cred->cr_ngroups == 0) + return (FALSE); if (!xdr_uint32_t(xdrs, &cred->cr_gid)) return (FALSE); if (xdrs->x_op == XDR_ENCODE) { /* - * Note that this is a `struct xucred`, which maintains its - * historical layout of preserving the egid in cr_ngroups and - * cr_groups[0] == egid. + * Note that this is a 'struct xucred', which still has the + * historical layout where the effective GID is in cr_groups[0] + * and is accounted in 'cr_ngroups'. We substract 1 to obtain + * the number of "supplementary" groups, passed in the AUTH_SYS + * credentials variable-length array called gids[] in RFC 5531. */ - ngroups = cred->cr_ngroups - 1; - if (ngroups > NGRPS) - ngroups = NGRPS; + MPASS(cred->cr_ngroups <= XU_NGROUPS); + supp_ngroups = cred->cr_ngroups - 1; + if (supp_ngroups > AUTH_SYS_MAX_GROUPS) + /* With current values, this should never execute. */ + supp_ngroups = AUTH_SYS_MAX_GROUPS; } - if (!xdr_uint32_t(xdrs, &ngroups)) + if (!xdr_uint32_t(xdrs, &supp_ngroups)) return (FALSE); - for (i = 0; i < ngroups; i++) { - if (i < ngroups_max) { - if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1])) - return (FALSE); - } else { - if (!xdr_uint32_t(xdrs, &junk)) - return (FALSE); - } - } - if (xdrs->x_op == XDR_DECODE) { - if (ngroups > ngroups_max) - cred->cr_ngroups = ngroups_max + 1; - else - cred->cr_ngroups = ngroups + 1; - } + /* + * Because we cannot store more than XU_NGROUPS in total (16 at time of + * this writing), for now we choose to be strict with respect to RFC + * 5531's maximum number of supplementary groups (AUTH_SYS_MAX_GROUPS). + * That would also be an accidental DoS prevention measure if the + * request handling code didn't try to reassemble it in full without any + * size limits. Although AUTH_SYS_MAX_GROUPS and XU_NGROUPS are equal, + * since the latter includes the "effective" GID, we cannot store the + * last group of a message with exactly AUTH_SYS_MAX_GROUPS + * supplementary groups. We accept such messages so as not to violate + * the protocol, silently dropping the last group on the floor. + */ + + if (xdrs->x_op != XDR_ENCODE && supp_ngroups > AUTH_SYS_MAX_GROUPS) + return (FALSE); + + junk = 0; + for (i = 0; i < supp_ngroups; ++i) + if (!xdr_uint32_t(xdrs, i < XU_NGROUPS - 1 ? + &cred->cr_sgroups[i] : &junk)) + return (FALSE); + + if (xdrs->x_op != XDR_ENCODE) + cred->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS); return (TRUE); } diff --git a/sys/rpc/svc_auth_unix.c b/sys/rpc/svc_auth_unix.c index 963f4f272964..aa0fc585865f 100644 --- a/sys/rpc/svc_auth_unix.c +++ b/sys/rpc/svc_auth_unix.c @@ -41,18 +41,12 @@ */ #include <sys/param.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/systm.h> #include <sys/ucred.h> #include <rpc/rpc.h> #include <rpc/rpc_com.h> -#define MAX_MACHINE_NAME 255 -#define NGRPS 16 - /* * Unix longhand authenticator */ @@ -62,11 +56,8 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) enum auth_stat stat; XDR xdrs; int32_t *buf; - uint32_t time; struct xucred *xcr; - u_int auth_len; - size_t str_len, gid_len; - u_int i; + uint32_t auth_len, time; xcr = rqst->rq_clntcred; auth_len = (u_int)msg->rm_call.cb_cred.oa_length; @@ -74,51 +65,58 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) XDR_DECODE); buf = XDR_INLINE(&xdrs, auth_len); if (buf != NULL) { + /* 'time', 'str_len', UID, GID and 'supp_ngroups'. */ + const uint32_t min_len = 5 * BYTES_PER_XDR_UNIT; + uint32_t str_len, supp_ngroups; + + if (auth_len < min_len) + goto badcred; time = IXDR_GET_UINT32(buf); - str_len = (size_t)IXDR_GET_UINT32(buf); - if (str_len > MAX_MACHINE_NAME) { - stat = AUTH_BADCRED; - goto done; - } + str_len = IXDR_GET_UINT32(buf); + if (str_len > AUTH_SYS_MAX_HOSTNAME) + goto badcred; str_len = RNDUP(str_len); + /* + * Recheck message length now that we know the value of + * 'str_len' (and that it won't cause an overflow in additions + * below) to protect access to the credentials part. + */ + if (auth_len < min_len + str_len) + goto badcred; buf += str_len / sizeof (int32_t); xcr->cr_uid = IXDR_GET_UINT32(buf); xcr->cr_gid = IXDR_GET_UINT32(buf); - gid_len = (size_t)IXDR_GET_UINT32(buf); - if (gid_len > NGRPS) { - stat = AUTH_BADCRED; - goto done; - } - for (i = 0; i < gid_len; i++) { - /* - * Note that this is a `struct xucred`, which maintains - * its historical layout of preserving the egid in - * cr_ngroups and cr_groups[0] == egid. - */ - if (i + 1 < XU_NGROUPS) - xcr->cr_groups[i + 1] = IXDR_GET_INT32(buf); - else - buf++; - } - if (gid_len + 1 > XU_NGROUPS) - xcr->cr_ngroups = XU_NGROUPS; - else - xcr->cr_ngroups = gid_len + 1; + supp_ngroups = IXDR_GET_UINT32(buf); + /* + * See the herald comment before a similar test at the end of + * xdr_authunix_parms() for why we strictly respect RFC 5531 and + * why we may have to drop the last supplementary group when + * there are AUTH_SYS_MAX_GROUPS of them. + */ + if (supp_ngroups > AUTH_SYS_MAX_GROUPS) + goto badcred; + /* + * Final message length check, as we now know how much we will + * read in total. + */ + if (auth_len < min_len + str_len + + supp_ngroups * BYTES_PER_XDR_UNIT) + goto badcred; /* - * five is the smallest unix credentials structure - - * timestamp, hostname len (0), uid, gid, and gids len (0). + * Note that 'xcr' is a 'struct xucred', which still has the + * historical layout where the effective GID is in cr_groups[0] + * and is accounted in 'cr_ngroups'. */ - if ((5 + gid_len) * BYTES_PER_XDR_UNIT + str_len > auth_len) { - (void) printf("bad auth_len gid %ld str %ld auth %u\n", - (long)gid_len, (long)str_len, auth_len); - stat = AUTH_BADCRED; - goto done; + for (uint32_t i = 0; i < supp_ngroups; ++i) { + if (i < XU_NGROUPS - 1) + xcr->cr_sgroups[i] = IXDR_GET_INT32(buf); + else + buf++; } - } else if (! xdr_authunix_parms(&xdrs, &time, xcr)) { - stat = AUTH_BADCRED; - goto done; - } + xcr->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS); + } else if (!xdr_authunix_parms(&xdrs, &time, xcr)) + goto badcred; rqst->rq_verf = _null_auth; stat = AUTH_OK; @@ -126,6 +124,10 @@ done: XDR_DESTROY(&xdrs); return (stat); + +badcred: + stat = AUTH_BADCRED; + goto done; } diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c index 7ec50d990d4e..876776e5f62e 100644 --- a/sys/security/audit/audit.c +++ b/sys/security/audit/audit.c @@ -329,7 +329,7 @@ audit_record_dtor(void *mem, int size, void *arg) * call into the BSM assembly code to initialize it. */ static void -audit_init(void) +audit_init(void *dummy __unused) { audit_trail_enabled = 0; diff --git a/sys/security/audit/audit_pipe.c b/sys/security/audit/audit_pipe.c index fb773fd04297..4d9815467e1a 100644 --- a/sys/security/audit/audit_pipe.c +++ b/sys/security/audit/audit_pipe.c @@ -243,6 +243,7 @@ static const struct filterops audit_pipe_read_filterops = { .f_attach = NULL, .f_detach = audit_pipe_kqdetach, .f_event = audit_pipe_kqread, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/security/mac/mac_framework.c b/sys/security/mac/mac_framework.c index d742b5dcbc3a..b0776160cc74 100644 --- a/sys/security/mac/mac_framework.c +++ b/sys/security/mac/mac_framework.c @@ -320,7 +320,7 @@ mac_policy_xlock_assert(void) * Initialize the MAC subsystem, including appropriate SMP locks. */ static void -mac_init(void) +mac_init(void *dummy __unused) { LIST_INIT(&mac_static_policy_list); @@ -340,7 +340,7 @@ mac_init(void) * kernel, or loaded before the kernel startup. */ static void -mac_late_init(void) +mac_late_init(void *dummy __unused) { mac_late = 1; diff --git a/sys/sys/event.h b/sys/sys/event.h index 084eaafcbdc0..ebbcdb703183 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -228,6 +228,7 @@ struct freebsd11_kevent32 { /* Flags for kqueuex(2) */ #define KQUEUE_CLOEXEC 0x00000001 /* close on exec */ +#define KQUEUE_CPONFORK 0x00000002 /* copy on fork */ struct knote; SLIST_HEAD(klist, knote); @@ -283,6 +284,7 @@ struct filterops { void (*f_touch)(struct knote *kn, struct kevent *kev, u_long type); int (*f_userdump)(struct proc *p, struct knote *kn, struct kinfo_knote *kin); + int (*f_copy)(struct knote *kn, struct proc *p1); }; /* @@ -346,6 +348,7 @@ struct rwlock; void knote(struct knlist *list, long hint, int lockflags); void knote_fork(struct knlist *list, int pid); +int knote_triv_copy(struct knote *kn, struct proc *p1); struct knlist *knlist_alloc(struct mtx *lock); void knlist_detach(struct knlist *knl); void knlist_add(struct knlist *knl, struct knote *kn, int islocked); diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h index c0d9811dd1b9..29a16b393b52 100644 --- a/sys/sys/eventhandler.h +++ b/sys/sys/eventhandler.h @@ -33,6 +33,7 @@ #include <sys/lock.h> #include <sys/ktr.h> #include <sys/mutex.h> +#include <sys/power.h> #include <sys/queue.h> #ifdef VIMAGE @@ -201,7 +202,7 @@ EVENTHANDLER_DECLARE(shutdown_post_sync, shutdown_fn); /* after fs sync */ EVENTHANDLER_DECLARE(shutdown_final, shutdown_fn); /* Power state change events */ -typedef void (*power_change_fn)(void *); +typedef void (*power_change_fn)(void *, enum power_stype stype); EVENTHANDLER_DECLARE(power_resume, power_change_fn); EVENTHANDLER_DECLARE(power_suspend, power_change_fn); EVENTHANDLER_DECLARE(power_suspend_early, power_change_fn); diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h index 7fec444447f9..7cb3269f1fdf 100644 --- a/sys/sys/eventvar.h +++ b/sys/sys/eventvar.h @@ -55,12 +55,14 @@ struct kqueue { #define KQ_CLOSING 0x10 #define KQ_TASKSCHED 0x20 /* task scheduled */ #define KQ_TASKDRAIN 0x40 /* waiting for task to drain */ +#define KQ_CPONFORK 0x80 int kq_knlistsize; /* size of knlist */ struct klist *kq_knlist; /* list of knotes */ u_long kq_knhashmask; /* size of knhash */ struct klist *kq_knhash; /* hash table for knotes */ struct task kq_task; struct ucred *kq_cred; + struct kqueue *kq_forksrc; }; #endif /* !_SYS_EVENTVAR_H_ */ diff --git a/sys/sys/file.h b/sys/sys/file.h index c44fd0f28929..e0195c7c6c2a 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -139,6 +139,8 @@ typedef int fo_fspacectl_t(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td); typedef int fo_cmp_t(struct file *fp, struct file *fp1, struct thread *td); +typedef int fo_fork_t(struct filedesc *fdp, struct file *fp, struct file **fp1, + struct proc *p1, struct thread *td); typedef int fo_spare_t(struct file *fp); typedef int fo_flags_t; @@ -163,12 +165,14 @@ struct fileops { fo_fallocate_t *fo_fallocate; fo_fspacectl_t *fo_fspacectl; fo_cmp_t *fo_cmp; + fo_fork_t *fo_fork; fo_spare_t *fo_spares[8]; /* Spare slots */ fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ +#define DFLAG_FORK 0x04 /* copy on fork */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 0a388c90de26..4817855443af 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -265,7 +265,7 @@ int fdcheckstd(struct thread *td); void fdclose(struct thread *td, struct file *fp, int idx); void fdcloseexec(struct thread *td); void fdsetugidsafety(struct thread *td); -struct filedesc *fdcopy(struct filedesc *fdp); +struct filedesc *fdcopy(struct filedesc *fdp, struct proc *p1); void fdunshare(struct thread *td); void fdescfree(struct thread *td); int fdlastfile(struct filedesc *fdp); diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h index 2845a9dbc1e2..25ce7871ba7c 100644 --- a/sys/sys/imgact_elf.h +++ b/sys/sys/imgact_elf.h @@ -86,8 +86,8 @@ typedef struct { struct sysentvec *sysvec; const char *interp_newpath; int flags; - Elf_Brandnote *brand_note; - bool (*header_supported)(const struct image_params *, + const Elf_Brandnote *brand_note; + bool (*const header_supported)(const struct image_params *, const int32_t *, const uint32_t *); /* High 8 bits of flags is private to the ABI */ #define BI_CAN_EXEC_DYN 0x0001 @@ -111,9 +111,9 @@ struct sseg_closure { size_t size; /* Total size of all writable segments. */ }; -bool __elfN(brand_inuse)(Elf_Brandinfo *entry); -int __elfN(insert_brand_entry)(Elf_Brandinfo *entry); -int __elfN(remove_brand_entry)(Elf_Brandinfo *entry); +bool __elfN(brand_inuse)(const Elf_Brandinfo *entry); +int __elfN(insert_brand_entry)(const Elf_Brandinfo *entry); +int __elfN(remove_brand_entry)(const Elf_Brandinfo *entry); int __elfN(freebsd_fixup)(uintptr_t *, struct image_params *); int __elfN(coredump)(struct thread *, struct coredump_writer *, off_t, int); size_t __elfN(populate_note)(int, void *, void *, size_t, void **); @@ -132,8 +132,8 @@ bool __elfN(parse_notes)(const struct image_params *, const Elf_Note *, void __elfN(dump_thread)(struct thread *, void *, size_t *); extern int __elfN(fallback_brand); -extern Elf_Brandnote __elfN(freebsd_brandnote); -extern Elf_Brandnote __elfN(kfreebsd_brandnote); +extern const Elf_Brandnote __elfN(freebsd_brandnote); +extern const Elf_Brandnote __elfN(kfreebsd_brandnote); #endif /* _KERNEL */ #endif /* !_SYS_IMGACT_ELF_H_ */ diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h index 08d4e2d28b33..4f6b45d78a88 100644 --- a/sys/sys/mutex.h +++ b/sys/sys/mutex.h @@ -68,9 +68,9 @@ */ #define MTX_UNOWNED 0x00000000 /* Cookie for free mutex */ #define MTX_RECURSED 0x00000001 /* lock recursed (for MTX_DEF only) */ -#define MTX_CONTESTED 0x00000002 /* lock contested (for MTX_DEF only) */ +#define MTX_WAITERS 0x00000002 /* lock has waiters (for MTX_DEF only) */ #define MTX_DESTROYED 0x00000004 /* lock destroyed */ -#define MTX_FLAGMASK (MTX_RECURSED | MTX_CONTESTED | MTX_DESTROYED) +#define MTX_FLAGMASK (MTX_RECURSED | MTX_WAITERS | MTX_DESTROYED) /* * Prototypes @@ -217,14 +217,10 @@ void _thread_lock(struct thread *); #define _mtx_obtain_lock_fetch(mp, vp, tid) \ atomic_fcmpset_acq_ptr(&(mp)->mtx_lock, vp, (tid)) -/* Try to release mtx_lock if it is unrecursed and uncontested. */ +/* Try to release mtx_lock if it is unrecursed and without waiters. */ #define _mtx_release_lock(mp, tid) \ atomic_cmpset_rel_ptr(&(mp)->mtx_lock, (tid), MTX_UNOWNED) -/* Release mtx_lock quickly, assuming we own it. */ -#define _mtx_release_lock_quick(mp) \ - atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED) - #define _mtx_release_lock_fetch(mp, vp) \ atomic_fcmpset_rel_ptr(&(mp)->mtx_lock, (vp), MTX_UNOWNED) @@ -246,10 +242,10 @@ void _thread_lock(struct thread *); }) /* - * Lock a spin mutex. For spinlocks, we handle recursion inline (it - * turns out that function calls can be significantly expensive on - * some architectures). Since spin locks are not _too_ common, - * inlining this code is not too big a deal. + * Lock a spin mutex. + * + * FIXME: spinlock_enter is a function call, defeating the point of inlining in + * this. */ #ifdef SMP #define __mtx_lock_spin(mp, tid, opts, file, line) __extension__ ({ \ @@ -317,10 +313,10 @@ void _thread_lock(struct thread *); }) /* - * Unlock a spin mutex. For spinlocks, we can handle everything - * inline, as it's pretty simple and a function call would be too - * expensive (at least on some architectures). Since spin locks are - * not _too_ common, inlining this code is not too big a deal. + * Unlock a spin mutex. + * + * FIXME: spinlock_exit is a function call, defeating the point of inlining in + * this. * * Since we always perform a spinlock_enter() when attempting to acquire a * spin lock, we need to always perform a matching spinlock_exit() when @@ -332,7 +328,7 @@ void _thread_lock(struct thread *); (mp)->mtx_recurse--; \ else { \ LOCKSTAT_PROFILE_RELEASE_SPIN_LOCK(spin__release, mp); \ - _mtx_release_lock_quick((mp)); \ + atomic_store_rel_ptr(&(mp)->mtx_lock, MTX_UNOWNED); \ } \ spinlock_exit(); \ }) diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 9140cee56885..8c0729d3ec66 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -741,7 +741,7 @@ struct proc { reaper which spawned our subtree. */ uint64_t p_elf_flags; /* (x) ELF flags */ - void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for + const void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for non ELF binaries. */ sbintime_t p_umtx_min_timeout; /* End area that is copied on creation. */ diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index b4593f38f592..739723754b7d 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -62,7 +62,7 @@ #include <sys/_sx.h> #include <sys/_task.h> -#define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ +#define SB_MAX (8*1024*1024) /* default for max chars in sockbuf */ struct ktls_session; struct mbuf; diff --git a/sys/sys/socket.h b/sys/sys/socket.h index cdd4fa3b4b89..cf1d95da6168 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -396,6 +396,7 @@ struct sockproto { #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP +#define PF_HYPERV AF_HYPERV #define PF_DIVERT AF_DIVERT #define PF_IPFWLOG AF_IPFWLOG diff --git a/sys/sys/sockopt.h b/sys/sys/sockopt.h index bfe12d8510d7..d2b0ff5ed2c8 100644 --- a/sys/sys/sockopt.h +++ b/sys/sys/sockopt.h @@ -57,8 +57,10 @@ struct sockopt { int sosetopt(struct socket *so, struct sockopt *sopt); int sogetopt(struct socket *so, struct sockopt *sopt); -int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); -int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); +int __result_use_check sooptcopyin(struct sockopt *sopt, void *buf, size_t len, + size_t minlen); +int __result_use_check sooptcopyout(struct sockopt *sopt, const void *buf, + size_t len); int soopt_getm(struct sockopt *sopt, struct mbuf **mp); int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index 8237165b84ce..d32690634059 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -211,7 +211,8 @@ int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout); int kern_kill(struct thread *td, pid_t pid, int signum); -int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps); +int kern_kqueue(struct thread *td, int flags, bool cponfork, + struct filecaps *fcaps); int kern_kldload(struct thread *td, const char *file, int *fileid); int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat); int kern_kldunload(struct thread *td, int fileid, int flags); diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h index 1714fa5a7416..6de391dcc03e 100644 --- a/sys/sys/sysent.h +++ b/sys/sys/sysent.h @@ -343,8 +343,7 @@ void exec_free_abi_mappings(struct proc *p); void exec_onexec_old(struct thread *td); #define INIT_SYSENTVEC(name, sv) \ - SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \ - (sysinit_cfunc_t)exec_sysvec_init, sv); + SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, exec_sysvec_init, sv) #endif /* _KERNEL */ diff --git a/sys/sys/tree.h b/sys/sys/tree.h index c11bccfb387c..194ad505b038 100644 --- a/sys/sys/tree.h +++ b/sys/sys/tree.h @@ -334,10 +334,13 @@ struct { \ #define _RB_L ((__uintptr_t)1) #define _RB_R ((__uintptr_t)2) #define _RB_LR ((__uintptr_t)3) -#define _RB_BITS(elm) (*(__uintptr_t *)&elm) +#define _RB_BITS(elm) ((__uintptr_t)elm) #define _RB_BITSUP(elm, field) _RB_BITS(_RB_UP(elm, field)) -#define _RB_PTR(elm) (__typeof(elm)) \ - ((__uintptr_t)elm & ~_RB_LR) +#define _RB_PTR_OP(elm, op, dir) ((__typeof(elm)) \ + ((__uintptr_t)(elm) op (dir))) +#define _RB_PTR(elm) _RB_PTR_OP((elm), &, ~_RB_LR) +#define _RB_MOD_OR(elm, dir) ((elm) = _RB_PTR_OP((elm), |, (dir))) +#define _RB_MOD_XOR(elm, dir) ((elm) = _RB_PTR_OP((elm), ^, (dir))) #define RB_PARENT(elm, field) _RB_PTR(_RB_UP(elm, field)) #define RB_LEFT(elm, field) _RB_LINK(elm, _RB_L, field) @@ -346,8 +349,8 @@ struct { \ #define RB_EMPTY(head) (RB_ROOT(head) == NULL) #define RB_SET_PARENT(dst, src, field) do { \ - _RB_BITSUP(dst, field) = (__uintptr_t)src | \ - (_RB_BITSUP(dst, field) & _RB_LR); \ + _RB_UP(dst, field) = (__typeof(src))((__uintptr_t)src | \ + (_RB_BITSUP(dst, field) & _RB_LR)); \ } while (/*CONSTCOND*/ 0) #define RB_SET(elm, parent, field) do { \ @@ -546,12 +549,12 @@ name##_RB_INSERT_COLOR(struct name *head, \ elmdir = RB_RIGHT(parent, field) == elm ? _RB_R : _RB_L; \ if (_RB_BITS(gpar) & elmdir) { \ /* shorten the parent-elm edge to rebalance */ \ - _RB_BITSUP(parent, field) ^= elmdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), elmdir); \ return (NULL); \ } \ sibdir = elmdir ^ _RB_LR; \ /* the other edge must change length */ \ - _RB_BITSUP(parent, field) ^= sibdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), sibdir); \ if ((_RB_BITS(gpar) & _RB_LR) == 0) { \ /* both edges now short, retry from parent */ \ child = elm; \ @@ -583,11 +586,14 @@ name##_RB_INSERT_COLOR(struct name *head, \ RB_ROTATE(elm, child, elmdir, field); \ child_up = _RB_UP(child, field); \ if (_RB_BITS(child_up) & sibdir) \ - _RB_BITSUP(parent, field) ^= elmdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + elmdir); \ if (_RB_BITS(child_up) & elmdir) \ - _RB_BITSUP(elm, field) ^= _RB_LR; \ + _RB_MOD_XOR(_RB_UP(elm, field), \ + _RB_LR); \ else \ - _RB_BITSUP(elm, field) ^= elmdir; \ + _RB_MOD_XOR(_RB_UP(elm, field), \ + elmdir); \ /* if child is a leaf, don't augment elm, \ * since it is restored to be a leaf again. */ \ if ((_RB_BITS(child_up) & _RB_LR) == 0) \ @@ -656,7 +662,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \ /* the rank of the tree rooted at elm shrank */ \ gpar = _RB_UP(parent, field); \ elmdir = RB_RIGHT(parent, field) == elm ? _RB_R : _RB_L; \ - _RB_BITS(gpar) ^= elmdir; \ + _RB_MOD_XOR(gpar, elmdir); \ if (_RB_BITS(gpar) & elmdir) { \ /* lengthen the parent-elm edge to rebalance */ \ _RB_UP(parent, field) = gpar; \ @@ -664,7 +670,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \ } \ if (_RB_BITS(gpar) & _RB_LR) { \ /* shorten other edge, retry from parent */ \ - _RB_BITS(gpar) ^= _RB_LR; \ + _RB_MOD_XOR(gpar, _RB_LR); \ _RB_UP(parent, field) = gpar; \ gpar = _RB_PTR(gpar); \ continue; \ @@ -672,7 +678,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \ sibdir = elmdir ^ _RB_LR; \ sib = _RB_LINK(parent, sibdir, field); \ up = _RB_UP(sib, field); \ - _RB_BITS(up) ^= _RB_LR; \ + _RB_MOD_XOR(up, _RB_LR); \ if ((_RB_BITS(up) & _RB_LR) == 0) { \ /* shorten edges descending from sib, retry */ \ _RB_UP(sib, field) = up; \ @@ -703,24 +709,29 @@ name##_RB_REMOVE_COLOR(struct name *head, \ /* elm is a 1-child. First rotate at elm. */ \ RB_ROTATE(sib, elm, sibdir, field); \ up = _RB_UP(elm, field); \ - _RB_BITSUP(parent, field) ^= \ - (_RB_BITS(up) & elmdir) ? _RB_LR : elmdir; \ - _RB_BITSUP(sib, field) ^= \ - (_RB_BITS(up) & sibdir) ? _RB_LR : sibdir; \ - _RB_BITSUP(elm, field) |= _RB_LR; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + (_RB_BITS(up) & elmdir) ? _RB_LR : elmdir); \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + (_RB_BITS(up) & sibdir) ? _RB_LR : sibdir); \ + _RB_MOD_OR(_RB_UP(elm, field), _RB_LR); \ } else { \ if ((_RB_BITS(up) & elmdir) == 0 && \ RB_STRICT_HST && elm != NULL) { \ /* if parent does not become a leaf, \ do not demote parent yet. */ \ - _RB_BITSUP(parent, field) ^= sibdir; \ - _RB_BITSUP(sib, field) ^= _RB_LR; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + sibdir); \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + _RB_LR); \ } else if ((_RB_BITS(up) & elmdir) == 0) { \ /* demote parent. */ \ - _RB_BITSUP(parent, field) ^= elmdir; \ - _RB_BITSUP(sib, field) ^= sibdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + elmdir); \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + sibdir); \ } else \ - _RB_BITSUP(sib, field) ^= sibdir; \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + sibdir); \ elm = sib; \ } \ \ diff --git a/sys/sys/user.h b/sys/sys/user.h index 3183f0792256..1704bc089d85 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -617,7 +617,8 @@ struct kinfo_vmobject { } kvo_type_spec; /* Type-specific union */ uint64_t kvo_me; /* Uniq handle for anon obj */ uint64_t kvo_laundry; /* Number of laundry pages. */ - uint64_t _kvo_qspare[5]; + uint64_t kvo_wired; /* Number of wired pages. */ + uint64_t _kvo_qspare[4]; uint32_t kvo_swapped; /* Number of swapped pages */ uint32_t kvo_flags; uint32_t _kvo_ispare[6]; diff --git a/sys/tests/ktest.h b/sys/tests/ktest.h index c767aa31e8e5..75d7a75e2fff 100644 --- a/sys/tests/ktest.h +++ b/sys/tests/ktest.h @@ -57,6 +57,8 @@ struct ktest_test_info { ktest_parse_t parse; }; +#define KTEST_FUNC(X) static int __ktest_##X(struct ktest_test_context *ctx) + struct ktest_module_info { const char *name; const struct ktest_test_info *tests; @@ -64,6 +66,8 @@ struct ktest_module_info { void *module_ptr; }; +#define KTEST_INFO(X) { "test_" #X, "Test " #X, __ktest_##X, NULL } + int ktest_default_modevent(module_t mod, int type, void *arg); bool ktest_start_msg(struct ktest_test_context *ctx); @@ -84,6 +88,9 @@ void ktest_end_msg(struct ktest_test_context *ctx); #define KTEST_LOG(_ctx, _fmt, ...) \ KTEST_LOG_LEVEL(_ctx, LOG_DEBUG, _fmt, ## __VA_ARGS__) +#define KTEST_ERR(_ctx, _fmt, ...) \ + KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__) + #define KTEST_MAX_BUF 512 #define KTEST_MODULE_DECLARE(_n, _t) \ @@ -104,6 +111,9 @@ MODULE_VERSION(ktest_##_n, 1); \ MODULE_DEPEND(ktest_##_n, ktestmod, 1, 1, 1); \ MODULE_DEPEND(ktest_##_n, netlink, 1, 1, 1); \ +#define KTEST_MODULE_DEPEND(_n, _d) \ +MODULE_DEPEND(ktest_##_n, _d, 1, 1, 1); \ + #endif /* _KERNEL */ /* genetlink definitions */ diff --git a/sys/tools/gdb/README.txt b/sys/tools/gdb/README.txt new file mode 100644 index 000000000000..8c31565ddc42 --- /dev/null +++ b/sys/tools/gdb/README.txt @@ -0,0 +1,21 @@ +This directory contains Python scripts that can be loaded by GDB to help debug +FreeBSD kernel crashes. + +Add new commands and functions in their own files. Functions with general +utility should be added to freebsd.py. sys/tools/kernel-gdb.py is installed +into the kernel debug directory (typically /usr/lib/debug/boot/kernel). It will +be automatically loaded by kgdb when opening a vmcore, so if you add new GDB +commands or functions, that script should be updated to import them, and you +should document them here. + +To provide some rudimentary testing, selftest.py tries to exercise all of the +commands and functions defined here. To use it, run selftest.sh to panic the +system. Then, create a kernel dump or attach to the panicked kernel, and invoke +the script with "python import selftest" in (k)gdb. + +Commands: +acttrace Display a backtrace for all on-CPU threads + +Functions: +$PCPU(<field>[, <cpuid>]) Display the value of a PCPU/DPCPU field +$V(<variable>[, <vnet>]) Display the value of a VNET variable diff --git a/sys/tools/gdb/acttrace.py b/sys/tools/gdb/acttrace.py new file mode 100644 index 000000000000..147effbbddf1 --- /dev/null +++ b/sys/tools/gdb/acttrace.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2022 The FreeBSD Foundation +# +# This software was developed by Mark Johnston under sponsorship from the +# FreeBSD Foundation. +# +# SPDX-License-Identifier: BSD-2-Clause +# + +import gdb +from freebsd import * +from pcpu import * + +class acttrace(gdb.Command): + """ + Register an acttrace command with gdb. + + When run, acttrace prints the stack trace of all threads that were on-CPU + at the time of the panic. + """ + def __init__(self): + super(acttrace, self).__init__("acttrace", gdb.COMMAND_USER) + + def invoke(self, arg, from_tty): + # Save the current thread so that we can switch back after. + curthread = gdb.selected_thread() + + for pcpu in pcpu_foreach(): + td = pcpu['pc_curthread'] + tid = td['td_tid'] + + gdb_thread = tid_to_gdb_thread(tid) + if gdb_thread is None: + raise gdb.error(f"failed to find GDB thread with TID {tid}") + else: + gdb_thread.switch() + + p = td['td_proc'] + print("Tracing command {} pid {} tid {} (CPU {})".format( + p['p_comm'], p['p_pid'], td['td_tid'], pcpu['pc_cpuid'])) + gdb.execute("bt") + print() + + curthread.switch() + + +# Registers the command with gdb, doesn't do anything. +acttrace() diff --git a/sys/tools/gdb/freebsd.py b/sys/tools/gdb/freebsd.py new file mode 100644 index 000000000000..81ea60373348 --- /dev/null +++ b/sys/tools/gdb/freebsd.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org> +# +# SPDX-License-Identifier: BSD-2-Clause +# + +import gdb + +def symval(name): + sym = gdb.lookup_global_symbol(name) + if sym is None: + sym = gdb.lookup_static_symbol(name) + if sym is None: + raise gdb.GdbError(f"Symbol '{name}' not found") + return sym.value() + + +def _queue_foreach(head, field, headf, nextf): + elm = head[headf] + while elm != 0: + yield elm + elm = elm[field][nextf] + + +def list_foreach(head, field): + """sys/queue.h-style iterator.""" + return _queue_foreach(head, field, "lh_first", "le_next") + + +def tailq_foreach(head, field): + """sys/queue.h-style iterator.""" + return _queue_foreach(head, field, "tqh_first", "tqe_next") + + +def linker_file_foreach(): + """Iterate over loaded linker files.""" + return tailq_foreach(symval("linker_files"), "link") + + +def pcpu_foreach(): + mp_maxid = symval("mp_maxid") + cpuid_to_pcpu = symval("cpuid_to_pcpu") + + cpu = 0 + while cpu <= mp_maxid: + pcpu = cpuid_to_pcpu[cpu] + if pcpu: + yield pcpu + cpu = cpu + 1 + + +def tid_to_gdb_thread(tid): + """Convert a FreeBSD kernel thread ID to a gdb inferior thread.""" + for thread in gdb.inferiors()[0].threads(): + if thread.ptid[2] == tid: + return thread + else: + return None + + +def tdfind(tid, pid=-1): + """Convert a FreeBSD kernel thread ID to a struct thread pointer.""" + td = tdfind.cached_threads.get(int(tid)) + if td: + return td + + for p in list_foreach(symval("allproc"), "p_list"): + if pid != -1 and pid != p['p_pid']: + continue + for td in tailq_foreach(p['p_threads'], "td_plist"): + ntid = td['td_tid'] + tdfind.cached_threads[int(ntid)] = td + if ntid == tid: + return td +tdfind.cached_threads = dict() diff --git a/sys/tools/gdb/pcpu.py b/sys/tools/gdb/pcpu.py new file mode 100644 index 000000000000..aadc4b2d42df --- /dev/null +++ b/sys/tools/gdb/pcpu.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org> +# +# SPDX-License-Identifier: BSD-2-Clause +# + +import gdb +from freebsd import * + +class pcpu(gdb.Function): + """ + Register a function to lookup PCPU and DPCPU variables by name. + + To look up the value of the PCPU field foo on CPU n, use + $PCPU("foo", n). This works for DPCPU fields too. If the CPU ID is + omitted, and the currently selected thread is on-CPU, that CPU is + used, otherwise an error is raised. + """ + def __init__(self): + super(pcpu, self).__init__("PCPU") + + def invoke(self, field, cpuid=-1): + if cpuid == -1: + cpuid = tdfind(gdb.selected_thread().ptid[2])['td_oncpu'] + if cpuid == -1: + raise gdb.error("Currently selected thread is off-CPU") + if cpuid < 0 or cpuid > symval("mp_maxid"): + raise gdb.error(f"Currently selected on invalid CPU {cpuid}") + pcpu = symval("cpuid_to_pcpu")[cpuid] + + # Are we dealing with a PCPU or DPCPU field? + field = field.string() + for f in gdb.lookup_type("struct pcpu").fields(): + if f.name == "pc_" + field: + return pcpu["pc_" + field] + + def uintptr_t(val): + return val.cast(gdb.lookup_type("uintptr_t")) + + # We're dealing with a DPCPU field. This is handled similarly + # to VNET symbols, see vnet.py for comments. + pcpu_base = pcpu['pc_dynamic'] + pcpu_entry = symval("pcpu_entry_" + field) + pcpu_entry_addr = uintptr_t(pcpu_entry.address) + + for lf in linker_file_foreach(): + block = gdb.block_for_pc(lf['ops']['cls']['methods'][0]['func']) + elf_file_t = gdb.lookup_type("elf_file_t", block).target() + ef = lf.cast(elf_file_t) + + file_type = lf['ops']['cls']['name'].string() + if file_type == "elf64": + start = uintptr_t(ef['pcpu_start']) + if start == 0: + continue + end = uintptr_t(ef['pcpu_stop']) + base = uintptr_t(ef['pcpu_base']) + elif file_type == "elf64_obj": + for i in range(ef['nprogtab']): + pe = ef['progtab'][i] + if pe['name'].string() == "set_pcpu": + start = uintptr_t(pe['origaddr']) + end = start + uintptr_t(pe['size']) + base = uintptr_t(pe['addr']) + break + else: + continue + else: + path = lf['pathname'].string() + raise gdb.error(f"{path} has unexpected linker file type {file_type}") + + if pcpu_entry_addr >= start and pcpu_entry_addr < end: + obj = gdb.Value(pcpu_base + pcpu_entry_addr - start + base) + return obj.cast(pcpu_entry.type.pointer()).dereference() + +# Register with gdb. +pcpu() diff --git a/sys/tools/gdb/selftest.py b/sys/tools/gdb/selftest.py new file mode 100644 index 000000000000..41e9211c4bb3 --- /dev/null +++ b/sys/tools/gdb/selftest.py @@ -0,0 +1,31 @@ +# +# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org> +# +# SPDX-License-Identifier: BSD-2-Clause +# + +import gdb + +cmds = ["acttrace", + "p $V(\"tcbinfo\")", + "p $V(\"tcbinfo\", vnet0)", + "p $V(\"pf_status\")", + "p $V(\"pf_status\", \"gdbselftest\")", + "p $PCPU(\"curthread\")", + "p $PCPU(\"curthread\", 0)", + "p/x $PCPU(\"hardclocktime\", 1)", + "p $PCPU(\"pqbatch\")[0][0]", + "p $PCPU(\"ss\", 1)", + ] + +for cmd in cmds: + try: + print(f"Running command: '{cmd}'") + gdb.execute(cmd) + except gdb.error as e: + print(f"Command '{cmd}' failed: {e}") + break + +# We didn't hit any unexpected errors. This isn't as good as actually +# verifying the output, but it's better than nothing. +print("Everything seems OK") diff --git a/sys/tools/gdb/selftest.sh b/sys/tools/gdb/selftest.sh new file mode 100644 index 000000000000..252fae14af17 --- /dev/null +++ b/sys/tools/gdb/selftest.sh @@ -0,0 +1,23 @@ +# +# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org> +# +# SPDX-License-Identifier: BSD-2-Clause +# + +set -e + +n=$(sysctl -n hw.ncpu) +if [ $n -lt 2 ]; then + echo "This test requires at least 2 CPUs" + exit 1 +fi + +# Set up some things expected by selftest.py. +kldload -n pf siftr +pfctl -e || true +jail -c name=gdbselftest vnet persist + +echo "I'm about to panic your system, ctrl-C now if that's not what you want." +sleep 10 +sysctl debug.debugger_on_panic=0 +sysctl debug.kdb.panic=1 diff --git a/sys/tools/gdb/vnet.py b/sys/tools/gdb/vnet.py new file mode 100644 index 000000000000..36b4d512a3eb --- /dev/null +++ b/sys/tools/gdb/vnet.py @@ -0,0 +1,100 @@ +# +# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org> +# +# SPDX-License-Identifier: BSD-2-Clause +# + +import gdb +import traceback +from freebsd import * + +class vnet(gdb.Function): + """ + Register a function to look up VNET variables by name. + + To look at the value of a VNET variable V_foo, print $V("foo"). The + currently selected thread's VNET is used by default, but can be optionally + specified as a second parameter, e.g., $V("foo", <vnet>), where <vnet> is a + pointer to a struct vnet (e.g., vnet0 or allprison.tqh_first->pr_vnet) or a + string naming a jail. + """ + def __init__(self): + super(vnet, self).__init__("V") + + def invoke(self, sym, vnet=None): + sym = sym.string() + if sym.startswith("V_"): + sym = sym[len("V_"):] + if gdb.lookup_symbol("sysctl___kern_features_vimage")[0] is None: + return symval(sym) + + # Look up the VNET's base address. + if vnet is None: + vnet = tdfind(gdb.selected_thread().ptid[2])['td_vnet'] + if not vnet: + # If curthread->td_vnet == NULL, vnet0 is the current vnet. + vnet = symval("vnet0") + elif vnet.type.is_string_like: + vnet = vnet.string() + for prison in tailq_foreach(symval("allprison"), "pr_list"): + if prison['pr_name'].string() == vnet: + vnet = prison['pr_vnet'] + break + else: + raise gdb.error(f"No prison named {vnet}") + + def uintptr_t(val): + return val.cast(gdb.lookup_type("uintptr_t")) + + # Now the tricky part: compute the address of the symbol relative + # to the selected VNET. In the compiled kernel this is done at + # load time by applying a magic transformation to relocations + # against symbols in the vnet linker set. Here we have to apply + # the transformation manually. + vnet_data_base = vnet['vnet_data_base'] + vnet_entry = symval("vnet_entry_" + sym) + vnet_entry_addr = uintptr_t(vnet_entry.address) + + # First, which kernel module does the symbol belong to? + for lf in linker_file_foreach(): + # Find the bounds of this linker file's VNET linker set. The + # struct containing the bounds depends on the type of the linker + # file, and unfortunately both are called elf_file_t. So we use a + # PC value from the compilation unit (either link_elf.c or + # link_elf_obj.c) to disambiguate. + block = gdb.block_for_pc(lf['ops']['cls']['methods'][0]['func']) + elf_file_t = gdb.lookup_type("elf_file_t", block).target() + ef = lf.cast(elf_file_t) + + file_type = lf['ops']['cls']['name'].string() + if file_type == "elf64": + start = uintptr_t(ef['vnet_start']) + if start == 0: + # This linker file doesn't have a VNET linker set. + continue + end = uintptr_t(ef['vnet_stop']) + base = uintptr_t(ef['vnet_base']) + elif file_type == "elf64_obj": + for i in range(ef['nprogtab']): + pe = ef['progtab'][i] + if pe['name'].string() == "set_vnet": + start = uintptr_t(pe['origaddr']) + end = start + uintptr_t(pe['size']) + base = uintptr_t(pe['addr']) + break + else: + # This linker file doesn't have a VNET linker set. + continue + else: + path = lf['pathname'].string() + raise gdb.error(f"{path} has unexpected linker file type {file_type}") + + if vnet_entry_addr >= start and vnet_entry_addr < end: + # The symbol belongs to this linker file, so compute the final + # address. + obj = gdb.Value(vnet_data_base + vnet_entry_addr - start + base) + return obj.cast(vnet_entry.type.pointer()).dereference() + + +# Register with gdb. +vnet() diff --git a/sys/tools/kernel-gdb.py b/sys/tools/kernel-gdb.py new file mode 100644 index 000000000000..8a41ef6efab1 --- /dev/null +++ b/sys/tools/kernel-gdb.py @@ -0,0 +1,15 @@ +# +# Copyright (c) 2025 Mark Johnston <markj@FreeBSD.org> +# +# SPDX-License-Identifier: BSD-2-Clause +# + +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), "gdb")) + +# Import FreeBSD kernel debugging commands and modules below. +import acttrace +import pcpu +import vnet diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 970536a13aa5..f47cfd08f75a 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -653,8 +653,8 @@ done: for (i = 0; i < UFS_NDADDR; i++) if (newblks[i] != DIP(ip, i_db[i])) panic("ffs_truncate2: blkno %d newblks %jd != i_db %jd", - i, (intmax_t)newblks[UFS_NDADDR + level], - (intmax_t)DIP(ip, i_ib[level])); + i, (intmax_t)newblks[i], + (intmax_t)DIP(ip, i_db[i])); BO_LOCK(bo); if (length == 0 && (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) && diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 679b2e20e88b..b80b5cc781f7 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -4009,21 +4009,15 @@ restart: /* * Use the keg's policy if upper layers haven't already specified a * domain (as happens with first-touch zones). - * - * To avoid races we run the iterator with the keg lock held, but that - * means that we cannot allow the vm_domainset layer to sleep. Thus, - * clear M_WAITOK and handle low memory conditions locally. */ rr = rdomain == UMA_ANYDOMAIN; + aflags = flags; if (rr) { - aflags = (flags & ~M_WAITOK) | M_NOWAIT; if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags) != 0) return (NULL); - } else { - aflags = flags; + } else domain = rdomain; - } for (;;) { slab = keg_fetch_free_slab(keg, domain, rr, flags); @@ -4053,13 +4047,8 @@ restart: if ((flags & M_WAITOK) == 0) break; vm_wait_domain(domain); - } else if (vm_domainset_iter_policy(&di, &domain) != 0) { - if ((flags & M_WAITOK) != 0) { - vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); - goto restart; - } + } else if (vm_domainset_iter_policy(&di, &domain) != 0) break; - } } /* @@ -5245,7 +5234,7 @@ uma_prealloc(uma_zone_t zone, int items) KEG_GET(zone, keg); slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { - aflags = M_NOWAIT; + aflags = M_WAITOK; if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags) != 0) panic("%s: Domainset is empty", __func__); @@ -5266,7 +5255,8 @@ uma_prealloc(uma_zone_t zone, int items) break; } if (vm_domainset_iter_policy(&di, &domain) != 0) - vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); + panic("%s: Cannot allocate from any domain", + __func__); } } } diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 9fa17da954f7..c25ed0cc2267 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -113,7 +113,6 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) int d; d = di->di_offset % di->di_domain->ds_cnt; - *di->di_iter = d; *domain = di->di_domain->ds_order[d]; } @@ -260,9 +259,14 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, * are immutable and unsynchronized. Updates can race but pointer * loads are assumed to be atomic. */ - if (obj != NULL && obj->domain.dr_policy != NULL) + if (obj != NULL && obj->domain.dr_policy != NULL) { + /* + * This write lock protects non-atomic increments of the + * iterator index in vm_domainset_iter_rr(). + */ + VM_OBJECT_ASSERT_WLOCKED(obj); dr = &obj->domain; - else + } else dr = &curthread->td_domain; vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index e0f1807a1b32..18d789c59281 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -441,19 +441,16 @@ vm_thread_kstack_arena_release(void *arena, vmem_addr_t addr, vmem_size_t size) * Create the kernel stack for a new thread. */ static vm_offset_t -vm_thread_stack_create(struct domainset *ds, int pages) +vm_thread_stack_create(struct domainset *ds, int pages, int flags) { vm_page_t ma[KSTACK_MAX_PAGES]; struct vm_domainset_iter di; - int req = VM_ALLOC_NORMAL; - vm_object_t obj; + int req; vm_offset_t ks; int domain, i; - obj = vm_thread_kstack_size_to_obj(pages); - if (vm_ndomains > 1) - obj->domain.dr_policy = ds; - vm_domainset_iter_page_init(&di, obj, 0, &domain, &req); + vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + req = malloc2vm_flags(flags); do { /* * Get a kernel virtual address for this thread's kstack. @@ -480,7 +477,7 @@ vm_thread_stack_create(struct domainset *ds, int pages) vm_page_valid(ma[i]); pmap_qenter(ks, ma, pages); return (ks); - } while (vm_domainset_iter_page(&di, obj, &domain, NULL) == 0); + } while (vm_domainset_iter_policy(&di, &domain) == 0); return (0); } @@ -532,15 +529,9 @@ vm_thread_new(struct thread *td, int pages) ks = 0; if (pages == kstack_pages && kstack_cache != NULL) ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT); - - /* - * Ensure that kstack objects can draw pages from any memory - * domain. Otherwise a local memory shortage can block a process - * swap-in. - */ if (ks == 0) ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)), - pages); + pages, M_NOWAIT); if (ks == 0) return (0); @@ -660,7 +651,8 @@ kstack_import(void *arg, void **store, int cnt, int domain, int flags) ds = DOMAINSET_PREF(domain); for (i = 0; i < cnt; i++) { - store[i] = (void *)vm_thread_stack_create(ds, kstack_pages); + store[i] = (void *)vm_thread_stack_create(ds, kstack_pages, + flags); if (store[i] == NULL) break; } diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index fef28bb883e4..fee50f49c844 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -96,7 +96,7 @@ struct vmmeter __read_mostly vm_cnt = { u_long __exclusive_cache_line vm_user_wire_count; static void -vmcounter_startup(void) +vmcounter_startup(void *dummy __unused) { counter_u64_t *cnt = (counter_u64_t *)&vm_cnt; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 6d9ea8bf9d93..5b4517d2bf0c 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2522,15 +2522,13 @@ vm_object_list_handler(struct sysctl_req *req, bool swap_only) continue; } mtx_unlock(&vm_object_list_mtx); + + memset(kvo, 0, sizeof(*kvo)); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = atomic_load_int(&obj->shadow_count); kvo->kvo_memattr = obj->memattr; - kvo->kvo_active = 0; - kvo->kvo_inactive = 0; - kvo->kvo_laundry = 0; - kvo->kvo_flags = 0; if (!swap_only) { vm_page_iter_init(&pages, obj); VM_RADIX_FOREACH(m, &pages) { @@ -2549,12 +2547,12 @@ vm_object_list_handler(struct sysctl_req *req, bool swap_only) kvo->kvo_inactive++; else if (vm_page_in_laundry(m)) kvo->kvo_laundry++; + + if (vm_page_wired(m)) + kvo->kvo_wired++; } } - kvo->kvo_vn_fileid = 0; - kvo->kvo_vn_fsid = 0; - kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; vp = NULL; diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 3f1be78342c9..418a9cff8abf 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -120,7 +120,7 @@ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); -static void vm_pageout_init(void); +static void vm_pageout_init(void *); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, @@ -2333,7 +2333,7 @@ vm_pageout_init_domain(int domain) } static void -vm_pageout_init(void) +vm_pageout_init(void *dummy __unused) { u_long freecount; int i; diff --git a/sys/x86/acpica/acpi_apm.c b/sys/x86/acpica/acpi_apm.c index 8e5785cf0ed6..919f76949dd4 100644 --- a/sys/x86/acpica/acpi_apm.c +++ b/sys/x86/acpica/acpi_apm.c @@ -64,6 +64,7 @@ static const struct filterops apm_readfiltops = { .f_isfd = 1, .f_detach = apmreadfiltdetach, .f_event = apmreadfilt, + .f_copy = knote_triv_copy, }; static struct cdevsw apm_cdevsw = { diff --git a/sys/x86/include/mca.h b/sys/x86/include/mca.h index 183480625f6d..553b5d765f17 100644 --- a/sys/x86/include/mca.h +++ b/sys/x86/include/mca.h @@ -44,6 +44,31 @@ struct mca_record { int mr_cpu; }; +enum mca_stat_types { + MCA_T_NONE = 0, + MCA_T_UNCLASSIFIED, + MCA_T_UCODE_ROM_PARITY, + MCA_T_EXTERNAL, + MCA_T_FRC, + MCA_T_INTERNAL_PARITY, + MCA_T_SMM_HANDLER, + MCA_T_INTERNAL_TIMER, + MCA_T_GENERIC_IO, + MCA_T_INTERNAL, + MCA_T_MEMORY, + MCA_T_TLB, + MCA_T_MEMCONTROLLER_GEN, + MCA_T_MEMCONTROLLER_RD, + MCA_T_MEMCONTROLLER_WR, + MCA_T_MEMCONTROLLER_AC, + MCA_T_MEMCONTROLLER_MS, + MCA_T_MEMCONTROLLER_OTHER, + MCA_T_CACHE, + MCA_T_BUS, + MCA_T_UNKNOWN, + MCA_T_COUNT /* Must stay last */ +}; + #ifdef _KERNEL void cmc_intr(void); diff --git a/sys/x86/x86/mca.c b/sys/x86/x86/mca.c index 4ba49469d3a2..735efe307215 100644 --- a/sys/x86/x86/mca.c +++ b/sys/x86/x86/mca.c @@ -46,9 +46,11 @@ #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/sbuf.h> #include <sys/sched.h> #include <sys/smp.h> #include <sys/sysctl.h> +#include <sys/syslog.h> #include <sys/systm.h> #include <sys/taskqueue.h> #include <machine/intr_machdep.h> @@ -124,6 +126,22 @@ SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN, &workaround_erratum383, 0, "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); +#ifdef DIAGNOSTIC +static uint64_t fake_status; +SYSCTL_U64(_hw_mca, OID_AUTO, fake_status, CTLFLAG_RW, + &fake_status, 0, + "Insert artificial MCA with given status (testing purpose only)"); +static int fake_bank; +SYSCTL_INT(_hw_mca, OID_AUTO, fake_bank, CTLFLAG_RW, + &fake_bank, 0, + "Bank to use for artificial MCAs (testing purpose only)"); +#endif + +static bool mca_uselog = false; +SYSCTL_BOOL(_hw_mca, OID_AUTO, uselog, CTLFLAG_RWTUN, &mca_uselog, 0, + "Should the system send non-fatal machine check errors to the log " + "(instead of the console)?"); + static STAILQ_HEAD(, mca_internal) mca_freelist; static int mca_freecount; static STAILQ_HEAD(, mca_internal) mca_records; @@ -131,8 +149,44 @@ static STAILQ_HEAD(, mca_internal) mca_pending; static int mca_ticks = 300; static struct taskqueue *mca_tq; static struct task mca_resize_task; +static struct task mca_postscan_task; static struct timeout_task mca_scan_task; static struct mtx mca_lock; +static bool mca_startup_done = false; + +/* Static buffer to compose messages while in an interrupt context. */ +static char mca_msg_buf[1024]; +static struct mtx mca_msg_buf_lock; + +/* Statistics on number of MCA events by type, updated with the mca_lock. */ +static uint64_t mca_stats[MCA_T_COUNT]; +SYSCTL_OPAQUE(_hw_mca, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_SKIP, + mca_stats, MCA_T_COUNT * sizeof(mca_stats[0]), + "S", "Array of MCA events by type"); + +/* Variables to track and control message rate limiting. */ +static struct timeval mca_last_log_time; +static struct timeval mca_log_interval; +static int mca_log_skipped; + +static int +sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS) +{ + int error; + u_int val; + + val = mca_log_interval.tv_sec; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + mca_log_interval.tv_sec = val; + return (0); +} +SYSCTL_PROC(_hw_mca, OID_AUTO, log_interval, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_log_interval, 0, + sysctl_mca_log_interval, "IU", + "Minimum number of seconds between logging correctable MCAs" + " (0 = no limit)"); static unsigned int mca_ia32_ctl_reg(int bank) @@ -356,21 +410,27 @@ mca_error_request(uint16_t mca_error) } static const char * -mca_error_mmtype(uint16_t mca_error) +mca_error_mmtype(uint16_t mca_error, enum mca_stat_types *event_type) { switch ((mca_error & 0x70) >> 4) { case 0x0: + *event_type = MCA_T_MEMCONTROLLER_GEN; return ("GEN"); case 0x1: + *event_type = MCA_T_MEMCONTROLLER_RD; return ("RD"); case 0x2: + *event_type = MCA_T_MEMCONTROLLER_WR; return ("WR"); case 0x3: + *event_type = MCA_T_MEMCONTROLLER_AC; return ("AC"); case 0x4: + *event_type = MCA_T_MEMCONTROLLER_MS; return ("MS"); } + *event_type = MCA_T_MEMCONTROLLER_OTHER; return ("???"); } @@ -423,87 +483,111 @@ mca_mute(const struct mca_record *rec) /* Dump details about a single machine check. */ static void -mca_log(const struct mca_record *rec) +mca_log(enum scan_mode mode, const struct mca_record *rec, bool fatal) { + int error, numskipped; uint16_t mca_error; + enum mca_stat_types event_type; + struct sbuf sb; + bool uncor, using_shared_buf; if (mca_mute(rec)) return; - if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 && - (!tes_supported(rec->mr_mcg_cap) || + uncor = (rec->mr_status & MC_STATUS_UC) != 0; + + if (!log_corrected && !uncor && (!tes_supported(rec->mr_mcg_cap) || ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2)) return; - printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, + /* Try to use an allocated buffer when not in an interrupt context. */ + if (mode == POLLED && sbuf_new(&sb, NULL, 512, SBUF_AUTOEXTEND) != NULL) + using_shared_buf = false; + else { + using_shared_buf = true; + mtx_lock_spin(&mca_msg_buf_lock); + sbuf_new(&sb, mca_msg_buf, sizeof(mca_msg_buf), SBUF_FIXEDLEN); + } + + sbuf_printf(&sb, "MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, (long long)rec->mr_status); - printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", + sbuf_printf(&sb, "MCA: Global Cap 0x%016llx, Status 0x%016llx\n", (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); - printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, - rec->mr_cpu_id, rec->mr_apic_id); - printf("MCA: CPU %d ", rec->mr_cpu); + sbuf_printf(&sb, "MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", + cpu_vendor, rec->mr_cpu_id, rec->mr_apic_id); + sbuf_printf(&sb, "MCA: CPU %d ", rec->mr_cpu); if (rec->mr_status & MC_STATUS_UC) - printf("UNCOR "); + sbuf_printf(&sb, "UNCOR "); else { - printf("COR "); + sbuf_printf(&sb, "COR "); if (cmci_supported(rec->mr_mcg_cap)) - printf("(%lld) ", ((long long)rec->mr_status & + sbuf_printf(&sb, "(%lld) ", ((long long)rec->mr_status & MC_STATUS_COR_COUNT) >> 38); if (tes_supported(rec->mr_mcg_cap)) { switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) { case 0x1: - printf("(Green) "); + sbuf_printf(&sb, "(Green) "); break; case 0x2: - printf("(Yellow) "); + sbuf_printf(&sb, "(Yellow) "); break; } } } if (rec->mr_status & MC_STATUS_EN) - printf("EN "); + sbuf_printf(&sb, "EN "); if (rec->mr_status & MC_STATUS_PCC) - printf("PCC "); + sbuf_printf(&sb, "PCC "); if (ser_supported(rec->mr_mcg_cap)) { if (rec->mr_status & MC_STATUS_S) - printf("S "); + sbuf_printf(&sb, "S "); if (rec->mr_status & MC_STATUS_AR) - printf("AR "); + sbuf_printf(&sb, "AR "); } if (rec->mr_status & MC_STATUS_OVER) - printf("OVER "); + sbuf_printf(&sb, "OVER "); mca_error = rec->mr_status & MC_STATUS_MCA_ERROR; + event_type = MCA_T_COUNT; switch (mca_error) { /* Simple error codes. */ case 0x0000: - printf("no error"); + sbuf_printf(&sb, "no error"); + event_type = MCA_T_NONE; break; case 0x0001: - printf("unclassified error"); + sbuf_printf(&sb, "unclassified error"); + event_type = MCA_T_UNCLASSIFIED; break; case 0x0002: - printf("ucode ROM parity error"); + sbuf_printf(&sb, "ucode ROM parity error"); + event_type = MCA_T_UCODE_ROM_PARITY; break; case 0x0003: - printf("external error"); + sbuf_printf(&sb, "external error"); + event_type = MCA_T_EXTERNAL; break; case 0x0004: - printf("FRC error"); + sbuf_printf(&sb, "FRC error"); + event_type = MCA_T_FRC; break; case 0x0005: - printf("internal parity error"); + sbuf_printf(&sb, "internal parity error"); + event_type = MCA_T_INTERNAL_PARITY; break; case 0x0006: - printf("SMM handler code access violation"); + sbuf_printf(&sb, "SMM handler code access violation"); + event_type = MCA_T_SMM_HANDLER; break; case 0x0400: - printf("internal timer error"); + sbuf_printf(&sb, "internal timer error"); + event_type = MCA_T_INTERNAL_TIMER; break; case 0x0e0b: - printf("generic I/O error"); + sbuf_printf(&sb, "generic I/O error"); + event_type = MCA_T_GENERIC_IO; if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL && (rec->mr_status & MC_STATUS_MISCV)) { - printf(" (pci%d:%d:%d:%d)", + sbuf_printf(&sb, " (pci%d:%d:%d:%d)", (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32), (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24), (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19), @@ -512,7 +596,9 @@ mca_log(const struct mca_record *rec) break; default: if ((mca_error & 0xfc00) == 0x0400) { - printf("internal error %x", mca_error & 0x03ff); + sbuf_printf(&sb, "internal error %x", + mca_error & 0x03ff); + event_type = MCA_T_INTERNAL; break; } @@ -520,101 +606,168 @@ mca_log(const struct mca_record *rec) /* Memory hierarchy error. */ if ((mca_error & 0xeffc) == 0x000c) { - printf("%s memory error", mca_error_level(mca_error)); + sbuf_printf(&sb, "%s memory error", + mca_error_level(mca_error)); + event_type = MCA_T_MEMORY; break; } /* TLB error. */ if ((mca_error & 0xeff0) == 0x0010) { - printf("%sTLB %s error", mca_error_ttype(mca_error), + sbuf_printf(&sb, "%sTLB %s error", + mca_error_ttype(mca_error), mca_error_level(mca_error)); + event_type = MCA_T_TLB; break; } /* Memory controller error. */ if ((mca_error & 0xef80) == 0x0080) { - printf("%s channel ", mca_error_mmtype(mca_error)); + sbuf_printf(&sb, "%s channel ", + mca_error_mmtype(mca_error, &event_type)); if ((mca_error & 0x000f) != 0x000f) - printf("%d", mca_error & 0x000f); + sbuf_printf(&sb, "%d", mca_error & 0x000f); else - printf("??"); - printf(" memory error"); + sbuf_printf(&sb, "??"); + sbuf_printf(&sb, " memory error"); break; } /* Cache error. */ if ((mca_error & 0xef00) == 0x0100) { - printf("%sCACHE %s %s error", + sbuf_printf(&sb, "%sCACHE %s %s error", mca_error_ttype(mca_error), mca_error_level(mca_error), mca_error_request(mca_error)); + event_type = MCA_T_CACHE; break; } /* Extended memory error. */ if ((mca_error & 0xef80) == 0x0280) { - printf("%s channel ", mca_error_mmtype(mca_error)); + sbuf_printf(&sb, "%s channel ", + mca_error_mmtype(mca_error, &event_type)); if ((mca_error & 0x000f) != 0x000f) - printf("%d", mca_error & 0x000f); + sbuf_printf(&sb, "%d", mca_error & 0x000f); else - printf("??"); - printf(" extended memory error"); + sbuf_printf(&sb, "??"); + sbuf_printf(&sb, " extended memory error"); break; } /* Bus and/or Interconnect error. */ if ((mca_error & 0xe800) == 0x0800) { - printf("BUS%s ", mca_error_level(mca_error)); + sbuf_printf(&sb, "BUS%s ", mca_error_level(mca_error)); + event_type = MCA_T_BUS; switch ((mca_error & 0x0600) >> 9) { case 0: - printf("Source"); + sbuf_printf(&sb, "Source"); break; case 1: - printf("Responder"); + sbuf_printf(&sb, "Responder"); break; case 2: - printf("Observer"); + sbuf_printf(&sb, "Observer"); break; default: - printf("???"); + sbuf_printf(&sb, "???"); break; } - printf(" %s ", mca_error_request(mca_error)); + sbuf_printf(&sb, " %s ", mca_error_request(mca_error)); switch ((mca_error & 0x000c) >> 2) { case 0: - printf("Memory"); + sbuf_printf(&sb, "Memory"); break; case 2: - printf("I/O"); + sbuf_printf(&sb, "I/O"); break; case 3: - printf("Other"); + sbuf_printf(&sb, "Other"); break; default: - printf("???"); + sbuf_printf(&sb, "???"); break; } if (mca_error & 0x0100) - printf(" timed out"); + sbuf_printf(&sb, " timed out"); break; } - printf("unknown error %x", mca_error); + sbuf_printf(&sb, "unknown error %x", mca_error); + event_type = MCA_T_UNKNOWN; break; } - printf("\n"); + sbuf_printf(&sb, "\n"); if (rec->mr_status & MC_STATUS_ADDRV) { - printf("MCA: Address 0x%llx", (long long)rec->mr_addr); + sbuf_printf(&sb, "MCA: Address 0x%llx", + (long long)rec->mr_addr); if (ser_supported(rec->mr_mcg_cap) && (rec->mr_status & MC_STATUS_MISCV)) { - printf(" (Mode: %s, LSB: %d)", + sbuf_printf(&sb, " (Mode: %s, LSB: %d)", mca_addres_mode(rec->mr_misc), (int)(rec->mr_misc & MC_MISC_RA_LSB)); } - printf("\n"); + sbuf_printf(&sb, "\n"); } if (rec->mr_status & MC_STATUS_MISCV) - printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc); + sbuf_printf(&sb, "MCA: Misc 0x%llx\n", (long long)rec->mr_misc); + + if (event_type < 0 || event_type >= MCA_T_COUNT) { + KASSERT(0, ("%s: invalid event type (%d)", __func__, + event_type)); + event_type = MCA_T_UNKNOWN; + } + numskipped = 0; + if (!fatal && !uncor) { + /* + * Update statistics and check the rate limit for + * correctable errors. The rate limit is only applied + * after the system records a reasonable number of errors + * of the same type. The goal is to reduce the impact of + * the system seeing and attempting to log a burst of + * similar errors, which (especially when printed to the + * console) can be expensive. + */ + mtx_lock_spin(&mca_lock); + mca_stats[event_type]++; + if (mca_log_interval.tv_sec > 0 && mca_stats[event_type] > 50 && + ratecheck(&mca_last_log_time, &mca_log_interval) == 0) { + mca_log_skipped++; + mtx_unlock_spin(&mca_lock); + goto done; + } + numskipped = mca_log_skipped; + mca_log_skipped = 0; + mtx_unlock_spin(&mca_lock); + } + + error = sbuf_finish(&sb); + if (fatal || !mca_uselog) { + if (numskipped > 0) + printf("MCA: %d events skipped due to rate limit\n", + numskipped); + if (error) + printf("MCA: error logging message (sbuf error %d)\n", + error); + else + sbuf_putbuf(&sb); + } else { + if (numskipped > 0) + log(LOG_ERR, + "MCA: %d events skipped due to rate limit\n", + numskipped); + if (error) + log(LOG_ERR, + "MCA: error logging message (sbuf error %d)\n", + error); + else + log(uncor ? LOG_CRIT : LOG_ERR, "%s", sbuf_data(&sb)); + } + +done: + sbuf_delete(&sb); + if (using_shared_buf) + mtx_unlock_spin(&mca_msg_buf_lock); } static bool @@ -662,8 +815,24 @@ mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank, bool mce, recover; status = rdmsr(mca_msr_ops.status(bank)); - if (!(status & MC_STATUS_VAL)) + if (!(status & MC_STATUS_VAL)) { +#ifdef DIAGNOSTIC + /* + * Check if we have a pending artificial event to generate. + * Note that this is potentially racy with the sysctl. The + * tradeoff is deemed acceptable given the test nature + * of the code. + */ + if (fake_status && bank == fake_bank) { + status = fake_status; + fake_status = 0; + } + if (!(status & MC_STATUS_VAL)) + return (0); +#else return (0); +#endif + } recover = *recoverablep; mce = mca_is_mce(mcg_cap, status, &recover); @@ -757,9 +926,9 @@ mca_record_entry(enum scan_mode mode, const struct mca_record *record) mtx_lock_spin(&mca_lock); rec = STAILQ_FIRST(&mca_freelist); if (rec == NULL) { - printf("MCA: Unable to allocate space for an event.\n"); - mca_log(record); mtx_unlock_spin(&mca_lock); + printf("MCA: Unable to allocate space for an event.\n"); + mca_log(mode, record, false); return; } STAILQ_REMOVE_HEAD(&mca_freelist, link); @@ -916,7 +1085,7 @@ mca_scan(enum scan_mode mode, bool *recoverablep) if (*recoverablep) mca_record_entry(mode, &rec); else - mca_log(&rec); + mca_log(mode, &rec, true); } #ifdef DEV_APIC @@ -978,18 +1147,49 @@ static void mca_process_records(enum scan_mode mode) { struct mca_internal *mca; + STAILQ_HEAD(, mca_internal) tmplist; + + /* + * If in an interrupt context, defer the post-scan activities to a + * task queue. + */ + if (mode != POLLED) { + if (mca_startup_done) + taskqueue_enqueue(mca_tq, &mca_postscan_task); + return; + } + + /* + * Copy the pending list to the stack so we can drop the spin lock + * while we are emitting logs. + */ + STAILQ_INIT(&tmplist); + mtx_lock_spin(&mca_lock); + STAILQ_SWAP(&mca_pending, &tmplist, mca_internal); + mtx_unlock_spin(&mca_lock); + + STAILQ_FOREACH(mca, &tmplist, link) + mca_log(mode, &mca->rec, false); mtx_lock_spin(&mca_lock); - while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) { - STAILQ_REMOVE_HEAD(&mca_pending, link); - mca_log(&mca->rec); + while ((mca = STAILQ_FIRST(&tmplist)) != NULL) { + STAILQ_REMOVE_HEAD(&tmplist, link); mca_store_record(mca); } mtx_unlock_spin(&mca_lock); - if (mode == POLLED) - mca_resize_freelist(); - else if (!cold) - taskqueue_enqueue(mca_tq, &mca_resize_task); + mca_resize_freelist(); +} + +/* + * Emit log entries and resize the free list. This is intended to be called + * from a task queue to handle work which does not need to be done (or cannot + * be done) in an interrupt context. + */ +static void +mca_postscan(void *context __unused, int pending __unused) +{ + + mca_process_records(POLLED); } /* @@ -1060,7 +1260,7 @@ sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS) doresize = true; } mtx_unlock_spin(&mca_lock); - if (doresize && !cold) + if (doresize && mca_startup_done) taskqueue_enqueue(mca_tq, &mca_resize_task); return (error); } @@ -1072,12 +1272,16 @@ mca_startup(void *dummy) if (mca_banks <= 0) return; - /* CMCIs during boot may have claimed items from the freelist. */ - mca_resize_freelist(); - taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq"); taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, mca_ticks * SBT_1S, 0, C_PREL(1)); + mca_startup_done = true; + + /* + * CMCIs during boot may have recorded entries. Conduct the post-scan + * activities now. + */ + mca_postscan(NULL, 0); } SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL); @@ -1130,6 +1334,7 @@ mca_setup(uint64_t mcg_cap) mca_banks = mcg_cap & MCG_CAP_COUNT; mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); + mtx_init(&mca_msg_buf_lock, "mca_msg_buf", NULL, MTX_SPIN); STAILQ_INIT(&mca_records); STAILQ_INIT(&mca_pending); mca_tq = taskqueue_create_fast("mca", M_WAITOK, @@ -1137,6 +1342,7 @@ mca_setup(uint64_t mcg_cap) TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL); STAILQ_INIT(&mca_freelist); TASK_INIT(&mca_resize_task, 0, mca_resize, NULL); + TASK_INIT(&mca_postscan_task, 0, mca_postscan, NULL); mca_resize_freelist(); SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0, @@ -1540,6 +1746,9 @@ mca_intr(void) panic("Unrecoverable machine check exception"); } + if (count) + mca_process_records(MCE); + /* Clear MCIP. */ wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); } diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index a1a5d8140b14..3b873d9dae73 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -650,7 +650,7 @@ retry: #endif /* SMP */ static void -init_TSC_tc(void) +init_TSC_tc(void *dummy __unused) { uint64_t max_freq; int shift; diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c index 994dc3e0804c..43a253cc2860 100644 --- a/sys/x86/xen/xen_apic.c +++ b/sys/x86/xen/xen_apic.c @@ -330,7 +330,7 @@ xen_cpu_ipi_init(int cpu) } static void -xen_setup_cpus(void) +xen_setup_cpus(void *dummy __unused) { uint32_t regs[4]; int i; |