diff options
Diffstat (limited to 'sys/kern')
38 files changed, 1574 insertions, 402 deletions
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 2690ad3b2679..1bc2491a1a12 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -84,6 +84,13 @@ #define ELF_NOTE_ROUNDSIZE 4 #define OLD_EI_BRAND 8 +/* + * ELF_ABI_NAME is a string name of the ELF ABI. ELF_ABI_ID is used + * to build variable names. + */ +#define ELF_ABI_NAME __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) +#define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE) + static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0); @@ -104,14 +111,15 @@ static Elf_Word __elfN(untrans_prot)(vm_prot_t); static size_t __elfN(prepare_register_notes)(struct thread *td, struct note_info_list *list, struct thread *target_td); -SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, +SYSCTL_NODE(_kern, OID_AUTO, ELF_ABI_ID, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); +#define ELF_NODE_OID __CONCAT(_kern_, ELF_ABI_ID) + int __elfN(fallback_brand) = -1; -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); + ELF_ABI_NAME " brand of last resort"); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, @@ -126,22 +134,22 @@ int __elfN(nxstack) = #else 0; #endif -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, nxstack, CTLFLAG_RW, &__elfN(nxstack), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": support PT_GNU_STACK for non-executable stack control"); + ELF_ABI_NAME ": support PT_GNU_STACK for non-executable stack control"); #if defined(__amd64__) static int __elfN(vdso) = 1; -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, vdso, CTLFLAG_RWTUN, &__elfN(vdso), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable vdso preloading"); + ELF_ABI_NAME ": enable vdso preloading"); #else static int __elfN(vdso) = 0; #endif #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__)) int i386_read_exec = 0; -SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, "enable execution from readable segments"); #endif @@ -161,15 +169,15 @@ sysctl_pie_base(SYSCTL_HANDLER_ARGS) __elfN(pie_base) = val; return (0); } -SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base, +SYSCTL_PROC(ELF_NODE_OID, OID_AUTO, pie_base, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_pie_base, "LU", "PIE load base without randomization"); -SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, +SYSCTL_NODE(ELF_NODE_OID, OID_AUTO, aslr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); -#define ASLR_NODE_OID __CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr) +#define ASLR_NODE_OID __CONCAT(ELF_NODE_OID, _aslr) /* * Enable ASLR by default for 64-bit non-PIE binaries. 32-bit architectures @@ -179,8 +187,7 @@ SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, static int __elfN(aslr_enabled) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN, &__elfN(aslr_enabled), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable address map randomization"); + ELF_ABI_NAME ": enable address map randomization"); /* * Enable ASLR by default for 64-bit PIE binaries. @@ -188,8 +195,7 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN, static int __elfN(pie_aslr_enabled) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN, &__elfN(pie_aslr_enabled), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable address map randomization for PIE binaries"); + ELF_ABI_NAME ": enable address map randomization for PIE binaries"); /* * Sbrk is deprecated and it can be assumed that in most cases it will not be @@ -199,27 +205,25 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN, static int __elfN(aslr_honor_sbrk) = 0; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW, &__elfN(aslr_honor_sbrk), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used"); + ELF_ABI_NAME ": assume sbrk is used"); static int __elfN(aslr_stack) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack, CTLFLAG_RWTUN, &__elfN(aslr_stack), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable stack address randomization"); + ELF_ABI_NAME ": enable stack address randomization"); static int __elfN(aslr_shared_page) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, shared_page, CTLFLAG_RWTUN, &__elfN(aslr_shared_page), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable shared page address randomization"); + ELF_ABI_NAME ": enable shared page address randomization"); static int __elfN(sigfastblock) = 1; -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, sigfastblock, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, sigfastblock, CTLFLAG_RWTUN, &__elfN(sigfastblock), 0, "enable sigfastblock for new processes"); static bool __elfN(allow_wx) = true; -SYSCTL_BOOL(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, allow_wx, +SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx, CTLFLAG_RWTUN, &__elfN(allow_wx), 0, "Allow pages to be mapped simultaneously writable and executable"); @@ -2606,11 +2610,13 @@ note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep) int structsize; p = arg; - size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t); + size = sizeof(structsize) + + (1 + p->p_ucred->cr_ngroups) * sizeof(gid_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(gid_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); + sbuf_bcat(sb, &p->p_ucred->cr_gid, sizeof(gid_t)); sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups * sizeof(gid_t)); } @@ -2951,9 +2957,9 @@ __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, */ static struct execsw __elfN(execsw) = { .ex_imgact = __CONCAT(exec_, __elfN(imgact)), - .ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) + .ex_name = ELF_ABI_NAME }; -EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); +EXEC_SET(ELF_ABI_ID, __elfN(execsw)); static vm_prot_t __elfN(trans_prot)(Elf_Word flags) diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 36ce44b988be..87ffdb8dbf07 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -145,13 +145,6 @@ FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* - * This ensures that there is at least one entry so that the sysinit_set - * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never - * executed. - */ -SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); - -/* * The sysinit linker set compiled into the kernel. These are placed onto the * sysinit list by mi_startup; sysinit_add can add (e.g., from klds) additional * sysinits to the linked list but the linker set here does not change. @@ -296,7 +289,7 @@ mi_startup(void) BOOTTRACE_INIT("sysinit 0x%7x", sip->subsystem); #if defined(VERBOSE_SYSINIT) - if (sip->subsystem > last && verbose_sysinit != 0) { + if (sip->subsystem != last && verbose_sysinit != 0) { verbose = 1; printf("subsystem %x\n", sip->subsystem); } diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index fcd232cde21e..e42e7dcf8b44 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -663,4 +663,6 @@ struct sysent sysent[] = { { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */ { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ + { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ + { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ }; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index a27ab33b34da..19118eb7f275 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -658,6 +658,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) error = EBADF; break; } + fsetfl_lock(fp); do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; @@ -665,26 +666,34 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); got_set = tmp & ~flg; got_cleared = flg & ~tmp; - tmp = fp->f_flag & FNONBLOCK; - error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); - if (error != 0) - goto revert_f_setfl; - tmp = fp->f_flag & FASYNC; - error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); - if (error == 0) { - fdrop(fp, td); - break; + if (((got_set | got_cleared) & FNONBLOCK) != 0) { + tmp = fp->f_flag & FNONBLOCK; + error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + if (error != 0) + goto revert_flags; + } + if (((got_set | got_cleared) & FASYNC) != 0) { + tmp = fp->f_flag & FASYNC; + error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); + if (error != 0) + goto revert_nonblock; + } + fsetfl_unlock(fp); + fdrop(fp, td); + break; +revert_nonblock: + if (((got_set | got_cleared) & FNONBLOCK) != 0) { + tmp = ~fp->f_flag & FNONBLOCK; + (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); } - atomic_clear_int(&fp->f_flag, FNONBLOCK); - tmp = 0; - (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); -revert_f_setfl: +revert_flags: do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= got_cleared; tmp &= ~got_set; } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); + fsetfl_unlock(fp); fdrop(fp, td); break; @@ -5250,6 +5259,8 @@ file_type_to_name(short type) return ("eventfd"); case DTYPE_TIMERFD: return ("timerfd"); + case DTYPE_JAILDESC: + return ("jail"); default: return ("unkn"); } diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c index 0cb0f566a839..7c0654769581 100644 --- a/sys/kern/kern_environment.c +++ b/sys/kern/kern_environment.c @@ -1098,65 +1098,65 @@ kernenv_next(char *cp) } void -tunable_int_init(void *data) +tunable_int_init(const void *data) { - struct tunable_int *d = (struct tunable_int *)data; + const struct tunable_int *d = data; TUNABLE_INT_FETCH(d->path, d->var); } void -tunable_long_init(void *data) +tunable_long_init(const void *data) { - struct tunable_long *d = (struct tunable_long *)data; + const struct tunable_long *d = data; TUNABLE_LONG_FETCH(d->path, d->var); } void -tunable_ulong_init(void *data) +tunable_ulong_init(const void *data) { - struct tunable_ulong *d = (struct tunable_ulong *)data; + const struct tunable_ulong *d = data; TUNABLE_ULONG_FETCH(d->path, d->var); } void -tunable_int64_init(void *data) +tunable_int64_init(const void *data) { - struct tunable_int64 *d = (struct tunable_int64 *)data; + const struct tunable_int64 *d = data; TUNABLE_INT64_FETCH(d->path, d->var); } void -tunable_uint64_init(void *data) +tunable_uint64_init(const void *data) { - struct tunable_uint64 *d = (struct tunable_uint64 *)data; + const struct tunable_uint64 *d = data; TUNABLE_UINT64_FETCH(d->path, d->var); } void -tunable_quad_init(void *data) +tunable_quad_init(const void *data) { - struct tunable_quad *d = (struct tunable_quad *)data; + const struct tunable_quad *d = data; TUNABLE_QUAD_FETCH(d->path, d->var); } void -tunable_bool_init(void *data) +tunable_bool_init(const void *data) { - struct tunable_bool *d = (struct tunable_bool *)data; + const struct tunable_bool *d = data; TUNABLE_BOOL_FETCH(d->path, d->var); } void -tunable_str_init(void *data) +tunable_str_init(const void *data) { - struct tunable_str *d = (struct tunable_str *)data; + const struct tunable_str *d = data; TUNABLE_STR_FETCH(d->path, d->var, d->size); } diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index eb77a5064113..23d8dc9cf54a 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -50,6 +50,8 @@ #include <sys/filedesc.h> #include <sys/filio.h> #include <sys/fcntl.h> +#include <sys/jail.h> +#include <sys/jaildesc.h> #include <sys/kthread.h> #include <sys/selinfo.h> #include <sys/queue.h> @@ -163,6 +165,9 @@ static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); +static int filt_jailattach(struct knote *kn); +static void filt_jaildetach(struct knote *kn); +static int filt_jail(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static void filt_timerexpire_l(struct knote *kn, bool proc_locked); @@ -195,6 +200,12 @@ static const struct filterops proc_filtops = { .f_detach = filt_procdetach, .f_event = filt_proc, }; +static const struct filterops jail_filtops = { + .f_isfd = 0, + .f_attach = filt_jailattach, + .f_detach = filt_jaildetach, + .f_event = filt_jail, +}; static const struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, @@ -365,6 +376,8 @@ static struct { [~EVFILT_USER] = { &user_filtops, 1 }, [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, + [~EVFILT_JAIL] = { &jail_filtops, 1 }, + [~EVFILT_JAILDESC] = { &file_filtops, 1 }, }; /* @@ -614,6 +627,86 @@ knote_fork(struct knlist *list, int pid) } } +int +filt_jailattach(struct knote *kn) +{ + struct prison *pr; + + if (kn->kn_id == 0) { + /* Let jid=0 watch the current prison (including prison0). */ + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + } else { + sx_slock(&allprison_lock); + pr = prison_find_child(curthread->td_ucred->cr_prison, + kn->kn_id); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + if (!prison_isalive(pr)) { + mtx_unlock(&pr->pr_mtx); + return (ENOENT); + } + } + kn->kn_ptr.p_prison = pr; + kn->kn_flags |= EV_CLEAR; + knlist_add(pr->pr_klist, kn, 1); + mtx_unlock(&pr->pr_mtx); + return (0); +} + +void +filt_jaildetach(struct knote *kn) +{ + if (kn->kn_ptr.p_prison != NULL) { + knlist_remove(kn->kn_knlist, kn, 0); + kn->kn_ptr.p_prison = NULL; + } else + kn->kn_status |= KN_DETACHED; +} + +int +filt_jail(struct knote *kn, long hint) +{ + struct prison *pr; + u_int event; + + pr = kn->kn_ptr.p_prison; + if (pr == NULL) /* already activated, from attach filter */ + return (0); + + /* + * Mask off extra data. In the NOTE_JAIL_CHILD case, that's + * everything except the NOTE_JAIL_CHILD bit itself, since a + * JID is any positive integer. + */ + event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD : + (u_int)hint & NOTE_JAIL_CTRLMASK; + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) { + kn->kn_fflags |= event; + /* Report the created jail id or attached process id. */ + if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_MULTI; + kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U : + (u_int)hint & ~event; + } + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + kn->kn_ptr.p_prison = NULL; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. @@ -1771,7 +1864,7 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp) kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) - return (EBADF); + return (EINVAL); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { @@ -2800,6 +2893,7 @@ knote_init(void) knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue); + prison0.pr_klist = knlist_alloc(&prison0.pr_mtx); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); @@ -3033,7 +3127,7 @@ sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS) return (error); td = curthread; -#ifdef FREEBSD_COMPAT32 +#ifdef COMPAT_FREEBSD32 compat32 = SV_CURPROC_FLAG(SV_ILP32); #else compat32 = false; diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 7c9a15ae18f3..3697d95fe0e5 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -39,15 +39,18 @@ #include <sys/kernel.h> #include <sys/systm.h> #include <sys/errno.h> +#include <sys/file.h> #include <sys/sysproto.h> #include <sys/malloc.h> #include <sys/osd.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/epoch.h> +#include <sys/event.h> #include <sys/taskqueue.h> #include <sys/fcntl.h> #include <sys/jail.h> +#include <sys/jaildesc.h> #include <sys/linker.h> #include <sys/lock.h> #include <sys/mman.h> @@ -154,7 +157,8 @@ static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); -static void prison_cleanup(struct prison *pr); +static void prison_cleanup_locked(struct prison *pr); +static void prison_cleanup_unlocked(struct prison *pr); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_proc_relink(struct prison *opr, struct prison *npr, @@ -167,6 +171,7 @@ static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif +static void prison_knote(struct prison *pr, long hint); /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ @@ -238,6 +243,9 @@ static struct bool_flags pr_flag_allow[NBBY * NBPW] = { {"allow.unprivileged_parent_tampering", "allow.nounprivileged_parent_tampering", PR_ALLOW_UNPRIV_PARENT_TAMPER}, +#ifdef AUDIT + {"allow.setaudit", "allow.nosetaudit", PR_ALLOW_SETAUDIT}, +#endif }; static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; const size_t pr_flag_allow_size = sizeof(pr_flag_allow); @@ -985,6 +993,7 @@ prison_ip_cnt(const struct prison *pr, const pr_family_t af) int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { + struct file *jfp_out; struct nameidata nd; #ifdef INET struct prison_ip *ip4; @@ -995,6 +1004,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; + struct ucred *jdcred; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; @@ -1008,7 +1018,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; - int deadid, jid, jsys, len, level; + int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; #ifdef INET int ip4s; @@ -1018,22 +1028,32 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) int ip6s; bool redo_ip6; #endif + bool maybe_changed; uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; char numbuf[12]; - error = priv_check(td, PRIV_JAIL_SET); - if (!error && (flags & JAIL_ATTACH)) - error = priv_check(td, PRIV_JAIL_ATTACH); - if (error) - return (error); mypr = td->td_ucred->cr_prison; - if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) + if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) && + mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); + if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) == + (JAIL_USE_DESC | JAIL_AT_DESC)) + return (EINVAL); + prison_hold(mypr); +#ifdef INET + ip4 = NULL; +#endif +#ifdef INET6 + ip6 = NULL; +#endif + g_path = NULL; + jfp_out = NULL; + jfd_out = -1; /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this @@ -1046,14 +1066,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) */ error = vfs_buildopts(optuio, &opts); if (error) - return (error); -#ifdef INET - ip4 = NULL; -#endif -#ifdef INET6 - ip6 = NULL; -#endif - g_path = NULL; + goto done_free; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { @@ -1062,6 +1075,61 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_errmsg; } + error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); + if (error == ENOENT) { + if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC)) { + vfs_opterror(opts, "missing desc"); + goto done_errmsg; + } + jfd_in = -1; + } else if (error != 0) + goto done_free; + else { + if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC))) { + vfs_opterror(opts, "unexpected desc"); + goto done_errmsg; + } + if (flags & JAIL_AT_DESC) { + /* + * Look up and create jails based on the + * descriptor's prison. + */ + prison_free(mypr); + error = jaildesc_find(td, jfd_in, &mypr, NULL); + if (error != 0) { + vfs_opterror(opts, error == ENOENT ? + "descriptor to dead jail" : + "not a jail descriptor"); + goto done_errmsg; + } + if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) { + error = EPERM; + goto done_free; + } + } + if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { + /* Allocate a jail descriptor to return later. */ + error = jaildesc_alloc(td, &jfp_out, &jfd_out, + flags & JAIL_OWN_DESC); + if (error) + goto done_free; + } + } + + /* + * Delay the permission check if using a jail descriptor, + * until we get the descriptor's credentials. + */ + if (!(flags & JAIL_USE_DESC)) { + error = priv_check(td, PRIV_JAIL_SET); + if (error == 0 && (flags & JAIL_ATTACH)) + error = priv_check(td, PRIV_JAIL_ATTACH); + if (error) + goto done_free; + } + error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; @@ -1422,6 +1490,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) pr = NULL; inspr = NULL; deadpr = NULL; + maybe_changed = false; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); @@ -1436,7 +1505,45 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) error = EAGAIN; goto done_deref; } - if (jid != 0) { + if (flags & JAIL_USE_DESC) { + /* Get the jail from its descriptor. */ + error = jaildesc_find(td, jfd_in, &pr, &jdcred); + if (error) { + vfs_opterror(opts, error == ENOENT ? + "descriptor to dead jail" : + "not a jail descriptor"); + goto done_deref; + } + drflags |= PD_DEREF; + error = priv_check_cred(jdcred, PRIV_JAIL_SET); + if (error == 0 && (flags & JAIL_ATTACH)) + error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); + crfree(jdcred); + if (error) + goto done_deref; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + if (cuflags == JAIL_CREATE) { + error = EEXIST; + vfs_opterror(opts, "jail %d already exists", + pr->pr_id); + goto done_deref; + } + if (!prison_isalive(pr)) { + /* While a jid can be resurrected, the prison + * itself cannot. + */ + error = ENOENT; + vfs_opterror(opts, "jail %d is dying", pr->pr_id); + goto done_deref; + } + if (jid != 0 && jid != pr->pr_id) { + error = EINVAL; + vfs_opterror(opts, "cannot change jid"); + goto done_deref; + } + jid = pr->pr_id; + } else if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); @@ -1570,7 +1677,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } } } - /* Update: must provide a jid or name. */ + /* Update: must provide a desc, jid, or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); @@ -1643,6 +1750,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; + pr->pr_klist = knlist_alloc(&pr->pr_mtx); /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) @@ -1722,8 +1830,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ - prison_hold(pr); - drflags |= PD_DEREF; + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { @@ -1880,6 +1990,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_deref; } } + maybe_changed = true; /* Set the parameters of the prison. */ #ifdef INET @@ -2112,7 +2223,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * reference via persistence, or is about to gain one via attachment. */ if (created) { - drflags = prison_lock_xlock(pr, drflags); + sx_assert(&allprison_lock, SX_XLOCKED); + prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id); + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; pr->pr_state = PRISON_STATE_ALIVE; } @@ -2146,10 +2260,37 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) printf("Warning jail jid=%d: mountd/nfsd requires a separate" " file system\n", pr->pr_id); + /* + * Now that the prison is fully created without error, set the + * jail descriptor if one was requested. This is the only + * parameter that is returned to the caller (except the error + * message). + */ + if (jfd_out >= 0) { + if (!(drflags & PD_LOCKED)) { + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + } + jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1; + if (optuio->uio_segflg == UIO_SYSSPACE) + *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out; + else + (void)copyout(&jfd_out, + optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out)); + jaildesc_set_prison(jfp_out, pr); + } + drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; done_deref: + /* + * Report changes to kevent. This can happen even if the + * system call fails, as changes might have been made before + * the failure. + */ + if (maybe_changed && !created) + prison_knote(pr, NOTE_JAIL_SET); /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); @@ -2176,15 +2317,21 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } } done_free: + /* Clean up other resources. */ #ifdef INET prison_ip_free(ip4); #endif #ifdef INET6 prison_ip_free(ip6); #endif + if (jfp_out != NULL) + fdrop(jfp_out, td); + if (error && jfd_out >= 0) + (void)kern_close(td, jfd_out); if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); + prison_free(mypr); return (error); } @@ -2329,16 +2476,21 @@ int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; + struct file *jfp_out; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; + int jfd_in, jfd_out; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); + if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) == + (JAIL_USE_DESC | JAIL_AT_DESC)) + return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); @@ -2346,13 +2498,70 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; + prison_hold(mypr); pr = NULL; + jfp_out = NULL; + jfd_out = -1; /* - * Find the prison specified by one of: lastjid, jid, name. + * Find the prison specified by one of: desc, lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; + + error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); + if (error == ENOENT) { + if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { + vfs_opterror(opts, "missing desc"); + goto done; + } + } else if (error == 0) { + if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC))) { + vfs_opterror(opts, "unexpected desc"); + goto done; + } + if (flags & JAIL_USE_DESC) { + /* Get the jail from its descriptor. */ + error = jaildesc_find(td, jfd_in, &pr, NULL); + if (error) { + vfs_opterror(opts, error == ENOENT ? + "descriptor to dead jail" : + "not a jail descriptor"); + goto done; + } + drflags |= PD_DEREF; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { + error = ENOENT; + vfs_opterror(opts, "jail %d is dying", + pr->pr_id); + goto done; + } + goto found_prison; + } + if (flags & JAIL_AT_DESC) { + /* Look up jails based on the descriptor's prison. */ + prison_free(mypr); + error = jaildesc_find(td, jfd_in, &mypr, NULL); + if (error != 0) { + vfs_opterror(opts, error == ENOENT ? + "descriptor to dead jail" : + "not a jail descriptor"); + goto done; + } + } + if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { + /* Allocate a jail descriptor to return later. */ + error = jaildesc_alloc(td, &jfp_out, &jfd_out, + flags & JAIL_OWN_DESC); + if (error) + goto done; + } + } else + goto done; + error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { @@ -2421,9 +2630,17 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) found_prison: /* Get the parameters of the prison. */ - prison_hold(pr); - drflags |= PD_DEREF; + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } td->td_retval[0] = pr->pr_id; + if (jfd_out >= 0) { + error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out)); + if (error != 0 && error != ENOENT) + goto done; + jaildesc_set_prison(jfp_out, pr); + } error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; @@ -2603,6 +2820,13 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); + else if (drflags & PD_LIST_XLOCKED) + sx_xunlock(&allprison_lock); + /* Clean up other resources. */ + if (jfp_out != NULL) + (void)fdrop(jfp_out, td); + if (error && jfd_out >= 0) + (void)kern_close(td, jfd_out); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); @@ -2619,6 +2843,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) } } vfs_freeopts(opts); + prison_free(mypr); return (error); } @@ -2643,14 +2868,54 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap) sx_xunlock(&allprison_lock); return (EINVAL); } + prison_hold(pr); + prison_remove(pr); + return (0); +} + +/* + * struct jail_remove_jd_args { + * int fd; + * }; + */ +int +sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap) +{ + struct prison *pr; + struct ucred *jdcred; + int error; + + error = jaildesc_find(td, uap->fd, &pr, &jdcred); + if (error) + return (error); + error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE); + crfree(jdcred); + if (error) { + prison_free(pr); + return (error); + } + sx_xlock(&allprison_lock); + mtx_lock(&pr->pr_mtx); + prison_remove(pr); + return (0); +} + +/* + * Begin the removal process for a prison. The allprison lock should + * be held exclusively, and the prison should be both locked and held. + */ +void +prison_remove(struct prison *pr) +{ + sx_assert(&allprison_lock, SA_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_OWNED); if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); - return (0); + return; } - prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); - return (0); + prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); } /* @@ -2685,6 +2950,44 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap) return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } +/* + * struct jail_attach_jd_args { + * int fd; + * }; + */ +int +sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap) +{ + struct prison *pr; + struct ucred *jdcred; + int drflags, error; + + sx_slock(&allprison_lock); + drflags = PD_LIST_SLOCKED; + error = jaildesc_find(td, uap->fd, &pr, &jdcred); + if (error) + goto fail; + drflags |= PD_DEREF; + error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); + crfree(jdcred); + if (error) + goto fail; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + + /* Do not allow a process to attach to a prison that is not alive. */ + if (!prison_isalive(pr)) { + error = EINVAL; + goto fail; + } + + return (do_jail_attach(td, pr, drflags)); + + fail: + prison_deref(pr, drflags); + return (error); +} + static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { @@ -2703,9 +3006,12 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags) * a process root from one prison, but attached to the jail * of another. */ - prison_hold(pr); + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } refcount_acquire(&pr->pr_uref); - drflags |= PD_DEREF | PD_DEUREF; + drflags |= PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; @@ -2755,6 +3061,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags) prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); + prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); /* * If the prison was killed while changing credentials, die along @@ -3182,9 +3489,10 @@ prison_deref(struct prison *pr, int flags) refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; + prison_cleanup_locked(pr); mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; - prison_cleanup(pr); + prison_cleanup_unlocked(pr); } } } @@ -3327,8 +3635,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) } if (!(cpr->pr_flags & PR_REMOVE)) continue; - prison_cleanup(cpr); + prison_cleanup_unlocked(cpr); mtx_lock(&cpr->pr_mtx); + prison_cleanup_locked(cpr); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; @@ -3363,8 +3672,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); - prison_cleanup(pr); + prison_cleanup_unlocked(pr); mtx_lock(&pr->pr_mtx); + prison_cleanup_locked(pr); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); @@ -3411,10 +3721,22 @@ prison_lock_xlock(struct prison *pr, int flags) /* * Release a prison's resources when it starts dying (when the last user - * reference is dropped, or when it is killed). + * reference is dropped, or when it is killed). Two functions are called, + * for work that requires a locked prison or an unlocked one. */ static void -prison_cleanup(struct prison *pr) +prison_cleanup_locked(struct prison *pr) +{ + sx_assert(&allprison_lock, SA_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_OWNED); + prison_knote(pr, NOTE_JAIL_REMOVE); + knlist_detach(pr->pr_klist); + jaildesc_prison_cleanup(pr); + pr->pr_klist = NULL; +} + +static void +prison_cleanup_unlocked(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_NOTOWNED); @@ -3970,7 +4292,6 @@ prison_priv_check(struct ucred *cred, int priv) */ case PRIV_KTRACE: -#if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may @@ -3979,6 +4300,11 @@ prison_priv_check(struct ucred *cred, int priv) */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: + if (cred->cr_prison->pr_allow & PR_ALLOW_SETAUDIT) + return (0); + else + return (EPERM); +#if 0 case PRIV_AUDIT_SUBMIT: #endif @@ -4715,6 +5041,10 @@ SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set system time"); SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may modify routing table"); +#ifdef AUDIT +SYSCTL_JAIL_PARAM(_allow, setaudit, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may set and get audit session state"); +#endif SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, @@ -5039,6 +5369,23 @@ prison_racct_detach(struct prison *pr) } #endif /* RACCT */ +/* + * Submit a knote for a prison, locking if necessary. + */ +static void +prison_knote(struct prison *pr, long hint) +{ + int locked; + + locked = mtx_owned(&pr->pr_mtx); + if (!locked) + mtx_lock(&pr->pr_mtx); + KNOTE_LOCKED(pr->pr_klist, hint); + jaildesc_knote(pr, hint); + if (!locked) + mtx_unlock(&pr->pr_mtx); +} + #ifdef DDB static void diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c new file mode 100644 index 000000000000..3f322b271400 --- /dev/null +++ b/sys/kern/kern_jaildesc.c @@ -0,0 +1,412 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 James Gritton. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/jail.h> +#include <sys/jaildesc.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/poll.h> +#include <sys/priv.h> +#include <sys/stat.h> +#include <sys/sysproto.h> +#include <sys/systm.h> +#include <sys/ucred.h> +#include <sys/user.h> +#include <sys/vnode.h> + +MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors"); + +static fo_poll_t jaildesc_poll; +static fo_kqfilter_t jaildesc_kqfilter; +static fo_stat_t jaildesc_stat; +static fo_close_t jaildesc_close; +static fo_fill_kinfo_t jaildesc_fill_kinfo; +static fo_cmp_t jaildesc_cmp; + +static struct fileops jaildesc_ops = { + .fo_read = invfo_rdwr, + .fo_write = invfo_rdwr, + .fo_truncate = invfo_truncate, + .fo_ioctl = invfo_ioctl, + .fo_poll = jaildesc_poll, + .fo_kqfilter = jaildesc_kqfilter, + .fo_stat = jaildesc_stat, + .fo_close = jaildesc_close, + .fo_chmod = invfo_chmod, + .fo_chown = invfo_chown, + .fo_sendfile = invfo_sendfile, + .fo_fill_kinfo = jaildesc_fill_kinfo, + .fo_cmp = jaildesc_cmp, + .fo_flags = DFLAG_PASSABLE, +}; + +/* + * Given a jail descriptor number, return its prison and/or its + * credential. They are returned held, and will need to be released + * by the caller. + */ +int +jaildesc_find(struct thread *td, int fd, struct prison **prp, + struct ucred **ucredp) +{ + struct file *fp; + struct jaildesc *jd; + struct prison *pr; + int error; + + error = fget(td, fd, &cap_no_rights, &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_JAILDESC) { + error = EINVAL; + goto out; + } + jd = fp->f_data; + JAILDESC_LOCK(jd); + pr = jd->jd_prison; + if (pr == NULL || !prison_isvalid(pr)) { + error = ENOENT; + JAILDESC_UNLOCK(jd); + goto out; + } + if (prp != NULL) { + prison_hold(pr); + *prp = pr; + } + JAILDESC_UNLOCK(jd); + if (ucredp != NULL) + *ucredp = crhold(fp->f_cred); + out: + fdrop(fp, td); + return (error); +} + +/* + * Allocate a new jail decriptor, not yet associated with a prison. + * Return the file pointer (with a reference held) and the descriptor + * number. + */ +int +jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning) +{ + struct file *fp; + struct jaildesc *jd; + int error; + + if (owning) { + error = priv_check(td, PRIV_JAIL_REMOVE); + if (error != 0) + return (error); + } + jd = malloc(sizeof(*jd), M_JAILDESC, M_WAITOK | M_ZERO); + error = falloc_caps(td, &fp, fdp, 0, NULL); + if (error != 0) { + free(jd, M_JAILDESC); + return (error); + } + finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 ? + FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops); + JAILDESC_LOCK_INIT(jd); + knlist_init_mtx(&jd->jd_selinfo.si_note, &jd->jd_lock); + if (owning) + jd->jd_flags |= JDF_OWNING; + *fpp = fp; + return (0); +} + +/* + * Assocate a jail descriptor with its prison. + */ +void +jaildesc_set_prison(struct file *fp, struct prison *pr) +{ + struct jaildesc *jd; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + jd = fp->f_data; + JAILDESC_LOCK(jd); + jd->jd_prison = pr; + LIST_INSERT_HEAD(&pr->pr_descs, jd, jd_list); + prison_hold(pr); + JAILDESC_UNLOCK(jd); +} + +/* + * Detach all the jail descriptors from a prison. + */ +void +jaildesc_prison_cleanup(struct prison *pr) +{ + struct jaildesc *jd; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + while ((jd = LIST_FIRST(&pr->pr_descs))) { + JAILDESC_LOCK(jd); + LIST_REMOVE(jd, jd_list); + jd->jd_prison = NULL; + JAILDESC_UNLOCK(jd); + prison_free(pr); + } +} + +/* + * Pass a note to all listening kqueues. + */ +void +jaildesc_knote(struct prison *pr, long hint) +{ + struct jaildesc *jd; + int prison_locked; + + if (!LIST_EMPTY(&pr->pr_descs)) { + prison_locked = mtx_owned(&pr->pr_mtx); + if (!prison_locked) + prison_lock(pr); + LIST_FOREACH(jd, &pr->pr_descs, jd_list) { + JAILDESC_LOCK(jd); + if (hint == NOTE_JAIL_REMOVE) { + jd->jd_flags |= JDF_REMOVED; + if (jd->jd_flags & JDF_SELECTED) { + jd->jd_flags &= ~JDF_SELECTED; + selwakeup(&jd->jd_selinfo); + } + } + KNOTE_LOCKED(&jd->jd_selinfo.si_note, hint); + JAILDESC_UNLOCK(jd); + } + if (!prison_locked) + prison_unlock(pr); + } +} + +static int +jaildesc_close(struct file *fp, struct thread *td) +{ + struct jaildesc *jd; + struct prison *pr; + + jd = fp->f_data; + fp->f_data = NULL; + if (jd != NULL) { + JAILDESC_LOCK(jd); + pr = jd->jd_prison; + if (pr != NULL) { + /* + * Free or remove the associated prison. + * This requires a second check after re- + * ordering locks. This jaildesc can remain + * unlocked once we have a prison reference, + * because that prison is the only place that + * still points back to it. + */ + prison_hold(pr); + JAILDESC_UNLOCK(jd); + if (jd->jd_flags & JDF_OWNING) { + sx_xlock(&allprison_lock); + prison_lock(pr); + if (jd->jd_prison != NULL) { + /* + * Unlink the prison, but don't free + * it; that will be done as part of + * of prison_remove. + */ + LIST_REMOVE(jd, jd_list); + prison_remove(pr); + } else { + prison_unlock(pr); + sx_xunlock(&allprison_lock); + } + } else { + prison_lock(pr); + if (jd->jd_prison != NULL) { + LIST_REMOVE(jd, jd_list); + prison_free(pr); + } + prison_unlock(pr); + } + prison_free(pr); + } + knlist_destroy(&jd->jd_selinfo.si_note); + JAILDESC_LOCK_DESTROY(jd); + free(jd, M_JAILDESC); + } + return (0); +} + +static int +jaildesc_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int revents; + + revents = 0; + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (jd->jd_flags & JDF_REMOVED) + revents |= POLLHUP; + if (revents == 0) { + selrecord(td, &jd->jd_selinfo); + jd->jd_flags |= JDF_SELECTED; + } + JAILDESC_UNLOCK(jd); + return (revents); +} + +static void +jaildesc_kqops_detach(struct knote *kn) +{ + struct jaildesc *jd; + + jd = kn->kn_fp->f_data; + knlist_remove(&jd->jd_selinfo.si_note, kn, 0); +} + +static int +jaildesc_kqops_event(struct knote *kn, long hint) +{ + struct jaildesc *jd; + u_int event; + + jd = kn->kn_fp->f_data; + if (hint == 0) { + /* + * Initial test after registration. Generate a + * NOTE_JAIL_REMOVE in case the prison already died + * before registration. + */ + event = jd->jd_flags & JDF_REMOVED ? NOTE_JAIL_REMOVE : 0; + } else { + /* + * Mask off extra data. In the NOTE_JAIL_CHILD case, + * that's everything except the NOTE_JAIL_CHILD bit + * itself, since a JID is any positive integer. + */ + event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD : + (u_int)hint & NOTE_JAIL_CTRLMASK; + } + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) { + kn->kn_fflags |= event; + /* Report the created jail id or attached process id. */ + if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_MULTI; + kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U : + (u_int)hint & ~event; + } + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + +static const struct filterops jaildesc_kqops = { + .f_isfd = 1, + .f_detach = jaildesc_kqops_detach, + .f_event = jaildesc_kqops_event, +}; + +static int +jaildesc_kqfilter(struct file *fp, struct knote *kn) +{ + struct jaildesc *jd; + + jd = fp->f_data; + switch (kn->kn_filter) { + case EVFILT_JAILDESC: + kn->kn_fop = &jaildesc_kqops; + kn->kn_flags |= EV_CLEAR; + knlist_add(&jd->jd_selinfo.si_note, kn, 0); + return (0); + default: + return (EINVAL); + } +} + +static int +jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) +{ + struct jaildesc *jd; + + bzero(sb, sizeof(struct stat)); + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (jd->jd_prison != NULL) { + sb->st_ino = jd->jd_prison->pr_id; + sb->st_mode = S_IFREG | S_IRWXU; + } else + sb->st_mode = S_IFREG; + JAILDESC_UNLOCK(jd); + return (0); +} + +static int +jaildesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, + struct filedesc *fdp) +{ + struct jaildesc *jd; + + jd = fp->f_data; + kif->kf_type = KF_TYPE_JAILDESC; + kif->kf_un.kf_jail.kf_jid = jd->jd_prison ? jd->jd_prison->pr_id : 0; + return (0); +} + +static int +jaildesc_cmp(struct file *fp1, struct file *fp2, struct thread *td) +{ + struct jaildesc *jd1, *jd2; + int jid1, jid2; + + if (fp2->f_type != DTYPE_JAILDESC) + return (3); + jd1 = fp1->f_data; + JAILDESC_LOCK(jd1); + jid1 = jd1->jd_prison ? (uintptr_t)jd1->jd_prison->pr_id : 0; + JAILDESC_UNLOCK(jd1); + jd2 = fp2->f_data; + JAILDESC_LOCK(jd2); + jid2 = jd2->jd_prison ? (uintptr_t)jd2->jd_prison->pr_id : 0; + JAILDESC_UNLOCK(jd2); + return (kcmp_cmp(jid1, jid2)); +} diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 879220be050b..653ce1ee556b 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -751,11 +751,14 @@ malloc_domainset(size_t size, struct malloc_type *mtp, struct domainset *ds, return (malloc_large(size, mtp, DOMAINSET_RR(), flags DEBUG_REDZONE_ARG)); - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); - do { - va = malloc_domain(&size, &indx, mtp, domain, flags); - } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); + indx = -1; + va = NULL; + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) == 0) + do { + va = malloc_domain(&size, &indx, mtp, domain, flags); + } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); + if (__predict_false(va == NULL)) { KASSERT((flags & M_WAITOK) == 0, ("malloc(M_WAITOK) returned NULL")); diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index f952b3fc8805..8b5908f5219a 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -1136,9 +1136,9 @@ __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line) * General init routine used by the MTX_SYSINIT() macro. */ void -mtx_sysinit(void *arg) +mtx_sysinit(const void *arg) { - struct mtx_args *margs = arg; + const struct mtx_args *margs = arg; mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 379fbda619c0..6e56664d12ce 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -1112,13 +1112,14 @@ fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) if (cred->cr_flags & CRED_FLAG_CAPMODE) kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE; /* XXX bde doesn't like KI_NGROUPS */ - if (cred->cr_ngroups > KI_NGROUPS) { + if (1 + cred->cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else - kp->ki_ngroups = cred->cr_ngroups; - bcopy(cred->cr_groups, kp->ki_groups, - kp->ki_ngroups * sizeof(gid_t)); + kp->ki_ngroups = 1 + cred->cr_ngroups; + kp->ki_groups[0] = cred->cr_gid; + bcopy(cred->cr_groups, kp->ki_groups + 1, + (kp->ki_ngroups - 1) * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ @@ -2943,8 +2944,11 @@ sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS) cred = crhold(p->p_ucred); PROC_UNLOCK(p); - error = SYSCTL_OUT(req, cred->cr_groups, - cred->cr_ngroups * sizeof(gid_t)); + error = SYSCTL_OUT(req, &cred->cr_gid, sizeof(gid_t)); + if (error == 0) + error = SYSCTL_OUT(req, cred->cr_groups, + cred->cr_ngroups * sizeof(gid_t)); + crfree(cred); return (error); } diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index 0ca42d640767..a4c5bcc52529 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -291,11 +291,6 @@ sys_getgid(struct thread *td, struct getgid_args *uap) return (0); } -/* - * Get effective group ID. The "egid" is groups[0], and could be obtained - * via getgroups. This syscall exists because it is somewhat painful to do - * correctly in a library function. - */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; @@ -1803,12 +1798,6 @@ groupmember(gid_t gid, const struct ucred *cred) bool realgroupmember(gid_t gid, const struct ucred *cred) { - /* - * Although the equality test on 'cr_rgid' below doesn't access - * 'cr_groups', we check for the latter's length here as we assume that, - * if 'cr_ngroups' is 0, the passed 'struct ucred' is invalid, and - * 'cr_rgid' may not have been filled. - */ groups_check_positive_len(cred->cr_ngroups); if (gid == cred->cr_rgid) @@ -1896,19 +1885,22 @@ SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, static int cr_canseeothergids(struct ucred *u1, struct ucred *u2) { - if (!see_other_gids) { - if (realgroupmember(u1->cr_rgid, u2)) - return (0); + if (see_other_gids) + return (0); - for (int i = 1; i < u1->cr_ngroups; i++) - if (realgroupmember(u1->cr_groups[i], u2)) - return (0); + /* Restriction in force. */ - if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) != 0) - return (ESRCH); - } + if (realgroupmember(u1->cr_rgid, u2)) + return (0); - return (0); + for (int i = 0; i < u1->cr_ngroups; i++) + if (realgroupmember(u1->cr_groups[i], u2)) + return (0); + + if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) == 0) + return (0); + + return (ESRCH); } /* @@ -2276,6 +2268,7 @@ cr_xids_subset(struct ucred *active_cred, struct ucred *obj_cred) } } grpsubset = grpsubset && + groupmember(obj_cred->cr_gid, active_cred) && groupmember(obj_cred->cr_rgid, active_cred) && groupmember(obj_cred->cr_svgid, active_cred); @@ -2921,8 +2914,8 @@ crextend(struct ucred *cr, int n) * Normalizes a set of groups to be applied to a 'struct ucred'. * * Normalization ensures that the supplementary groups are sorted in ascending - * order and do not contain duplicates. This allows group_is_supplementary - * to do a binary search. + * order and do not contain duplicates. This allows group_is_supplementary() to + * do a binary search. */ static void groups_normalize(int *ngrp, gid_t *groups) @@ -2985,9 +2978,9 @@ crsetgroups_internal(struct ucred *cr, int ngrp, const gid_t *groups) * Copy groups in to a credential after expanding it if required. * * May sleep in order to allocate memory (except if, e.g., crextend() was called - * before with 'ngrp' or greater). Truncates the list to ngroups_max if + * before with 'ngrp' or greater). Truncates the list to 'ngroups_max' if * it is too large. Array 'groups' doesn't need to be sorted. 'ngrp' must be - * strictly positive. + * positive. */ void crsetgroups(struct ucred *cr, int ngrp, const gid_t *groups) @@ -3018,8 +3011,8 @@ crsetgroups(struct ucred *cr, int ngrp, const gid_t *groups) * Same as crsetgroups() but sets the effective GID as well. * * This function ensures that an effective GID is always present in credentials. - * An empty array will only set the effective GID to the default_egid, while a - * non-empty array will peel off groups[0] to set as the effective GID and use + * An empty array will only set the effective GID to 'default_egid', while + * a non-empty array will peel off groups[0] to set as the effective GID and use * the remainder, if any, as supplementary groups. */ void diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c index c1633dd19de2..7206572ffc02 100644 --- a/sys/kern/kern_rmlock.c +++ b/sys/kern/kern_rmlock.c @@ -337,9 +337,9 @@ rm_wowned(const struct rmlock *rm) } void -rm_sysinit(void *arg) +rm_sysinit(const void *arg) { - struct rm_args *args; + const struct rm_args *args; args = arg; rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags); diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c index e182d1fe9baf..84a3a890be63 100644 --- a/sys/kern/kern_rwlock.c +++ b/sys/kern/kern_rwlock.c @@ -266,9 +266,9 @@ _rw_destroy(volatile uintptr_t *c) } void -rw_sysinit(void *arg) +rw_sysinit(const void *arg) { - struct rw_args *args; + const struct rw_args *args; args = arg; rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc, diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index da0efac0598d..8efc0886988b 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -2656,9 +2656,11 @@ ptrace_coredumpreq(struct thread *td, struct proc *p, return; } + memset(&wctx, 0, sizeof(wctx)); wctx.vp = tcq->tc_vp; wctx.fcred = NOCRED; + memset(&cdw, 0, sizeof(wctx)); cdw.ctx = &wctx; cdw.write_fn = core_vn_write; cdw.extend_fn = core_vn_extend; diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c index accea5d288eb..c005e112d3b9 100644 --- a/sys/kern/kern_sx.c +++ b/sys/kern/kern_sx.c @@ -222,9 +222,9 @@ owner_sx(const struct lock_object *lock, struct thread **owner) #endif void -sx_sysinit(void *arg) +sx_sysinit(const void *arg) { - struct sx_args *sargs = arg; + const struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 0e8c2b9f362e..4329959a2ef4 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -347,6 +347,17 @@ kern_thr_exit(struct thread *td) p = td->td_proc; /* + * Clear kernel ASTs in advance of selecting the last exiting + * thread and acquiring schedulers locks. It is fine to + * clear the ASTs here even if we are not going to exit after + * all. On the other hand, leaving them pending could trigger + * execution in subsystems in a context where they are not + * prepared to handle top kernel actions, even in execution of + * an unrelated thread. + */ + ast_kclear(td); + + /* * If all of the threads in a process call this routine to * exit (e.g. all threads call pthread_exit()), exactly one * thread should return to the caller to terminate the process diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 50b040132396..3180c66cb42b 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -1694,8 +1694,10 @@ thread_single_end(struct proc *p, int mode) thread_unlock(td); } } - KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, - ("inconsistent boundary count %d", p->p_boundary_count)); + KASSERT(mode != SINGLE_BOUNDARY || P_SHOULDSTOP(p) || + p->p_boundary_count == 0, + ("pid %d proc %p flags %#x inconsistent boundary count %d", + p->p_pid, p, p->p_flag, p->p_boundary_count)); PROC_SUNLOCK(p); wakeup(&p->p_flag); } diff --git a/sys/kern/kern_tslog.c b/sys/kern/kern_tslog.c index fbf81d423b95..09070eea284f 100644 --- a/sys/kern/kern_tslog.c +++ b/sys/kern/kern_tslog.c @@ -220,3 +220,13 @@ SYSCTL_PROC(_debug, OID_AUTO, tslog_user, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_SKIP, 0, 0, sysctl_debug_tslog_user, "", "Dump recorded userland event timestamps"); + +void +sysinit_tslog_shim(const void *data) +{ + const struct sysinit_tslog *x = data; + + tslog(curthread, TS_ENTER, "SYSINIT", x->name); + (x->func)(x->data); + tslog(curthread, TS_EXIT, "SYSINIT", x->name); +} diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c index 464efda1e91a..fee6c1a844e2 100644 --- a/sys/kern/subr_asan.c +++ b/sys/kern/subr_asan.c @@ -835,6 +835,7 @@ ASAN_ATOMIC_FUNC_TESTANDSET(32, uint32_t); ASAN_ATOMIC_FUNC_TESTANDSET(64, uint64_t); ASAN_ATOMIC_FUNC_TESTANDSET(int, u_int); ASAN_ATOMIC_FUNC_TESTANDSET(long, u_long); +ASAN_ATOMIC_FUNC_TESTANDSET(acq_long, u_long); ASAN_ATOMIC_FUNC_TESTANDSET(ptr, uintptr_t); ASAN_ATOMIC_FUNC_SWAP(32, uint32_t); diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c index 62a3da964c37..bf5bda7e058d 100644 --- a/sys/kern/subr_bus.c +++ b/sys/kern/subr_bus.c @@ -280,6 +280,9 @@ device_sysctl_handler(SYSCTL_HANDLER_ARGS) struct sbuf sb; device_t dev = (device_t)arg1; device_t iommu; +#ifdef IOMMU + device_t requester; +#endif int error; uint16_t rid; const char *c; @@ -314,9 +317,15 @@ device_sysctl_handler(SYSCTL_HANDLER_ARGS) } rid = 0; #ifdef IOMMU - iommu_get_requester(dev, &rid); + error = iommu_get_requester(dev, &requester, &rid); + /* + * Do not return requester error from sysctl, iommu + * unit might be assigned by other means. + */ +#else + error = ENXIO; #endif - if (rid != 0) + if (error == 0) sbuf_printf(&sb, "%srid=%#x", c, rid); break; default: diff --git a/sys/kern/subr_msan.c b/sys/kern/subr_msan.c index a3238b61482b..883dbd2b7604 100644 --- a/sys/kern/subr_msan.c +++ b/sys/kern/subr_msan.c @@ -1301,6 +1301,7 @@ MSAN_ATOMIC_FUNC_TESTANDSET(32, uint32_t); MSAN_ATOMIC_FUNC_TESTANDSET(64, uint64_t); MSAN_ATOMIC_FUNC_TESTANDSET(int, u_int); MSAN_ATOMIC_FUNC_TESTANDSET(long, u_long); +MSAN_ATOMIC_FUNC_TESTANDSET(acq_long, u_long); MSAN_ATOMIC_FUNC_TESTANDSET(ptr, uintptr_t); MSAN_ATOMIC_FUNC_SWAP(32, uint32_t); diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 471640c290a7..a67e5fa6cbff 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -235,14 +235,11 @@ init_param1(void) * specification for <limits.h>, paragraph "Runtime Increasable * Values"). * - * On the other hand, INT_MAX would result in an overflow for the common - * 'ngroups_max + 1' computation (to obtain the size of the internal - * groups array, its first element being reserved for the effective - * GID). Also, the number of allocated bytes for the group array must - * not overflow on 32-bit machines. For all these reasons, we limit the - * number of supplementary groups to some very high number that we - * expect will never be reached in all practical uses and ensures we - * avoid the problems just exposed, even if 'gid_t' was to be enlarged + * On the other hand, a too high value would result in an overflow when + * computing the number of bytes to allocate for the groups array. We + * thus limit the number of supplementary groups to some very high + * number that we expect will never be reached in all practical uses, + * avoiding the problem just exposed even if 'gid_t' were to be enlarged * by a magnitude. */ ngroups_max = NGROUPS_MAX; diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c index db0e7bf5b0e3..eb5bd03f5018 100644 --- a/sys/kern/subr_power.c +++ b/sys/kern/subr_power.c @@ -3,6 +3,10 @@ * * Copyright (c) 2001 Mitsuru IWASAKI * All rights reserved. + * Copyright (c) 2025 The FreeBSD Foundation + * + * Portions of this software were developed by Aymeric Wibo + * <obiwac@freebsd.org> under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,20 +34,83 @@ #include <sys/eventhandler.h> #include <sys/power.h> #include <sys/proc.h> +#include <sys/sbuf.h> +#include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> +enum power_stype power_standby_stype = POWER_STYPE_STANDBY; +enum power_stype power_suspend_stype = POWER_STYPE_SUSPEND_TO_IDLE; +enum power_stype power_hibernate_stype = POWER_STYPE_HIBERNATE; + static u_int power_pm_type = POWER_PM_TYPE_NONE; static power_pm_fn_t power_pm_fn = NULL; static void *power_pm_arg = NULL; static struct task power_pm_task; +enum power_stype +power_name_to_stype(const char *name) +{ + enum power_stype stype; + + for (stype = 0; stype < POWER_STYPE_COUNT; stype++) { + if (strcasecmp(name, power_stype_names[stype]) == 0) + return (stype); + } + return (POWER_STYPE_UNKNOWN); +} + +const char * +power_stype_to_name(enum power_stype stype) +{ + if (stype == POWER_STYPE_UNKNOWN) + return ("NONE"); + if (stype < POWER_STYPE_AWAKE || stype >= POWER_STYPE_COUNT) + return (NULL); + return (power_stype_names[stype]); +} + +static int +power_sysctl_stype(SYSCTL_HANDLER_ARGS) +{ + char name[10]; + int err; + enum power_stype new_stype, old_stype; + + old_stype = *(enum power_stype *)oidp->oid_arg1; + strlcpy(name, power_stype_to_name(old_stype), sizeof(name)); + err = sysctl_handle_string(oidp, name, sizeof(name), req); + if (err != 0 || req->newptr == NULL) + return (err); + + new_stype = power_name_to_stype(name); + if (new_stype == POWER_STYPE_UNKNOWN) + return (EINVAL); + /* TODO Check to see if the new stype is supported. */ + if (new_stype != old_stype) + *(enum power_stype *)oidp->oid_arg1 = new_stype; + return (0); +} + +static SYSCTL_NODE(_kern, OID_AUTO, power, CTLFLAG_RW, 0, + "Generic power management related sysctls"); + +SYSCTL_PROC(_kern_power, OID_AUTO, standby, CTLTYPE_STRING | CTLFLAG_RW, + &power_standby_stype, 0, power_sysctl_stype, "A", + "Sleep type to enter on standby"); +SYSCTL_PROC(_kern_power, OID_AUTO, suspend, CTLTYPE_STRING | CTLFLAG_RW, + &power_suspend_stype, 0, power_sysctl_stype, "A", + "Sleep type to enter on suspend"); +SYSCTL_PROC(_kern_power, OID_AUTO, hibernate, CTLTYPE_STRING | CTLFLAG_RW, + &power_hibernate_stype, 0, power_sysctl_stype, "A", + "Sleep type to enter on hibernate"); + static void power_pm_deferred_fn(void *arg, int pending) { - int state = (intptr_t)arg; + enum power_stype stype = (intptr_t)arg; - power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state); + power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, stype); } int @@ -75,14 +142,27 @@ power_pm_get_type(void) void power_pm_suspend(int state) { + enum power_stype stype; + if (power_pm_fn == NULL) return; - if (state != POWER_SLEEP_STATE_STANDBY && - state != POWER_SLEEP_STATE_SUSPEND && - state != POWER_SLEEP_STATE_HIBERNATE) + switch (state) { + case POWER_SLEEP_STATE_STANDBY: + stype = power_standby_stype; + break; + case POWER_SLEEP_STATE_SUSPEND: + stype = power_suspend_stype; + break; + case POWER_SLEEP_STATE_HIBERNATE: + stype = power_hibernate_stype; + break; + default: + printf("%s: unknown sleep state %d\n", __func__, state); return; - power_pm_task.ta_context = (void *)(intptr_t)state; + } + + power_pm_task.ta_context = (void *)(intptr_t)stype; taskqueue_enqueue(taskqueue_thread, &power_pm_task); } diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index ab47b6ad29a3..c937f6a82757 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -57,7 +57,7 @@ * b : public affirmation by word or example of usually * religious faith or conviction <the heroic witness to divine * life -- Pilot> - * 6 capitalized : a member of the Jehovah's Witnesses + * 6 capitalized : a member of the Jehovah's Witnesses */ /* @@ -131,7 +131,7 @@ #define LI_SLEEPABLE 0x00040000 /* Lock may be held while sleeping. */ #ifndef WITNESS_COUNT -#define WITNESS_COUNT 1536 +#define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (512 + (MAXCPU * 4)) @@ -158,20 +158,18 @@ * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ -#define WITNESS_UNRELATED 0x00 /* No lock order relation. */ -#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ -#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ -#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ -#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ -#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) -#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) -#define WITNESS_RELATED_MASK \ - (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) -#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been - * observed. */ -#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ -#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ -#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ +#define WITNESS_UNRELATED 0x00 /* No lock order relation. */ +#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ +#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ +#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ +#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ +#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) +#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) +#define WITNESS_RELATED_MASK (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) +#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been observed. */ +#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ +#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ +#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) @@ -218,20 +216,18 @@ struct lock_list_entry { * (for example, "vnode interlock"). */ struct witness { - char w_name[MAX_W_NAME]; - uint32_t w_index; /* Index in the relationship matrix */ + char w_name[MAX_W_NAME]; + uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; - STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ - STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ - struct witness *w_hash_next; /* Linked list in hash buckets. */ - const char *w_file; /* File where last acquired */ - uint32_t w_line; /* Line where last acquired */ - uint32_t w_refcount; - uint16_t w_num_ancestors; /* direct/indirect - * ancestor count */ - uint16_t w_num_descendants; /* direct/indirect - * descendant count */ - int16_t w_ddb_level; + STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ + STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ + struct witness *w_hash_next; /* Linked list in hash buckets. */ + const char *w_file; /* File where last acquired */ + uint32_t w_line; /* Line where last acquired */ + uint32_t w_refcount; + uint16_t w_num_ancestors; /* direct/indirect ancestor count */ + uint16_t w_num_descendants; /* direct/indirect descendant count */ + int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; @@ -265,7 +261,7 @@ struct witness_lock_order_data { /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects - * (struct witness_lock_order_data). + * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; @@ -295,7 +291,6 @@ struct witness_order_list_entry { static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { - return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } @@ -304,7 +299,6 @@ static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { - return (a->from == b->from && a->to == b->to); } @@ -415,7 +409,7 @@ SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, int badstack_sbuf_size; int witness_count = WITNESS_COUNT; -SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, +SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* @@ -760,7 +754,6 @@ static int witness_spin_warn = 0; static const char * fixup_filename(const char *file) { - if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) @@ -835,7 +828,7 @@ witness_startup(void *mem) w_free_cnt--; for (i = 0; i < witness_count; i++) { - memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * + memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } @@ -989,16 +982,16 @@ witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), { int i; - for (i = 0; i < indent; i++) - prnt(" "); + for (i = 0; i < indent; i++) + prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); - if (w->w_displayed) { - prnt(" -- (already displayed)\n"); - return; - } - w->w_displayed = 1; + if (w->w_displayed) { + prnt(" -- (already displayed)\n"); + return; + } + w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); @@ -1079,7 +1072,6 @@ witness_ddb_display(int(*prnt)(const char *fmt, ...)) int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { - if (witness_watch == -1 || KERNEL_PANICKED()) return (0); @@ -1257,7 +1249,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( - "acquiring duplicate lock of same type: \"%s\"\n", + "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); @@ -1523,6 +1515,10 @@ witness_lock(struct lock_object *lock, int flags, const char *file, int line) else lock_list = PCPU_PTR(spinlocks); + /* Update per-witness last file and line acquire. */ + w->w_file = file; + w->w_line = line; + /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { @@ -1530,15 +1526,9 @@ witness_lock(struct lock_object *lock, int flags, const char *file, int line) CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); - instance->li_file = file; - instance->li_line = line; return; } - /* Update per-witness last file and line acquire. */ - w->w_file = file; - w->w_line = line; - /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { @@ -1743,7 +1733,7 @@ found: /* * In order to reduce contention on w_mtx, we want to keep always an - * head object into lists so that frequent allocation from the + * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have @@ -1781,7 +1771,7 @@ witness_thread_exit(struct thread *td) n++; witness_list_lock(&lle->ll_children[i], witness_output); - + } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); @@ -1948,7 +1938,6 @@ found: static void depart(struct witness *w) { - MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { w_sleep_cnt--; @@ -1999,18 +1988,18 @@ adopt(struct witness *parent, struct witness *child) child->w_num_ancestors++; } - /* - * Find each ancestor of 'pi'. Note that 'pi' itself is counted as + /* + * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { - if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && + if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { - /* + /* * Skip children that are already marked as * descendants of 'i'. */ @@ -2021,7 +2010,7 @@ adopt(struct witness *parent, struct witness *child) * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ - if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && + if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; @@ -2029,16 +2018,16 @@ adopt(struct witness *parent, struct witness *child) w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; - /* + /* * Make sure we aren't marking a node as both an - * ancestor and descendant. We should have caught + * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", - i, j, w_rmatrix[i][j]); + i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; @@ -2047,7 +2036,7 @@ adopt(struct witness *parent, struct witness *child) (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", - j, i, w_rmatrix[j][i]); + j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; @@ -2124,7 +2113,6 @@ _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) static int isitmychild(struct witness *parent, struct witness *child) { - return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } @@ -2134,7 +2122,6 @@ isitmychild(struct witness *parent, struct witness *child) static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { - return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } @@ -2182,7 +2169,7 @@ witness_get(void) STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; - MPASS(index > 0 && index == w_max_used_index+1 && + MPASS(index > 0 && index == w_max_used_index + 1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; @@ -2194,7 +2181,6 @@ witness_get(void) static void witness_free(struct witness *w) { - STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } @@ -2219,11 +2205,10 @@ witness_lock_list_get(void) bzero(lle, sizeof(*lle)); return (lle); } - + static void witness_lock_list_free(struct lock_list_entry *lle) { - mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; @@ -2297,7 +2282,6 @@ witness_voutput(const char *fmt, va_list ap) static int witness_thread_has_locks(struct thread *td) { - if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); @@ -2573,14 +2557,12 @@ witness_setflag(struct lock_object *lock, int flag, int set) void witness_norelease(struct lock_object *lock) { - witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { - witness_setflag(lock, LI_NORELEASE, 0); } @@ -2588,7 +2570,6 @@ witness_releaseok(struct lock_object *lock) static void witness_ddb_list(struct thread *td) { - KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); @@ -2653,7 +2634,6 @@ DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE); DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE) { - witness_ddb_display(db_printf); } #endif @@ -2673,9 +2653,9 @@ sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); - tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, + tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); - tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, + tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); @@ -2750,12 +2730,12 @@ restart: sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", - tmp_w1->w_name, tmp_w1->w_class->lc_name, + tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", - tmp_w1->w_name, tmp_w1->w_class->lc_name, + tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_putc(sb, '\n'); @@ -2763,7 +2743,7 @@ restart: if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", - tmp_w2->w_name, tmp_w2->w_class->lc_name, + tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_putc(sb, '\n'); @@ -2823,7 +2803,6 @@ sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { - return (db_printf("%.*s", len, data)); } @@ -3068,7 +3047,7 @@ witness_lock_order_get(struct witness *parent, struct witness *child) & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; - hash = witness_hash_djb2((const char*)&key, + hash = witness_hash_djb2((const char *)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { @@ -3089,7 +3068,6 @@ out: static int witness_lock_order_check(struct witness *parent, struct witness *child) { - if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && @@ -3115,7 +3093,7 @@ witness_lock_order_add(struct witness *parent, struct witness *child) & WITNESS_LOCK_ORDER_KNOWN) return (1); - hash = witness_hash_djb2((const char*)&key, + hash = witness_hash_djb2((const char *)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; @@ -3134,7 +3112,6 @@ witness_lock_order_add(struct witness *parent, struct witness *child) static void witness_increment_graph_generation(void) { - if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; @@ -3143,7 +3120,6 @@ witness_increment_graph_generation(void) static int witness_output_drain(void *arg __unused, const char *data, int len) { - witness_output("%.*s", len, data); return (len); } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index ed651da96b14..30527fdd4fd0 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -234,6 +234,7 @@ static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, bool backing); +static void pipe_destroy(struct pipe *pipe); static int pipe_paircreate(struct thread *td, struct pipepair **p_pp); static __inline int pipelock(struct pipe *cpipe, bool catch); static __inline void pipeunlock(struct pipe *cpipe); @@ -399,16 +400,7 @@ pipe_paircreate(struct thread *td, struct pipepair **p_pp) goto fail; error = pipe_create(wpipe, false); if (error != 0) { - /* - * This cleanup leaves the pipe inode number for rpipe - * still allocated, but never used. We do not free - * inode numbers for opened pipes, which is required - * for correctness because numbers must be unique. - * But also it avoids any memory use by the unr - * allocator, so stashing away the transient inode - * number is reasonable. - */ - pipe_free_kmem(rpipe); + pipe_destroy(rpipe); goto fail; } @@ -743,6 +735,16 @@ pipe_create(struct pipe *pipe, bool large_backing) return (error); } +static void +pipe_destroy(struct pipe *pipe) +{ + pipe_free_kmem(pipe); + /* + * Note: we "leak" pipe_ino -- by design the alloc_unr64 mechanism does + * not undo allocations. + */ +} + /* ARGSUSED */ static int pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c index 11bd1b6f30e1..acaf1241cb2e 100644 --- a/sys/kern/sys_procdesc.c +++ b/sys/kern/sys_procdesc.c @@ -129,7 +129,7 @@ procdesc_find(struct thread *td, int fd, const cap_rights_t *rightsp, if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { - error = EBADF; + error = EINVAL; goto out; } pd = fp->f_data; diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 4122f9261871..4cef89cd5219 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -602,4 +602,6 @@ const char *syscallnames[] = { "inotify_rm_watch", /* 594 = inotify_rm_watch */ "getgroups", /* 595 = getgroups */ "setgroups", /* 596 = setgroups */ + "jail_attach_jd", /* 597 = jail_attach_jd */ + "jail_remove_jd", /* 598 = jail_remove_jd */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index fa64597d14a5..967af1f5313c 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -552,13 +552,13 @@ _Out_writes_bytes_(len/PAGE_SIZE) char *vec ); } -79 AUE_GETGROUPS STD|CAPENABLED|COMPAT14 { +79 AUE_GETGROUPS COMPAT14|CAPENABLED { int getgroups( int gidsetsize, _Out_writes_opt_(gidsetsize) gid_t *gidset ); } -80 AUE_SETGROUPS STD|COMPAT14 { +80 AUE_SETGROUPS COMPAT14 { int setgroups( int gidsetsize, _In_reads_(gidsetsize) const gid_t *gidset @@ -3383,5 +3383,15 @@ _In_reads_(gidsetsize) const gid_t *gidset ); } +597 AUE_JAIL_ATTACH STD { + int jail_attach_jd( + int fd + ); + } +598 AUE_JAIL_REMOVE STD { + int jail_remove_jd( + int fd + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 2b1ea9eed8d4..e28fef931ea8 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3500,6 +3500,20 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 2; break; } + /* jail_attach_jd */ + case 597: { + struct jail_attach_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } + /* jail_remove_jd */ + case 598: { + struct jail_remove_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } default: *n_args = 0; break; @@ -9367,6 +9381,26 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* jail_attach_jd */ + case 597: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; + /* jail_remove_jd */ + case 598: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11365,6 +11399,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* jail_attach_jd */ + case 597: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* jail_remove_jd */ + case 598: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 0056dac65c7d..340d84666459 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -154,15 +154,12 @@ static struct task unp_defer_task; * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ -#ifndef PIPSIZ -#define PIPSIZ 8192 -#endif -static u_long unpst_sendspace = PIPSIZ; -static u_long unpst_recvspace = PIPSIZ; +static u_long unpst_sendspace = 64*1024; +static u_long unpst_recvspace = 64*1024; static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */ static u_long unpdg_recvspace = 16*1024; -static u_long unpsp_sendspace = PIPSIZ; -static u_long unpsp_recvspace = PIPSIZ; +static u_long unpsp_sendspace = 64*1024; +static u_long unpsp_recvspace = 64*1024; static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Local domain"); @@ -1810,9 +1807,7 @@ uipc_filt_sowrite(struct knote *kn, long hint) kn->kn_data = uipc_stream_sbspace(&so2->so_rcv); if (so2->so_rcv.sb_state & SBS_CANTRCVMORE) { - /* - * XXXGL: maybe kn->kn_flags |= EV_EOF ? - */ + kn->kn_flags |= EV_EOF; return (1); } else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); @@ -3672,11 +3667,14 @@ unp_internalize(struct mbuf *control, struct mchain *mc, struct thread *td) cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; - cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, + _Static_assert(CMGROUP_MAX >= 1, + "Room needed for the effective GID."); + cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups + 1, CMGROUP_MAX); - for (i = 0; i < cmcred->cmcred_ngroups; i++) + cmcred->cmcred_groups[0] = td->td_ucred->cr_gid; + for (i = 1; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = - td->td_ucred->cr_groups[i]; + td->td_ucred->cr_groups[i - 1]; break; case SCM_RIGHTS: diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index fa655c43d155..19c39e42bafa 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -5170,7 +5170,7 @@ bufstrategy(struct bufobj *bo, struct buf *bp) vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); - KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, + KASSERT(!VN_ISDEV(vp), ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 89c1d779f04c..13abb9171234 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -86,7 +86,7 @@ * * This fundamental choice needs to be revisited. In the meantime, the current * state is described below. Significance of all notable routines is explained - * in comments placed above their implementation. Scattered thoroughout the + * in comments placed above their implementation. Scattered throughout the * file are TODO comments indicating shortcomings which can be fixed without * reworking everything (most of the fixes will likely be reusable). Various * details are omitted from this explanation to not clutter the overview, they @@ -109,18 +109,19 @@ * The (directory vnode; name) tuple reliably determines the target entry if * it exists. * - * Since there are no small locks at this time (all are 32 bytes in size on - * LP64), the code works around the problem by introducing lock arrays to - * protect hash buckets and vnode lists. + * Since there were no small locks at the time of writing this comment (all are + * 32 bytes in size on LP64), the code works around the problem by introducing + * lock arrays to protect hash buckets and vnode lists. * * II. Filesystem integration * * Filesystems participating in name caching do the following: * - set vop_lookup routine to vfs_cache_lookup - * - set vop_cachedlookup to whatever can perform the lookup if the above fails - * - if they support lockless lookup (see below), vop_fplookup_vexec and - * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the - * mount point + * - set vop_cachedlookup to a routine which can perform the lookup if the + * above fails + * - if they support lockless lookup (see below), they set vop_fplookup_vexec + * and vop_fplookup_symlink along with the MNTK_FPLOOKUP flag on the mount + * point * - call cache_purge or cache_vop_* routines to eliminate stale entries as * applicable * - call cache_enter to add entries depending on the MAKEENTRY flag @@ -134,11 +135,15 @@ * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter * + * You may notice a degree of CPU waste in this callchain. + * * III. Performance considerations * * For lockless case forward lookup avoids any writes to shared areas apart * from the terminal path component. In other words non-modifying lookups of - * different files don't suffer any scalability problems in the namecache. + * different files don't suffer any scalability problems in the namecache + * itself. + * * Looking up the same file is limited by VFS and goes beyond the scope of this * file. * @@ -158,8 +163,10 @@ * * IV. Observability * - * Note not everything has an explicit dtrace probe nor it should have, thus - * some of the one-liners below depend on implementation details. + * Several statistics are collected in the vfs.cache sysctl tree. + * + * Some of the state can be checked for with explicit dtrace probes, must of it + * depends on implementation details. * * Examples: * @@ -167,7 +174,7 @@ * # line number, column 2 is status code (see cache_fpl_status) * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' * - * # Lengths of names added by binary name + * # Histogram of lengths of names added, aggregated by which programs are doing it * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' * * # Same as above but only those which exceed 64 characters @@ -195,6 +202,11 @@ * - vnodes are subject to being recycled even if target inode is left in memory, * which loses the name cache entries when it perhaps should not. in case of tmpfs * names get duplicated -- kept by filesystem itself and namecache separately + * - vnode reclamation (see vnlru in kern/vfs_subr.c) defaults to skipping + * directories for this very reason, which arguably further reducing quality + * of vnode LRU. Per the above this is done to avoid breaking vnode -> path + * resolution (it becomes expensive for directories and impossible for the rest) + * This would not be a factor if namecache entries could persist without vnodes. * - struct namecache has a fixed size and comes in 2 variants, often wasting * space. now hard to replace with malloc due to dependence on SMR, which * requires UMA zones to opt in @@ -207,7 +219,8 @@ * performance left on the table, most notably from single-threaded standpoint. * Below is a woefully incomplete list of changes which can help. Ideas are * mostly sketched out, no claim is made all kinks or prerequisites are laid - * out. + * out. The name of the game is eliding branches altogether and hopefully some + * of memory accesses. * * Note there is performance lost all over VFS. * @@ -223,13 +236,6 @@ * the vnode to hang around for the long haul, but would work for aforementioned * stat(2) but also access(2), readlink(2), realpathat(2) and probably more. * - * === hotpatching for sdt probes - * - * They result in *tons* of branches all over with rather regrettable codegen - * at times. Removing sdt probes altogether gives over 2% boost in lookup rate. - * Reworking the code to patch itself at runtime with asm goto would solve it. - * asm goto is fully supported by gcc and clang. - * * === copyinstr * * On all architectures it operates one byte at a time, while it could be @@ -251,10 +257,12 @@ * things worked out locklessly. Instead the lockless lookup could be the * actual entry point which calls what is currently namei as a fallback. * + * It could be hotpatched if lockless lookup is disabled. + * * === avoidable branches in cache_can_fplookup * * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if - * this is off, none of fplookup code should execute). + * this is off, none of fplookup code should execute, see above). * * Both audit and capsicum branches can be combined into one, but it requires * paying off a lot of tech debt first. @@ -277,8 +285,18 @@ * * === inactive on v_usecount reaching 0 * - * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such - * processing with a bit in usecount. + * VOP_NEED_INACTIVE should not exist. Filesystems can indicate need for such + * processing with a bit in usecount and adding a hold count. Then vput fast path + * would become as simple as (ACHTUNG: locking ignored): + * + * ref = atomic_fetchadd_int(&vp->v_count, -1) - 1; + * if ((ref & MAGIC_BIT) == 0) // common case + * return; + * if (ref != 0) // the bit is set but this was not the last user + * return; + * // do inactive here + * + * Also see below. * * === v_holdcnt * @@ -287,7 +305,8 @@ * vnlru et al would consider the vnode not-freeable if has either hold or * usecount on it. * - * This would eliminate 2 atomics. + * This would eliminate 2 atomics in the common case of securing a vnode and + * undoing it. */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, @@ -4632,7 +4651,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, } /* - * The target vnode is not supported, prepare for the slow path to take over. + * Prepare fallback to the locked lookup while trying to retain the progress. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) @@ -6289,53 +6308,90 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria - * outlined below. - * - * Traditional vnode lookup conceptually looks like this: + * outlined at the end. * - * vn_lock(current); - * for (;;) { - * next = find(); - * vn_lock(next); - * vn_unlock(current); - * current = next; - * if (last) - * break; - * } - * return (current); + * Traversing from one vnode to another requires atomicity with regard to + * permissions, mount points and of course their relative placement (if you are + * looking up "bar" in "foo" and you found it, it better be in that directory + * at the time). * - * Each jump to the next vnode is safe memory-wise and atomic with respect to - * any modifications thanks to holding respective locks. + * Normally this is accomplished with locking, but it comes with a significant + * performance hit and is untenable as a fast path even in a moderate core + * count environment (at the time of writing this comment this would be a + * little south of 100). * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as - * we made the jump. This includes things like permissions, mount points etc. - * Counter modification is provided by enclosing relevant places in - * vn_seqc_write_begin()/end() calls. + * we made the jump. * - * Thus this translates to: + * See places which issue vn_seqc_write_begin()/vn_seqc_write_end() for + * operations affected. + * + * Suppose the variable "cnp" contains lookup metadata (the path etc.), then + * locked lookup conceptually looks like this: + * + * // lock the current directory + * vn_lock(dvp); + * for (;;) { + * // permission check + * if (!canlookup(dvp, cnp)) + * abort(); + * // look for the target name inside dvp + * tvp = findnext(dvp, cnp); + * vn_lock(tvp); + * // tvp is still guaranteed to be inside of dvp because of the lock on dvp + * vn_unlock(dvp); + * // dvp is unlocked. its state is now arbitrary, but that's fine as we + * // made the jump while everything relevant was correct, continue with tvp + * // as the directory to look up names in + * tvp = dvp; + * if (last) + * break; + * // if not last loop back and continue until done + * } + * vget(tvp); + * return (tvp); + * + * Lockless lookup replaces locking with sequence counter checks: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); - * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode + * // fail if someone is altering the directory vnode + * if (seqc_in_modify(dvp_seqc)) * abort(); * for (;;) { - * tvp = find(); + * // permission check. note it can race, but we will validate the outcome + * // with a seqc + * if (!canlookup_smr(dvp, cnp)) { + * // has dvp changed from under us? if so, the denial may be invalid + * if (!seqc_consistent(dvp, dvp_seqc) + * fallback_to_locked(); + * // nothing changed, lookup denial is valid + * fail(); + * } + * // look for the target name inside dvp + * tvp = findnext(dvp, cnp); * tvp_seqc = seqc_read_any(tvp); - * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode - * abort(); - * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode - * abort(); - * dvp = tvp; // we know nothing of importance has changed - * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration + * // bail if someone is altering the target vnode + * if (seqc_in_modify(tvp_seqc)) + * fallback_to_locked(); + * // bail if someone is altering the directory vnode + * if (!seqc_consistent(dvp, dvp_seqc) + * fallback_to_locked(); + * // we confirmed neither dvp nor tvp changed while we were making the + * // jump to the next component, thus the result is the same as if we + * // held the lock on dvp and tvp the entire time, continue with tvp + * // as the directory to look up names in + * dvp = tvp; + * dvp_seqc = tvp_seqc; * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check - * abort(); + * fallback_to_locked(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c index cd30d5cfae47..ceda770cb714 100644 --- a/sys/kern/vfs_init.c +++ b/sys/kern/vfs_init.c @@ -103,6 +103,16 @@ struct vattr va_null; * Routines having to do with the management of the vnode table. */ +void +vfs_unref_vfsconf(struct vfsconf *vfsp) +{ + vfsconf_lock(); + KASSERT(vfsp->vfc_refcount > 0, + ("vfs %p refcount underflow %d", vfsp, vfsp->vfc_refcount)); + vfsp->vfc_refcount--; + vfsconf_unlock(); +} + static struct vfsconf * vfs_byname_locked(const char *name) { @@ -123,9 +133,11 @@ vfs_byname(const char *name) { struct vfsconf *vfsp; - vfsconf_slock(); + vfsconf_lock(); vfsp = vfs_byname_locked(name); - vfsconf_sunlock(); + if (vfsp != NULL) + vfsp->vfc_refcount++; + vfsconf_unlock(); return (vfsp); } @@ -387,7 +399,7 @@ vfs_register(struct vfsconf *vfc) static int once; struct vfsconf *tvfc; uint32_t hashval; - int secondpass; + int error, prevmaxconf, secondpass; if (!once) { vattr_null(&va_null); @@ -405,6 +417,7 @@ vfs_register(struct vfsconf *vfc) return (EEXIST); } + prevmaxconf = maxvfsconf; if (vfs_typenumhash != 0) { /* * Calculate a hash on vfc_name to use for vfc_typenum. Unless @@ -497,16 +510,24 @@ vfs_register(struct vfsconf *vfc) vfc->vfc_vfsops = &vfsops_sigdefer; } - if (vfc->vfc_flags & VFCF_JAIL) - prison_add_vfs(vfc); - /* * Call init function for this VFS... */ if ((vfc->vfc_flags & VFCF_SBDRY) != 0) - vfc->vfc_vfsops_sd->vfs_init(vfc); + error = vfc->vfc_vfsops_sd->vfs_init(vfc); else - vfc->vfc_vfsops->vfs_init(vfc); + error = vfc->vfc_vfsops->vfs_init(vfc); + + if (error != 0) { + maxvfsconf = prevmaxconf; + TAILQ_REMOVE(&vfsconf, vfc, vfc_list); + vfsconf_unlock(); + return (error); + } + + if ((vfc->vfc_flags & VFCF_JAIL) != 0) + prison_add_vfs(vfc); + vfsconf_unlock(); /* diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 8e64a7fe966b..13403acacc08 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -683,7 +683,6 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, MPASSERT(mp->mnt_vfs_ops == 1, mp, ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops)); (void) vfs_busy(mp, MBF_NOWAIT); - atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; @@ -731,7 +730,6 @@ vfs_mount_destroy(struct mount *mp) __FILE__, __LINE__)); MPPASS(mp->mnt_writeopcount == 0, mp); MPPASS(mp->mnt_secondary_writes == 0, mp); - atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; @@ -769,6 +767,9 @@ vfs_mount_destroy(struct mount *mp) vfs_free_addrlist(mp->mnt_export); free(mp->mnt_export, M_MOUNT); } + vfsconf_lock(); + mp->mnt_vfc->vfc_refcount--; + vfsconf_unlock(); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } @@ -1133,6 +1134,7 @@ vfs_domount_first( if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred, vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) { vput(vp); + vfs_unref_vfsconf(vfsp); return (EPERM); } @@ -1169,6 +1171,7 @@ vfs_domount_first( } if (error != 0) { vput(vp); + vfs_unref_vfsconf(vfsp); return (error); } vn_seqc_write_begin(vp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a6e38be89291..fe299ecc9c56 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2186,6 +2186,8 @@ freevnode(struct vnode *vp) { struct bufobj *bo; + ASSERT_VOP_UNLOCKED(vp, __func__); + /* * The vnode has been marked for destruction, so free it. * @@ -2222,12 +2224,16 @@ freevnode(struct vnode *vp) mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { + int error __diagused; + /* * Use LK_NOWAIT to shut up witness about the lock. We may get * here while having another vnode locked when trying to * satisfy a lookup and needing to recycle. */ - VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); + error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); + VNASSERT(error == 0, vp, + ("freevnode: cannot lock vp %p for pollinfo destroy", vp)); destroy_vpollinfo(vp->v_pollinfo); VOP_UNLOCK(vp); vp->v_pollinfo = NULL; @@ -3561,11 +3567,6 @@ enum vput_op { VRELE, VPUT, VUNREF }; * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. - * - * XXX Some filesystems pass in an exclusively locked vnode and strongly depend - * on the lock being held all the way until VOP_INACTIVE. This in particular - * happens with UFS which adds half-constructed vnodes to the hash, where they - * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) @@ -4501,6 +4502,17 @@ vgonel(struct vnode *vp) /* * Done with purge, reset to the standard lock and invalidate * the vnode. + * + * FIXME: this is buggy for vnode ops with custom locking primitives. + * + * vget used to be gated with a special flag serializing it against vgone, + * which got lost in the process of SMP-ifying the VFS layer. + * + * Suppose a custom locking routine references ->v_data. + * + * Since now it is possible to start executing it as vgone is + * progressing, this very well may crash as ->v_data gets invalidated + * and memory used to back it is freed. */ vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index b805e147bd62..bf3ed9d515dc 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -2839,7 +2839,7 @@ setfflags(struct thread *td, struct vnode *vp, u_long flags) * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ - if (vp->v_type == VCHR || vp->v_type == VBLK) { + if (VN_ISDEV(vp)) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 93f87ddae4de..3d4567b6ab1e 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -798,58 +798,82 @@ vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, } #if OFF_MAX <= LONG_MAX -off_t -foffset_lock(struct file *fp, int flags) +static void +file_v_lock(struct file *fp, short lock_bit, short lock_wait_bit) { - volatile short *flagsp; - off_t res; + short *flagsp; short state; - KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); - - if ((flags & FOF_NOLOCK) != 0) - return (atomic_load_long(&fp->f_offset)); - - /* - * According to McKusick the vn lock was protecting f_offset here. - * It is now protected by the FOFFSET_LOCKED flag. - */ - flagsp = &fp->f_vnread_flags; - if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED)) - return (atomic_load_long(&fp->f_offset)); + flagsp = &fp->f_vflags; + state = atomic_load_16(flagsp); + if ((state & lock_bit) == 0 && + atomic_cmpset_acq_16(flagsp, state, state | lock_bit)) + return; - sleepq_lock(&fp->f_vnread_flags); + sleepq_lock(flagsp); state = atomic_load_16(flagsp); for (;;) { - if ((state & FOFFSET_LOCKED) == 0) { + if ((state & lock_bit) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, - FOFFSET_LOCKED)) + state | lock_bit)) continue; break; } - if ((state & FOFFSET_LOCK_WAITING) == 0) { + if ((state & lock_wait_bit) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, - state | FOFFSET_LOCK_WAITING)) + state | lock_wait_bit)) continue; } DROP_GIANT(); - sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0); - sleepq_wait(&fp->f_vnread_flags, PRI_MAX_KERN); + sleepq_add(flagsp, NULL, "vofflock", 0, 0); + sleepq_wait(flagsp, PRI_MAX_KERN); PICKUP_GIANT(); - sleepq_lock(&fp->f_vnread_flags); + sleepq_lock(flagsp); state = atomic_load_16(flagsp); } - res = atomic_load_long(&fp->f_offset); - sleepq_release(&fp->f_vnread_flags); - return (res); + sleepq_release(flagsp); } -void -foffset_unlock(struct file *fp, off_t val, int flags) +static void +file_v_unlock(struct file *fp, short lock_bit, short lock_wait_bit) { - volatile short *flagsp; + short *flagsp; short state; + flagsp = &fp->f_vflags; + state = atomic_load_16(flagsp); + if ((state & lock_wait_bit) == 0 && + atomic_cmpset_rel_16(flagsp, state, state & ~lock_bit)) + return; + + sleepq_lock(flagsp); + MPASS((*flagsp & lock_bit) != 0); + MPASS((*flagsp & lock_wait_bit) != 0); + atomic_clear_16(flagsp, lock_bit | lock_wait_bit); + sleepq_broadcast(flagsp, SLEEPQ_SLEEP, 0, 0); + sleepq_release(flagsp); +} + +off_t +foffset_lock(struct file *fp, int flags) +{ + KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); + + if ((flags & FOF_NOLOCK) == 0) { + file_v_lock(fp, FILE_V_FOFFSET_LOCKED, + FILE_V_FOFFSET_LOCK_WAITING); + } + + /* + * According to McKusick the vn lock was protecting f_offset here. + * It is now protected by the FOFFSET_LOCKED flag. + */ + return (atomic_load_long(&fp->f_offset)); +} + +void +foffset_unlock(struct file *fp, off_t val, int flags) +{ KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOUPDATE) == 0) @@ -859,21 +883,10 @@ foffset_unlock(struct file *fp, off_t val, int flags) if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; - if ((flags & FOF_NOLOCK) != 0) - return; - - flagsp = &fp->f_vnread_flags; - state = atomic_load_16(flagsp); - if ((state & FOFFSET_LOCK_WAITING) == 0 && - atomic_cmpset_rel_16(flagsp, state, 0)) - return; - - sleepq_lock(&fp->f_vnread_flags); - MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0); - MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0); - fp->f_vnread_flags = 0; - sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0); - sleepq_release(&fp->f_vnread_flags); + if ((flags & FOF_NOLOCK) == 0) { + file_v_unlock(fp, FILE_V_FOFFSET_LOCKED, + FILE_V_FOFFSET_LOCK_WAITING); + } } static off_t @@ -882,7 +895,47 @@ foffset_read(struct file *fp) return (atomic_load_long(&fp->f_offset)); } -#else + +void +fsetfl_lock(struct file *fp) +{ + file_v_lock(fp, FILE_V_SETFL_LOCKED, FILE_V_SETFL_LOCK_WAITING); +} + +void +fsetfl_unlock(struct file *fp) +{ + file_v_unlock(fp, FILE_V_SETFL_LOCKED, FILE_V_SETFL_LOCK_WAITING); +} + +#else /* OFF_MAX <= LONG_MAX */ + +static void +file_v_lock_mtxp(struct file *fp, struct mtx *mtxp, short lock_bit, + short lock_wait_bit) +{ + mtx_assert(mtxp, MA_OWNED); + + while ((fp->f_vflags & lock_bit) != 0) { + fp->f_vflags |= lock_wait_bit; + msleep(&fp->f_vflags, mtxp, PRI_MAX_KERN, + "vofflock", 0); + } + fp->f_vflags |= lock_bit; +} + +static void +file_v_unlock_mtxp(struct file *fp, struct mtx *mtxp, short lock_bit, + short lock_wait_bit) +{ + mtx_assert(mtxp, MA_OWNED); + + KASSERT((fp->f_vflags & lock_bit) != 0, ("Lost lock_bit")); + if ((fp->f_vflags & lock_wait_bit) != 0) + wakeup(&fp->f_vflags); + fp->f_vflags &= ~(lock_bit | lock_wait_bit); +} + off_t foffset_lock(struct file *fp, int flags) { @@ -894,12 +947,8 @@ foffset_lock(struct file *fp, int flags) mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { - while (fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PRI_MAX_KERN, - "vofflock", 0); - } - fp->f_vnread_flags |= FOFFSET_LOCKED; + file_v_lock_mtxp(fp, mtxp, FILE_V_FOFFSET_LOCKED, + FILE_V_FOFFSET_LOCK_WAITING); } res = fp->f_offset; mtx_unlock(mtxp); @@ -922,11 +971,8 @@ foffset_unlock(struct file *fp, off_t val, int flags) if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) == 0) { - KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, - ("Lost FOFFSET_LOCKED")); - if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) - wakeup(&fp->f_vnread_flags); - fp->f_vnread_flags = 0; + file_v_unlock_mtxp(fp, mtxp, FILE_V_FOFFSET_LOCKED, + FILE_V_FOFFSET_LOCK_WAITING); } mtx_unlock(mtxp); } @@ -937,6 +983,30 @@ foffset_read(struct file *fp) return (foffset_lock(fp, FOF_NOLOCK)); } + +void +fsetfl_lock(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + file_v_lock_mtxp(fp, mtxp, FILE_V_SETFL_LOCKED, + FILE_V_SETFL_LOCK_WAITING); + mtx_unlock(mtxp); +} + +void +fsetfl_unlock(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + file_v_unlock_mtxp(fp, mtxp, FILE_V_SETFL_LOCKED, + FILE_V_SETFL_LOCK_WAITING); + mtx_unlock(mtxp); +} #endif void @@ -3444,7 +3514,7 @@ vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, dat = NULL; if ((flags & COPY_FILE_RANGE_CLONE) != 0) { - error = ENOSYS; + error = EOPNOTSUPP; goto out; } |