diff options
Diffstat (limited to 'sys/kern')
-rw-r--r-- | sys/kern/device_if.m | 10 | ||||
-rw-r--r-- | sys/kern/imgact_elf.c | 54 | ||||
-rw-r--r-- | sys/kern/init_sysent.c | 8 | ||||
-rw-r--r-- | sys/kern/kern_exit.c | 2 | ||||
-rw-r--r-- | sys/kern/kern_jail.c | 15 | ||||
-rw-r--r-- | sys/kern/kern_prot.c | 185 | ||||
-rw-r--r-- | sys/kern/kern_racct.c | 307 | ||||
-rw-r--r-- | sys/kern/kern_time.c | 5 | ||||
-rw-r--r-- | sys/kern/subr_witness.c | 134 | ||||
-rw-r--r-- | sys/kern/sys_timerfd.c | 1 | ||||
-rw-r--r-- | sys/kern/syscalls.c | 8 | ||||
-rw-r--r-- | sys/kern/syscalls.conf | 1 | ||||
-rw-r--r-- | sys/kern/syscalls.master | 21 | ||||
-rw-r--r-- | sys/kern/systrace_args.c | 112 | ||||
-rw-r--r-- | sys/kern/uipc_shm.c | 3 | ||||
-rw-r--r-- | sys/kern/uipc_usrreq.c | 11 | ||||
-rw-r--r-- | sys/kern/vfs_bio.c | 2 | ||||
-rw-r--r-- | sys/kern/vfs_default.c | 1 | ||||
-rw-r--r-- | sys/kern/vfs_init.c | 16 | ||||
-rw-r--r-- | sys/kern/vfs_inotify.c | 1 | ||||
-rw-r--r-- | sys/kern/vfs_mount.c | 7 | ||||
-rw-r--r-- | sys/kern/vfs_syscalls.c | 21 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 5 |
23 files changed, 527 insertions, 403 deletions
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m index c02e5a46f326..ed94b2ccbe1b 100644 --- a/sys/kern/device_if.m +++ b/sys/kern/device_if.m @@ -49,25 +49,25 @@ HEADER { CODE { static int null_shutdown(device_t dev) { - return 0; + return (0); } static int null_suspend(device_t dev) { - return 0; + return (0); } static int null_resume(device_t dev) { - return 0; + return (0); } static int null_quiesce(device_t dev) { - return 0; + return (0); } - static void * null_register(device_t dev) + static void *null_register(device_t dev) { return NULL; } diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 2690ad3b2679..5a53fac50f2c 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -84,6 +84,13 @@ #define ELF_NOTE_ROUNDSIZE 4 #define OLD_EI_BRAND 8 +/* + * ELF_ABI_NAME is a string name of the ELF ABI. ELF_ABI_ID is used + * to build variable names. + */ +#define ELF_ABI_NAME __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) +#define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE) + static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0); @@ -104,14 +111,15 @@ static Elf_Word __elfN(untrans_prot)(vm_prot_t); static size_t __elfN(prepare_register_notes)(struct thread *td, struct note_info_list *list, struct thread *target_td); -SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, +SYSCTL_NODE(_kern, OID_AUTO, ELF_ABI_ID, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); +#define ELF_NODE_OID __CONCAT(_kern_, ELF_ABI_ID) + int __elfN(fallback_brand) = -1; -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); + ELF_ABI_NAME " brand of last resort"); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, @@ -126,22 +134,22 @@ int __elfN(nxstack) = #else 0; #endif -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, nxstack, CTLFLAG_RW, &__elfN(nxstack), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": support PT_GNU_STACK for non-executable stack control"); + ELF_ABI_NAME ": support PT_GNU_STACK for non-executable stack control"); #if defined(__amd64__) static int __elfN(vdso) = 1; -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, vdso, CTLFLAG_RWTUN, &__elfN(vdso), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable vdso preloading"); + ELF_ABI_NAME ": enable vdso preloading"); #else static int __elfN(vdso) = 0; #endif #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__)) int i386_read_exec = 0; -SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, "enable execution from readable segments"); #endif @@ -161,15 +169,15 @@ sysctl_pie_base(SYSCTL_HANDLER_ARGS) __elfN(pie_base) = val; return (0); } -SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base, +SYSCTL_PROC(ELF_NODE_OID, OID_AUTO, pie_base, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_pie_base, "LU", "PIE load base without randomization"); -SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, +SYSCTL_NODE(ELF_NODE_OID, OID_AUTO, aslr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); -#define ASLR_NODE_OID __CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr) +#define ASLR_NODE_OID __CONCAT(ELF_NODE_OID, _aslr) /* * Enable ASLR by default for 64-bit non-PIE binaries. 32-bit architectures @@ -179,8 +187,7 @@ SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, static int __elfN(aslr_enabled) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN, &__elfN(aslr_enabled), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable address map randomization"); + ELF_ABI_NAME ": enable address map randomization"); /* * Enable ASLR by default for 64-bit PIE binaries. @@ -188,8 +195,7 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN, static int __elfN(pie_aslr_enabled) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN, &__elfN(pie_aslr_enabled), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable address map randomization for PIE binaries"); + ELF_ABI_NAME ": enable address map randomization for PIE binaries"); /* * Sbrk is deprecated and it can be assumed that in most cases it will not be @@ -199,27 +205,25 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN, static int __elfN(aslr_honor_sbrk) = 0; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW, &__elfN(aslr_honor_sbrk), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used"); + ELF_ABI_NAME ": assume sbrk is used"); static int __elfN(aslr_stack) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack, CTLFLAG_RWTUN, &__elfN(aslr_stack), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable stack address randomization"); + ELF_ABI_NAME ": enable stack address randomization"); static int __elfN(aslr_shared_page) = __ELF_WORD_SIZE == 64; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, shared_page, CTLFLAG_RWTUN, &__elfN(aslr_shared_page), 0, - __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) - ": enable shared page address randomization"); + ELF_ABI_NAME ": enable shared page address randomization"); static int __elfN(sigfastblock) = 1; -SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, sigfastblock, +SYSCTL_INT(ELF_NODE_OID, OID_AUTO, sigfastblock, CTLFLAG_RWTUN, &__elfN(sigfastblock), 0, "enable sigfastblock for new processes"); static bool __elfN(allow_wx) = true; -SYSCTL_BOOL(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, allow_wx, +SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx, CTLFLAG_RWTUN, &__elfN(allow_wx), 0, "Allow pages to be mapped simultaneously writable and executable"); @@ -2951,9 +2955,9 @@ __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, */ static struct execsw __elfN(execsw) = { .ex_imgact = __CONCAT(exec_, __elfN(imgact)), - .ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) + .ex_name = ELF_ABI_NAME }; -EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); +EXEC_SET(ELF_ABI_ID, __elfN(execsw)); static vm_prot_t __elfN(trans_prot)(Elf_Word flags) diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 91792430d24c..fcd232cde21e 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -67,7 +67,7 @@ /* The casts are bogus but will do for now. */ struct sysent sysent[] = { { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 0 = syscall */ - { .sy_narg = AS(exit_args), .sy_call = (sy_call_t *)sys_exit, .sy_auevent = AUE_EXIT, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 1 = exit */ + { .sy_narg = AS(_exit_args), .sy_call = (sy_call_t *)sys__exit, .sy_auevent = AUE_EXIT, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 1 = _exit */ { .sy_narg = 0, .sy_call = (sy_call_t *)sys_fork, .sy_auevent = AUE_FORK, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 2 = fork */ { .sy_narg = AS(read_args), .sy_call = (sy_call_t *)sys_read, .sy_auevent = AUE_READ, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 3 = read */ { .sy_narg = AS(write_args), .sy_call = (sy_call_t *)sys_write, .sy_auevent = AUE_WRITE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 4 = write */ @@ -145,8 +145,8 @@ struct sysent sysent[] = { { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 76 = obsolete vhangup */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 77 = obsolete vlimit */ { .sy_narg = AS(mincore_args), .sy_call = (sy_call_t *)sys_mincore, .sy_auevent = AUE_MINCORE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 78 = mincore */ - { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 79 = getgroups */ - { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 80 = setgroups */ + { compat14(AS(freebsd14_getgroups_args),getgroups), .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 79 = freebsd14 getgroups */ + { compat14(AS(freebsd14_setgroups_args),setgroups), .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 80 = freebsd14 setgroups */ { .sy_narg = 0, .sy_call = (sy_call_t *)sys_getpgrp, .sy_auevent = AUE_GETPGRP, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 81 = getpgrp */ { .sy_narg = AS(setpgid_args), .sy_call = (sy_call_t *)sys_setpgid, .sy_auevent = AUE_SETPGRP, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 82 = setpgid */ { .sy_narg = AS(setitimer_args), .sy_call = (sy_call_t *)sys_setitimer, .sy_auevent = AUE_SETITIMER, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 83 = setitimer */ @@ -661,4 +661,6 @@ struct sysent sysent[] = { { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */ { .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 593 = inotify_add_watch_at */ { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ + { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */ + { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ }; diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 54e3044ab093..a32b5a1b3354 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -202,7 +202,7 @@ exit_onexit(struct proc *p) * exit -- death of process. */ int -sys_exit(struct thread *td, struct exit_args *uap) +sys__exit(struct thread *td, struct _exit_args *uap) { exit1(td, uap->rval, 0); diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 7ef1d19f0ea8..7c9a15ae18f3 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -115,8 +115,11 @@ struct prison prison0 = { #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif - .pr_allow = PR_ALLOW_ALL_STATIC, + .pr_allow = PR_ALLOW_PRISON0, }; +_Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0, + "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved"); + MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); struct bool_flags { @@ -232,6 +235,9 @@ static struct bool_flags pr_flag_allow[NBBY * NBPW] = { {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME}, {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME}, {"allow.routing", "allow.norouting", PR_ALLOW_ROUTING}, + {"allow.unprivileged_parent_tampering", + "allow.nounprivileged_parent_tampering", + PR_ALLOW_UNPRIV_PARENT_TAMPER}, }; static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; const size_t pr_flag_allow_size = sizeof(pr_flag_allow); @@ -4006,6 +4012,7 @@ prison_priv_check(struct ucred *cred, int priv) case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: + case PRIV_DEBUG_DIFFJAIL: /* * Allow jail to set various resource limits and login @@ -4043,8 +4050,10 @@ prison_priv_check(struct ucred *cred, int priv) */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: + case PRIV_SCHED_DIFFJAIL: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: + case PRIV_SIGNAL_DIFFJAIL: /* * Allow jailed processes to write to sysctls marked as jail @@ -4688,6 +4697,10 @@ SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may read the kernel message buffer"); SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, "B", "Unprivileged processes may use process debugging facilities"); +SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering, + CTLTYPE_INT | CTLFLAG_RW, "B", + "Unprivileged parent jail processes may tamper with same-uid processes" + " (signal/debug/cpuset)"); SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, "B", "Processes in jail with uid 0 have privilege"); #ifdef VIMAGE diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index bbb622547598..0ca42d640767 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -310,6 +310,39 @@ sys_getegid(struct thread *td, struct getegid_args *uap) return (0); } +#ifdef COMPAT_FREEBSD14 +int +freebsd14_getgroups(struct thread *td, struct freebsd14_getgroups_args *uap) +{ + struct ucred *cred; + int ngrp, error; + + cred = td->td_ucred; + + /* + * For FreeBSD < 15.0, we account for the egid being placed at the + * beginning of the group list prior to all supplementary groups. + */ + ngrp = cred->cr_ngroups + 1; + if (uap->gidsetsize == 0) { + error = 0; + goto out; + } else if (uap->gidsetsize < ngrp) { + return (EINVAL); + } + + error = copyout(&cred->cr_gid, uap->gidset, sizeof(gid_t)); + if (error == 0) + error = copyout(cred->cr_groups, uap->gidset + 1, + (ngrp - 1) * sizeof(gid_t)); + +out: + td->td_retval[0] = ngrp; + return (error); + +} +#endif /* COMPAT_FREEBSD14 */ + #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { int gidsetsize; @@ -320,18 +353,11 @@ int sys_getgroups(struct thread *td, struct getgroups_args *uap) { struct ucred *cred; - gid_t *ugidset; int ngrp, error; cred = td->td_ucred; - /* - * cr_gid has been moved out of cr_groups, but we'll continue exporting - * the egid as groups[0] for the time being until we audit userland for - * any surprises. - */ - ngrp = cred->cr_ngroups + 1; - + ngrp = cred->cr_ngroups; if (uap->gidsetsize == 0) { error = 0; goto out; @@ -339,14 +365,7 @@ sys_getgroups(struct thread *td, struct getgroups_args *uap) if (uap->gidsetsize < ngrp) return (EINVAL); - ugidset = uap->gidset; - error = copyout(&cred->cr_gid, ugidset, sizeof(*ugidset)); - if (error != 0) - goto out; - - if (ngrp > 1) - error = copyout(cred->cr_groups, ugidset + 1, - (ngrp - 1) * sizeof(*ugidset)); + error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); @@ -1186,6 +1205,44 @@ fail: return (error); } +#ifdef COMPAT_FREEBSD14 +int +freebsd14_setgroups(struct thread *td, struct freebsd14_setgroups_args *uap) +{ + gid_t smallgroups[CRED_SMALLGROUPS_NB]; + gid_t *groups; + int gidsetsize, error; + + /* + * Before FreeBSD 15.0, we allow one more group to be supplied to + * account for the egid appearing before the supplementary groups. This + * may technically allow one more supplementary group for systems that + * did use the default NGROUPS_MAX if we round it back up to 1024. + */ + gidsetsize = uap->gidsetsize; + if (gidsetsize > ngroups_max + 1 || gidsetsize < 0) + return (EINVAL); + + if (gidsetsize > CRED_SMALLGROUPS_NB) + groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); + else + groups = smallgroups; + + error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); + if (error == 0) { + int ngroups = gidsetsize > 0 ? gidsetsize - 1 /* egid */ : 0; + + error = kern_setgroups(td, &ngroups, groups + 1); + if (error == 0 && gidsetsize > 0) + td->td_proc->p_ucred->cr_gid = groups[0]; + } + + if (groups != smallgroups) + free(groups, M_TEMP); + return (error); +} +#endif /* COMPAT_FREEBSD14 */ + #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { int gidsetsize; @@ -1210,8 +1267,7 @@ sys_setgroups(struct thread *td, struct setgroups_args *uap) * setgroups() differ. */ gidsetsize = uap->gidsetsize; - /* XXXKE Limit to ngroups_max when we change the userland interface. */ - if (gidsetsize > ngroups_max + 1 || gidsetsize < 0) + if (gidsetsize > ngroups_max || gidsetsize < 0) return (EINVAL); if (gidsetsize > CRED_SMALLGROUPS_NB) @@ -1238,35 +1294,17 @@ kern_setgroups(struct thread *td, int *ngrpp, gid_t *groups) struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int ngrp, error; - gid_t egid; ngrp = *ngrpp; /* Sanity check size. */ - /* XXXKE Limit to ngroups_max when we change the userland interface. */ - if (ngrp < 0 || ngrp > ngroups_max + 1) + if (ngrp < 0 || ngrp > ngroups_max) return (EINVAL); AUDIT_ARG_GROUPSET(groups, ngrp); - /* - * setgroups(0, NULL) is a legitimate way of clearing the groups vector - * on non-BSD systems (which generally do not have the egid in the - * groups[0]). We risk security holes when running non-BSD software if - * we do not do the same. So we allow and treat 0 for 'ngrp' specially - * below (twice). - */ - if (ngrp != 0) { - /* - * To maintain userland compat for now, we use the first group - * as our egid and we'll use the rest as our supplemental - * groups. - */ - egid = groups[0]; - ngrp--; - groups++; - groups_normalize(&ngrp, groups); - *ngrpp = ngrp; - } + groups_normalize(&ngrp, groups); + *ngrpp = ngrp; + newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); @@ -1289,15 +1327,7 @@ kern_setgroups(struct thread *td, int *ngrpp, gid_t *groups) if (error) goto fail; - /* - * If some groups were passed, the first one is currently the desired - * egid. This code is to be removed (along with some commented block - * above) when setgroups() is changed to take only supplementary groups. - */ - if (ngrp != 0) - newcred->cr_gid = egid; crsetgroups_internal(newcred, ngrp, groups); - setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); @@ -1914,6 +1944,38 @@ cr_canseejailproc(struct ucred *u1, struct ucred *u2) } /* + * Determine if u1 can tamper with the subject specified by u2, if they are in + * different jails and 'unprivileged_parent_tampering' jail policy allows it. + * + * May be called if u1 and u2 are in the same jail, but it is expected that the + * caller has already done a prison_check() prior to calling it. + * + * Returns: 0 for permitted, EPERM otherwise + */ +static int +cr_can_tamper_with_subjail(struct ucred *u1, struct ucred *u2, int priv) +{ + + MPASS(prison_check(u1, u2) == 0); + if (u1->cr_prison == u2->cr_prison) + return (0); + + if (priv_check_cred(u1, priv) == 0) + return (0); + + /* + * Jails do not maintain a distinct UID space, so process visibility is + * all that would control an unprivileged process' ability to tamper + * with a process in a subjail by default if we did not have the + * allow.unprivileged_parent_tampering knob to restrict it by default. + */ + if (prison_allow(u2, PR_ALLOW_UNPRIV_PARENT_TAMPER)) + return (0); + + return (EPERM); +} + +/* * Helper for cr_cansee*() functions to abide by system-wide security.bsd.see_* * policies. Determines if u1 "can see" u2 according to these policies. * Returns: 0 for permitted, ESRCH otherwise @@ -2062,6 +2124,19 @@ cr_cansignal(struct ucred *cred, struct proc *proc, int signum) return (error); } + /* + * At this point, the target may be in a different jail than the + * subject -- the subject must be in a parent jail to the target, + * whether it is prison0 or a subordinate of prison0 that has + * children. Additional privileges are required to allow this, as + * whether the creds are truly equivalent or not must be determined on + * a case-by-case basis. + */ + error = cr_can_tamper_with_subjail(cred, proc->p_ucred, + PRIV_SIGNAL_DIFFJAIL); + if (error) + return (error); + return (0); } @@ -2138,6 +2213,12 @@ p_cansched(struct thread *td, struct proc *p) if (error) return (error); } + + error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred, + PRIV_SCHED_DIFFJAIL); + if (error) + return (error); + return (0); } @@ -2258,6 +2339,11 @@ p_candebug(struct thread *td, struct proc *p) return (error); } + error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred, + PRIV_DEBUG_DIFFJAIL); + if (error) + return (error); + /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); @@ -2835,7 +2921,8 @@ crextend(struct ucred *cr, int n) * Normalizes a set of groups to be applied to a 'struct ucred'. * * Normalization ensures that the supplementary groups are sorted in ascending - * order and do not contain duplicates. + * order and do not contain duplicates. This allows group_is_supplementary + * to do a binary search. */ static void groups_normalize(int *ngrp, gid_t *groups) diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 7ee3b9e2048a..7351e9cb6313 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -96,6 +96,13 @@ static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); +static int racct_set_locked(struct proc *p, int resource, uint64_t amount, + int force); +static void racct_updatepcpu_locked(struct proc *p); +static void racct_updatepcpu_racct_locked(struct racct *racct); +static void racct_updatepcpu_containers(void); +static void racct_settime_locked(struct proc *p, bool exit); +static void racct_zeropcpu_locked(struct proc *p); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, @@ -308,68 +315,6 @@ fixpt_t ccpu_exp[] = { #define CCPU_EXP_MAX 110 -/* - * This function is analogical to the getpcpu() function in the ps(1) command. - * They should both calculate in the same way so that the racct %cpu - * calculations are consistent with the values shown by the ps(1) tool. - * The calculations are more complex in the 4BSD scheduler because of the value - * of the ccpu variable. In ULE it is defined to be zero which saves us some - * work. - */ -static uint64_t -racct_getpcpu(struct proc *p, u_int pcpu) -{ - u_int swtime; -#ifdef SCHED_4BSD - fixpt_t pctcpu, pctcpu_next; -#endif - fixpt_t p_pctcpu; - struct thread *td; - - ASSERT_RACCT_ENABLED(); - KASSERT((p->p_flag & P_IDLEPROC) == 0, - ("racct_getpcpu: idle process %p", p)); - - swtime = (ticks - p->p_swtick) / hz; - - /* - * For short-lived processes, the sched_pctcpu() returns small - * values even for cpu intensive processes. Therefore we use - * our own estimate in this case. - */ - if (swtime < RACCT_PCPU_SECS) - return (pcpu); - - p_pctcpu = 0; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); -#ifdef SCHED_4BSD - pctcpu = sched_pctcpu(td); - /* Count also the yet unfinished second. */ - pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; - pctcpu_next += sched_pctcpu_delta(td); - p_pctcpu += max(pctcpu, pctcpu_next); -#else - /* - * In ULE the %cpu statistics are updated on every - * sched_pctcpu() call. So special calculations to - * account for the latest (unfinished) second are - * not needed. - */ - p_pctcpu += sched_pctcpu(td); -#endif - thread_unlock(td); - } - -#ifdef SCHED_4BSD - if (swtime <= CCPU_EXP_MAX) - return ((100 * (uint64_t)p_pctcpu * 1000000) / - (FSCALE - ccpu_exp[swtime])); -#endif - - return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); -} - static void racct_add_racct(struct racct *dest, const struct racct *src) { @@ -499,19 +444,6 @@ racct_adjust_resource(struct racct *racct, int resource, ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } - - /* - * There are some cases where the racct %cpu resource would grow - * beyond 100% per core. For example in racct_proc_exit() we add - * the process %cpu usage to the ucred racct containers. If too - * many processes terminated in a short time span, the ucred %cpu - * resource could grow too much. Also, the 4BSD scheduler sometimes - * returns for a thread more than 100% cpu usage. So we set a sane - * boundary here to 100% * the maximum number of CPUs. - */ - if ((resource == RACCT_PCTCPU) && - (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) - racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int @@ -635,10 +567,44 @@ racct_add_buf(struct proc *p, const struct buf *bp, int is_write) RACCT_UNLOCK(); } +static void +racct_settime_locked(struct proc *p, bool exit) +{ + struct thread *td; + struct timeval wallclock; + uint64_t runtime; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + if (exit) { + /* + * proc_reap() has already calculated rux + * and added crux to rux. + */ + runtime = cputick2usec(p->p_rux.rux_runtime - + p->p_crux.rux_runtime); + } else { + PROC_STATLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) + ruxagg(p, td); + PROC_STATUNLOCK(p); + runtime = cputick2usec(p->p_rux.rux_runtime); + } + microuptime(&wallclock); + timevalsub(&wallclock, &p->p_stats->p_start); + + racct_set_locked(p, RACCT_CPU, runtime, 0); + racct_set_locked(p, RACCT_WALLCLOCK, + (uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec, 0); +} + static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { - int64_t old_amount, decayed_amount, diff_proc, diff_cred; + int64_t old_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif @@ -655,17 +621,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) * The diffs may be negative. */ diff_proc = amount - old_amount; - if (resource == RACCT_PCTCPU) { - /* - * Resources in per-credential racct containers may decay. - * If this is the case, we need to calculate the difference - * between the new amount and the proportional value of the - * old amount that has decayed in the ucred racct containers. - */ - decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; - diff_cred = amount - decayed_amount; - } else - diff_cred = diff_proc; + diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, @@ -908,8 +864,6 @@ racct_proc_fork(struct proc *parent, struct proc *child) goto out; #endif - /* Init process cpu time. */ - child->p_prev_runtime = 0; child->p_throttled = 0; /* @@ -964,37 +918,16 @@ racct_proc_fork_done(struct proc *child) void racct_proc_exit(struct proc *p) { - struct timeval wallclock; - uint64_t pct_estimate, pct, runtime; int i; if (!racct_enable) return; PROC_LOCK(p); - /* - * We don't need to calculate rux, proc_reap() has already done this. - */ - runtime = cputick2usec(p->p_rux.rux_runtime); -#ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); -#else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; -#endif - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); - RACCT_LOCK(); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); + + racct_settime_locked(p, true); + racct_zeropcpu_locked(p); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", @@ -1068,6 +1001,10 @@ racct_move(struct racct *dest, struct racct *src) RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); + dest->r_runtime = src->r_runtime; + dest->r_time = src->r_time; + src->r_runtime = 0; + timevalsub(&src->r_time, &src->r_time); RACCT_UNLOCK(); } @@ -1170,8 +1107,6 @@ racct_proc_wakeup(struct proc *p) static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { - int64_t r_old, r_new; - ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); @@ -1181,15 +1116,6 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif - - r_old = racct->r_resources[RACCT_PCTCPU]; - - /* If there is nothing to decay, just exit. */ - if (r_old <= 0) - return; - - r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; - racct->r_resources[RACCT_PCTCPU] = r_new; } static void @@ -1221,15 +1147,105 @@ racct_decay(void) } static void +racct_updatepcpu_racct_locked(struct racct *racct) +{ + struct timeval diff; + uint64_t elapsed; + uint64_t runtime; + uint64_t newpcpu; + uint64_t oldpcpu; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + + /* Difference between now and previously-recorded time. */ + microuptime(&diff); + timevalsub(&diff, &racct->r_time); + elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec; + + /* Difference between current and previously-recorded runtime. */ + runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime; + + newpcpu = runtime * 100 * 1000000 / elapsed; + oldpcpu = racct->r_resources[RACCT_PCTCPU]; + /* + * This calculation is equivalent to + * (1 - 0.3) * newpcpu + 0.3 * oldpcpu + * where RACCT_DECAY_FACTOR = 0.3 * FSCALE. + */ + racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) * + newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE; + if (racct->r_resources[RACCT_PCTCPU] > + 100 * 1000000 * (uint64_t)mp_ncpus) + racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * + (uint64_t)mp_ncpus; + + /* Record current times. */ + racct->r_runtime = racct->r_resources[RACCT_CPU]; + timevaladd(&racct->r_time, &diff); +} + +static void +racct_zeropcpu_locked(struct proc *p) +{ + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + p->p_racct->r_resources[RACCT_PCTCPU] = 0; +} + +static void +racct_updatepcpu_locked(struct proc *p) +{ + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + racct_updatepcpu_racct_locked(p->p_racct); +} + +static void +racct_updatepcpu_pre(void) +{ + + RACCT_LOCK(); +} + +static void +racct_updatepcpu_post(void) +{ + + RACCT_UNLOCK(); +} + +static void +racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2) +{ + racct_updatepcpu_racct_locked(racct); +} + +static void +racct_updatepcpu_containers(void) +{ + ASSERT_RACCT_ENABLED(); + + ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); + loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); + prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); +} + +static void racctd(void) { - struct thread *td; struct proc *p; - struct timeval wallclock; - uint64_t pct, pct_estimate, runtime; + struct proc *idle; ASSERT_RACCT_ENABLED(); + idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc; + for (;;) { racct_decay(); @@ -1237,36 +1253,16 @@ racctd(void) FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); + if (p == idle) { + PROC_UNLOCK(p); + continue; + } if (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0) { - if (p->p_state == PRS_ZOMBIE) - racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); continue; } - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - PROC_STATLOCK(p); - FOREACH_THREAD_IN_PROC(p, td) - ruxagg(p, td); - runtime = cputick2usec(p->p_rux.rux_runtime); - PROC_STATUNLOCK(p); -#ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, - ("runtime < p_prev_runtime")); -#else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; -#endif - p->p_prev_runtime = runtime; - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); @@ -1274,11 +1270,8 @@ racctd(void) rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif - racct_set_locked(p, RACCT_PCTCPU, pct, 1); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_set_locked(p, RACCT_WALLCLOCK, - (uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec, 0); + racct_settime_locked(p, false); + racct_updatepcpu_locked(p); RACCT_UNLOCK(); PROC_UNLOCK(p); } @@ -1306,6 +1299,8 @@ racctd(void) PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); + + racct_updatepcpu_containers(); pause("-", hz); } } diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index 9830e5093a3a..2a6f0989f6aa 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -571,7 +571,10 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, td->td_rtcgen = atomic_load_acq_int(&rtc_generation); error = kern_clock_gettime(td, clock_id, &now); - KASSERT(error == 0, ("kern_clock_gettime: %d", error)); + if (error != 0) { + td->td_rtcgen = 0; + return (error); + } timespecsub(&ts, &now, &ts); } if (ts.tv_sec < 0 || (ts.tv_sec == 0 && ts.tv_nsec == 0)) { diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index ab47b6ad29a3..a65c3ca128d9 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -57,7 +57,7 @@ * b : public affirmation by word or example of usually * religious faith or conviction <the heroic witness to divine * life -- Pilot> - * 6 capitalized : a member of the Jehovah's Witnesses + * 6 capitalized : a member of the Jehovah's Witnesses */ /* @@ -131,7 +131,7 @@ #define LI_SLEEPABLE 0x00040000 /* Lock may be held while sleeping. */ #ifndef WITNESS_COUNT -#define WITNESS_COUNT 1536 +#define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (512 + (MAXCPU * 4)) @@ -158,20 +158,18 @@ * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ -#define WITNESS_UNRELATED 0x00 /* No lock order relation. */ -#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ -#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ -#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ -#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ -#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) -#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) -#define WITNESS_RELATED_MASK \ - (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) -#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been - * observed. */ -#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ -#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ -#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ +#define WITNESS_UNRELATED 0x00 /* No lock order relation. */ +#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ +#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ +#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ +#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ +#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) +#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) +#define WITNESS_RELATED_MASK (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) +#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been observed. */ +#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ +#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ +#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) @@ -218,20 +216,18 @@ struct lock_list_entry { * (for example, "vnode interlock"). */ struct witness { - char w_name[MAX_W_NAME]; - uint32_t w_index; /* Index in the relationship matrix */ + char w_name[MAX_W_NAME]; + uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; - STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ - STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ - struct witness *w_hash_next; /* Linked list in hash buckets. */ - const char *w_file; /* File where last acquired */ - uint32_t w_line; /* Line where last acquired */ - uint32_t w_refcount; - uint16_t w_num_ancestors; /* direct/indirect - * ancestor count */ - uint16_t w_num_descendants; /* direct/indirect - * descendant count */ - int16_t w_ddb_level; + STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ + STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ + struct witness *w_hash_next; /* Linked list in hash buckets. */ + const char *w_file; /* File where last acquired */ + uint32_t w_line; /* Line where last acquired */ + uint32_t w_refcount; + uint16_t w_num_ancestors; /* direct/indirect ancestor count */ + uint16_t w_num_descendants; /* direct/indirect descendant count */ + int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; @@ -265,7 +261,7 @@ struct witness_lock_order_data { /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects - * (struct witness_lock_order_data). + * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; @@ -295,7 +291,6 @@ struct witness_order_list_entry { static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { - return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } @@ -304,7 +299,6 @@ static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { - return (a->from == b->from && a->to == b->to); } @@ -415,7 +409,7 @@ SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, int badstack_sbuf_size; int witness_count = WITNESS_COUNT; -SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, +SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* @@ -760,7 +754,6 @@ static int witness_spin_warn = 0; static const char * fixup_filename(const char *file) { - if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) @@ -835,7 +828,7 @@ witness_startup(void *mem) w_free_cnt--; for (i = 0; i < witness_count; i++) { - memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * + memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } @@ -989,16 +982,16 @@ witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), { int i; - for (i = 0; i < indent; i++) - prnt(" "); + for (i = 0; i < indent; i++) + prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); - if (w->w_displayed) { - prnt(" -- (already displayed)\n"); - return; - } - w->w_displayed = 1; + if (w->w_displayed) { + prnt(" -- (already displayed)\n"); + return; + } + w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); @@ -1079,7 +1072,6 @@ witness_ddb_display(int(*prnt)(const char *fmt, ...)) int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { - if (witness_watch == -1 || KERNEL_PANICKED()) return (0); @@ -1257,7 +1249,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( - "acquiring duplicate lock of same type: \"%s\"\n", + "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); @@ -1743,7 +1735,7 @@ found: /* * In order to reduce contention on w_mtx, we want to keep always an - * head object into lists so that frequent allocation from the + * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have @@ -1781,7 +1773,7 @@ witness_thread_exit(struct thread *td) n++; witness_list_lock(&lle->ll_children[i], witness_output); - + } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); @@ -1948,7 +1940,6 @@ found: static void depart(struct witness *w) { - MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { w_sleep_cnt--; @@ -1999,18 +1990,18 @@ adopt(struct witness *parent, struct witness *child) child->w_num_ancestors++; } - /* - * Find each ancestor of 'pi'. Note that 'pi' itself is counted as + /* + * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { - if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && + if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { - /* + /* * Skip children that are already marked as * descendants of 'i'. */ @@ -2021,7 +2012,7 @@ adopt(struct witness *parent, struct witness *child) * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ - if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && + if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; @@ -2029,16 +2020,16 @@ adopt(struct witness *parent, struct witness *child) w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; - /* + /* * Make sure we aren't marking a node as both an - * ancestor and descendant. We should have caught + * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", - i, j, w_rmatrix[i][j]); + i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; @@ -2047,7 +2038,7 @@ adopt(struct witness *parent, struct witness *child) (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", - j, i, w_rmatrix[j][i]); + j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; @@ -2124,7 +2115,6 @@ _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) static int isitmychild(struct witness *parent, struct witness *child) { - return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } @@ -2134,7 +2124,6 @@ isitmychild(struct witness *parent, struct witness *child) static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { - return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } @@ -2182,7 +2171,7 @@ witness_get(void) STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; - MPASS(index > 0 && index == w_max_used_index+1 && + MPASS(index > 0 && index == w_max_used_index + 1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; @@ -2194,7 +2183,6 @@ witness_get(void) static void witness_free(struct witness *w) { - STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } @@ -2219,11 +2207,10 @@ witness_lock_list_get(void) bzero(lle, sizeof(*lle)); return (lle); } - + static void witness_lock_list_free(struct lock_list_entry *lle) { - mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; @@ -2297,7 +2284,6 @@ witness_voutput(const char *fmt, va_list ap) static int witness_thread_has_locks(struct thread *td) { - if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); @@ -2573,14 +2559,12 @@ witness_setflag(struct lock_object *lock, int flag, int set) void witness_norelease(struct lock_object *lock) { - witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { - witness_setflag(lock, LI_NORELEASE, 0); } @@ -2588,7 +2572,6 @@ witness_releaseok(struct lock_object *lock) static void witness_ddb_list(struct thread *td) { - KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); @@ -2653,7 +2636,6 @@ DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE); DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE) { - witness_ddb_display(db_printf); } #endif @@ -2673,9 +2655,9 @@ sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); - tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, + tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); - tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, + tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); @@ -2750,12 +2732,12 @@ restart: sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", - tmp_w1->w_name, tmp_w1->w_class->lc_name, + tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", - tmp_w1->w_name, tmp_w1->w_class->lc_name, + tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_putc(sb, '\n'); @@ -2763,7 +2745,7 @@ restart: if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", - tmp_w2->w_name, tmp_w2->w_class->lc_name, + tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_putc(sb, '\n'); @@ -2823,7 +2805,6 @@ sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { - return (db_printf("%.*s", len, data)); } @@ -3068,7 +3049,7 @@ witness_lock_order_get(struct witness *parent, struct witness *child) & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; - hash = witness_hash_djb2((const char*)&key, + hash = witness_hash_djb2((const char *)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { @@ -3089,7 +3070,6 @@ out: static int witness_lock_order_check(struct witness *parent, struct witness *child) { - if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && @@ -3115,7 +3095,7 @@ witness_lock_order_add(struct witness *parent, struct witness *child) & WITNESS_LOCK_ORDER_KNOWN) return (1); - hash = witness_hash_djb2((const char*)&key, + hash = witness_hash_djb2((const char *)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; @@ -3134,7 +3114,6 @@ witness_lock_order_add(struct witness *parent, struct witness *child) static void witness_increment_graph_generation(void) { - if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; @@ -3143,7 +3122,6 @@ witness_increment_graph_generation(void) static int witness_output_drain(void *arg __unused, const char *data, int len) { - witness_output("%.*s", len, data); return (len); } diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c index ab7e048a2ab1..565ab3ad6ee6 100644 --- a/sys/kern/sys_timerfd.c +++ b/sys/kern/sys_timerfd.c @@ -206,7 +206,6 @@ retry: mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } - td->td_rtcgen = atomic_load_acq_int(&rtc_generation); error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "tfdrd", 0); if (error == 0) { diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 90a4f3a7dad8..4122f9261871 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -6,7 +6,7 @@ const char *syscallnames[] = { "syscall", /* 0 = syscall */ - "exit", /* 1 = exit */ + "_exit", /* 1 = _exit */ "fork", /* 2 = fork */ "read", /* 3 = read */ "write", /* 4 = write */ @@ -84,8 +84,8 @@ const char *syscallnames[] = { "obs_vhangup", /* 76 = obsolete vhangup */ "obs_vlimit", /* 77 = obsolete vlimit */ "mincore", /* 78 = mincore */ - "getgroups", /* 79 = getgroups */ - "setgroups", /* 80 = setgroups */ + "compat14.getgroups", /* 79 = freebsd14 getgroups */ + "compat14.setgroups", /* 80 = freebsd14 setgroups */ "getpgrp", /* 81 = getpgrp */ "setpgid", /* 82 = setpgid */ "setitimer", /* 83 = setitimer */ @@ -600,4 +600,6 @@ const char *syscallnames[] = { "exterrctl", /* 592 = exterrctl */ "inotify_add_watch_at", /* 593 = inotify_add_watch_at */ "inotify_rm_watch", /* 594 = inotify_rm_watch */ + "getgroups", /* 595 = getgroups */ + "setgroups", /* 596 = setgroups */ }; diff --git a/sys/kern/syscalls.conf b/sys/kern/syscalls.conf index a98d52659832..ae7bd1f87612 100644 --- a/sys/kern/syscalls.conf +++ b/sys/kern/syscalls.conf @@ -1,3 +1,4 @@ libsysmap="../../lib/libsys/syscalls.map" libsys_h="../../lib/libsys/_libsys.h" sysmk="../sys/syscall.mk" +syshdr_extra="#define SYS_exit SYS__exit" diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index a8815afee866..fa64597d14a5 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -51,6 +51,7 @@ ; SYSMUX syscall multiplexer. No prototype, argument struct, or ; handler is declared or used. Handled in MD syscall code. ; CAPENABLED syscall is allowed in capability mode +; NORETURN the syscall does not return ; ; To support programmatic generation of both the default ABI and 32-bit compat ; (freebsd32) we impose a number of restrictions on the types of system calls. @@ -124,8 +125,8 @@ ... ); } -1 AUE_EXIT STD|CAPENABLED { - void exit( +1 AUE_EXIT STD|CAPENABLED|NORETURN { + void _exit( int rval ); } @@ -551,13 +552,13 @@ _Out_writes_bytes_(len/PAGE_SIZE) char *vec ); } -79 AUE_GETGROUPS STD|CAPENABLED { +79 AUE_GETGROUPS STD|CAPENABLED|COMPAT14 { int getgroups( int gidsetsize, _Out_writes_opt_(gidsetsize) gid_t *gidset ); } -80 AUE_SETGROUPS STD { +80 AUE_SETGROUPS STD|COMPAT14 { int setgroups( int gidsetsize, _In_reads_(gidsetsize) const gid_t *gidset @@ -3370,5 +3371,17 @@ int wd ); } +595 AUE_GETGROUPS STD|CAPENABLED { + int getgroups( + int gidsetsize, + _Out_writes_opt_(gidsetsize) gid_t *gidset + ); + } +596 AUE_SETGROUPS STD { + int setgroups( + int gidsetsize, + _In_reads_(gidsetsize) const gid_t *gidset + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 467caa71f20d..2b1ea9eed8d4 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -17,9 +17,9 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 0; break; } - /* exit */ + /* _exit */ case 1: { - struct exit_args *p = params; + struct _exit_args *p = params; iarg[a++] = p->rval; /* int */ *n_args = 1; break; @@ -454,22 +454,6 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 3; break; } - /* getgroups */ - case 79: { - struct getgroups_args *p = params; - iarg[a++] = p->gidsetsize; /* int */ - uarg[a++] = (intptr_t)p->gidset; /* gid_t * */ - *n_args = 2; - break; - } - /* setgroups */ - case 80: { - struct setgroups_args *p = params; - iarg[a++] = p->gidsetsize; /* int */ - uarg[a++] = (intptr_t)p->gidset; /* const gid_t * */ - *n_args = 2; - break; - } /* getpgrp */ case 81: { *n_args = 0; @@ -3500,6 +3484,22 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 2; break; } + /* getgroups */ + case 595: { + struct getgroups_args *p = params; + iarg[a++] = p->gidsetsize; /* int */ + uarg[a++] = (intptr_t)p->gidset; /* gid_t * */ + *n_args = 2; + break; + } + /* setgroups */ + case 596: { + struct setgroups_args *p = params; + iarg[a++] = p->gidsetsize; /* int */ + uarg[a++] = (intptr_t)p->gidset; /* const gid_t * */ + *n_args = 2; + break; + } default: *n_args = 0; break; @@ -3513,7 +3513,7 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) /* syscall */ case 0: break; - /* exit */ + /* _exit */ case 1: switch (ndx) { case 0: @@ -4199,32 +4199,6 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; - /* getgroups */ - case 79: - switch (ndx) { - case 0: - p = "int"; - break; - case 1: - p = "userland gid_t *"; - break; - default: - break; - }; - break; - /* setgroups */ - case 80: - switch (ndx) { - case 0: - p = "int"; - break; - case 1: - p = "userland const gid_t *"; - break; - default: - break; - }; - break; /* getpgrp */ case 81: break; @@ -9367,6 +9341,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* getgroups */ + case 595: + switch (ndx) { + case 0: + p = "int"; + break; + case 1: + p = "userland gid_t *"; + break; + default: + break; + }; + break; + /* setgroups */ + case 596: + switch (ndx) { + case 0: + p = "int"; + break; + case 1: + p = "userland const gid_t *"; + break; + default: + break; + }; + break; default: break; }; @@ -9380,7 +9380,7 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) switch (sysnum) { /* syscall */ case 0: - /* exit */ + /* _exit */ case 1: if (ndx == 0 || ndx == 1) p = "void"; @@ -9633,16 +9633,6 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; - /* getgroups */ - case 79: - if (ndx == 0 || ndx == 1) - p = "int"; - break; - /* setgroups */ - case 80: - if (ndx == 0 || ndx == 1) - p = "int"; - break; /* getpgrp */ case 81: /* setpgid */ @@ -11365,6 +11355,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* getgroups */ + case 595: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* setgroups */ + case 596: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 85fe48ddd466..eb1327f7f2de 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -1160,7 +1160,8 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); - if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) + if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC | + O_CLOFORK)) != 0) return (EINVAL); largepage = (shmflags & SHM_LARGEPAGE) != 0; diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 0056dac65c7d..19870e989437 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -154,15 +154,12 @@ static struct task unp_defer_task; * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ -#ifndef PIPSIZ -#define PIPSIZ 8192 -#endif -static u_long unpst_sendspace = PIPSIZ; -static u_long unpst_recvspace = PIPSIZ; +static u_long unpst_sendspace = 64*1024; +static u_long unpst_recvspace = 64*1024; static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */ static u_long unpdg_recvspace = 16*1024; -static u_long unpsp_sendspace = PIPSIZ; -static u_long unpsp_recvspace = PIPSIZ; +static u_long unpsp_sendspace = 64*1024; +static u_long unpsp_recvspace = 64*1024; static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Local domain"); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index fa655c43d155..19c39e42bafa 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -5170,7 +5170,7 @@ bufstrategy(struct bufobj *bo, struct buf *bp) vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); - KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, + KASSERT(!VN_ISDEV(vp), ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index fd6202a1424c..85f67731e1cc 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -457,6 +457,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap) case _PC_NAMEDATTR_ENABLED: case _PC_HAS_NAMEDATTR: case _PC_HAS_HIDDENSYSTEM: + case _PC_CLONE_BLKSIZE: *ap->a_retval = 0; return (0); default: diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c index cd30d5cfae47..93ac001af8ad 100644 --- a/sys/kern/vfs_init.c +++ b/sys/kern/vfs_init.c @@ -103,6 +103,16 @@ struct vattr va_null; * Routines having to do with the management of the vnode table. */ +void +vfs_unref_vfsconf(struct vfsconf *vfsp) +{ + vfsconf_lock(); + KASSERT(vfsp->vfc_refcount > 0, + ("vfs %p refcount underflow %d", vfsp, vfsp->vfc_refcount)); + vfsp->vfc_refcount--; + vfsconf_unlock(); +} + static struct vfsconf * vfs_byname_locked(const char *name) { @@ -123,9 +133,11 @@ vfs_byname(const char *name) { struct vfsconf *vfsp; - vfsconf_slock(); + vfsconf_lock(); vfsp = vfs_byname_locked(name); - vfsconf_sunlock(); + if (vfsp != NULL) + vfsp->vfc_refcount++; + vfsconf_unlock(); return (vfsp); } diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c index 746a5a39208e..b265a5ff3a62 100644 --- a/sys/kern/vfs_inotify.c +++ b/sys/kern/vfs_inotify.c @@ -801,6 +801,7 @@ vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask, vn_lock(vp, LK_SHARED | LK_RETRY); if (error != 0) break; + NDFREE_PNBUF(&nd); vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT); vrele(nd.ni_vp); } diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 8e64a7fe966b..13403acacc08 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -683,7 +683,6 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, MPASSERT(mp->mnt_vfs_ops == 1, mp, ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops)); (void) vfs_busy(mp, MBF_NOWAIT); - atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; @@ -731,7 +730,6 @@ vfs_mount_destroy(struct mount *mp) __FILE__, __LINE__)); MPPASS(mp->mnt_writeopcount == 0, mp); MPPASS(mp->mnt_secondary_writes == 0, mp); - atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; @@ -769,6 +767,9 @@ vfs_mount_destroy(struct mount *mp) vfs_free_addrlist(mp->mnt_export); free(mp->mnt_export, M_MOUNT); } + vfsconf_lock(); + mp->mnt_vfc->vfc_refcount--; + vfsconf_unlock(); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } @@ -1133,6 +1134,7 @@ vfs_domount_first( if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred, vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) { vput(vp); + vfs_unref_vfsconf(vfsp); return (EPERM); } @@ -1169,6 +1171,7 @@ vfs_domount_first( } if (error != 0) { vput(vp); + vfs_unref_vfsconf(vfsp); return (error); } vn_seqc_write_begin(vp); diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 9704e9c160a8..bf3ed9d515dc 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -2839,7 +2839,7 @@ setfflags(struct thread *td, struct vnode *vp, u_long flags) * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ - if (vp->v_type == VCHR || vp->v_type == VBLK) { + if (VN_ISDEV(vp)) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); @@ -5050,15 +5050,16 @@ kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, size_t retlen; void *rl_rcookie, *rl_wcookie; off_t inoff, outoff, savinoff, savoutoff; - bool foffsets_locked; + bool foffsets_locked, foffsets_set; infp = outfp = NULL; rl_rcookie = rl_wcookie = NULL; foffsets_locked = false; + foffsets_set = false; error = 0; retlen = 0; - if (flags != 0) { + if ((flags & ~COPY_FILE_RANGE_USERFLAGS) != 0) { error = EINVAL; goto out; } @@ -5122,6 +5123,8 @@ kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, } foffset_lock_pair(infp1, &inoff, outfp1, &outoff, 0); foffsets_locked = true; + } else { + foffsets_set = true; } savinoff = inoff; savoutoff = outoff; @@ -5180,11 +5183,12 @@ out: vn_rangelock_unlock(invp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(outvp, rl_wcookie); + if ((foffsets_locked || foffsets_set) && + (error == EINTR || error == ERESTART)) { + inoff = savinoff; + outoff = savoutoff; + } if (foffsets_locked) { - if (error == EINTR || error == ERESTART) { - inoff = savinoff; - outoff = savoutoff; - } if (inoffp == NULL) foffset_unlock(infp, inoff, 0); else @@ -5193,6 +5197,9 @@ out: foffset_unlock(outfp, outoff, 0); else *outoffp = outoff; + } else if (foffsets_set) { + *inoffp = inoff; + *outoffp = outoff; } if (outfp != NULL) fdrop(outfp, td); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 6451c9e07a60..a4f41192f684 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -3443,6 +3443,11 @@ vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, interrupted = 0; dat = NULL; + if ((flags & COPY_FILE_RANGE_CLONE) != 0) { + error = EOPNOTSUPP; + goto out; + } + error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; |