aboutsummaryrefslogtreecommitdiff
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/device_if.m10
-rw-r--r--sys/kern/imgact_elf.c54
-rw-r--r--sys/kern/init_sysent.c8
-rw-r--r--sys/kern/kern_exit.c2
-rw-r--r--sys/kern/kern_jail.c15
-rw-r--r--sys/kern/kern_prot.c185
-rw-r--r--sys/kern/kern_racct.c307
-rw-r--r--sys/kern/kern_time.c5
-rw-r--r--sys/kern/sys_timerfd.c1
-rw-r--r--sys/kern/syscalls.c8
-rw-r--r--sys/kern/syscalls.conf1
-rw-r--r--sys/kern/syscalls.master21
-rw-r--r--sys/kern/systrace_args.c112
-rw-r--r--sys/kern/uipc_shm.c3
-rw-r--r--sys/kern/uipc_usrreq.c11
-rw-r--r--sys/kern/vfs_bio.c2
-rw-r--r--sys/kern/vfs_default.c1
-rw-r--r--sys/kern/vfs_inotify.c1
-rw-r--r--sys/kern/vfs_syscalls.c21
-rw-r--r--sys/kern/vfs_vnops.c5
20 files changed, 452 insertions, 321 deletions
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
index c02e5a46f326..ed94b2ccbe1b 100644
--- a/sys/kern/device_if.m
+++ b/sys/kern/device_if.m
@@ -49,25 +49,25 @@ HEADER {
CODE {
static int null_shutdown(device_t dev)
{
- return 0;
+ return (0);
}
static int null_suspend(device_t dev)
{
- return 0;
+ return (0);
}
static int null_resume(device_t dev)
{
- return 0;
+ return (0);
}
static int null_quiesce(device_t dev)
{
- return 0;
+ return (0);
}
- static void * null_register(device_t dev)
+ static void *null_register(device_t dev)
{
return NULL;
}
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 2690ad3b2679..5a53fac50f2c 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -84,6 +84,13 @@
#define ELF_NOTE_ROUNDSIZE 4
#define OLD_EI_BRAND 8
+/*
+ * ELF_ABI_NAME is a string name of the ELF ABI. ELF_ABI_ID is used
+ * to build variable names.
+ */
+#define ELF_ABI_NAME __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
+#define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE)
+
static int __elfN(check_header)(const Elf_Ehdr *hdr);
static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
const char *interp, int32_t *osrel, uint32_t *fctl0);
@@ -104,14 +111,15 @@ static Elf_Word __elfN(untrans_prot)(vm_prot_t);
static size_t __elfN(prepare_register_notes)(struct thread *td,
struct note_info_list *list, struct thread *target_td);
-SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE),
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+SYSCTL_NODE(_kern, OID_AUTO, ELF_ABI_ID, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"");
+#define ELF_NODE_OID __CONCAT(_kern_, ELF_ABI_ID)
+
int __elfN(fallback_brand) = -1;
-SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+SYSCTL_INT(ELF_NODE_OID, OID_AUTO,
fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
+ ELF_ABI_NAME " brand of last resort");
static int elf_legacy_coredump = 0;
SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW,
@@ -126,22 +134,22 @@ int __elfN(nxstack) =
#else
0;
#endif
-SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+SYSCTL_INT(ELF_NODE_OID, OID_AUTO,
nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": support PT_GNU_STACK for non-executable stack control");
+ ELF_ABI_NAME ": support PT_GNU_STACK for non-executable stack control");
#if defined(__amd64__)
static int __elfN(vdso) = 1;
-SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+SYSCTL_INT(ELF_NODE_OID, OID_AUTO,
vdso, CTLFLAG_RWTUN, &__elfN(vdso), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable vdso preloading");
+ ELF_ABI_NAME ": enable vdso preloading");
#else
static int __elfN(vdso) = 0;
#endif
#if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
int i386_read_exec = 0;
-SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
+SYSCTL_INT(ELF_NODE_OID, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
"enable execution from readable segments");
#endif
@@ -161,15 +169,15 @@ sysctl_pie_base(SYSCTL_HANDLER_ARGS)
__elfN(pie_base) = val;
return (0);
}
-SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base,
+SYSCTL_PROC(ELF_NODE_OID, OID_AUTO, pie_base,
CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
sysctl_pie_base, "LU",
"PIE load base without randomization");
-SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr,
+SYSCTL_NODE(ELF_NODE_OID, OID_AUTO, aslr,
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"");
-#define ASLR_NODE_OID __CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr)
+#define ASLR_NODE_OID __CONCAT(ELF_NODE_OID, _aslr)
/*
* Enable ASLR by default for 64-bit non-PIE binaries. 32-bit architectures
@@ -179,8 +187,7 @@ SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr,
static int __elfN(aslr_enabled) = __ELF_WORD_SIZE == 64;
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN,
&__elfN(aslr_enabled), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
- ": enable address map randomization");
+ ELF_ABI_NAME ": enable address map randomization");
/*
* Enable ASLR by default for 64-bit PIE binaries.
@@ -188,8 +195,7 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN,
static int __elfN(pie_aslr_enabled) = __ELF_WORD_SIZE == 64;
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN,
&__elfN(pie_aslr_enabled), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
- ": enable address map randomization for PIE binaries");
+ ELF_ABI_NAME ": enable address map randomization for PIE binaries");
/*
* Sbrk is deprecated and it can be assumed that in most cases it will not be
@@ -199,27 +205,25 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN,
static int __elfN(aslr_honor_sbrk) = 0;
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW,
&__elfN(aslr_honor_sbrk), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used");
+ ELF_ABI_NAME ": assume sbrk is used");
static int __elfN(aslr_stack) = __ELF_WORD_SIZE == 64;
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack, CTLFLAG_RWTUN,
&__elfN(aslr_stack), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
- ": enable stack address randomization");
+ ELF_ABI_NAME ": enable stack address randomization");
static int __elfN(aslr_shared_page) = __ELF_WORD_SIZE == 64;
SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, shared_page, CTLFLAG_RWTUN,
&__elfN(aslr_shared_page), 0,
- __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
- ": enable shared page address randomization");
+ ELF_ABI_NAME ": enable shared page address randomization");
static int __elfN(sigfastblock) = 1;
-SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, sigfastblock,
+SYSCTL_INT(ELF_NODE_OID, OID_AUTO, sigfastblock,
CTLFLAG_RWTUN, &__elfN(sigfastblock), 0,
"enable sigfastblock for new processes");
static bool __elfN(allow_wx) = true;
-SYSCTL_BOOL(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, allow_wx,
+SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx,
CTLFLAG_RWTUN, &__elfN(allow_wx), 0,
"Allow pages to be mapped simultaneously writable and executable");
@@ -2951,9 +2955,9 @@ __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
*/
static struct execsw __elfN(execsw) = {
.ex_imgact = __CONCAT(exec_, __elfN(imgact)),
- .ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
+ .ex_name = ELF_ABI_NAME
};
-EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
+EXEC_SET(ELF_ABI_ID, __elfN(execsw));
static vm_prot_t
__elfN(trans_prot)(Elf_Word flags)
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 91792430d24c..fcd232cde21e 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -67,7 +67,7 @@
/* The casts are bogus but will do for now. */
struct sysent sysent[] = {
{ .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 0 = syscall */
- { .sy_narg = AS(exit_args), .sy_call = (sy_call_t *)sys_exit, .sy_auevent = AUE_EXIT, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 1 = exit */
+ { .sy_narg = AS(_exit_args), .sy_call = (sy_call_t *)sys__exit, .sy_auevent = AUE_EXIT, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 1 = _exit */
{ .sy_narg = 0, .sy_call = (sy_call_t *)sys_fork, .sy_auevent = AUE_FORK, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 2 = fork */
{ .sy_narg = AS(read_args), .sy_call = (sy_call_t *)sys_read, .sy_auevent = AUE_READ, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 3 = read */
{ .sy_narg = AS(write_args), .sy_call = (sy_call_t *)sys_write, .sy_auevent = AUE_WRITE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 4 = write */
@@ -145,8 +145,8 @@ struct sysent sysent[] = {
{ .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 76 = obsolete vhangup */
{ .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 77 = obsolete vlimit */
{ .sy_narg = AS(mincore_args), .sy_call = (sy_call_t *)sys_mincore, .sy_auevent = AUE_MINCORE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 78 = mincore */
- { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 79 = getgroups */
- { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 80 = setgroups */
+ { compat14(AS(freebsd14_getgroups_args),getgroups), .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 79 = freebsd14 getgroups */
+ { compat14(AS(freebsd14_setgroups_args),setgroups), .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 80 = freebsd14 setgroups */
{ .sy_narg = 0, .sy_call = (sy_call_t *)sys_getpgrp, .sy_auevent = AUE_GETPGRP, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 81 = getpgrp */
{ .sy_narg = AS(setpgid_args), .sy_call = (sy_call_t *)sys_setpgid, .sy_auevent = AUE_SETPGRP, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 82 = setpgid */
{ .sy_narg = AS(setitimer_args), .sy_call = (sy_call_t *)sys_setitimer, .sy_auevent = AUE_SETITIMER, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 83 = setitimer */
@@ -661,4 +661,6 @@ struct sysent sysent[] = {
{ .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */
{ .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 593 = inotify_add_watch_at */
{ .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */
+ { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */
+ { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */
};
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 54e3044ab093..a32b5a1b3354 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -202,7 +202,7 @@ exit_onexit(struct proc *p)
* exit -- death of process.
*/
int
-sys_exit(struct thread *td, struct exit_args *uap)
+sys__exit(struct thread *td, struct _exit_args *uap)
{
exit1(td, uap->rval, 0);
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 7ef1d19f0ea8..7c9a15ae18f3 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -115,8 +115,11 @@ struct prison prison0 = {
#else
.pr_flags = PR_HOST|_PR_IP_SADDRSEL,
#endif
- .pr_allow = PR_ALLOW_ALL_STATIC,
+ .pr_allow = PR_ALLOW_PRISON0,
};
+_Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0,
+ "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved");
+
MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
struct bool_flags {
@@ -232,6 +235,9 @@ static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
{"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
{"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
{"allow.routing", "allow.norouting", PR_ALLOW_ROUTING},
+ {"allow.unprivileged_parent_tampering",
+ "allow.nounprivileged_parent_tampering",
+ PR_ALLOW_UNPRIV_PARENT_TAMPER},
};
static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
@@ -4006,6 +4012,7 @@ prison_priv_check(struct ucred *cred, int priv)
case PRIV_DEBUG_DIFFCRED:
case PRIV_DEBUG_SUGID:
case PRIV_DEBUG_UNPRIV:
+ case PRIV_DEBUG_DIFFJAIL:
/*
* Allow jail to set various resource limits and login
@@ -4043,8 +4050,10 @@ prison_priv_check(struct ucred *cred, int priv)
*/
case PRIV_SCHED_DIFFCRED:
case PRIV_SCHED_CPUSET:
+ case PRIV_SCHED_DIFFJAIL:
case PRIV_SIGNAL_DIFFCRED:
case PRIV_SIGNAL_SUGID:
+ case PRIV_SIGNAL_DIFFJAIL:
/*
* Allow jailed processes to write to sysctls marked as jail
@@ -4688,6 +4697,10 @@ SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may read the kernel message buffer");
SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
"B", "Unprivileged processes may use process debugging facilities");
+SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering,
+ CTLTYPE_INT | CTLFLAG_RW, "B",
+ "Unprivileged parent jail processes may tamper with same-uid processes"
+ " (signal/debug/cpuset)");
SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
"B", "Processes in jail with uid 0 have privilege");
#ifdef VIMAGE
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index bbb622547598..0ca42d640767 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -310,6 +310,39 @@ sys_getegid(struct thread *td, struct getegid_args *uap)
return (0);
}
+#ifdef COMPAT_FREEBSD14
+int
+freebsd14_getgroups(struct thread *td, struct freebsd14_getgroups_args *uap)
+{
+ struct ucred *cred;
+ int ngrp, error;
+
+ cred = td->td_ucred;
+
+ /*
+ * For FreeBSD < 15.0, we account for the egid being placed at the
+ * beginning of the group list prior to all supplementary groups.
+ */
+ ngrp = cred->cr_ngroups + 1;
+ if (uap->gidsetsize == 0) {
+ error = 0;
+ goto out;
+ } else if (uap->gidsetsize < ngrp) {
+ return (EINVAL);
+ }
+
+ error = copyout(&cred->cr_gid, uap->gidset, sizeof(gid_t));
+ if (error == 0)
+ error = copyout(cred->cr_groups, uap->gidset + 1,
+ (ngrp - 1) * sizeof(gid_t));
+
+out:
+ td->td_retval[0] = ngrp;
+ return (error);
+
+}
+#endif /* COMPAT_FREEBSD14 */
+
#ifndef _SYS_SYSPROTO_H_
struct getgroups_args {
int gidsetsize;
@@ -320,18 +353,11 @@ int
sys_getgroups(struct thread *td, struct getgroups_args *uap)
{
struct ucred *cred;
- gid_t *ugidset;
int ngrp, error;
cred = td->td_ucred;
- /*
- * cr_gid has been moved out of cr_groups, but we'll continue exporting
- * the egid as groups[0] for the time being until we audit userland for
- * any surprises.
- */
- ngrp = cred->cr_ngroups + 1;
-
+ ngrp = cred->cr_ngroups;
if (uap->gidsetsize == 0) {
error = 0;
goto out;
@@ -339,14 +365,7 @@ sys_getgroups(struct thread *td, struct getgroups_args *uap)
if (uap->gidsetsize < ngrp)
return (EINVAL);
- ugidset = uap->gidset;
- error = copyout(&cred->cr_gid, ugidset, sizeof(*ugidset));
- if (error != 0)
- goto out;
-
- if (ngrp > 1)
- error = copyout(cred->cr_groups, ugidset + 1,
- (ngrp - 1) * sizeof(*ugidset));
+ error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t));
out:
td->td_retval[0] = ngrp;
return (error);
@@ -1186,6 +1205,44 @@ fail:
return (error);
}
+#ifdef COMPAT_FREEBSD14
+int
+freebsd14_setgroups(struct thread *td, struct freebsd14_setgroups_args *uap)
+{
+ gid_t smallgroups[CRED_SMALLGROUPS_NB];
+ gid_t *groups;
+ int gidsetsize, error;
+
+ /*
+ * Before FreeBSD 15.0, we allow one more group to be supplied to
+ * account for the egid appearing before the supplementary groups. This
+ * may technically allow one more supplementary group for systems that
+ * did use the default NGROUPS_MAX if we round it back up to 1024.
+ */
+ gidsetsize = uap->gidsetsize;
+ if (gidsetsize > ngroups_max + 1 || gidsetsize < 0)
+ return (EINVAL);
+
+ if (gidsetsize > CRED_SMALLGROUPS_NB)
+ groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK);
+ else
+ groups = smallgroups;
+
+ error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t));
+ if (error == 0) {
+ int ngroups = gidsetsize > 0 ? gidsetsize - 1 /* egid */ : 0;
+
+ error = kern_setgroups(td, &ngroups, groups + 1);
+ if (error == 0 && gidsetsize > 0)
+ td->td_proc->p_ucred->cr_gid = groups[0];
+ }
+
+ if (groups != smallgroups)
+ free(groups, M_TEMP);
+ return (error);
+}
+#endif /* COMPAT_FREEBSD14 */
+
#ifndef _SYS_SYSPROTO_H_
struct setgroups_args {
int gidsetsize;
@@ -1210,8 +1267,7 @@ sys_setgroups(struct thread *td, struct setgroups_args *uap)
* setgroups() differ.
*/
gidsetsize = uap->gidsetsize;
- /* XXXKE Limit to ngroups_max when we change the userland interface. */
- if (gidsetsize > ngroups_max + 1 || gidsetsize < 0)
+ if (gidsetsize > ngroups_max || gidsetsize < 0)
return (EINVAL);
if (gidsetsize > CRED_SMALLGROUPS_NB)
@@ -1238,35 +1294,17 @@ kern_setgroups(struct thread *td, int *ngrpp, gid_t *groups)
struct proc *p = td->td_proc;
struct ucred *newcred, *oldcred;
int ngrp, error;
- gid_t egid;
ngrp = *ngrpp;
/* Sanity check size. */
- /* XXXKE Limit to ngroups_max when we change the userland interface. */
- if (ngrp < 0 || ngrp > ngroups_max + 1)
+ if (ngrp < 0 || ngrp > ngroups_max)
return (EINVAL);
AUDIT_ARG_GROUPSET(groups, ngrp);
- /*
- * setgroups(0, NULL) is a legitimate way of clearing the groups vector
- * on non-BSD systems (which generally do not have the egid in the
- * groups[0]). We risk security holes when running non-BSD software if
- * we do not do the same. So we allow and treat 0 for 'ngrp' specially
- * below (twice).
- */
- if (ngrp != 0) {
- /*
- * To maintain userland compat for now, we use the first group
- * as our egid and we'll use the rest as our supplemental
- * groups.
- */
- egid = groups[0];
- ngrp--;
- groups++;
- groups_normalize(&ngrp, groups);
- *ngrpp = ngrp;
- }
+ groups_normalize(&ngrp, groups);
+ *ngrpp = ngrp;
+
newcred = crget();
crextend(newcred, ngrp);
PROC_LOCK(p);
@@ -1289,15 +1327,7 @@ kern_setgroups(struct thread *td, int *ngrpp, gid_t *groups)
if (error)
goto fail;
- /*
- * If some groups were passed, the first one is currently the desired
- * egid. This code is to be removed (along with some commented block
- * above) when setgroups() is changed to take only supplementary groups.
- */
- if (ngrp != 0)
- newcred->cr_gid = egid;
crsetgroups_internal(newcred, ngrp, groups);
-
setsugid(p);
proc_set_cred(p, newcred);
PROC_UNLOCK(p);
@@ -1914,6 +1944,38 @@ cr_canseejailproc(struct ucred *u1, struct ucred *u2)
}
/*
+ * Determine if u1 can tamper with the subject specified by u2, if they are in
+ * different jails and 'unprivileged_parent_tampering' jail policy allows it.
+ *
+ * May be called if u1 and u2 are in the same jail, but it is expected that the
+ * caller has already done a prison_check() prior to calling it.
+ *
+ * Returns: 0 for permitted, EPERM otherwise
+ */
+static int
+cr_can_tamper_with_subjail(struct ucred *u1, struct ucred *u2, int priv)
+{
+
+ MPASS(prison_check(u1, u2) == 0);
+ if (u1->cr_prison == u2->cr_prison)
+ return (0);
+
+ if (priv_check_cred(u1, priv) == 0)
+ return (0);
+
+ /*
+ * Jails do not maintain a distinct UID space, so process visibility is
+ * all that would control an unprivileged process' ability to tamper
+ * with a process in a subjail by default if we did not have the
+ * allow.unprivileged_parent_tampering knob to restrict it by default.
+ */
+ if (prison_allow(u2, PR_ALLOW_UNPRIV_PARENT_TAMPER))
+ return (0);
+
+ return (EPERM);
+}
+
+/*
* Helper for cr_cansee*() functions to abide by system-wide security.bsd.see_*
* policies. Determines if u1 "can see" u2 according to these policies.
* Returns: 0 for permitted, ESRCH otherwise
@@ -2062,6 +2124,19 @@ cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
return (error);
}
+ /*
+ * At this point, the target may be in a different jail than the
+ * subject -- the subject must be in a parent jail to the target,
+ * whether it is prison0 or a subordinate of prison0 that has
+ * children. Additional privileges are required to allow this, as
+ * whether the creds are truly equivalent or not must be determined on
+ * a case-by-case basis.
+ */
+ error = cr_can_tamper_with_subjail(cred, proc->p_ucred,
+ PRIV_SIGNAL_DIFFJAIL);
+ if (error)
+ return (error);
+
return (0);
}
@@ -2138,6 +2213,12 @@ p_cansched(struct thread *td, struct proc *p)
if (error)
return (error);
}
+
+ error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred,
+ PRIV_SCHED_DIFFJAIL);
+ if (error)
+ return (error);
+
return (0);
}
@@ -2258,6 +2339,11 @@ p_candebug(struct thread *td, struct proc *p)
return (error);
}
+ error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred,
+ PRIV_DEBUG_DIFFJAIL);
+ if (error)
+ return (error);
+
/* Can't trace init when securelevel > 0. */
if (p == initproc) {
error = securelevel_gt(td->td_ucred, 0);
@@ -2835,7 +2921,8 @@ crextend(struct ucred *cr, int n)
* Normalizes a set of groups to be applied to a 'struct ucred'.
*
* Normalization ensures that the supplementary groups are sorted in ascending
- * order and do not contain duplicates.
+ * order and do not contain duplicates. This allows group_is_supplementary
+ * to do a binary search.
*/
static void
groups_normalize(int *ngrp, gid_t *groups)
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 7ee3b9e2048a..7351e9cb6313 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -96,6 +96,13 @@ static void racct_sub_cred_locked(struct ucred *cred, int resource,
uint64_t amount);
static void racct_add_cred_locked(struct ucred *cred, int resource,
uint64_t amount);
+static int racct_set_locked(struct proc *p, int resource, uint64_t amount,
+ int force);
+static void racct_updatepcpu_locked(struct proc *p);
+static void racct_updatepcpu_racct_locked(struct racct *racct);
+static void racct_updatepcpu_containers(void);
+static void racct_settime_locked(struct proc *p, bool exit);
+static void racct_zeropcpu_locked(struct proc *p);
SDT_PROVIDER_DEFINE(racct);
SDT_PROBE_DEFINE3(racct, , rusage, add,
@@ -308,68 +315,6 @@ fixpt_t ccpu_exp[] = {
#define CCPU_EXP_MAX 110
-/*
- * This function is analogical to the getpcpu() function in the ps(1) command.
- * They should both calculate in the same way so that the racct %cpu
- * calculations are consistent with the values shown by the ps(1) tool.
- * The calculations are more complex in the 4BSD scheduler because of the value
- * of the ccpu variable. In ULE it is defined to be zero which saves us some
- * work.
- */
-static uint64_t
-racct_getpcpu(struct proc *p, u_int pcpu)
-{
- u_int swtime;
-#ifdef SCHED_4BSD
- fixpt_t pctcpu, pctcpu_next;
-#endif
- fixpt_t p_pctcpu;
- struct thread *td;
-
- ASSERT_RACCT_ENABLED();
- KASSERT((p->p_flag & P_IDLEPROC) == 0,
- ("racct_getpcpu: idle process %p", p));
-
- swtime = (ticks - p->p_swtick) / hz;
-
- /*
- * For short-lived processes, the sched_pctcpu() returns small
- * values even for cpu intensive processes. Therefore we use
- * our own estimate in this case.
- */
- if (swtime < RACCT_PCPU_SECS)
- return (pcpu);
-
- p_pctcpu = 0;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
-#ifdef SCHED_4BSD
- pctcpu = sched_pctcpu(td);
- /* Count also the yet unfinished second. */
- pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
- pctcpu_next += sched_pctcpu_delta(td);
- p_pctcpu += max(pctcpu, pctcpu_next);
-#else
- /*
- * In ULE the %cpu statistics are updated on every
- * sched_pctcpu() call. So special calculations to
- * account for the latest (unfinished) second are
- * not needed.
- */
- p_pctcpu += sched_pctcpu(td);
-#endif
- thread_unlock(td);
- }
-
-#ifdef SCHED_4BSD
- if (swtime <= CCPU_EXP_MAX)
- return ((100 * (uint64_t)p_pctcpu * 1000000) /
- (FSCALE - ccpu_exp[swtime]));
-#endif
-
- return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
-}
-
static void
racct_add_racct(struct racct *dest, const struct racct *src)
{
@@ -499,19 +444,6 @@ racct_adjust_resource(struct racct *racct, int resource,
("%s: resource %d usage < 0", __func__, resource));
racct->r_resources[resource] = 0;
}
-
- /*
- * There are some cases where the racct %cpu resource would grow
- * beyond 100% per core. For example in racct_proc_exit() we add
- * the process %cpu usage to the ucred racct containers. If too
- * many processes terminated in a short time span, the ucred %cpu
- * resource could grow too much. Also, the 4BSD scheduler sometimes
- * returns for a thread more than 100% cpu usage. So we set a sane
- * boundary here to 100% * the maximum number of CPUs.
- */
- if ((resource == RACCT_PCTCPU) &&
- (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
- racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
}
static int
@@ -635,10 +567,44 @@ racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
RACCT_UNLOCK();
}
+static void
+racct_settime_locked(struct proc *p, bool exit)
+{
+ struct thread *td;
+ struct timeval wallclock;
+ uint64_t runtime;
+
+ ASSERT_RACCT_ENABLED();
+ RACCT_LOCK_ASSERT();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (exit) {
+ /*
+ * proc_reap() has already calculated rux
+ * and added crux to rux.
+ */
+ runtime = cputick2usec(p->p_rux.rux_runtime -
+ p->p_crux.rux_runtime);
+ } else {
+ PROC_STATLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td)
+ ruxagg(p, td);
+ PROC_STATUNLOCK(p);
+ runtime = cputick2usec(p->p_rux.rux_runtime);
+ }
+ microuptime(&wallclock);
+ timevalsub(&wallclock, &p->p_stats->p_start);
+
+ racct_set_locked(p, RACCT_CPU, runtime, 0);
+ racct_set_locked(p, RACCT_WALLCLOCK,
+ (uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec, 0);
+}
+
static int
racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
{
- int64_t old_amount, decayed_amount, diff_proc, diff_cred;
+ int64_t old_amount, diff_proc, diff_cred;
#ifdef RCTL
int error;
#endif
@@ -655,17 +621,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (resource == RACCT_PCTCPU) {
- /*
- * Resources in per-credential racct containers may decay.
- * If this is the case, we need to calculate the difference
- * between the new amount and the proportional value of the
- * old amount that has decayed in the ucred racct containers.
- */
- decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
- diff_cred = amount - decayed_amount;
- } else
- diff_cred = diff_proc;
+ diff_cred = diff_proc;
#ifdef notyet
KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
("%s: usage of non-droppable resource %d dropping", __func__,
@@ -908,8 +864,6 @@ racct_proc_fork(struct proc *parent, struct proc *child)
goto out;
#endif
- /* Init process cpu time. */
- child->p_prev_runtime = 0;
child->p_throttled = 0;
/*
@@ -964,37 +918,16 @@ racct_proc_fork_done(struct proc *child)
void
racct_proc_exit(struct proc *p)
{
- struct timeval wallclock;
- uint64_t pct_estimate, pct, runtime;
int i;
if (!racct_enable)
return;
PROC_LOCK(p);
- /*
- * We don't need to calculate rux, proc_reap() has already done this.
- */
- runtime = cputick2usec(p->p_rux.rux_runtime);
-#ifdef notyet
- KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
-#else
- if (runtime < p->p_prev_runtime)
- runtime = p->p_prev_runtime;
-#endif
- microuptime(&wallclock);
- timevalsub(&wallclock, &p->p_stats->p_start);
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
-
RACCT_LOCK();
- racct_set_locked(p, RACCT_CPU, runtime, 0);
- racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
+
+ racct_settime_locked(p, true);
+ racct_zeropcpu_locked(p);
KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
("process reaped with %ju allocated for RSS\n",
@@ -1068,6 +1001,10 @@ racct_move(struct racct *dest, struct racct *src)
RACCT_LOCK();
racct_add_racct(dest, src);
racct_sub_racct(src, src);
+ dest->r_runtime = src->r_runtime;
+ dest->r_time = src->r_time;
+ src->r_runtime = 0;
+ timevalsub(&src->r_time, &src->r_time);
RACCT_UNLOCK();
}
@@ -1170,8 +1107,6 @@ racct_proc_wakeup(struct proc *p)
static void
racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
{
- int64_t r_old, r_new;
-
ASSERT_RACCT_ENABLED();
RACCT_LOCK_ASSERT();
@@ -1181,15 +1116,6 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
rctl_throttle_decay(racct, RACCT_READIOPS);
rctl_throttle_decay(racct, RACCT_WRITEIOPS);
#endif
-
- r_old = racct->r_resources[RACCT_PCTCPU];
-
- /* If there is nothing to decay, just exit. */
- if (r_old <= 0)
- return;
-
- r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
- racct->r_resources[RACCT_PCTCPU] = r_new;
}
static void
@@ -1221,15 +1147,105 @@ racct_decay(void)
}
static void
+racct_updatepcpu_racct_locked(struct racct *racct)
+{
+ struct timeval diff;
+ uint64_t elapsed;
+ uint64_t runtime;
+ uint64_t newpcpu;
+ uint64_t oldpcpu;
+
+ ASSERT_RACCT_ENABLED();
+ RACCT_LOCK_ASSERT();
+
+ /* Difference between now and previously-recorded time. */
+ microuptime(&diff);
+ timevalsub(&diff, &racct->r_time);
+ elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec;
+
+ /* Difference between current and previously-recorded runtime. */
+ runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime;
+
+ newpcpu = runtime * 100 * 1000000 / elapsed;
+ oldpcpu = racct->r_resources[RACCT_PCTCPU];
+ /*
+ * This calculation is equivalent to
+ * (1 - 0.3) * newpcpu + 0.3 * oldpcpu
+ * where RACCT_DECAY_FACTOR = 0.3 * FSCALE.
+ */
+ racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) *
+ newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE;
+ if (racct->r_resources[RACCT_PCTCPU] >
+ 100 * 1000000 * (uint64_t)mp_ncpus)
+ racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 *
+ (uint64_t)mp_ncpus;
+
+ /* Record current times. */
+ racct->r_runtime = racct->r_resources[RACCT_CPU];
+ timevaladd(&racct->r_time, &diff);
+}
+
+static void
+racct_zeropcpu_locked(struct proc *p)
+{
+ ASSERT_RACCT_ENABLED();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ p->p_racct->r_resources[RACCT_PCTCPU] = 0;
+}
+
+static void
+racct_updatepcpu_locked(struct proc *p)
+{
+ ASSERT_RACCT_ENABLED();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ racct_updatepcpu_racct_locked(p->p_racct);
+}
+
+static void
+racct_updatepcpu_pre(void)
+{
+
+ RACCT_LOCK();
+}
+
+static void
+racct_updatepcpu_post(void)
+{
+
+ RACCT_UNLOCK();
+}
+
+static void
+racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2)
+{
+ racct_updatepcpu_racct_locked(racct);
+}
+
+static void
+racct_updatepcpu_containers(void)
+{
+ ASSERT_RACCT_ENABLED();
+
+ ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
+ racct_updatepcpu_post, NULL, NULL);
+ loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
+ racct_updatepcpu_post, NULL, NULL);
+ prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
+ racct_updatepcpu_post, NULL, NULL);
+}
+
+static void
racctd(void)
{
- struct thread *td;
struct proc *p;
- struct timeval wallclock;
- uint64_t pct, pct_estimate, runtime;
+ struct proc *idle;
ASSERT_RACCT_ENABLED();
+ idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc;
+
for (;;) {
racct_decay();
@@ -1237,36 +1253,16 @@ racctd(void)
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
+ if (p == idle) {
+ PROC_UNLOCK(p);
+ continue;
+ }
if (p->p_state != PRS_NORMAL ||
(p->p_flag & P_IDLEPROC) != 0) {
- if (p->p_state == PRS_ZOMBIE)
- racct_set(p, RACCT_PCTCPU, 0);
PROC_UNLOCK(p);
continue;
}
- microuptime(&wallclock);
- timevalsub(&wallclock, &p->p_stats->p_start);
- PROC_STATLOCK(p);
- FOREACH_THREAD_IN_PROC(p, td)
- ruxagg(p, td);
- runtime = cputick2usec(p->p_rux.rux_runtime);
- PROC_STATUNLOCK(p);
-#ifdef notyet
- KASSERT(runtime >= p->p_prev_runtime,
- ("runtime < p_prev_runtime"));
-#else
- if (runtime < p->p_prev_runtime)
- runtime = p->p_prev_runtime;
-#endif
- p->p_prev_runtime = runtime;
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
RACCT_LOCK();
#ifdef RCTL
rctl_throttle_decay(p->p_racct, RACCT_READBPS);
@@ -1274,11 +1270,8 @@ racctd(void)
rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
#endif
- racct_set_locked(p, RACCT_PCTCPU, pct, 1);
- racct_set_locked(p, RACCT_CPU, runtime, 0);
- racct_set_locked(p, RACCT_WALLCLOCK,
- (uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec, 0);
+ racct_settime_locked(p, false);
+ racct_updatepcpu_locked(p);
RACCT_UNLOCK();
PROC_UNLOCK(p);
}
@@ -1306,6 +1299,8 @@ racctd(void)
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
+
+ racct_updatepcpu_containers();
pause("-", hz);
}
}
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 9830e5093a3a..2a6f0989f6aa 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -571,7 +571,10 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
td->td_rtcgen =
atomic_load_acq_int(&rtc_generation);
error = kern_clock_gettime(td, clock_id, &now);
- KASSERT(error == 0, ("kern_clock_gettime: %d", error));
+ if (error != 0) {
+ td->td_rtcgen = 0;
+ return (error);
+ }
timespecsub(&ts, &now, &ts);
}
if (ts.tv_sec < 0 || (ts.tv_sec == 0 && ts.tv_nsec == 0)) {
diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c
index ab7e048a2ab1..565ab3ad6ee6 100644
--- a/sys/kern/sys_timerfd.c
+++ b/sys/kern/sys_timerfd.c
@@ -206,7 +206,6 @@ retry:
mtx_unlock(&tfd->tfd_lock);
return (EAGAIN);
}
- td->td_rtcgen = atomic_load_acq_int(&rtc_generation);
error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock,
PCATCH, "tfdrd", 0);
if (error == 0) {
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 90a4f3a7dad8..4122f9261871 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -6,7 +6,7 @@
const char *syscallnames[] = {
"syscall", /* 0 = syscall */
- "exit", /* 1 = exit */
+ "_exit", /* 1 = _exit */
"fork", /* 2 = fork */
"read", /* 3 = read */
"write", /* 4 = write */
@@ -84,8 +84,8 @@ const char *syscallnames[] = {
"obs_vhangup", /* 76 = obsolete vhangup */
"obs_vlimit", /* 77 = obsolete vlimit */
"mincore", /* 78 = mincore */
- "getgroups", /* 79 = getgroups */
- "setgroups", /* 80 = setgroups */
+ "compat14.getgroups", /* 79 = freebsd14 getgroups */
+ "compat14.setgroups", /* 80 = freebsd14 setgroups */
"getpgrp", /* 81 = getpgrp */
"setpgid", /* 82 = setpgid */
"setitimer", /* 83 = setitimer */
@@ -600,4 +600,6 @@ const char *syscallnames[] = {
"exterrctl", /* 592 = exterrctl */
"inotify_add_watch_at", /* 593 = inotify_add_watch_at */
"inotify_rm_watch", /* 594 = inotify_rm_watch */
+ "getgroups", /* 595 = getgroups */
+ "setgroups", /* 596 = setgroups */
};
diff --git a/sys/kern/syscalls.conf b/sys/kern/syscalls.conf
index a98d52659832..ae7bd1f87612 100644
--- a/sys/kern/syscalls.conf
+++ b/sys/kern/syscalls.conf
@@ -1,3 +1,4 @@
libsysmap="../../lib/libsys/syscalls.map"
libsys_h="../../lib/libsys/_libsys.h"
sysmk="../sys/syscall.mk"
+syshdr_extra="#define SYS_exit SYS__exit"
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index a8815afee866..fa64597d14a5 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -51,6 +51,7 @@
; SYSMUX syscall multiplexer. No prototype, argument struct, or
; handler is declared or used. Handled in MD syscall code.
; CAPENABLED syscall is allowed in capability mode
+; NORETURN the syscall does not return
;
; To support programmatic generation of both the default ABI and 32-bit compat
; (freebsd32) we impose a number of restrictions on the types of system calls.
@@ -124,8 +125,8 @@
...
);
}
-1 AUE_EXIT STD|CAPENABLED {
- void exit(
+1 AUE_EXIT STD|CAPENABLED|NORETURN {
+ void _exit(
int rval
);
}
@@ -551,13 +552,13 @@
_Out_writes_bytes_(len/PAGE_SIZE) char *vec
);
}
-79 AUE_GETGROUPS STD|CAPENABLED {
+79 AUE_GETGROUPS STD|CAPENABLED|COMPAT14 {
int getgroups(
int gidsetsize,
_Out_writes_opt_(gidsetsize) gid_t *gidset
);
}
-80 AUE_SETGROUPS STD {
+80 AUE_SETGROUPS STD|COMPAT14 {
int setgroups(
int gidsetsize,
_In_reads_(gidsetsize) const gid_t *gidset
@@ -3370,5 +3371,17 @@
int wd
);
}
+595 AUE_GETGROUPS STD|CAPENABLED {
+ int getgroups(
+ int gidsetsize,
+ _Out_writes_opt_(gidsetsize) gid_t *gidset
+ );
+ }
+596 AUE_SETGROUPS STD {
+ int setgroups(
+ int gidsetsize,
+ _In_reads_(gidsetsize) const gid_t *gidset
+ );
+ }
; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 467caa71f20d..2b1ea9eed8d4 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -17,9 +17,9 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 0;
break;
}
- /* exit */
+ /* _exit */
case 1: {
- struct exit_args *p = params;
+ struct _exit_args *p = params;
iarg[a++] = p->rval; /* int */
*n_args = 1;
break;
@@ -454,22 +454,6 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 3;
break;
}
- /* getgroups */
- case 79: {
- struct getgroups_args *p = params;
- iarg[a++] = p->gidsetsize; /* int */
- uarg[a++] = (intptr_t)p->gidset; /* gid_t * */
- *n_args = 2;
- break;
- }
- /* setgroups */
- case 80: {
- struct setgroups_args *p = params;
- iarg[a++] = p->gidsetsize; /* int */
- uarg[a++] = (intptr_t)p->gidset; /* const gid_t * */
- *n_args = 2;
- break;
- }
/* getpgrp */
case 81: {
*n_args = 0;
@@ -3500,6 +3484,22 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 2;
break;
}
+ /* getgroups */
+ case 595: {
+ struct getgroups_args *p = params;
+ iarg[a++] = p->gidsetsize; /* int */
+ uarg[a++] = (intptr_t)p->gidset; /* gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* setgroups */
+ case 596: {
+ struct setgroups_args *p = params;
+ iarg[a++] = p->gidsetsize; /* int */
+ uarg[a++] = (intptr_t)p->gidset; /* const gid_t * */
+ *n_args = 2;
+ break;
+ }
default:
*n_args = 0;
break;
@@ -3513,7 +3513,7 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
/* syscall */
case 0:
break;
- /* exit */
+ /* _exit */
case 1:
switch (ndx) {
case 0:
@@ -4199,32 +4199,6 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
- /* getgroups */
- case 79:
- switch (ndx) {
- case 0:
- p = "int";
- break;
- case 1:
- p = "userland gid_t *";
- break;
- default:
- break;
- };
- break;
- /* setgroups */
- case 80:
- switch (ndx) {
- case 0:
- p = "int";
- break;
- case 1:
- p = "userland const gid_t *";
- break;
- default:
- break;
- };
- break;
/* getpgrp */
case 81:
break;
@@ -9367,6 +9341,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
+ /* getgroups */
+ case 595:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "userland gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setgroups */
+ case 596:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "userland const gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
default:
break;
};
@@ -9380,7 +9380,7 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
switch (sysnum) {
/* syscall */
case 0:
- /* exit */
+ /* _exit */
case 1:
if (ndx == 0 || ndx == 1)
p = "void";
@@ -9633,16 +9633,6 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
if (ndx == 0 || ndx == 1)
p = "int";
break;
- /* getgroups */
- case 79:
- if (ndx == 0 || ndx == 1)
- p = "int";
- break;
- /* setgroups */
- case 80:
- if (ndx == 0 || ndx == 1)
- p = "int";
- break;
/* getpgrp */
case 81:
/* setpgid */
@@ -11365,6 +11355,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
if (ndx == 0 || ndx == 1)
p = "int";
break;
+ /* getgroups */
+ case 595:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setgroups */
+ case 596:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
default:
break;
};
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 85fe48ddd466..eb1327f7f2de 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1160,7 +1160,8 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
return (EINVAL);
- if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
+ if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC |
+ O_CLOFORK)) != 0)
return (EINVAL);
largepage = (shmflags & SHM_LARGEPAGE) != 0;
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 0056dac65c7d..19870e989437 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -154,15 +154,12 @@ static struct task unp_defer_task;
* and don't really want to reserve the sendspace. Their recvspace should be
* large enough for at least one max-size datagram plus address.
*/
-#ifndef PIPSIZ
-#define PIPSIZ 8192
-#endif
-static u_long unpst_sendspace = PIPSIZ;
-static u_long unpst_recvspace = PIPSIZ;
+static u_long unpst_sendspace = 64*1024;
+static u_long unpst_recvspace = 64*1024;
static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */
static u_long unpdg_recvspace = 16*1024;
-static u_long unpsp_sendspace = PIPSIZ;
-static u_long unpsp_recvspace = PIPSIZ;
+static u_long unpsp_sendspace = 64*1024;
+static u_long unpsp_recvspace = 64*1024;
static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Local domain");
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index fa655c43d155..19c39e42bafa 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -5170,7 +5170,7 @@ bufstrategy(struct bufobj *bo, struct buf *bp)
vp = bp->b_vp;
KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
- KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
+ KASSERT(!VN_ISDEV(vp),
("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
i = VOP_STRATEGY(vp, bp);
KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index fd6202a1424c..85f67731e1cc 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -457,6 +457,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap)
case _PC_NAMEDATTR_ENABLED:
case _PC_HAS_NAMEDATTR:
case _PC_HAS_HIDDENSYSTEM:
+ case _PC_CLONE_BLKSIZE:
*ap->a_retval = 0;
return (0);
default:
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
index 746a5a39208e..b265a5ff3a62 100644
--- a/sys/kern/vfs_inotify.c
+++ b/sys/kern/vfs_inotify.c
@@ -801,6 +801,7 @@ vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
vn_lock(vp, LK_SHARED | LK_RETRY);
if (error != 0)
break;
+ NDFREE_PNBUF(&nd);
vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
vrele(nd.ni_vp);
}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 9704e9c160a8..bf3ed9d515dc 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -2839,7 +2839,7 @@ setfflags(struct thread *td, struct vnode *vp, u_long flags)
* if they are allowed to set flags and programs assume that
* chown can't fail when done as root.
*/
- if (vp->v_type == VCHR || vp->v_type == VBLK) {
+ if (VN_ISDEV(vp)) {
error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
if (error != 0)
return (error);
@@ -5050,15 +5050,16 @@ kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
size_t retlen;
void *rl_rcookie, *rl_wcookie;
off_t inoff, outoff, savinoff, savoutoff;
- bool foffsets_locked;
+ bool foffsets_locked, foffsets_set;
infp = outfp = NULL;
rl_rcookie = rl_wcookie = NULL;
foffsets_locked = false;
+ foffsets_set = false;
error = 0;
retlen = 0;
- if (flags != 0) {
+ if ((flags & ~COPY_FILE_RANGE_USERFLAGS) != 0) {
error = EINVAL;
goto out;
}
@@ -5122,6 +5123,8 @@ kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
}
foffset_lock_pair(infp1, &inoff, outfp1, &outoff, 0);
foffsets_locked = true;
+ } else {
+ foffsets_set = true;
}
savinoff = inoff;
savoutoff = outoff;
@@ -5180,11 +5183,12 @@ out:
vn_rangelock_unlock(invp, rl_rcookie);
if (rl_wcookie != NULL)
vn_rangelock_unlock(outvp, rl_wcookie);
+ if ((foffsets_locked || foffsets_set) &&
+ (error == EINTR || error == ERESTART)) {
+ inoff = savinoff;
+ outoff = savoutoff;
+ }
if (foffsets_locked) {
- if (error == EINTR || error == ERESTART) {
- inoff = savinoff;
- outoff = savoutoff;
- }
if (inoffp == NULL)
foffset_unlock(infp, inoff, 0);
else
@@ -5193,6 +5197,9 @@ out:
foffset_unlock(outfp, outoff, 0);
else
*outoffp = outoff;
+ } else if (foffsets_set) {
+ *inoffp = inoff;
+ *outoffp = outoff;
}
if (outfp != NULL)
fdrop(outfp, td);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 6451c9e07a60..a4f41192f684 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -3443,6 +3443,11 @@ vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
interrupted = 0;
dat = NULL;
+ if ((flags & COPY_FILE_RANGE_CLONE) != 0) {
+ error = EOPNOTSUPP;
+ goto out;
+ }
+
error = vn_lock(invp, LK_SHARED);
if (error != 0)
goto out;