aboutsummaryrefslogtreecommitdiff
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/init_main.c9
-rw-r--r--sys/kern/init_sysent.c2
-rw-r--r--sys/kern/kern_descrip.c2
-rw-r--r--sys/kern/kern_environment.c32
-rw-r--r--sys/kern/kern_event.c139
-rw-r--r--sys/kern/kern_jail.c460
-rw-r--r--sys/kern/kern_jaildesc.c336
-rw-r--r--sys/kern/kern_mutex.c4
-rw-r--r--sys/kern/kern_rmlock.c4
-rw-r--r--sys/kern/kern_rwlock.c4
-rw-r--r--sys/kern/kern_sx.c4
-rw-r--r--sys/kern/kern_thread.c6
-rw-r--r--sys/kern/subr_witness.c134
-rw-r--r--sys/kern/syscalls.c2
-rw-r--r--sys/kern/syscalls.master10
-rw-r--r--sys/kern/systrace_args.c44
-rw-r--r--sys/kern/uipc_usrreq.c4
-rw-r--r--sys/kern/vfs_init.c37
-rw-r--r--sys/kern/vfs_mount.c7
-rw-r--r--sys/kern/vfs_subr.c8
20 files changed, 1085 insertions, 163 deletions
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 36ce44b988be..87ffdb8dbf07 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -145,13 +145,6 @@ FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance");
#endif
/*
- * This ensures that there is at least one entry so that the sysinit_set
- * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
- * executed.
- */
-SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
-
-/*
* The sysinit linker set compiled into the kernel. These are placed onto the
* sysinit list by mi_startup; sysinit_add can add (e.g., from klds) additional
* sysinits to the linked list but the linker set here does not change.
@@ -296,7 +289,7 @@ mi_startup(void)
BOOTTRACE_INIT("sysinit 0x%7x", sip->subsystem);
#if defined(VERBOSE_SYSINIT)
- if (sip->subsystem > last && verbose_sysinit != 0) {
+ if (sip->subsystem != last && verbose_sysinit != 0) {
verbose = 1;
printf("subsystem %x\n", sip->subsystem);
}
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index fcd232cde21e..e42e7dcf8b44 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -663,4 +663,6 @@ struct sysent sysent[] = {
{ .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */
{ .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */
{ .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */
+ { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */
+ { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */
};
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index a27ab33b34da..057235574eb5 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -5250,6 +5250,8 @@ file_type_to_name(short type)
return ("eventfd");
case DTYPE_TIMERFD:
return ("timerfd");
+ case DTYPE_JAILDESC:
+ return ("jail");
default:
return ("unkn");
}
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
index 0cb0f566a839..7c0654769581 100644
--- a/sys/kern/kern_environment.c
+++ b/sys/kern/kern_environment.c
@@ -1098,65 +1098,65 @@ kernenv_next(char *cp)
}
void
-tunable_int_init(void *data)
+tunable_int_init(const void *data)
{
- struct tunable_int *d = (struct tunable_int *)data;
+ const struct tunable_int *d = data;
TUNABLE_INT_FETCH(d->path, d->var);
}
void
-tunable_long_init(void *data)
+tunable_long_init(const void *data)
{
- struct tunable_long *d = (struct tunable_long *)data;
+ const struct tunable_long *d = data;
TUNABLE_LONG_FETCH(d->path, d->var);
}
void
-tunable_ulong_init(void *data)
+tunable_ulong_init(const void *data)
{
- struct tunable_ulong *d = (struct tunable_ulong *)data;
+ const struct tunable_ulong *d = data;
TUNABLE_ULONG_FETCH(d->path, d->var);
}
void
-tunable_int64_init(void *data)
+tunable_int64_init(const void *data)
{
- struct tunable_int64 *d = (struct tunable_int64 *)data;
+ const struct tunable_int64 *d = data;
TUNABLE_INT64_FETCH(d->path, d->var);
}
void
-tunable_uint64_init(void *data)
+tunable_uint64_init(const void *data)
{
- struct tunable_uint64 *d = (struct tunable_uint64 *)data;
+ const struct tunable_uint64 *d = data;
TUNABLE_UINT64_FETCH(d->path, d->var);
}
void
-tunable_quad_init(void *data)
+tunable_quad_init(const void *data)
{
- struct tunable_quad *d = (struct tunable_quad *)data;
+ const struct tunable_quad *d = data;
TUNABLE_QUAD_FETCH(d->path, d->var);
}
void
-tunable_bool_init(void *data)
+tunable_bool_init(const void *data)
{
- struct tunable_bool *d = (struct tunable_bool *)data;
+ const struct tunable_bool *d = data;
TUNABLE_BOOL_FETCH(d->path, d->var);
}
void
-tunable_str_init(void *data)
+tunable_str_init(const void *data)
{
- struct tunable_str *d = (struct tunable_str *)data;
+ const struct tunable_str *d = data;
TUNABLE_STR_FETCH(d->path, d->var, d->size);
}
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index eb77a5064113..501adc151d44 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -50,6 +50,7 @@
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
+#include <sys/jail.h>
#include <sys/kthread.h>
#include <sys/selinfo.h>
#include <sys/queue.h>
@@ -163,6 +164,9 @@ static int filt_kqueue(struct knote *kn, long hint);
static int filt_procattach(struct knote *kn);
static void filt_procdetach(struct knote *kn);
static int filt_proc(struct knote *kn, long hint);
+static int filt_jailattach(struct knote *kn);
+static void filt_jaildetach(struct knote *kn);
+static int filt_jail(struct knote *kn, long hint);
static int filt_fileattach(struct knote *kn);
static void filt_timerexpire(void *knx);
static void filt_timerexpire_l(struct knote *kn, bool proc_locked);
@@ -195,6 +199,12 @@ static const struct filterops proc_filtops = {
.f_detach = filt_procdetach,
.f_event = filt_proc,
};
+static const struct filterops jail_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_jailattach,
+ .f_detach = filt_jaildetach,
+ .f_event = filt_jail,
+};
static const struct filterops timer_filtops = {
.f_isfd = 0,
.f_attach = filt_timerattach,
@@ -365,6 +375,7 @@ static struct {
[~EVFILT_USER] = { &user_filtops, 1 },
[~EVFILT_SENDFILE] = { &null_filtops },
[~EVFILT_EMPTY] = { &file_filtops, 1 },
+ [~EVFILT_JAIL] = { &jail_filtops, 1 },
};
/*
@@ -528,7 +539,8 @@ filt_proc(struct knote *kn, long hint)
* process forked. Additionally, for each knote attached to the
* parent, check whether user wants to track the new process. If so
* attach a new knote to it, and immediately report an event with the
- * child's pid.
+ * child's pid. This is also called on jail creation, which is treated
+ * the same way by jail events.
*/
void
knote_fork(struct knlist *list, int pid)
@@ -555,6 +567,8 @@ knote_fork(struct knlist *list, int pid)
/*
* The same as knote(), activate the event.
*/
+ _Static_assert(NOTE_JAIL_CHILD == NOTE_FORK,
+ "NOTE_JAIL_CHILD should be the same as NOTE_FORK");
if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
if (kn->kn_fop->f_event(kn, NOTE_FORK))
KNOTE_ACTIVATE(kn, 1);
@@ -614,6 +628,124 @@ knote_fork(struct knlist *list, int pid)
}
}
+int
+filt_jailattach(struct knote *kn)
+{
+ struct prison *pr;
+ bool immediate;
+
+ immediate = false;
+ if (kn->kn_id == 0) {
+ /* Let jid=0 watch the current prison (including prison0). */
+ pr = curthread->td_ucred->cr_prison;
+ mtx_lock(&pr->pr_mtx);
+ } else if (kn->kn_flags & (EV_FLAG1 | EV_FLAG2)) {
+ /*
+ * The kernel registers prisons before they are valid,
+ * so prison_find_child will fail.
+ */
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ if (pr->pr_id < kn->kn_id)
+ continue;
+ if (pr->pr_id > kn->kn_id) {
+ pr = NULL;
+ break;
+ }
+ mtx_lock(&pr->pr_mtx);
+ break;
+ }
+ if (pr == NULL)
+ return (ENOENT);
+ } else {
+ sx_slock(&allprison_lock);
+ pr = prison_find_child(curthread->td_ucred->cr_prison,
+ kn->kn_id);
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL)
+ return (ENOENT);
+ if (!prison_isalive(pr)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (ENOENT);
+ }
+ }
+ kn->kn_ptr.p_prison = pr;
+ kn->kn_flags |= EV_CLEAR;
+
+ /*
+ * Internal flag indicating registration done by kernel for the
+ * purposes of getting a NOTE_CHILD notification.
+ */
+ if (kn->kn_flags & EV_FLAG2) {
+ kn->kn_flags &= ~EV_FLAG2;
+ kn->kn_data = kn->kn_sdata; /* parent id */
+ kn->kn_fflags = NOTE_CHILD;
+ kn->kn_sfflags &= ~NOTE_JAIL_CTRLMASK;
+ immediate = true; /* Force immediate activation of child note. */
+ }
+ /*
+ * Internal flag indicating registration done by kernel (for other than
+ * NOTE_CHILD).
+ */
+ if (kn->kn_flags & EV_FLAG1) {
+ kn->kn_flags &= ~EV_FLAG1;
+ }
+
+ knlist_add(pr->pr_klist, kn, 1);
+
+ /* Immediately activate any child notes. */
+ if (immediate)
+ KNOTE_ACTIVATE(kn, 0);
+
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+}
+
+void
+filt_jaildetach(struct knote *kn)
+{
+ if (kn->kn_ptr.p_prison != NULL) {
+ knlist_remove(kn->kn_knlist, kn, 0);
+ kn->kn_ptr.p_prison = NULL;
+ } else
+ kn->kn_status |= KN_DETACHED;
+}
+
+int
+filt_jail(struct knote *kn, long hint)
+{
+ struct prison *pr;
+ u_int event;
+
+ pr = kn->kn_ptr.p_prison;
+ if (pr == NULL) /* already activated, from attach filter */
+ return (0);
+
+ /* Mask off extra data. */
+ event = (u_int)hint & NOTE_JAIL_CTRLMASK;
+
+ /* If the user is interested in this event, record it. */
+ if (kn->kn_sfflags & event)
+ kn->kn_fflags |= event;
+
+ /* Report the attached process id. */
+ if (event == NOTE_JAIL_ATTACH) {
+ if (kn->kn_data != 0)
+ kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI;
+ kn->kn_data = hint & NOTE_JAIL_DATAMASK;
+ }
+
+ /* Prison is gone, so flag the event as finished. */
+ if (event == NOTE_JAIL_REMOVE) {
+ kn->kn_flags |= EV_EOF | EV_ONESHOT;
+ kn->kn_ptr.p_prison = NULL;
+ if (kn->kn_fflags == 0)
+ kn->kn_flags |= EV_DROP;
+ return (1);
+ }
+
+ return (kn->kn_fflags != 0);
+}
+
/*
* XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
* interval timer support code.
@@ -1597,8 +1729,8 @@ findkn:
/*
* If possible, find an existing knote to use for this kevent.
*/
- if (kev->filter == EVFILT_PROC &&
- (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+ if ((kev->filter == EVFILT_PROC || kev->filter == EVFILT_JAIL)
+ && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
/* This is an internal creation of a process tracking
* note. Don't attempt to coalesce this with an
* existing note.
@@ -2800,6 +2932,7 @@ knote_init(void)
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
+ prison0.pr_klist = knlist_alloc(&prison0.pr_mtx);
}
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 7c9a15ae18f3..5a1fbe23ddeb 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -39,15 +39,18 @@
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/errno.h>
+#include <sys/file.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
#include <sys/osd.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/epoch.h>
+#include <sys/event.h>
#include <sys/taskqueue.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
+#include <sys/jaildesc.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/mman.h>
@@ -154,7 +157,8 @@ static void prison_complete(void *context, int pending);
static void prison_deref(struct prison *pr, int flags);
static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
static int prison_lock_xlock(struct prison *pr, int flags);
-static void prison_cleanup(struct prison *pr);
+static void prison_cleanup_locked(struct prison *pr);
+static void prison_cleanup_unlocked(struct prison *pr);
static void prison_free_not_last(struct prison *pr);
static void prison_proc_free_not_last(struct prison *pr);
static void prison_proc_relink(struct prison *opr, struct prison *npr,
@@ -167,6 +171,7 @@ static void prison_racct_attach(struct prison *pr);
static void prison_racct_modify(struct prison *pr);
static void prison_racct_detach(struct prison *pr);
#endif
+static void prison_knote(struct prison *pr, long hint);
/* Flags for prison_deref */
#define PD_DEREF 0x01 /* Decrement pr_ref */
@@ -985,6 +990,8 @@ prison_ip_cnt(const struct prison *pr, const pr_family_t af)
int
kern_jail_set(struct thread *td, struct uio *optuio, int flags)
{
+ struct file *jfp_out;
+ struct jaildesc *desc_in;
struct nameidata nd;
#ifdef INET
struct prison_ip *ip4;
@@ -995,6 +1002,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
struct vfsopt *opt;
struct vfsoptlist *opts;
struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
+ struct ucred *jdcred;
struct vnode *root;
char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
char *g_path, *osrelstr;
@@ -1008,7 +1016,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
int created, cuflags, descend, drflags, enforce;
int error, errmsg_len, errmsg_pos;
int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
- int deadid, jid, jsys, len, level;
+ int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level;
int childmax, osreldt, rsnum, slevel;
#ifdef INET
int ip4s;
@@ -1018,22 +1026,32 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
int ip6s;
bool redo_ip6;
#endif
+ bool maybe_changed;
uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
uint64_t pr_allow_diff;
unsigned tallow;
char numbuf[12];
- error = priv_check(td, PRIV_JAIL_SET);
- if (!error && (flags & JAIL_ATTACH))
- error = priv_check(td, PRIV_JAIL_ATTACH);
- if (error)
- return (error);
mypr = td->td_ucred->cr_prison;
- if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
+ if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE)
+ && mypr->pr_childmax == 0)
return (EPERM);
if (flags & ~JAIL_SET_MASK)
return (EINVAL);
+ if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC))
+ == (JAIL_USE_DESC | JAIL_AT_DESC))
+ return (EINVAL);
+ prison_hold(mypr);
+#ifdef INET
+ ip4 = NULL;
+#endif
+#ifdef INET6
+ ip6 = NULL;
+#endif
+ g_path = NULL;
+ jfp_out = NULL;
+ jfd_out = -1;
/*
* Check all the parameters before committing to anything. Not all
* errors can be caught early, but we may as well try. Also, this
@@ -1046,14 +1064,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
*/
error = vfs_buildopts(optuio, &opts);
if (error)
- return (error);
-#ifdef INET
- ip4 = NULL;
-#endif
-#ifdef INET6
- ip6 = NULL;
-#endif
- g_path = NULL;
+ goto done_free;
cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
if (!cuflags) {
@@ -1062,6 +1073,72 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
goto done_errmsg;
}
+ error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
+ if (error == ENOENT) {
+ if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
+ JAIL_OWN_DESC)) {
+ vfs_opterror(opts, "missing desc");
+ goto done_errmsg;
+ }
+ jfd_in = -1;
+ } else if (error != 0)
+ goto done_free;
+ else {
+ if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
+ JAIL_OWN_DESC))) {
+ vfs_opterror(opts, "unexpected desc");
+ goto done_errmsg;
+ }
+ if (flags & JAIL_AT_DESC) {
+ /*
+ * Look up and create jails based on the
+ * descriptor's prison.
+ */
+ prison_free(mypr);
+ error = jaildesc_find(td, jfd_in, &desc_in, &mypr,
+ NULL);
+ if (error != 0) {
+ vfs_opterror(opts, error == ENOENT
+ ? "descriptor to dead jail"
+ : "not a jail descriptor");
+ goto done_errmsg;
+ }
+ /*
+ * Check file permissions using the current
+ * credentials, and operation permissions
+ * using the descriptor's credentials.
+ */
+ error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid,
+ desc_in->jd_gid, VEXEC, td->td_ucred);
+ JAILDESC_UNLOCK(desc_in);
+ if (error != 0)
+ goto done_free;
+ if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) {
+ error = EPERM;
+ goto done_free;
+ }
+ }
+ if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
+ /* Allocate a jail descriptor to return later. */
+ error = jaildesc_alloc(td, &jfp_out, &jfd_out,
+ flags & JAIL_OWN_DESC);
+ if (error)
+ goto done_free;
+ }
+ }
+
+ /*
+ * Delay the permission check if using a jail descriptor,
+ * until we get the descriptor's credentials.
+ */
+ if (!(flags & JAIL_USE_DESC)) {
+ error = priv_check(td, PRIV_JAIL_SET);
+ if (error == 0 && (flags & JAIL_ATTACH))
+ error = priv_check(td, PRIV_JAIL_ATTACH);
+ if (error)
+ goto done_free;
+ }
+
error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
if (error == ENOENT)
jid = 0;
@@ -1422,6 +1499,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
pr = NULL;
inspr = NULL;
deadpr = NULL;
+ maybe_changed = false;
if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
namelc = strrchr(name, '.');
jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
@@ -1436,7 +1514,57 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
error = EAGAIN;
goto done_deref;
}
- if (jid != 0) {
+ if (flags & JAIL_USE_DESC) {
+ /* Get the jail from its descriptor. */
+ error = jaildesc_find(td, jfd_in, &desc_in, &pr, &jdcred);
+ if (error) {
+ vfs_opterror(opts, error == ENOENT
+ ? "descriptor to dead jail"
+ : "not a jail descriptor");
+ goto done_deref;
+ }
+ drflags |= PD_DEREF;
+ /*
+ * Check file permissions using the current credentials,
+ * and operation permissions using the descriptor's
+ * credentials.
+ */
+ error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid,
+ desc_in->jd_gid, VWRITE, td->td_ucred);
+ if (error == 0 && (flags & JAIL_ATTACH))
+ error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid,
+ desc_in->jd_gid, VEXEC, td->td_ucred);
+ JAILDESC_UNLOCK(desc_in);
+ if (error == 0)
+ error = priv_check_cred(jdcred, PRIV_JAIL_SET);
+ if (error == 0 && (flags & JAIL_ATTACH))
+ error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
+ crfree(jdcred);
+ if (error)
+ goto done_deref;
+ mtx_lock(&pr->pr_mtx);
+ drflags |= PD_LOCKED;
+ if (cuflags == JAIL_CREATE) {
+ error = EEXIST;
+ vfs_opterror(opts, "jail %d already exists",
+ pr->pr_id);
+ goto done_deref;
+ }
+ if (!prison_isalive(pr)) {
+ /* While a jid can be resurrected, the prison
+ * itself cannot.
+ */
+ error = ENOENT;
+ vfs_opterror(opts, "jail %d is dying", pr->pr_id);
+ goto done_deref;
+ }
+ if (jid != 0 && jid != pr->pr_id) {
+ error = EINVAL;
+ vfs_opterror(opts, "cannot change jid");
+ goto done_deref;
+ }
+ jid = pr->pr_id;
+ } else if (jid != 0) {
if (jid < 0) {
error = EINVAL;
vfs_opterror(opts, "negative jid");
@@ -1570,7 +1698,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
}
}
}
- /* Update: must provide a jid or name. */
+ /* Update: must provide a desc, jid, or name. */
else if (cuflags == JAIL_UPDATE && pr == NULL) {
error = ENOENT;
vfs_opterror(opts, "update specified no jail");
@@ -1643,6 +1771,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
tpr->pr_childcount++;
+ pr->pr_klist = knlist_alloc(&pr->pr_mtx);
/* Set some default values, and inherit some from the parent. */
if (namelc == NULL)
@@ -1722,8 +1851,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
* Grab a reference for existing prisons, to ensure they
* continue to exist for the duration of the call.
*/
- prison_hold(pr);
- drflags |= PD_DEREF;
+ if (!(drflags & PD_DEREF)) {
+ prison_hold(pr);
+ drflags |= PD_DEREF;
+ }
#if defined(VIMAGE) && (defined(INET) || defined(INET6))
if ((pr->pr_flags & PR_VNET) &&
(ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
@@ -1880,6 +2011,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
goto done_deref;
}
}
+ maybe_changed = true;
/* Set the parameters of the prison. */
#ifdef INET
@@ -2112,7 +2244,12 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
* reference via persistence, or is about to gain one via attachment.
*/
if (created) {
- drflags = prison_lock_xlock(pr, drflags);
+ sx_assert(&allprison_lock, SX_XLOCKED);
+ mtx_lock(&ppr->pr_mtx);
+ knote_fork(ppr->pr_klist, pr->pr_id);
+ mtx_unlock(&ppr->pr_mtx);
+ mtx_lock(&pr->pr_mtx);
+ drflags |= PD_LOCKED;
pr->pr_state = PRISON_STATE_ALIVE;
}
@@ -2146,10 +2283,37 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
printf("Warning jail jid=%d: mountd/nfsd requires a separate"
" file system\n", pr->pr_id);
+ /*
+ * Now that the prison is fully created without error, set the
+ * jail descriptor if one was requested. This is the only
+ * parameter that is returned to the caller (except the error
+ * message).
+ */
+ if (jfd_out >= 0) {
+ if (!(drflags & PD_LOCKED)) {
+ mtx_lock(&pr->pr_mtx);
+ drflags |= PD_LOCKED;
+ }
+ jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1;
+ if (optuio->uio_segflg == UIO_SYSSPACE)
+ *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out;
+ else
+ (void)copyout(&jfd_out,
+ optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out));
+ jaildesc_set_prison(jfp_out, pr);
+ }
+
drflags &= ~PD_KILL;
td->td_retval[0] = pr->pr_id;
done_deref:
+ /*
+ * Report changes to kevent. This can happen even if the
+ * system call fails, as changes might have been made before
+ * the failure.
+ */
+ if (maybe_changed && !created)
+ prison_knote(pr, NOTE_JAIL_SET);
/* Release any temporary prison holds and/or locks. */
if (pr != NULL)
prison_deref(pr, drflags);
@@ -2176,15 +2340,21 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
}
}
done_free:
+ /* Clean up other resources. */
#ifdef INET
prison_ip_free(ip4);
#endif
#ifdef INET6
prison_ip_free(ip6);
#endif
+ if (jfp_out != NULL)
+ fdrop(jfp_out, td);
+ if (error && jfd_out >= 0)
+ (void)kern_close(td, jfd_out);
if (g_path != NULL)
free(g_path, M_TEMP);
vfs_freeopts(opts);
+ prison_free(mypr);
return (error);
}
@@ -2329,16 +2499,22 @@ int
kern_jail_get(struct thread *td, struct uio *optuio, int flags)
{
struct bool_flags *bf;
+ struct file *jfp_out;
+ struct jaildesc *desc_in;
struct jailsys_flags *jsf;
struct prison *pr, *mypr;
struct vfsopt *opt;
struct vfsoptlist *opts;
char *errmsg, *name;
int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
+ int jfd_in, jfd_out;
unsigned f;
if (flags & ~JAIL_GET_MASK)
return (EINVAL);
+ if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC))
+ == (JAIL_USE_DESC | JAIL_AT_DESC))
+ return (EINVAL);
/* Get the parameter list. */
error = vfs_buildopts(optuio, &opts);
@@ -2346,13 +2522,81 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
return (error);
errmsg_pos = vfs_getopt_pos(opts, "errmsg");
mypr = td->td_ucred->cr_prison;
+ prison_hold(mypr);
pr = NULL;
+ jfp_out = NULL;
+ jfd_out = -1;
/*
- * Find the prison specified by one of: lastjid, jid, name.
+ * Find the prison specified by one of: desc, lastjid, jid, name.
*/
sx_slock(&allprison_lock);
drflags = PD_LIST_SLOCKED;
+
+ error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
+ if (error == ENOENT) {
+ if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) {
+ vfs_opterror(opts, "missing desc");
+ goto done;
+ }
+ } else if (error == 0) {
+ if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
+ JAIL_OWN_DESC))) {
+ vfs_opterror(opts, "unexpected desc");
+ goto done;
+ }
+ if (flags & JAIL_USE_DESC) {
+ /* Get the jail from its descriptor. */
+ error = jaildesc_find(td, jfd_in, &desc_in, &pr, NULL);
+ if (error) {
+ vfs_opterror(opts, error == ENOENT
+ ? "descriptor to dead jail"
+ : "not a jail descriptor");
+ goto done;
+ }
+ drflags |= PD_DEREF;
+ error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid,
+ desc_in->jd_gid, VREAD, td->td_ucred);
+ JAILDESC_UNLOCK(desc_in);
+ if (error != 0)
+ goto done;
+ mtx_lock(&pr->pr_mtx);
+ drflags |= PD_LOCKED;
+ if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
+ error = ENOENT;
+ vfs_opterror(opts, "jail %d is dying",
+ pr->pr_id);
+ goto done;
+ }
+ goto found_prison;
+ }
+ if (flags & JAIL_AT_DESC) {
+ /* Look up jails based on the descriptor's prison. */
+ prison_free(mypr);
+ error = jaildesc_find(td, jfd_in, &desc_in, &mypr,
+ NULL);
+ if (error != 0) {
+ vfs_opterror(opts, error == ENOENT
+ ? "descriptor to dead jail"
+ : "not a jail descriptor");
+ goto done;
+ }
+ error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid,
+ desc_in->jd_gid, VEXEC, td->td_ucred);
+ JAILDESC_UNLOCK(desc_in);
+ if (error != 0)
+ goto done;
+ }
+ if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
+ /* Allocate a jail descriptor to return later. */
+ error = jaildesc_alloc(td, &jfp_out, &jfd_out,
+ flags & JAIL_OWN_DESC);
+ if (error)
+ goto done;
+ }
+ } else
+ goto done;
+
error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
if (error == 0) {
TAILQ_FOREACH(pr, &allprison, pr_list) {
@@ -2421,9 +2665,17 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
found_prison:
/* Get the parameters of the prison. */
- prison_hold(pr);
- drflags |= PD_DEREF;
+ if (!(drflags & PD_DEREF)) {
+ prison_hold(pr);
+ drflags |= PD_DEREF;
+ }
td->td_retval[0] = pr->pr_id;
+ if (jfd_out >= 0) {
+ error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out));
+ if (error != 0 && error != ENOENT)
+ goto done;
+ jaildesc_set_prison(jfp_out, pr);
+ }
error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
if (error != 0 && error != ENOENT)
goto done;
@@ -2603,6 +2855,13 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
prison_deref(pr, drflags);
else if (drflags & PD_LIST_SLOCKED)
sx_sunlock(&allprison_lock);
+ else if (drflags & PD_LIST_XLOCKED)
+ sx_xunlock(&allprison_lock);
+ /* Clean up other resources. */
+ if (jfp_out != NULL)
+ (void)fdrop(jfp_out, td);
+ if (error && jfd_out >= 0)
+ (void)kern_close(td, jfd_out);
if (error && errmsg_pos >= 0) {
/* Write the error message back to userspace. */
vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
@@ -2619,6 +2878,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
}
}
vfs_freeopts(opts);
+ prison_free(mypr);
return (error);
}
@@ -2643,14 +2903,63 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
sx_xunlock(&allprison_lock);
return (EINVAL);
}
+ prison_hold(pr);
+ prison_remove(pr);
+ return (0);
+}
+
+/*
+ * struct jail_remove_jd_args {
+ * int fd;
+ * };
+ */
+int
+sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap)
+{
+ struct jaildesc *jd;
+ struct prison *pr;
+ struct ucred *jdcred;
+ int error;
+
+ error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred);
+ if (error)
+ return (error);
+ /*
+ * Check file permissions using the current credentials, and
+ * operation permissions using the descriptor's credentials.
+ */
+ error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VWRITE,
+ td->td_ucred);
+ JAILDESC_UNLOCK(jd);
+ if (error == 0)
+ error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE);
+ crfree(jdcred);
+ if (error) {
+ prison_free(pr);
+ return (error);
+ }
+ sx_xlock(&allprison_lock);
+ mtx_lock(&pr->pr_mtx);
+ prison_remove(pr);
+ return (0);
+}
+
+/*
+ * Begin the removal process for a prison. The allprison lock should
+ * be held exclusively, and the prison should be both locked and held.
+ */
+void
+prison_remove(struct prison *pr)
+{
+ sx_assert(&allprison_lock, SA_XLOCKED);
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
if (!prison_isalive(pr)) {
/* Silently ignore already-dying prisons. */
mtx_unlock(&pr->pr_mtx);
sx_xunlock(&allprison_lock);
- return (0);
+ return;
}
- prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
- return (0);
+ prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
}
/*
@@ -2685,6 +2994,53 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
}
+/*
+ * struct jail_attach_jd_args {
+ * int fd;
+ * };
+ */
+int
+sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap)
+{
+ struct jaildesc *jd;
+ struct prison *pr;
+ struct ucred *jdcred;
+ int drflags, error;
+
+ sx_slock(&allprison_lock);
+ drflags = PD_LIST_SLOCKED;
+ error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred);
+ if (error)
+ goto fail;
+ drflags |= PD_DEREF;
+ /*
+ * Check file permissions using the current credentials, and
+ * operation permissions using the descriptor's credentials.
+ */
+ error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VEXEC,
+ td->td_ucred);
+ JAILDESC_UNLOCK(jd);
+ if (error == 0)
+ error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
+ crfree(jdcred);
+ if (error)
+ goto fail;
+ mtx_lock(&pr->pr_mtx);
+ drflags |= PD_LOCKED;
+
+ /* Do not allow a process to attach to a prison that is not alive. */
+ if (!prison_isalive(pr)) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ return (do_jail_attach(td, pr, drflags));
+
+ fail:
+ prison_deref(pr, drflags);
+ return (error);
+}
+
static int
do_jail_attach(struct thread *td, struct prison *pr, int drflags)
{
@@ -2703,9 +3059,12 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
* a process root from one prison, but attached to the jail
* of another.
*/
- prison_hold(pr);
+ if (!(drflags & PD_DEREF)) {
+ prison_hold(pr);
+ drflags |= PD_DEREF;
+ }
refcount_acquire(&pr->pr_uref);
- drflags |= PD_DEREF | PD_DEUREF;
+ drflags |= PD_DEUREF;
mtx_unlock(&pr->pr_mtx);
drflags &= ~PD_LOCKED;
@@ -2755,6 +3114,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
prison_proc_relink(oldcred->cr_prison, pr, p);
prison_deref(oldcred->cr_prison, drflags);
crfree(oldcred);
+ prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid);
/*
* If the prison was killed while changing credentials, die along
@@ -3182,9 +3542,10 @@ prison_deref(struct prison *pr, int flags)
refcount_load(&prison0.pr_uref) > 0,
("prison0 pr_uref=0"));
pr->pr_state = PRISON_STATE_DYING;
+ prison_cleanup_locked(pr);
mtx_unlock(&pr->pr_mtx);
flags &= ~PD_LOCKED;
- prison_cleanup(pr);
+ prison_cleanup_unlocked(pr);
}
}
}
@@ -3327,8 +3688,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
}
if (!(cpr->pr_flags & PR_REMOVE))
continue;
- prison_cleanup(cpr);
+ prison_cleanup_unlocked(cpr);
mtx_lock(&cpr->pr_mtx);
+ prison_cleanup_locked(cpr);
cpr->pr_flags &= ~PR_REMOVE;
if (cpr->pr_flags & PR_PERSIST) {
cpr->pr_flags &= ~PR_PERSIST;
@@ -3363,8 +3725,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
if (rpr != NULL)
LIST_REMOVE(rpr, pr_sibling);
- prison_cleanup(pr);
+ prison_cleanup_unlocked(pr);
mtx_lock(&pr->pr_mtx);
+ prison_cleanup_locked(pr);
if (pr->pr_flags & PR_PERSIST) {
pr->pr_flags &= ~PR_PERSIST;
prison_proc_free_not_last(pr);
@@ -3411,10 +3774,22 @@ prison_lock_xlock(struct prison *pr, int flags)
/*
* Release a prison's resources when it starts dying (when the last user
- * reference is dropped, or when it is killed).
+ * reference is dropped, or when it is killed). Two functions are called,
+ * for work that requires a locked prison or an unlocked one.
*/
static void
-prison_cleanup(struct prison *pr)
+prison_cleanup_locked(struct prison *pr)
+{
+ sx_assert(&allprison_lock, SA_XLOCKED);
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ prison_knote(pr, NOTE_JAIL_REMOVE);
+ knlist_detach(pr->pr_klist);
+ jaildesc_prison_cleanup(pr);
+ pr->pr_klist = NULL;
+}
+
+static void
+prison_cleanup_unlocked(struct prison *pr)
{
sx_assert(&allprison_lock, SA_XLOCKED);
mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
@@ -4616,6 +4991,7 @@ sysctl_jail_param(SYSCTL_HANDLER_ARGS)
* jail creation time but cannot be changed in an existing jail.
*/
SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
+SYSCTL_JAIL_PARAM(, desc, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail descriptor");
SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
@@ -5039,6 +5415,22 @@ prison_racct_detach(struct prison *pr)
}
#endif /* RACCT */
+/*
+ * Submit a knote for a prison, locking if necessary.
+ */
+static void
+prison_knote(struct prison *pr, long hint)
+{
+ int locked;
+
+ locked = mtx_owned(&pr->pr_mtx);
+ if (!locked)
+ mtx_lock(&pr->pr_mtx);
+ KNOTE_LOCKED(pr->pr_klist, hint);
+ if (!locked)
+ mtx_unlock(&pr->pr_mtx);
+}
+
#ifdef DDB
static void
diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c
new file mode 100644
index 000000000000..fec9b5719b2e
--- /dev/null
+++ b/sys/kern/kern_jaildesc.c
@@ -0,0 +1,336 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 James Gritton.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/jail.h>
+#include <sys/jaildesc.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/stat.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+#include <sys/vnode.h>
+
+MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors");
+
+static fo_stat_t jaildesc_stat;
+static fo_close_t jaildesc_close;
+static fo_chmod_t jaildesc_chmod;
+static fo_chown_t jaildesc_chown;
+static fo_fill_kinfo_t jaildesc_fill_kinfo;
+static fo_cmp_t jaildesc_cmp;
+
+static struct fileops jaildesc_ops = {
+ .fo_read = invfo_rdwr,
+ .fo_write = invfo_rdwr,
+ .fo_truncate = invfo_truncate,
+ .fo_ioctl = invfo_ioctl,
+ .fo_poll = invfo_poll,
+ .fo_kqfilter = invfo_kqfilter,
+ .fo_stat = jaildesc_stat,
+ .fo_close = jaildesc_close,
+ .fo_chmod = jaildesc_chmod,
+ .fo_chown = jaildesc_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_fill_kinfo = jaildesc_fill_kinfo,
+ .fo_cmp = jaildesc_cmp,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+/*
+ * Given a jail descriptor number, return the jaildesc, its prison,
+ * and its credential. The jaildesc will be returned locked, and
+ * prison and the credential will be returned held.
+ */
+int
+jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp,
+ struct prison **prp, struct ucred **ucredp)
+{
+ struct file *fp;
+ struct jaildesc *jd;
+ struct prison *pr;
+ int error;
+
+ error = fget(td, fd, &cap_no_rights, &fp);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_JAILDESC) {
+ error = EBADF;
+ goto out;
+ }
+ jd = fp->f_data;
+ JAILDESC_LOCK(jd);
+ pr = jd->jd_prison;
+ if (pr == NULL || !prison_isvalid(pr)) {
+ error = ENOENT;
+ JAILDESC_UNLOCK(jd);
+ goto out;
+ }
+ prison_hold(pr);
+ *prp = pr;
+ if (jdp != NULL)
+ *jdp = jd;
+ else
+ JAILDESC_UNLOCK(jd);
+ if (ucredp != NULL)
+ *ucredp = crhold(fp->f_cred);
+ out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Allocate a new jail decriptor, not yet associated with a prison.
+ * Return the file pointer (with a reference held) and the descriptor
+ * number.
+ */
+int
+jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning)
+{
+ struct file *fp;
+ struct jaildesc *jd;
+ int error;
+ mode_t mode;
+
+ if (owning) {
+ error = priv_check(td, PRIV_JAIL_REMOVE);
+ if (error != 0)
+ return (error);
+ mode = S_ISTXT;
+ } else
+ mode = 0;
+ jd = malloc(sizeof(*jd), M_JAILDESC, M_WAITOK | M_ZERO);
+ error = falloc_caps(td, &fp, fdp, 0, NULL);
+ if (error != 0) {
+ free(jd, M_JAILDESC);
+ return (error);
+ }
+ finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0
+ ? FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops);
+ JAILDESC_LOCK_INIT(jd);
+ jd->jd_uid = fp->f_cred->cr_uid;
+ jd->jd_gid = fp->f_cred->cr_gid;
+ jd->jd_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH | mode
+ | (priv_check(td, PRIV_JAIL_SET) == 0 ? S_IWUSR | S_IXUSR : 0)
+ | (priv_check(td, PRIV_JAIL_ATTACH) == 0 ? S_IXUSR : 0);
+ *fpp = fp;
+ return (0);
+}
+
+/*
+ * Assocate a jail descriptor with its prison.
+ */
+void
+jaildesc_set_prison(struct file *fp, struct prison *pr)
+{
+ struct jaildesc *jd;
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ jd = fp->f_data;
+ JAILDESC_LOCK(jd);
+ jd->jd_prison = pr;
+ LIST_INSERT_HEAD(&pr->pr_descs, jd, jd_list);
+ prison_hold(pr);
+ JAILDESC_UNLOCK(jd);
+}
+
+/*
+ * Detach the all jail descriptors from a prison.
+ */
+void
+jaildesc_prison_cleanup(struct prison *pr)
+{
+ struct jaildesc *jd;
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ while ((jd = LIST_FIRST(&pr->pr_descs))) {
+ JAILDESC_LOCK(jd);
+ LIST_REMOVE(jd, jd_list);
+ jd->jd_prison = NULL;
+ JAILDESC_UNLOCK(jd);
+ prison_free(pr);
+ }
+}
+
+static int
+jaildesc_close(struct file *fp, struct thread *td)
+{
+ struct jaildesc *jd;
+ struct prison *pr;
+
+ jd = fp->f_data;
+ fp->f_data = NULL;
+ if (jd != NULL) {
+ JAILDESC_LOCK(jd);
+ pr = jd->jd_prison;
+ if (pr != NULL) {
+ /*
+ * Free or remove the associated prison.
+ * This requires a second check after re-
+ * ordering locks. This jaildesc can remain
+ * unlocked once we have a prison reference,
+ * because that prison is the only place that
+ * still points back to it.
+ */
+ prison_hold(pr);
+ JAILDESC_UNLOCK(jd);
+ if (jd->jd_mode & S_ISTXT) {
+ sx_xlock(&allprison_lock);
+ prison_lock(pr);
+ if (jd->jd_prison != NULL) {
+ /*
+ * Unlink the prison, but don't free
+ * it; that will be done as part of
+ * of prison_remove.
+ */
+ LIST_REMOVE(jd, jd_list);
+ prison_remove(pr);
+ } else {
+ prison_unlock(pr);
+ sx_xunlock(&allprison_lock);
+ }
+ } else {
+ prison_lock(pr);
+ if (jd->jd_prison != NULL) {
+ LIST_REMOVE(jd, jd_list);
+ prison_free(pr);
+ }
+ prison_unlock(pr);
+ }
+ prison_free(pr);
+ }
+ JAILDESC_LOCK_DESTROY(jd);
+ free(jd, M_JAILDESC);
+ }
+ return (0);
+}
+
+static int
+jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
+{
+ struct jaildesc *jd;
+
+ bzero(sb, sizeof(struct stat));
+ jd = fp->f_data;
+ JAILDESC_LOCK(jd);
+ if (jd->jd_prison != NULL) {
+ sb->st_ino = jd->jd_prison ? jd->jd_prison->pr_id : 0;
+ sb->st_uid = jd->jd_uid;
+ sb->st_gid = jd->jd_gid;
+ sb->st_mode = jd->jd_mode;
+ } else
+ sb->st_mode = S_IFREG;
+ JAILDESC_UNLOCK(jd);
+ return (0);
+}
+
+static int
+jaildesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct jaildesc *jd;
+ int error;
+
+ /* Reject permissions that the creator doesn't have. */
+ if (((mode & (S_IWUSR | S_IWGRP | S_IWOTH))
+ && priv_check_cred(fp->f_cred, PRIV_JAIL_SET) != 0)
+ || ((mode & (S_IXUSR | S_IXGRP | S_IXOTH))
+ && priv_check_cred(fp->f_cred, PRIV_JAIL_ATTACH) != 0
+ && priv_check_cred(fp->f_cred, PRIV_JAIL_SET) != 0)
+ || ((mode & S_ISTXT)
+ && priv_check_cred(fp->f_cred, PRIV_JAIL_REMOVE) != 0))
+ return (EPERM);
+ if (mode & (S_ISUID | S_ISGID))
+ return (EINVAL);
+ jd = fp->f_data;
+ JAILDESC_LOCK(jd);
+ error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VADMIN,
+ active_cred);
+ if (error == 0)
+ jd->jd_mode = S_IFREG | (mode & ALLPERMS);
+ JAILDESC_UNLOCK(jd);
+ return (error);
+}
+
+static int
+jaildesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct jaildesc *jd;
+ int error;
+
+ error = 0;
+ jd = fp->f_data;
+ JAILDESC_LOCK(jd);
+ if (uid == (uid_t)-1)
+ uid = jd->jd_uid;
+ if (gid == (gid_t)-1)
+ gid = jd->jd_gid;
+ if ((uid != jd->jd_uid && uid != active_cred->cr_uid) ||
+ (gid != jd->jd_gid && !groupmember(gid, active_cred)))
+ error = priv_check_cred(active_cred, PRIV_VFS_CHOWN);
+ if (error == 0) {
+ jd->jd_uid = uid;
+ jd->jd_gid = gid;
+ }
+ JAILDESC_UNLOCK(jd);
+ return (error);
+}
+
+static int
+jaildesc_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+ struct filedesc *fdp)
+{
+ return (EINVAL);
+}
+
+static int
+jaildesc_cmp(struct file *fp1, struct file *fp2, struct thread *td)
+{
+ struct jaildesc *jd1, *jd2;
+ int jid1, jid2;
+
+ if (fp2->f_type != DTYPE_JAILDESC)
+ return (3);
+ jd1 = fp1->f_data;
+ JAILDESC_LOCK(jd1);
+ jid1 = jd1->jd_prison ? (uintptr_t)jd1->jd_prison->pr_id : 0;
+ JAILDESC_UNLOCK(jd1);
+ jd2 = fp2->f_data;
+ JAILDESC_LOCK(jd2);
+ jid2 = jd2->jd_prison ? (uintptr_t)jd2->jd_prison->pr_id : 0;
+ JAILDESC_UNLOCK(jd2);
+ return (kcmp_cmp(jid1, jid2));
+}
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index f952b3fc8805..8b5908f5219a 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -1136,9 +1136,9 @@ __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
* General init routine used by the MTX_SYSINIT() macro.
*/
void
-mtx_sysinit(void *arg)
+mtx_sysinit(const void *arg)
{
- struct mtx_args *margs = arg;
+ const struct mtx_args *margs = arg;
mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
margs->ma_opts);
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
index c1633dd19de2..7206572ffc02 100644
--- a/sys/kern/kern_rmlock.c
+++ b/sys/kern/kern_rmlock.c
@@ -337,9 +337,9 @@ rm_wowned(const struct rmlock *rm)
}
void
-rm_sysinit(void *arg)
+rm_sysinit(const void *arg)
{
- struct rm_args *args;
+ const struct rm_args *args;
args = arg;
rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags);
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index e182d1fe9baf..84a3a890be63 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
@@ -266,9 +266,9 @@ _rw_destroy(volatile uintptr_t *c)
}
void
-rw_sysinit(void *arg)
+rw_sysinit(const void *arg)
{
- struct rw_args *args;
+ const struct rw_args *args;
args = arg;
rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
index accea5d288eb..c005e112d3b9 100644
--- a/sys/kern/kern_sx.c
+++ b/sys/kern/kern_sx.c
@@ -222,9 +222,9 @@ owner_sx(const struct lock_object *lock, struct thread **owner)
#endif
void
-sx_sysinit(void *arg)
+sx_sysinit(const void *arg)
{
- struct sx_args *sargs = arg;
+ const struct sx_args *sargs = arg;
sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 50b040132396..3180c66cb42b 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -1694,8 +1694,10 @@ thread_single_end(struct proc *p, int mode)
thread_unlock(td);
}
}
- KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
- ("inconsistent boundary count %d", p->p_boundary_count));
+ KASSERT(mode != SINGLE_BOUNDARY || P_SHOULDSTOP(p) ||
+ p->p_boundary_count == 0,
+ ("pid %d proc %p flags %#x inconsistent boundary count %d",
+ p->p_pid, p, p->p_flag, p->p_boundary_count));
PROC_SUNLOCK(p);
wakeup(&p->p_flag);
}
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index ab47b6ad29a3..a65c3ca128d9 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -57,7 +57,7 @@
* b : public affirmation by word or example of usually
* religious faith or conviction <the heroic witness to divine
* life -- Pilot>
- * 6 capitalized : a member of the Jehovah's Witnesses
+ * 6 capitalized : a member of the Jehovah's Witnesses
*/
/*
@@ -131,7 +131,7 @@
#define LI_SLEEPABLE 0x00040000 /* Lock may be held while sleeping. */
#ifndef WITNESS_COUNT
-#define WITNESS_COUNT 1536
+#define WITNESS_COUNT 1536
#endif
#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */
#define WITNESS_PENDLIST (512 + (MAXCPU * 4))
@@ -158,20 +158,18 @@
* These flags go in the witness relationship matrix and describe the
* relationship between any two struct witness objects.
*/
-#define WITNESS_UNRELATED 0x00 /* No lock order relation. */
-#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */
-#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */
-#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */
-#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */
-#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR)
-#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT)
-#define WITNESS_RELATED_MASK \
- (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
-#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been
- * observed. */
-#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */
-#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */
-#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */
+#define WITNESS_UNRELATED 0x00 /* No lock order relation. */
+#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */
+#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */
+#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */
+#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */
+#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR)
+#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT)
+#define WITNESS_RELATED_MASK (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
+#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been observed. */
+#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */
+#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */
+#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */
/* Descendant to ancestor flags */
#define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2)
@@ -218,20 +216,18 @@ struct lock_list_entry {
* (for example, "vnode interlock").
*/
struct witness {
- char w_name[MAX_W_NAME];
- uint32_t w_index; /* Index in the relationship matrix */
+ char w_name[MAX_W_NAME];
+ uint32_t w_index; /* Index in the relationship matrix */
struct lock_class *w_class;
- STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */
- STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */
- struct witness *w_hash_next; /* Linked list in hash buckets. */
- const char *w_file; /* File where last acquired */
- uint32_t w_line; /* Line where last acquired */
- uint32_t w_refcount;
- uint16_t w_num_ancestors; /* direct/indirect
- * ancestor count */
- uint16_t w_num_descendants; /* direct/indirect
- * descendant count */
- int16_t w_ddb_level;
+ STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */
+ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */
+ struct witness *w_hash_next; /* Linked list in hash buckets. */
+ const char *w_file; /* File where last acquired */
+ uint32_t w_line; /* Line where last acquired */
+ uint32_t w_refcount;
+ uint16_t w_num_ancestors; /* direct/indirect ancestor count */
+ uint16_t w_num_descendants; /* direct/indirect descendant count */
+ int16_t w_ddb_level;
unsigned w_displayed:1;
unsigned w_reversed:1;
};
@@ -265,7 +261,7 @@ struct witness_lock_order_data {
/*
* The witness lock order data hash table. Keys are witness index tuples
* (struct witness_lock_order_key), elements are lock order data objects
- * (struct witness_lock_order_data).
+ * (struct witness_lock_order_data).
*/
struct witness_lock_order_hash {
struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE];
@@ -295,7 +291,6 @@ struct witness_order_list_entry {
static __inline int
witness_lock_type_equal(struct witness *w1, struct witness *w2)
{
-
return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
}
@@ -304,7 +299,6 @@ static __inline int
witness_lock_order_key_equal(const struct witness_lock_order_key *a,
const struct witness_lock_order_key *b)
{
-
return (a->from == b->from && a->to == b->to);
}
@@ -415,7 +409,7 @@ SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin,
int badstack_sbuf_size;
int witness_count = WITNESS_COUNT;
-SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN,
+SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN,
&witness_count, 0, "");
/*
@@ -760,7 +754,6 @@ static int witness_spin_warn = 0;
static const char *
fixup_filename(const char *file)
{
-
if (file == NULL)
return (NULL);
while (strncmp(file, "../", 3) == 0)
@@ -835,7 +828,7 @@ witness_startup(void *mem)
w_free_cnt--;
for (i = 0; i < witness_count; i++) {
- memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
+ memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
(witness_count + 1));
}
@@ -989,16 +982,16 @@ witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
{
int i;
- for (i = 0; i < indent; i++)
- prnt(" ");
+ for (i = 0; i < indent; i++)
+ prnt(" ");
prnt("%s (type: %s, depth: %d, active refs: %d)",
w->w_name, w->w_class->lc_name,
w->w_ddb_level, w->w_refcount);
- if (w->w_displayed) {
- prnt(" -- (already displayed)\n");
- return;
- }
- w->w_displayed = 1;
+ if (w->w_displayed) {
+ prnt(" -- (already displayed)\n");
+ return;
+ }
+ w->w_displayed = 1;
if (w->w_file != NULL && w->w_line != 0)
prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
w->w_line);
@@ -1079,7 +1072,6 @@ witness_ddb_display(int(*prnt)(const char *fmt, ...))
int
witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
{
-
if (witness_watch == -1 || KERNEL_PANICKED())
return (0);
@@ -1257,7 +1249,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file,
w->w_reversed = 1;
mtx_unlock_spin(&w_mtx);
witness_output(
- "acquiring duplicate lock of same type: \"%s\"\n",
+ "acquiring duplicate lock of same type: \"%s\"\n",
w->w_name);
witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
fixup_filename(plock->li_file), plock->li_line);
@@ -1743,7 +1735,7 @@ found:
/*
* In order to reduce contention on w_mtx, we want to keep always an
- * head object into lists so that frequent allocation from the
+ * head object into lists so that frequent allocation from the
* free witness pool (and subsequent locking) is avoided.
* In order to maintain the current code simple, when the head
* object is totally unloaded it means also that we do not have
@@ -1781,7 +1773,7 @@ witness_thread_exit(struct thread *td)
n++;
witness_list_lock(&lle->ll_children[i],
witness_output);
-
+
}
kassert_panic(
"Thread %p cannot exit while holding sleeplocks\n", td);
@@ -1948,7 +1940,6 @@ found:
static void
depart(struct witness *w)
{
-
MPASS(w->w_refcount == 0);
if (w->w_class->lc_flags & LC_SLEEPLOCK) {
w_sleep_cnt--;
@@ -1999,18 +1990,18 @@ adopt(struct witness *parent, struct witness *child)
child->w_num_ancestors++;
}
- /*
- * Find each ancestor of 'pi'. Note that 'pi' itself is counted as
+ /*
+ * Find each ancestor of 'pi'. Note that 'pi' itself is counted as
* an ancestor of 'pi' during this loop.
*/
for (i = 1; i <= w_max_used_index; i++) {
- if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
+ if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
(i != pi))
continue;
/* Find each descendant of 'i' and mark it as a descendant. */
for (j = 1; j <= w_max_used_index; j++) {
- /*
+ /*
* Skip children that are already marked as
* descendants of 'i'.
*/
@@ -2021,7 +2012,7 @@ adopt(struct witness *parent, struct witness *child)
* We are only interested in descendants of 'ci'. Note
* that 'ci' itself is counted as a descendant of 'ci'.
*/
- if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
+ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
(j != ci))
continue;
w_rmatrix[i][j] |= WITNESS_ANCESTOR;
@@ -2029,16 +2020,16 @@ adopt(struct witness *parent, struct witness *child)
w_data[i].w_num_descendants++;
w_data[j].w_num_ancestors++;
- /*
+ /*
* Make sure we aren't marking a node as both an
- * ancestor and descendant. We should have caught
+ * ancestor and descendant. We should have caught
* this as a lock order reversal earlier.
*/
if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
(w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
printf("witness rmatrix paradox! [%d][%d]=%d "
"both ancestor and descendant\n",
- i, j, w_rmatrix[i][j]);
+ i, j, w_rmatrix[i][j]);
kdb_backtrace();
printf("Witness disabled.\n");
witness_watch = -1;
@@ -2047,7 +2038,7 @@ adopt(struct witness *parent, struct witness *child)
(w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
printf("witness rmatrix paradox! [%d][%d]=%d "
"both ancestor and descendant\n",
- j, i, w_rmatrix[j][i]);
+ j, i, w_rmatrix[j][i]);
kdb_backtrace();
printf("Witness disabled.\n");
witness_watch = -1;
@@ -2124,7 +2115,6 @@ _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
static int
isitmychild(struct witness *parent, struct witness *child)
{
-
return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
}
@@ -2134,7 +2124,6 @@ isitmychild(struct witness *parent, struct witness *child)
static int
isitmydescendant(struct witness *ancestor, struct witness *descendant)
{
-
return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
__func__));
}
@@ -2182,7 +2171,7 @@ witness_get(void)
STAILQ_REMOVE_HEAD(&w_free, w_list);
w_free_cnt--;
index = w->w_index;
- MPASS(index > 0 && index == w_max_used_index+1 &&
+ MPASS(index > 0 && index == w_max_used_index + 1 &&
index < witness_count);
bzero(w, sizeof(*w));
w->w_index = index;
@@ -2194,7 +2183,6 @@ witness_get(void)
static void
witness_free(struct witness *w)
{
-
STAILQ_INSERT_HEAD(&w_free, w, w_list);
w_free_cnt++;
}
@@ -2219,11 +2207,10 @@ witness_lock_list_get(void)
bzero(lle, sizeof(*lle));
return (lle);
}
-
+
static void
witness_lock_list_free(struct lock_list_entry *lle)
{
-
mtx_lock_spin(&w_mtx);
lle->ll_next = w_lock_list_free;
w_lock_list_free = lle;
@@ -2297,7 +2284,6 @@ witness_voutput(const char *fmt, va_list ap)
static int
witness_thread_has_locks(struct thread *td)
{
-
if (td->td_sleeplocks == NULL)
return (0);
return (td->td_sleeplocks->ll_count != 0);
@@ -2573,14 +2559,12 @@ witness_setflag(struct lock_object *lock, int flag, int set)
void
witness_norelease(struct lock_object *lock)
{
-
witness_setflag(lock, LI_NORELEASE, 1);
}
void
witness_releaseok(struct lock_object *lock)
{
-
witness_setflag(lock, LI_NORELEASE, 0);
}
@@ -2588,7 +2572,6 @@ witness_releaseok(struct lock_object *lock)
static void
witness_ddb_list(struct thread *td)
{
-
KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
KASSERT(kdb_active, ("%s: not in the debugger", __func__));
@@ -2653,7 +2636,6 @@ DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE);
DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE)
{
-
witness_ddb_display(db_printf);
}
#endif
@@ -2673,9 +2655,9 @@ sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
/* Allocate and init temporary storage space. */
tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
- tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
+ tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
M_WAITOK | M_ZERO);
- tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
+ tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
M_WAITOK | M_ZERO);
stack_zero(&tmp_data1->wlod_stack);
stack_zero(&tmp_data2->wlod_stack);
@@ -2750,12 +2732,12 @@ restart:
sbuf_printf(sb,
"\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
- tmp_w1->w_name, tmp_w1->w_class->lc_name,
+ tmp_w1->w_name, tmp_w1->w_class->lc_name,
tmp_w2->w_name, tmp_w2->w_class->lc_name);
if (data1) {
sbuf_printf(sb,
"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
- tmp_w1->w_name, tmp_w1->w_class->lc_name,
+ tmp_w1->w_name, tmp_w1->w_class->lc_name,
tmp_w2->w_name, tmp_w2->w_class->lc_name);
stack_sbuf_print(sb, &tmp_data1->wlod_stack);
sbuf_putc(sb, '\n');
@@ -2763,7 +2745,7 @@ restart:
if (data2 && data2 != data1) {
sbuf_printf(sb,
"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
- tmp_w2->w_name, tmp_w2->w_class->lc_name,
+ tmp_w2->w_name, tmp_w2->w_class->lc_name,
tmp_w1->w_name, tmp_w1->w_class->lc_name);
stack_sbuf_print(sb, &tmp_data2->wlod_stack);
sbuf_putc(sb, '\n');
@@ -2823,7 +2805,6 @@ sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
static int
sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
{
-
return (db_printf("%.*s", len, data));
}
@@ -3068,7 +3049,7 @@ witness_lock_order_get(struct witness *parent, struct witness *child)
& WITNESS_LOCK_ORDER_KNOWN) == 0)
goto out;
- hash = witness_hash_djb2((const char*)&key,
+ hash = witness_hash_djb2((const char *)&key,
sizeof(key)) % w_lohash.wloh_size;
data = w_lohash.wloh_array[hash];
while (data != NULL) {
@@ -3089,7 +3070,6 @@ out:
static int
witness_lock_order_check(struct witness *parent, struct witness *child)
{
-
if (parent != child &&
w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN &&
@@ -3115,7 +3095,7 @@ witness_lock_order_add(struct witness *parent, struct witness *child)
& WITNESS_LOCK_ORDER_KNOWN)
return (1);
- hash = witness_hash_djb2((const char*)&key,
+ hash = witness_hash_djb2((const char *)&key,
sizeof(key)) % w_lohash.wloh_size;
w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
data = w_lofree;
@@ -3134,7 +3114,6 @@ witness_lock_order_add(struct witness *parent, struct witness *child)
static void
witness_increment_graph_generation(void)
{
-
if (witness_cold == 0)
mtx_assert(&w_mtx, MA_OWNED);
w_generation++;
@@ -3143,7 +3122,6 @@ witness_increment_graph_generation(void)
static int
witness_output_drain(void *arg __unused, const char *data, int len)
{
-
witness_output("%.*s", len, data);
return (len);
}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 4122f9261871..4cef89cd5219 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -602,4 +602,6 @@ const char *syscallnames[] = {
"inotify_rm_watch", /* 594 = inotify_rm_watch */
"getgroups", /* 595 = getgroups */
"setgroups", /* 596 = setgroups */
+ "jail_attach_jd", /* 597 = jail_attach_jd */
+ "jail_remove_jd", /* 598 = jail_remove_jd */
};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index fa64597d14a5..911f9093824b 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3383,5 +3383,15 @@
_In_reads_(gidsetsize) const gid_t *gidset
);
}
+597 AUE_JAIL_ATTACH STD {
+ int jail_attach_jd(
+ int fd
+ );
+ }
+598 AUE_JAIL_REMOVE STD {
+ int jail_remove_jd(
+ int fd
+ );
+ }
; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 2b1ea9eed8d4..e28fef931ea8 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3500,6 +3500,20 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 2;
break;
}
+ /* jail_attach_jd */
+ case 597: {
+ struct jail_attach_jd_args *p = params;
+ iarg[a++] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* jail_remove_jd */
+ case 598: {
+ struct jail_remove_jd_args *p = params;
+ iarg[a++] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
default:
*n_args = 0;
break;
@@ -9367,6 +9381,26 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
+ /* jail_attach_jd */
+ case 597:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* jail_remove_jd */
+ case 598:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
default:
break;
};
@@ -11365,6 +11399,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
if (ndx == 0 || ndx == 1)
p = "int";
break;
+ /* jail_attach_jd */
+ case 597:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* jail_remove_jd */
+ case 598:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
default:
break;
};
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 19870e989437..6138e543fae7 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1807,9 +1807,7 @@ uipc_filt_sowrite(struct knote *kn, long hint)
kn->kn_data = uipc_stream_sbspace(&so2->so_rcv);
if (so2->so_rcv.sb_state & SBS_CANTRCVMORE) {
- /*
- * XXXGL: maybe kn->kn_flags |= EV_EOF ?
- */
+ kn->kn_flags |= EV_EOF;
return (1);
} else if (kn->kn_sfflags & NOTE_LOWAT)
return (kn->kn_data >= kn->kn_sdata);
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
index cd30d5cfae47..ceda770cb714 100644
--- a/sys/kern/vfs_init.c
+++ b/sys/kern/vfs_init.c
@@ -103,6 +103,16 @@ struct vattr va_null;
* Routines having to do with the management of the vnode table.
*/
+void
+vfs_unref_vfsconf(struct vfsconf *vfsp)
+{
+ vfsconf_lock();
+ KASSERT(vfsp->vfc_refcount > 0,
+ ("vfs %p refcount underflow %d", vfsp, vfsp->vfc_refcount));
+ vfsp->vfc_refcount--;
+ vfsconf_unlock();
+}
+
static struct vfsconf *
vfs_byname_locked(const char *name)
{
@@ -123,9 +133,11 @@ vfs_byname(const char *name)
{
struct vfsconf *vfsp;
- vfsconf_slock();
+ vfsconf_lock();
vfsp = vfs_byname_locked(name);
- vfsconf_sunlock();
+ if (vfsp != NULL)
+ vfsp->vfc_refcount++;
+ vfsconf_unlock();
return (vfsp);
}
@@ -387,7 +399,7 @@ vfs_register(struct vfsconf *vfc)
static int once;
struct vfsconf *tvfc;
uint32_t hashval;
- int secondpass;
+ int error, prevmaxconf, secondpass;
if (!once) {
vattr_null(&va_null);
@@ -405,6 +417,7 @@ vfs_register(struct vfsconf *vfc)
return (EEXIST);
}
+ prevmaxconf = maxvfsconf;
if (vfs_typenumhash != 0) {
/*
* Calculate a hash on vfc_name to use for vfc_typenum. Unless
@@ -497,16 +510,24 @@ vfs_register(struct vfsconf *vfc)
vfc->vfc_vfsops = &vfsops_sigdefer;
}
- if (vfc->vfc_flags & VFCF_JAIL)
- prison_add_vfs(vfc);
-
/*
* Call init function for this VFS...
*/
if ((vfc->vfc_flags & VFCF_SBDRY) != 0)
- vfc->vfc_vfsops_sd->vfs_init(vfc);
+ error = vfc->vfc_vfsops_sd->vfs_init(vfc);
else
- vfc->vfc_vfsops->vfs_init(vfc);
+ error = vfc->vfc_vfsops->vfs_init(vfc);
+
+ if (error != 0) {
+ maxvfsconf = prevmaxconf;
+ TAILQ_REMOVE(&vfsconf, vfc, vfc_list);
+ vfsconf_unlock();
+ return (error);
+ }
+
+ if ((vfc->vfc_flags & VFCF_JAIL) != 0)
+ prison_add_vfs(vfc);
+
vfsconf_unlock();
/*
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 8e64a7fe966b..13403acacc08 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -683,7 +683,6 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
MPASSERT(mp->mnt_vfs_ops == 1, mp,
("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
(void) vfs_busy(mp, MBF_NOWAIT);
- atomic_add_acq_int(&vfsp->vfc_refcount, 1);
mp->mnt_op = vfsp->vfc_vfsops;
mp->mnt_vfc = vfsp;
mp->mnt_stat.f_type = vfsp->vfc_typenum;
@@ -731,7 +730,6 @@ vfs_mount_destroy(struct mount *mp)
__FILE__, __LINE__));
MPPASS(mp->mnt_writeopcount == 0, mp);
MPPASS(mp->mnt_secondary_writes == 0, mp);
- atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
struct vnode *vp;
@@ -769,6 +767,9 @@ vfs_mount_destroy(struct mount *mp)
vfs_free_addrlist(mp->mnt_export);
free(mp->mnt_export, M_MOUNT);
}
+ vfsconf_lock();
+ mp->mnt_vfc->vfc_refcount--;
+ vfsconf_unlock();
crfree(mp->mnt_cred);
uma_zfree(mount_zone, mp);
}
@@ -1133,6 +1134,7 @@ vfs_domount_first(
if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred,
vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) {
vput(vp);
+ vfs_unref_vfsconf(vfsp);
return (EPERM);
}
@@ -1169,6 +1171,7 @@ vfs_domount_first(
}
if (error != 0) {
vput(vp);
+ vfs_unref_vfsconf(vfsp);
return (error);
}
vn_seqc_write_begin(vp);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index a6e38be89291..57732ddab7d9 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2186,6 +2186,8 @@ freevnode(struct vnode *vp)
{
struct bufobj *bo;
+ ASSERT_VOP_UNLOCKED(vp, __func__);
+
/*
* The vnode has been marked for destruction, so free it.
*
@@ -2222,12 +2224,16 @@ freevnode(struct vnode *vp)
mac_vnode_destroy(vp);
#endif
if (vp->v_pollinfo != NULL) {
+ int error __diagused;
+
/*
* Use LK_NOWAIT to shut up witness about the lock. We may get
* here while having another vnode locked when trying to
* satisfy a lookup and needing to recycle.
*/
- VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ VNASSERT(error == 0, vp,
+ ("freevnode: cannot lock vp %p for pollinfo destroy", vp));
destroy_vpollinfo(vp->v_pollinfo);
VOP_UNLOCK(vp);
vp->v_pollinfo = NULL;