78 files changed, 2679 insertions, 1263 deletions
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 5a53fac50f2c..779158b41221 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -92,7 +92,7 @@
 #define	ELF_ABI_ID	__CONCAT(elf, __ELF_WORD_SIZE)
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
-static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
+static const Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
     const char *interp, int32_t *osrel, uint32_t *fctl0);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry);
@@ -104,7 +104,7 @@ static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note,
     int32_t *osrel);
 static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static bool __elfN(check_note)(struct image_params *imgp,
-    Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0,
+    const Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0,
     uint32_t *fctl0);
 static vm_prot_t __elfN(trans_prot)(Elf_Word);
 static Elf_Word __elfN(untrans_prot)(vm_prot_t);
@@ -227,11 +227,11 @@ SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx,
     CTLFLAG_RWTUN, &__elfN(allow_wx), 0,
     "Allow pages to be mapped simultaneously writable and executable");
 
-static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
+static const Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 #define	aligned(a, t)	(rounddown2((u_long)(a), sizeof(t)) == (u_long)(a))
 
-Elf_Brandnote __elfN(freebsd_brandnote) = {
+const Elf_Brandnote __elfN(freebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
 	.hdr.n_descsz	= sizeof(int32_t),
 	.hdr.n_type	= NT_FREEBSD_ABI_TAG,
@@ -254,7 +254,7 @@ __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
 
 static int GNU_KFREEBSD_ABI_DESC = 3;
 
-Elf_Brandnote __elfN(kfreebsd_brandnote) = {
+const Elf_Brandnote __elfN(kfreebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
@@ -286,7 +286,7 @@ kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
 }
 
 int
-__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
+__elfN(insert_brand_entry)(const Elf_Brandinfo *entry)
 {
 	int i;
 
@@ -305,7 +305,7 @@ __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 }
 
 int
-__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
+__elfN(remove_brand_entry)(const Elf_Brandinfo *entry)
 {
 	int i;
 
@@ -321,7 +321,7 @@ __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 }
 
 bool
-__elfN(brand_inuse)(Elf_Brandinfo *entry)
+__elfN(brand_inuse)(const Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	bool rval = false;
@@ -338,12 +338,12 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry)
 	return (rval);
 }
 
-static Elf_Brandinfo *
+static const Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
     int32_t *osrel, uint32_t *fctl0)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
-	Elf_Brandinfo *bi, *bi_m;
+	const Elf_Brandinfo *bi, *bi_m;
 	bool ret, has_fctl0;
 	int i, interp_name_len;
 
@@ -492,7 +492,7 @@ __elfN(phdr_in_zero_page)(const Elf_Ehdr *hdr)
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
-	Elf_Brandinfo *bi;
+	const Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
@@ -1109,7 +1109,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 	struct vmspace *vmspace;
 	vm_map_t map;
 	char *interp;
-	Elf_Brandinfo *brand_info;
+	const Elf_Brandinfo *brand_info;
 	struct sysentvec *sv;
 	u_long addr, baddr, entry, proghdr;
 	u_long maxalign, maxsalign, mapsz, maxv, maxv1, anon_loc;
@@ -1925,7 +1925,7 @@ __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
 	Elf_Phdr *phdr;
 	Elf_Shdr *shdr;
 	struct phdr_closure phc;
-	Elf_Brandinfo *bi;
+	const Elf_Brandinfo *bi;
 
 	ehdr = (Elf_Ehdr *)hdr;
 	bi = td->td_proc->p_elf_brandinfo;
@@ -2610,11 +2610,13 @@ note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
 	int structsize;
 
 	p = arg;
-	size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
+	size = sizeof(structsize) +
+	    (1 + p->p_ucred->cr_ngroups) * sizeof(gid_t);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(gid_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sbuf_bcat(sb, &p->p_ucred->cr_gid, sizeof(gid_t));
 		sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
 		    sizeof(gid_t));
 	}
@@ -2829,7 +2831,7 @@ __elfN(parse_notes)(const struct image_params *imgp, const Elf_Note *checknote,
 		}
 		if ((const char *)note_end - (const char *)note <
 		    sizeof(Elf_Note)) {
-			uprintf("ELF note to short\n");
+			uprintf("ELF note too short\n");
 			goto retf;
 		}
 		if (note->n_namesz != checknote->n_namesz ||
@@ -2837,9 +2839,9 @@ __elfN(parse_notes)(const struct image_params *imgp, const Elf_Note *checknote,
 		    note->n_type != checknote->n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
-		if (note_name + checknote->n_namesz >=
-		    (const char *)note_end || strncmp(note_vendor,
-		    note_name, checknote->n_namesz) != 0)
+		if (note_name + roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
+		    note->n_descsz >= (const char *)note_end ||
+		    strncmp(note_vendor, note_name, checknote->n_namesz) != 0)
 			goto nextnote;
 
 		if (cb(note, cb_arg, &res))
@@ -2859,7 +2861,7 @@ ret:
 }
 
 struct brandnote_cb_arg {
-	Elf_Brandnote *brandnote;
+	const Elf_Brandnote *brandnote;
 	int32_t *osrel;
 };
 
@@ -2881,7 +2883,7 @@ brandnote_cb(const Elf_Note *note, void *arg0, bool *res)
 	return (true);
 }
 
-static Elf_Note fctl_note = {
+static const Elf_Note fctl_note = {
 	.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
 	.n_descsz = sizeof(uint32_t),
 	.n_type = NT_FREEBSD_FEATURE_CTL,
@@ -2916,7 +2918,7 @@ note_fctl_cb(const Elf_Note *note, void *arg0, bool *res)
  * as for headers.
  */
 static bool
-__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
+__elfN(check_note)(struct image_params *imgp, const Elf_Brandnote *brandnote,
     int32_t *osrel, bool *has_fctl0, uint32_t *fctl0)
 {
 	const Elf_Phdr *phdr;
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 36ce44b988be..6612ac685936 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -145,13 +145,6 @@ FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance");
 #endif
 
 /*
- * This ensures that there is at least one entry so that the sysinit_set
- * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
- * executed.
- */
-SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
-
-/*
  * The sysinit linker set compiled into the kernel.  These are placed onto the
  * sysinit list by mi_startup; sysinit_add can add (e.g., from klds) additional
  * sysinits to the linked list but the linker set here does not change.
@@ -296,7 +289,7 @@ mi_startup(void)
 			BOOTTRACE_INIT("sysinit 0x%7x", sip->subsystem);
 
 #if defined(VERBOSE_SYSINIT)
-		if (sip->subsystem > last && verbose_sysinit != 0) {
+		if (sip->subsystem != last && verbose_sysinit != 0) {
 			verbose = 1;
 			printf("subsystem %x\n", sip->subsystem);
 		}
@@ -391,7 +384,7 @@ C_SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH,
 
 #if __SIZEOF_LONG__ == 4
 static const char ilp32_warn[] =
-    "WARNING: 32-bit kernels are deprecated and may be removed in FreeBSD 15.0.\n";
+    "WARNING: 32-bit kernels are deprecated and may be removed in FreeBSD 16.0.\n";
 C_SYSINIT(ilp32warn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH,
     print_caddr_t, ilp32_warn);
 C_SYSINIT(ilp32warn2, SI_SUB_LAST, SI_ORDER_FIFTH,
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index fcd232cde21e..cd305de1ed44 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -663,4 +663,7 @@ struct sysent sysent[] = {
 	{ .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 594 = inotify_rm_watch */
 	{ .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 595 = getgroups */
 	{ .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 596 = setgroups */
+	{ .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 597 = jail_attach_jd */
+	{ .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 598 = jail_remove_jd */
+	{ .sy_narg = AS(kexec_load_args), .sy_call = (sy_call_t *)sys_kexec_load, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 599 = kexec_load */
 };
diff --git a/sys/kern/kern_boottrace.c b/sys/kern/kern_boottrace.c
index 1fa87955a299..c83255bc74ee 100644
--- a/sys/kern/kern_boottrace.c
+++ b/sys/kern/kern_boottrace.c
@@ -579,7 +579,7 @@ sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
 }
 
 static void
-boottrace_init(void)
+boottrace_init(void *dummy __unused)
 {
 
 	if (!boottrace_enabled)
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
index b891ed84957a..2da51d84ff60 100644
--- a/sys/kern/kern_conf.c
+++ b/sys/kern/kern_conf.c
@@ -664,7 +664,7 @@ prep_cdevsw(struct cdevsw *devsw, int flags)
 		if ((devsw->d_flags & D_GIANTOK) == 0) {
 			printf(
 			    "WARNING: Device \"%s\" is Giant locked and may be "
-			    "deleted before FreeBSD 15.0.\n",
+			    "deleted before FreeBSD 16.0.\n",
 			    devsw->d_name == NULL ? "???" : devsw->d_name);
 		}
 		if (devsw->d_gianttrick == NULL) {
@@ -1163,6 +1163,9 @@ destroy_devl(struct cdev *dev)
 		devfs_destroy_cdevpriv(p);
 		mtx_lock(&cdevpriv_mtx);
 	}
+	while (cdp->cdp_fdpriv_dtrc != 0) {
+		msleep(&cdp->cdp_fdpriv_dtrc, &cdevpriv_mtx, 0, "cdfdpc", 0);
+	}
 	mtx_unlock(&cdevpriv_mtx);
 	dev_lock();
 
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index a27ab33b34da..a71a601733e5 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -658,6 +658,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			error = EBADF;
 			break;
 		}
+		fsetfl_lock(fp);
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
@@ -665,26 +666,34 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		got_set = tmp & ~flg;
 		got_cleared = flg & ~tmp;
-		tmp = fp->f_flag & FNONBLOCK;
-		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
-		if (error != 0)
-			goto revert_f_setfl;
-		tmp = fp->f_flag & FASYNC;
-		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
-		if (error == 0) {
-			fdrop(fp, td);
-			break;
+		if (((got_set | got_cleared) & FNONBLOCK) != 0) {
+			tmp = fp->f_flag & FNONBLOCK;
+			error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+			if (error != 0)
+				goto revert_flags;
+		}
+		if (((got_set | got_cleared) & FASYNC) != 0) {
+			tmp = fp->f_flag & FASYNC;
+			error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
+			if (error != 0)
+				goto revert_nonblock;
 		}
-		atomic_clear_int(&fp->f_flag, FNONBLOCK);
-		tmp = 0;
-		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
-revert_f_setfl:
+		fsetfl_unlock(fp);
+		fdrop(fp, td);
+		break;
+revert_nonblock:
+		if (((got_set | got_cleared) & FNONBLOCK) != 0) {
+			tmp = ~fp->f_flag & FNONBLOCK;
+			(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+		}
+revert_flags:
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= got_cleared;
 			tmp &= ~got_set;
 		} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
+		fsetfl_unlock(fp);
 		fdrop(fp, td);
 		break;
 
@@ -2477,7 +2486,7 @@ fdunshare(struct thread *td)
 	if (refcount_load(&p->p_fd->fd_refcnt) == 1)
 		return;
 
-	tmp = fdcopy(p->p_fd);
+	tmp = fdcopy(p->p_fd, p);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
@@ -2506,14 +2515,17 @@ pdunshare(struct thread *td)
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
-fdcopy(struct filedesc *fdp)
+fdcopy(struct filedesc *fdp, struct proc *p1)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
+	struct file *fp;
 	int i, lastfile;
+	bool fork_pass;
 
 	MPASS(fdp != NULL);
 
+	fork_pass = false;
 	newfdp = fdinit();
 	FILEDESC_SLOCK(fdp);
 	for (;;) {
@@ -2524,10 +2536,35 @@ fdcopy(struct filedesc *fdp)
 		fdgrowtable(newfdp, lastfile + 1);
 		FILEDESC_SLOCK(fdp);
 	}
-	/* copy all passable descriptors (i.e. not kqueue) */
+
+	/*
+	 * Copy all passable descriptors (i.e. not kqueue), and
+	 * prepare to handle copyable but not passable descriptors
+	 * (kqueues).
+	 *
+	 * The pass to handle copying is performed after all passable
+	 * files are installed into the new file descriptor's table,
+	 * since kqueues need all referenced file descriptors already
+	 * valid, including other kqueues. For the same reason the
+	 * copying is done in two passes by itself, first installing
+	 * not fully initialized ('empty') copyable files into the new
+	 * fd table, and then giving the subsystems a second chance to
+	 * really fill the copied file backing structure with the
+	 * content.
+	 */
 	newfdp->fd_freefile = fdp->fd_freefile;
 	FILEDESC_FOREACH_FDE(fdp, i, ofde) {
-		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
+		const struct fileops *ops;
+
+		ops = ofde->fde_file->f_ops;
+		fp = NULL;
+		if ((ops->fo_flags & DFLAG_FORK) != 0 &&
+		    (ofde->fde_flags & UF_FOCLOSE) == 0) {
+			if (ops->fo_fork(newfdp, ofde->fde_file, &fp, p1,
+			    curthread) != 0)
+				continue;
+			fork_pass = true;
+		} else if ((ops->fo_flags & DFLAG_PASSABLE) == 0 ||
 		    (ofde->fde_flags & UF_FOCLOSE) != 0 ||
 		    !fhold(ofde->fde_file)) {
 			if (newfdp->fd_freefile == fdp->fd_freefile)
@@ -2536,11 +2573,30 @@ fdcopy(struct filedesc *fdp)
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
+		if (fp != NULL)
+			nfde->fde_file = fp;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fdused_init(newfdp, i);
 	}
 	MPASS(newfdp->fd_freefile != -1);
 	FILEDESC_SUNLOCK(fdp);
+
+	/*
+	 * Now handle copying kqueues, since all fds, including
+	 * kqueues, are in place.
+	 */
+	if (__predict_false(fork_pass)) {
+		FILEDESC_FOREACH_FDE(newfdp, i, nfde) {
+			const struct fileops *ops;
+
+			ops = nfde->fde_file->f_ops;
+			if ((ops->fo_flags & DFLAG_FORK) == 0 ||
+			    nfde->fde_file == NULL)
+				continue;
+			ops->fo_fork(newfdp, NULL, &nfde->fde_file, p1,
+			    curthread);
+		}
+	}
 	return (newfdp);
 }
 
@@ -5250,6 +5306,8 @@ file_type_to_name(short type)
 		return ("eventfd");
 	case DTYPE_TIMERFD:
 		return ("timerfd");
+	case DTYPE_JAILDESC:
+		return ("jail");
 	default:
 		return ("unkn");
 	}
diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c
index 7a2818c29b1a..a37cb23efed8 100644
--- a/sys/kern/kern_devctl.c
+++ b/sys/kern/kern_devctl.c
@@ -130,6 +130,7 @@ static const struct filterops devctl_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_devctl_detach,
 	.f_event = filt_devctl_read,
+	.f_copy = knote_triv_copy,
 };
 
 static struct cdev *devctl_dev;
@@ -140,7 +141,7 @@ static struct devctlbridge {
 } devctl_notify_hook = { .send_f = NULL };
 
 static void
-devctl_init(void)
+devctl_init(void *dummy __unused)
 {
 	int reserve;
 	uma_zone_t z;
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
index 0cb0f566a839..7c0654769581 100644
--- a/sys/kern/kern_environment.c
+++ b/sys/kern/kern_environment.c
@@ -1098,65 +1098,65 @@ kernenv_next(char *cp)
 }
 
 void
-tunable_int_init(void *data)
+tunable_int_init(const void *data)
 {
-	struct tunable_int *d = (struct tunable_int *)data;
+	const struct tunable_int *d = data;
 
 	TUNABLE_INT_FETCH(d->path, d->var);
 }
 
 void
-tunable_long_init(void *data)
+tunable_long_init(const void *data)
 {
-	struct tunable_long *d = (struct tunable_long *)data;
+	const struct tunable_long *d = data;
 
 	TUNABLE_LONG_FETCH(d->path, d->var);
 }
 
 void
-tunable_ulong_init(void *data)
+tunable_ulong_init(const void *data)
 {
-	struct tunable_ulong *d = (struct tunable_ulong *)data;
+	const struct tunable_ulong *d = data;
 
 	TUNABLE_ULONG_FETCH(d->path, d->var);
 }
 
 void
-tunable_int64_init(void *data)
+tunable_int64_init(const void *data)
 {
-	struct tunable_int64 *d = (struct tunable_int64 *)data;
+	const struct tunable_int64 *d = data;
 
 	TUNABLE_INT64_FETCH(d->path, d->var);
 }
 
 void
-tunable_uint64_init(void *data)
+tunable_uint64_init(const void *data)
 {
-	struct tunable_uint64 *d = (struct tunable_uint64 *)data;
+	const struct tunable_uint64 *d = data;
 
 	TUNABLE_UINT64_FETCH(d->path, d->var);
 }
 
 void
-tunable_quad_init(void *data)
+tunable_quad_init(const void *data)
 {
-	struct tunable_quad *d = (struct tunable_quad *)data;
+	const struct tunable_quad *d = data;
 
 	TUNABLE_QUAD_FETCH(d->path, d->var);
 }
 
 void
-tunable_bool_init(void *data)
+tunable_bool_init(const void *data)
 {
-	struct tunable_bool *d = (struct tunable_bool *)data;
+	const struct tunable_bool *d = data;
 
 	TUNABLE_BOOL_FETCH(d->path, d->var);
 }
 
 void
-tunable_str_init(void *data)
+tunable_str_init(const void *data)
 {
-	struct tunable_str *d = (struct tunable_str *)data;
+	const struct tunable_str *d = data;
 
 	TUNABLE_STR_FETCH(d->path, d->var, d->size);
 }
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index eb77a5064113..1baa24d278bf 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -50,6 +50,8 @@
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/jaildesc.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
@@ -132,6 +134,7 @@ static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
+static fo_fork_t	kqueue_fork;
 
 static const struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
@@ -146,7 +149,9 @@ static const struct fileops kqueueops = {
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_cmp = file_kcmp_generic,
+	.fo_fork = kqueue_fork,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
+	.fo_flags = DFLAG_FORK,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
@@ -154,7 +159,7 @@ static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
-static void 	knote_init(void);
+static void 	knote_init(void *);
 static struct 	knote *knote_alloc(int mflag);
 static void 	knote_free(struct knote *kn);
 
@@ -163,6 +168,9 @@ static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
+static int	filt_jailattach(struct knote *kn);
+static void	filt_jaildetach(struct knote *kn);
+static int	filt_jail(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
@@ -171,6 +179,7 @@ static void	filt_timerdetach(struct knote *kn);
 static void	filt_timerstart(struct knote *kn, sbintime_t to);
 static void	filt_timertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
+static int	filt_timercopy(struct knote *kn, struct proc *p1);
 static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
 static int	filt_timer(struct knote *kn, long hint);
 static int	filt_userattach(struct knote *kn);
@@ -182,11 +191,13 @@ static void	filt_usertouch(struct knote *kn, struct kevent *kev,
 static const struct filterops file_filtops = {
 	.f_isfd = 1,
 	.f_attach = filt_fileattach,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops kqread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_kqdetach,
 	.f_event = filt_kqueue,
+	.f_copy = knote_triv_copy,
 };
 /* XXX - move to kern_proc.c?  */
 static const struct filterops proc_filtops = {
@@ -194,6 +205,14 @@ static const struct filterops proc_filtops = {
 	.f_attach = filt_procattach,
 	.f_detach = filt_procdetach,
 	.f_event = filt_proc,
+	.f_copy = knote_triv_copy,
+};
+static const struct filterops jail_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_jailattach,
+	.f_detach = filt_jaildetach,
+	.f_event = filt_jail,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops timer_filtops = {
 	.f_isfd = 0,
@@ -201,12 +220,14 @@ static const struct filterops timer_filtops = {
 	.f_detach = filt_timerdetach,
 	.f_event = filt_timer,
 	.f_touch = filt_timertouch,
+	.f_copy =  filt_timercopy,
 };
 static const struct filterops user_filtops = {
 	.f_attach = filt_userattach,
 	.f_detach = filt_userdetach,
 	.f_event = filt_user,
 	.f_touch = filt_usertouch,
+	.f_copy = knote_triv_copy,
 };
 
 static uma_zone_t	knote_zone;
@@ -336,6 +357,7 @@ filt_nullattach(struct knote *kn)
 static const struct filterops null_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_nullattach,
+	.f_copy = knote_triv_copy,
 };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
@@ -365,6 +387,8 @@ static struct {
 	[~EVFILT_USER] = { &user_filtops, 1 },
 	[~EVFILT_SENDFILE] = { &null_filtops },
 	[~EVFILT_EMPTY] = { &file_filtops, 1 },
+	[~EVFILT_JAIL] = { &jail_filtops, 1 },
+	[~EVFILT_JAILDESC] = { &file_filtops, 1 },
 };
 
 /*
@@ -614,6 +638,86 @@ knote_fork(struct knlist *list, int pid)
 	}
 }
 
+int
+filt_jailattach(struct knote *kn)
+{
+	struct prison *pr;
+
+	if (kn->kn_id == 0) {
+		/* Let jid=0 watch the current prison (including prison0). */
+		pr = curthread->td_ucred->cr_prison;
+		mtx_lock(&pr->pr_mtx);
+	} else {
+		sx_slock(&allprison_lock);
+		pr = prison_find_child(curthread->td_ucred->cr_prison,
+		    kn->kn_id);
+		sx_sunlock(&allprison_lock);
+		if (pr == NULL)
+			return (ENOENT);
+		if (!prison_isalive(pr)) {
+			mtx_unlock(&pr->pr_mtx);
+			return (ENOENT);
+		}
+	}
+	kn->kn_ptr.p_prison = pr;
+	kn->kn_flags |= EV_CLEAR;
+	knlist_add(pr->pr_klist, kn, 1);
+	mtx_unlock(&pr->pr_mtx);
+	return (0);
+}
+
+void
+filt_jaildetach(struct knote *kn)
+{
+	if (kn->kn_ptr.p_prison != NULL) {
+		knlist_remove(kn->kn_knlist, kn, 0);
+		kn->kn_ptr.p_prison = NULL;
+	} else
+		kn->kn_status |= KN_DETACHED;
+}
+
+int
+filt_jail(struct knote *kn, long hint)
+{
+	struct prison *pr;
+	u_int event;
+
+	pr = kn->kn_ptr.p_prison;
+	if (pr == NULL) /* already activated, from attach filter */
+		return (0);
+
+	/*
+	 * Mask off extra data.  In the NOTE_JAIL_CHILD case, that's
+	 * everything except the NOTE_JAIL_CHILD bit itself, since a
+	 * JID is any positive integer.
+	 */
+	event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD :
+	    (u_int)hint & NOTE_JAIL_CTRLMASK;
+
+	/* If the user is interested in this event, record it. */
+	if (kn->kn_sfflags & event) {
+		kn->kn_fflags |= event;
+		/* Report the created jail id or attached process id. */
+		if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) {
+			if (kn->kn_data != 0)
+				kn->kn_fflags |= NOTE_JAIL_MULTI;
+			kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U :
+			    (u_int)hint & ~event;
+		}
+	}
+
+	/* Prison is gone, so flag the event as finished. */
+	if (event == NOTE_JAIL_REMOVE) {
+		kn->kn_flags |= EV_EOF | EV_ONESHOT;
+		kn->kn_ptr.p_prison = NULL;
+		if (kn->kn_fflags == 0)
+			kn->kn_flags |= EV_DROP;
+		return (1);
+	}
+
+	return (kn->kn_fflags != 0);
+}
+
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
@@ -847,6 +951,30 @@ filt_timerattach(struct knote *kn)
 	return (0);
 }
 
+static int
+filt_timercopy(struct knote *kn, struct proc *p)
+{
+	struct kq_timer_cb_data *kc_src, *kc;
+
+	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
+		atomic_subtract_int(&kq_ncallouts, 1);
+		return (ENOMEM);
+	}
+
+	kn->kn_status &= ~KN_DETACHED;
+	kc_src = kn->kn_ptr.p_v;
+	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
+	kc->kn = kn;
+	kc->p = p;
+	kc->flags = kc_src->flags & ~KQ_TIMER_CB_ENQUEUED;
+	kc->next = kc_src->next;
+	kc->to = kc_src->to;
+	kc->cpuid = PCPU_GET(cpuid);
+	callout_init(&kc->c, 1);
+	kqtimer_sched_callout(kc);
+	return (0);
+}
+
 static void
 filt_timerstart(struct knote *kn, sbintime_t to)
 {
@@ -1058,7 +1186,7 @@ int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
-	return (kern_kqueue(td, 0, NULL));
+	return (kern_kqueue(td, 0, false, NULL));
 }
 
 int
@@ -1066,55 +1194,76 @@ sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
 {
 	int flags;
 
-	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
+	if ((uap->flags & ~(KQUEUE_CLOEXEC | KQUEUE_CPONFORK)) != 0)
 		return (EINVAL);
 	flags = 0;
 	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
-	return (kern_kqueue(td, flags, NULL));
+	return (kern_kqueue(td, flags, (uap->flags & KQUEUE_CPONFORK) != 0,
+	    NULL));
 }
 
 static void
-kqueue_init(struct kqueue *kq)
+kqueue_init(struct kqueue *kq, bool cponfork)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+	if (cponfork)
+		kq->kq_state |= KQ_CPONFORK;
 }
 
-int
-kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
+static int
+kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
+    struct file **fpp, int flags, struct filecaps *fcaps, bool cponfork,
+    struct kqueue **kqp)
 {
-	struct filedesc *fdp;
-	struct kqueue *kq;
-	struct file *fp;
 	struct ucred *cred;
-	int fd, error;
+	struct kqueue *kq;
+	int error;
 
-	fdp = td->td_proc->p_fd;
 	cred = td->td_ucred;
 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
 		return (ENOMEM);
 
-	error = falloc_caps(td, &fp, &fd, flags, fcaps);
+	error = fdip != NULL ? falloc_caps(td, fpp, fdip, flags, fcaps) :
+	    _falloc_noinstall(td, fpp, 1);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
 	}
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
-	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
-	kqueue_init(kq);
+	kq = malloc(sizeof(*kq), M_KQUEUE, M_WAITOK | M_ZERO);
+	kqueue_init(kq, cponfork);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
-	FILEDESC_XLOCK(fdp);
+	if (fdip != NULL)
+		FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
-	FILEDESC_XUNLOCK(fdp);
+	if (fdip != NULL)
+		FILEDESC_XUNLOCK(fdp);
+
+	finit(*fpp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
+	*kqp = kq;
+	return (0);
+}
+
+int
+kern_kqueue(struct thread *td, int flags, bool cponfork, struct filecaps *fcaps)
+{
+	struct kqueue *kq;
+	struct file *fp;
+	int fd, error;
+
+	error = kern_kqueue_alloc(td, td->td_proc->p_fd, &fd, &fp, flags,
+	    fcaps, cponfork, &kq);
+	if (error != 0)
+		return (error);
 
-	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
@@ -1395,7 +1544,7 @@ kern_kevent_anonymous(struct thread *td, int nevents,
 	struct kqueue kq = {};
 	int error;
 
-	kqueue_init(&kq);
+	kqueue_init(&kq, false);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
@@ -1483,7 +1632,7 @@ kqueue_fo_release(int filt)
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
-	    ("filter object refcount not valid on release"));
+	    ("filter object %d refcount not valid on release", filt));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
@@ -1762,17 +1911,8 @@ done:
 }
 
 static int
-kqueue_acquire(struct file *fp, struct kqueue **kqp)
+kqueue_acquire_ref(struct kqueue *kq)
 {
-	int error;
-	struct kqueue *kq;
-
-	error = 0;
-
-	kq = fp->f_data;
-	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
-		return (EBADF);
-	*kqp = kq;
 	KQ_LOCK(kq);
 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
@@ -1780,8 +1920,22 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp)
 	}
 	kq->kq_refcnt++;
 	KQ_UNLOCK(kq);
+	return (0);
+}
 
-	return error;
+static int
+kqueue_acquire(struct file *fp, struct kqueue **kqp)
+{
+	struct kqueue *kq;
+	int error;
+
+	kq = fp->f_data;
+	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
+		return (EINVAL);
+	error = kqueue_acquire_ref(kq);
+	if (error == 0)
+		*kqp = kq;
+	return (error);
 }
 
 static void
@@ -2794,12 +2948,13 @@ knote_dequeue(struct knote *kn)
 }
 
 static void
-knote_init(void)
+knote_init(void *dummy __unused)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
+	prison0.pr_klist = knlist_alloc(&prison0.pr_mtx);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
@@ -2843,6 +2998,152 @@ noacquire:
 	return (error);
 }
 
+static int
+kqueue_fork_alloc(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct thread *td)
+{
+	struct kqueue *kq, *kq1;
+	int error;
+
+	MPASS(fp->f_type == DTYPE_KQUEUE);
+	kq = fp->f_data;
+	if ((kq->kq_state & KQ_CPONFORK) == 0)
+		return (EOPNOTSUPP);
+	error = kqueue_acquire_ref(kq);
+	if (error != 0)
+		return (error);
+	error = kern_kqueue_alloc(td, fdp, NULL, fp1, 0, NULL, true, &kq1);
+	if (error == 0) {
+		kq1->kq_forksrc = kq;
+		(*fp1)->f_flag = fp->f_flag & (FREAD | FWRITE | FEXEC |
+		    O_CLOEXEC | O_CLOFORK);
+	} else {
+		kqueue_release(kq, 0);
+	}
+	return (error);
+}
+
+static void
+kqueue_fork_copy_knote(struct kqueue *kq1, struct knote *kn, struct proc *p1,
+    struct filedesc *fdp)
+{
+	struct knote *kn1;
+	const struct filterops *fop;
+	int error;
+
+	fop = kn->kn_fop;
+	if (fop->f_copy == NULL || (fop->f_isfd &&
+	    fdp->fd_files->fdt_ofiles[kn->kn_kevent.ident].fde_file == NULL))
+		return;
+	error = kqueue_expand(kq1, fop, kn->kn_kevent.ident, M_WAITOK);
+	if (error != 0)
+		return;
+
+	kn1 = knote_alloc(M_WAITOK);
+	*kn1 = *kn;
+	kn1->kn_status |= KN_DETACHED;
+	kn1->kn_status &= ~KN_QUEUED;
+	kn1->kn_kq = kq1;
+	error = fop->f_copy(kn1, p1);
+	if (error != 0) {
+		knote_free(kn1);
+		return;
+	}
+	(void)kqueue_fo_find(kn->kn_kevent.filter);
+	if (fop->f_isfd && !fhold(kn1->kn_fp)) {
+		fop->f_detach(kn1);
+		kqueue_fo_release(kn->kn_kevent.filter);
+		knote_free(kn1);
+		return;
+	}
+	if (kn->kn_knlist != NULL)
+		knlist_add(kn->kn_knlist, kn1, 0);
+	KQ_LOCK(kq1);
+	knote_attach(kn1, kq1);
+	kn1->kn_influx = 0;
+	if ((kn->kn_status & KN_QUEUED) != 0)
+		knote_enqueue(kn1);
+	KQ_UNLOCK(kq1);
+}
+
+static void
+kqueue_fork_copy_list(struct klist *knlist, struct knote *marker,
+    struct kqueue *kq, struct kqueue *kq1, struct proc *p1,
+    struct filedesc *fdp)
+{
+	struct knote *kn;
+
+	KQ_OWNED(kq);
+	kn = SLIST_FIRST(knlist);
+	while (kn != NULL) {
+		if ((kn->kn_status & KN_DETACHED) != 0 ||
+		    (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0)) {
+			kn = SLIST_NEXT(kn, kn_link);
+			continue;
+		}
+		kn_enter_flux(kn);
+		SLIST_INSERT_AFTER(kn, marker, kn_link);
+		KQ_UNLOCK(kq);
+		kqueue_fork_copy_knote(kq1, kn, p1, fdp);
+		KQ_LOCK(kq);
+		kn_leave_flux(kn);
+		kn = SLIST_NEXT(marker, kn_link);
+		/* XXXKIB switch kn_link to LIST? */
+		SLIST_REMOVE(knlist, marker, knote, kn_link);
+	}
+}
+
+static int
+kqueue_fork_copy(struct filedesc *fdp, struct file *fp, struct file *fp1,
+    struct proc *p1, struct thread *td)
+{
+	struct kqueue *kq, *kq1;
+	struct knote *marker;
+	int error, i;
+
+	error = 0;
+	MPASS(fp == NULL);
+	MPASS(fp1->f_type == DTYPE_KQUEUE);
+
+	kq1 = fp1->f_data;
+	kq = kq1->kq_forksrc;
+	marker = knote_alloc(M_WAITOK);
+	marker->kn_status = KN_MARKER;
+
+	KQ_LOCK(kq);
+	for (i = 0; i < kq->kq_knlistsize; i++) {
+		kqueue_fork_copy_list(&kq->kq_knlist[i], marker, kq, kq1,
+		    p1, fdp);
+	}
+	if (kq->kq_knhashmask != 0) {
+		for (i = 0; i <= kq->kq_knhashmask; i++) {
+			kqueue_fork_copy_list(&kq->kq_knhash[i], marker, kq,
+			    kq1, p1, fdp);
+		}
+	}
+	kqueue_release(kq, 1);
+	kq1->kq_forksrc = NULL;
+	KQ_UNLOCK(kq);
+
+	knote_free(marker);
+	return (error);
+}
+
+static int
+kqueue_fork(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct proc *p1, struct thread *td)
+{
+	if (*fp1 == NULL)
+		return (kqueue_fork_alloc(fdp, fp, fp1, td));
+	return (kqueue_fork_copy(fdp, fp, *fp1, p1, td));
+}
+
+int
+knote_triv_copy(struct knote *kn __unused, struct proc *p1 __unused)
+{
+	return (0);
+}
+
 struct knote_status_export_bit {
 	int kn_status_bit;
 	int knt_status_bit;
@@ -3033,7 +3334,7 @@ sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS)
 		return (error);
 
 	td = curthread;
-#ifdef FREEBSD_COMPAT32
+#ifdef COMPAT_FREEBSD32
 	compat32 = SV_CURPROC_FLAG(SV_ILP32);
 #else
 	compat32 = false;
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 0fc2d0e7f1bc..2bdd6faa025a 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -418,7 +418,7 @@ do_execve(struct thread *td, struct image_args *args, struct mac *mac_p,
 #endif
 	int error, i, orig_osrel;
 	uint32_t orig_fctl0;
-	Elf_Brandinfo *orig_brandinfo;
+	const Elf_Brandinfo *orig_brandinfo;
 	size_t freepath_size;
 	static const char fexecv_proc_title[] = "(fexecv)";
 
@@ -1314,7 +1314,7 @@ exec_map_stack(struct image_params *imgp)
 		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 	} else {
 		sharedpage_addr = sv->sv_shared_page_base;
-		vm_map_fixed(map, obj, 0,
+		error = vm_map_fixed(map, obj, 0,
 		    sharedpage_addr, sv->sv_shared_page_len,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index a32b5a1b3354..c4b1c8201ff2 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -127,6 +127,27 @@ proc_realparent(struct proc *child)
 	return (parent);
 }
 
+static void
+reaper_clear(struct proc *p, struct proc *rp)
+{
+	struct proc *p1;
+	bool clear;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	LIST_REMOVE(p, p_reapsibling);
+	if (p->p_reapsubtree == 1)
+		return;
+	clear = true;
+	LIST_FOREACH(p1, &rp->p_reaplist, p_reapsibling) {
+		if (p1->p_reapsubtree == p->p_reapsubtree) {
+			clear = false;
+			break;
+		}
+	}
+	if (clear)
+		proc_id_clear(PROC_ID_REAP, p->p_reapsubtree);
+}
+
 void
 reaper_abandon_children(struct proc *p, bool exiting)
 {
@@ -138,7 +159,7 @@ reaper_abandon_children(struct proc *p, bool exiting)
 		return;
 	p1 = p->p_reaper;
 	LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
-		LIST_REMOVE(p2, p_reapsibling);
+		reaper_clear(p2, p);
 		p2->p_reaper = p1;
 		p2->p_reapsubtree = p->p_reapsubtree;
 		LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
@@ -152,27 +173,6 @@ reaper_abandon_children(struct proc *p, bool exiting)
 	p->p_treeflag &= ~P_TREE_REAPER;
 }
 
-static void
-reaper_clear(struct proc *p)
-{
-	struct proc *p1;
-	bool clear;
-
-	sx_assert(&proctree_lock, SX_LOCKED);
-	LIST_REMOVE(p, p_reapsibling);
-	if (p->p_reapsubtree == 1)
-		return;
-	clear = true;
-	LIST_FOREACH(p1, &p->p_reaper->p_reaplist, p_reapsibling) {
-		if (p1->p_reapsubtree == p->p_reapsubtree) {
-			clear = false;
-			break;
-		}
-	}
-	if (clear)
-		proc_id_clear(PROC_ID_REAP, p->p_reapsubtree);
-}
-
 void
 proc_clear_orphan(struct proc *p)
 {
@@ -807,7 +807,7 @@ kern_abort2(struct thread *td, const char *why, int nargs, void **uargs)
 	}
 	if (nargs > 0) {
 		sbuf_putc(sb, '(');
-		for (i = 0;i < nargs; i++)
+		for (i = 0; i < nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
 		sbuf_putc(sb, ')');
 	}
@@ -972,7 +972,7 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options)
 	sx_xunlock(PIDHASHLOCK(p->p_pid));
 	LIST_REMOVE(p, p_sibling);
 	reaper_abandon_children(p, true);
-	reaper_clear(p);
+	reaper_clear(p, p->p_reaper);
 	PROC_LOCK(p);
 	proc_clear_orphan(p);
 	PROC_UNLOCK(p);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 2ab9b363f8b5..8b237b6dbd17 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -423,7 +423,7 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *
 			pd = pdshare(p1->p_pd);
 		else
 			pd = pdcopy(p1->p_pd);
-		fd = fdcopy(p1->p_fd);
+		fd = fdcopy(p1->p_fd, p2);
 		fdtol = NULL;
 	} else {
 		if (fr->fr_flags2 & FR2_SHARE_PATHS)
@@ -610,10 +610,12 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *
 	p2->p_flag |= p1->p_flag & P_SUGID;
 	td2->td_pflags |= td->td_pflags & (TDP_ALTSTACK | TDP_SIGFASTBLOCK);
 	td2->td_pflags2 |= td->td_pflags2 & TDP2_UEXTERR;
-	SESS_LOCK(p1->p_session);
-	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
-		p2->p_flag |= P_CONTROLT;
-	SESS_UNLOCK(p1->p_session);
+	if (p1->p_flag & P_CONTROLT) {
+		SESS_LOCK(p1->p_session);
+		if (p1->p_session->s_ttyvp != NULL)
+			p2->p_flag |= P_CONTROLT;
+		SESS_UNLOCK(p1->p_session);
+	}
 	if (fr->fr_flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 7c9a15ae18f3..523b7e314a10 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -39,15 +39,18 @@
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
+#include <sys/file.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/epoch.h>
+#include <sys/event.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
+#include <sys/jaildesc.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
@@ -154,7 +157,8 @@ static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
 static int prison_lock_xlock(struct prison *pr, int flags);
-static void prison_cleanup(struct prison *pr);
+static void prison_cleanup_locked(struct prison *pr);
+static void prison_cleanup_unlocked(struct prison *pr);
 static void prison_free_not_last(struct prison *pr);
 static void prison_proc_free_not_last(struct prison *pr);
 static void prison_proc_relink(struct prison *opr, struct prison *npr,
@@ -167,6 +171,7 @@ static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
+static void prison_knote(struct prison *pr, long hint);
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01	/* Decrement pr_ref */
@@ -238,6 +243,9 @@ static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
 	{"allow.unprivileged_parent_tampering",
 	    "allow.nounprivileged_parent_tampering",
 	    PR_ALLOW_UNPRIV_PARENT_TAMPER},
+#ifdef AUDIT
+	{"allow.setaudit", "allow.nosetaudit", PR_ALLOW_SETAUDIT},
+#endif
 };
 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
@@ -985,6 +993,7 @@ prison_ip_cnt(const struct prison *pr, const pr_family_t af)
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
+	struct file *jfp_out;
 	struct nameidata nd;
 #ifdef INET
 	struct prison_ip *ip4;
@@ -995,6 +1004,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
+	struct ucred *jdcred;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
@@ -1008,7 +1018,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	int created, cuflags, descend, drflags, enforce;
 	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
-	int deadid, jid, jsys, len, level;
+	int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 #ifdef INET
 	int ip4s;
@@ -1018,22 +1028,32 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	int ip6s;
 	bool redo_ip6;
 #endif
+	bool maybe_changed;
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	uint64_t pr_allow_diff;
 	unsigned tallow;
 	char numbuf[12];
 
-	error = priv_check(td, PRIV_JAIL_SET);
-	if (!error && (flags & JAIL_ATTACH))
-		error = priv_check(td, PRIV_JAIL_ATTACH);
-	if (error)
-		return (error);
 	mypr = td->td_ucred->cr_prison;
-	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
+	if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) &&
+	    mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
+	if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) ==
+	    (JAIL_USE_DESC | JAIL_AT_DESC))
+		return (EINVAL);
+	prison_hold(mypr);
 
+#ifdef INET
+	ip4 = NULL;
+#endif
+#ifdef INET6
+	ip6 = NULL;
+#endif
+	g_path = NULL;
+	jfp_out = NULL;
+	jfd_out = -1;
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
@@ -1046,14 +1066,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
-		return (error);
-#ifdef INET
-	ip4 = NULL;
-#endif
-#ifdef INET6
-	ip6 = NULL;
-#endif
-	g_path = NULL;
+		goto done_free;
 
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
@@ -1062,6 +1075,62 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		goto done_errmsg;
 	}
 
+	error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
+	if (error == ENOENT) {
+		if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
+		    JAIL_OWN_DESC)) {
+			vfs_opterror(opts, "missing desc");
+			goto done_errmsg;
+		}
+		jfd_in = -1;
+	} else if (error != 0)
+		goto done_free;
+	else {
+		if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
+		    JAIL_OWN_DESC))) {
+			error = EINVAL;
+			vfs_opterror(opts, "unexpected desc");
+			goto done_errmsg;
+		}
+		if (flags & JAIL_AT_DESC) {
+			/*
+			 * Look up and create jails based on the
+			 * descriptor's prison.
+			 */
+			prison_free(mypr);
+			error = jaildesc_find(td, jfd_in, &mypr, NULL);
+			if (error != 0) {
+				vfs_opterror(opts, error == ENOENT ?
+				    "descriptor to dead jail" :
+				    "not a jail descriptor");
+				goto done_errmsg;
+			}
+			if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) {
+				error = EPERM;
+				goto done_free;
+			}
+		}
+		if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
+			/* Allocate a jail descriptor to return later. */
+			error = jaildesc_alloc(td, &jfp_out, &jfd_out,
+				flags & JAIL_OWN_DESC);
+			if (error)
+				goto done_free;
+		}
+	}
+
+	/*
+	 * Delay the permission check if using a jail descriptor,
+	 * until we get the descriptor's credentials.
+	 */
+	if (!(flags & JAIL_USE_DESC)) {
+		error = priv_check(td, PRIV_JAIL_SET);
+		if (error == 0 && (flags & JAIL_ATTACH))
+			error = priv_check(td, PRIV_JAIL_ATTACH);
+		if (error)
+			goto done_free;
+	}
+
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
@@ -1422,6 +1491,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	pr = NULL;
 	inspr = NULL;
 	deadpr = NULL;
+	maybe_changed = false;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
@@ -1436,7 +1506,45 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		error = EAGAIN;
 		goto done_deref;
 	}
-	if (jid != 0) {
+	if (flags & JAIL_USE_DESC) {
+		/* Get the jail from its descriptor. */
+		error = jaildesc_find(td, jfd_in, &pr, &jdcred);
+		if (error) {
+			vfs_opterror(opts, error == ENOENT ?
+			    "descriptor to dead jail" :
+			    "not a jail descriptor");
+			goto done_deref;
+		}
+		drflags |= PD_DEREF;
+		error = priv_check_cred(jdcred, PRIV_JAIL_SET);
+		if (error == 0 && (flags & JAIL_ATTACH))
+			error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
+		crfree(jdcred);
+		if (error)
+			goto done_deref;
+		mtx_lock(&pr->pr_mtx);
+		drflags |= PD_LOCKED;
+		if (cuflags == JAIL_CREATE) {
+			error = EEXIST;
+			vfs_opterror(opts, "jail %d already exists",
+			    pr->pr_id);
+			goto done_deref;
+		}
+		if (!prison_isalive(pr)) {
+			/* While a jid can be resurrected, the prison
+			 * itself cannot.
+			 */
+			error = ENOENT;
+			vfs_opterror(opts, "jail %d is dying", pr->pr_id);
+			goto done_deref;
+		}
+		if (jid != 0 && jid != pr->pr_id) {
+			error = EINVAL;
+			vfs_opterror(opts, "cannot change jid");
+			goto done_deref;
+		}
+		jid = pr->pr_id;
+	} else if (jid != 0) {
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
@@ -1570,7 +1678,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 			}
 		}
 	}
-	/* Update: must provide a jid or name. */
+	/* Update: must provide a desc, jid, or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
@@ -1643,6 +1751,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
+		pr->pr_klist = knlist_alloc(&pr->pr_mtx);
 
 		/* Set some default values, and inherit some from the parent. */
 		if (namelc == NULL)
@@ -1722,8 +1831,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
-		prison_hold(pr);
-		drflags |= PD_DEREF;
+		if (!(drflags & PD_DEREF)) {
+			prison_hold(pr);
+			drflags |= PD_DEREF;
+		}
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
@@ -1880,6 +1991,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 			goto done_deref;
 		}
 	}
+	maybe_changed = true;
 
 	/* Set the parameters of the prison. */
 #ifdef INET
@@ -2112,7 +2224,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	 * reference via persistence, or is about to gain one via attachment.
 	 */
 	if (created) {
-		drflags = prison_lock_xlock(pr, drflags);
+		sx_assert(&allprison_lock, SX_XLOCKED);
+		prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id);
+		mtx_lock(&pr->pr_mtx);
+		drflags |= PD_LOCKED;
 		pr->pr_state = PRISON_STATE_ALIVE;
 	}
 
@@ -2146,10 +2261,37 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
 		   " file system\n", pr->pr_id);
 
+	/*
+	 * Now that the prison is fully created without error, set the
+	 * jail descriptor if one was requested.  This is the only
+	 * parameter that is returned to the caller (except the error
+	 * message).
+	 */
+	if (jfd_out >= 0) {
+		if (!(drflags & PD_LOCKED)) {
+			mtx_lock(&pr->pr_mtx);
+			drflags |= PD_LOCKED;
+		}
+		jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1;
+		if (optuio->uio_segflg == UIO_SYSSPACE)
+			*(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out;
+		else
+			(void)copyout(&jfd_out,
+			    optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out));
+		jaildesc_set_prison(jfp_out, pr);
+	}
+
 	drflags &= ~PD_KILL;
 	td->td_retval[0] = pr->pr_id;
 
  done_deref:
+	/*
+	 * Report changes to kevent.  This can happen even if the
+	 * system call fails, as changes might have been made before
+	 * the failure.
+	 */
+	if (maybe_changed && !created)
+		prison_knote(pr, NOTE_JAIL_SET);
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
@@ -2176,15 +2318,21 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		}
 	}
  done_free:
+	/* Clean up other resources. */
 #ifdef INET
 	prison_ip_free(ip4);
 #endif
 #ifdef INET6
 	prison_ip_free(ip6);
 #endif
+	if (jfp_out != NULL)
+		fdrop(jfp_out, td);
+	if (error && jfd_out >= 0)
+		(void)kern_close(td, jfd_out);
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
+	prison_free(mypr);
 	return (error);
 }
 
@@ -2329,16 +2477,21 @@ int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct bool_flags *bf;
+	struct file *jfp_out;
 	struct jailsys_flags *jsf;
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
+	int jfd_in, jfd_out;
 	unsigned f;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
+	if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) ==
+	    (JAIL_USE_DESC | JAIL_AT_DESC))
+		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
@@ -2346,13 +2499,71 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
+	prison_hold(mypr);
 	pr = NULL;
+	jfp_out = NULL;
+	jfd_out = -1;
 
 	/*
-	 * Find the prison specified by one of: lastjid, jid, name.
+	 * Find the prison specified by one of: desc, lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	drflags = PD_LIST_SLOCKED;
+
+	error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
+	if (error == ENOENT) {
+		if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) {
+			vfs_opterror(opts, "missing desc");
+			goto done;
+		}
+	} else if (error == 0) {
+		if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
+		    JAIL_OWN_DESC))) {
+			error = EINVAL;
+			vfs_opterror(opts, "unexpected desc");
+			goto done;
+		}
+		if (flags & JAIL_USE_DESC) {
+			/* Get the jail from its descriptor. */
+			error = jaildesc_find(td, jfd_in, &pr, NULL);
+			if (error) {
+				vfs_opterror(opts, error == ENOENT ?
+				    "descriptor to dead jail" :
+				    "not a jail descriptor");
+				goto done;
+			}
+			drflags |= PD_DEREF;
+			mtx_lock(&pr->pr_mtx);
+			drflags |= PD_LOCKED;
+			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
+				error = ENOENT;
+				vfs_opterror(opts, "jail %d is dying",
+				    pr->pr_id);
+				goto done;
+			}
+			goto found_prison;
+		}
+		if (flags & JAIL_AT_DESC) {
+			/* Look up jails based on the descriptor's prison. */
+			prison_free(mypr);
+			error = jaildesc_find(td, jfd_in, &mypr, NULL);
+			if (error != 0) {
+				vfs_opterror(opts, error == ENOENT ?
+				    "descriptor to dead jail" :
+				    "not a jail descriptor");
+				goto done;
+			}
+		}
+		if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
+			/* Allocate a jail descriptor to return later. */
+			error = jaildesc_alloc(td, &jfp_out, &jfd_out,
+				flags & JAIL_OWN_DESC);
+			if (error)
+				goto done;
+		}
+	} else
+		goto done;
+
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
@@ -2421,9 +2632,17 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 
  found_prison:
 	/* Get the parameters of the prison. */
-	prison_hold(pr);
-	drflags |= PD_DEREF;
+	if (!(drflags & PD_DEREF)) {
+		prison_hold(pr);
+		drflags |= PD_DEREF;
+	}
 	td->td_retval[0] = pr->pr_id;
+	if (jfd_out >= 0) {
+		error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out));
+		if (error != 0 && error != ENOENT)
+			goto done;
+		jaildesc_set_prison(jfp_out, pr);
+	}
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
@@ -2603,6 +2822,13 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
+	else if (drflags & PD_LIST_XLOCKED)
+		sx_xunlock(&allprison_lock);
+	/* Clean up other resources. */
+	if (jfp_out != NULL)
+		(void)fdrop(jfp_out, td);
+	if (error && jfd_out >= 0)
+		(void)kern_close(td, jfd_out);
 	if (error && errmsg_pos >= 0) {
 		/* Write the error message back to userspace. */
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
@@ -2619,6 +2845,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 		}
 	}
 	vfs_freeopts(opts);
+	prison_free(mypr);
 	return (error);
 }
 
@@ -2643,17 +2870,51 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
-	if (!prison_isalive(pr)) {
-		/* Silently ignore already-dying prisons. */
-		mtx_unlock(&pr->pr_mtx);
-		sx_xunlock(&allprison_lock);
-		return (0);
+	prison_hold(pr);
+	prison_remove(pr);
+	return (0);
+}
+
+/*
+ * struct jail_remove_jd_args {
+ *	int fd;
+ * };
+ */
+int
+sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap)
+{
+	struct prison *pr;
+	struct ucred *jdcred;
+	int error;
+
+	error = jaildesc_find(td, uap->fd, &pr, &jdcred);
+	if (error)
+		return (error);
+	error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE);
+	crfree(jdcred);
+	if (error) {
+		prison_free(pr);
+		return (error);
 	}
-	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
+	sx_xlock(&allprison_lock);
+	mtx_lock(&pr->pr_mtx);
+	prison_remove(pr);
 	return (0);
 }
 
 /*
+ * Begin the removal process for a prison.  The allprison lock should
+ * be held exclusively, and the prison should be both locked and held.
+ */
+void
+prison_remove(struct prison *pr)
+{
+	sx_assert(&allprison_lock, SA_XLOCKED);
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
+}
+
+/*
  * struct jail_attach_args {
  *	int jid;
  * };
@@ -2685,6 +2946,44 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
 }
 
+/*
+ * struct jail_attach_jd_args {
+ *	int fd;
+ * };
+ */
+int
+sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap)
+{
+	struct prison *pr;
+	struct ucred *jdcred;
+	int drflags, error;
+
+	sx_slock(&allprison_lock);
+	drflags = PD_LIST_SLOCKED;
+	error = jaildesc_find(td, uap->fd, &pr, &jdcred);
+	if (error)
+		goto fail;
+	drflags |= PD_DEREF;
+	error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
+	crfree(jdcred);
+	if (error)
+		goto fail;
+	mtx_lock(&pr->pr_mtx);
+	drflags |= PD_LOCKED;
+
+	/* Do not allow a process to attach to a prison that is not alive. */
+	if (!prison_isalive(pr)) {
+		error = EINVAL;
+		goto fail;
+	}
+
+	return (do_jail_attach(td, pr, drflags));
+
+ fail:
+	prison_deref(pr, drflags);
+	return (error);
+}
+
 static int
 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 {
@@ -2703,9 +3002,12 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
-	prison_hold(pr);
+	if (!(drflags & PD_DEREF)) {
+		prison_hold(pr);
+		drflags |= PD_DEREF;
+	}
 	refcount_acquire(&pr->pr_uref);
-	drflags |= PD_DEREF | PD_DEUREF;
+	drflags |= PD_DEUREF;
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 
@@ -2745,6 +3047,8 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 	setsugid(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
@@ -2755,6 +3059,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 	prison_proc_relink(oldcred->cr_prison, pr, p);
 	prison_deref(oldcred->cr_prison, drflags);
 	crfree(oldcred);
+	prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid);
 
 	/*
 	 * If the prison was killed while changing credentials, die along
@@ -3154,12 +3459,17 @@ prison_deref(struct prison *pr, int flags)
 			/* Kill the prison and its descendents. */
 			KASSERT(pr != &prison0,
 			    ("prison_deref trying to kill prison0"));
-			if (!(flags & PD_DEREF)) {
-				prison_hold(pr);
-				flags |= PD_DEREF;
+			if (!prison_isalive(pr)) {
+				/* Silently ignore already-dying prisons. */
+				flags &= ~PD_KILL;
+			} else {
+				if (!(flags & PD_DEREF)) {
+					prison_hold(pr);
+					flags |= PD_DEREF;
+				}
+				flags = prison_lock_xlock(pr, flags);
+				prison_deref_kill(pr, &freeprison);
 			}
-			flags = prison_lock_xlock(pr, flags);
-			prison_deref_kill(pr, &freeprison);
 		}
 		if (flags & PD_DEUREF) {
 			/* Drop a user reference. */
@@ -3182,9 +3492,10 @@ prison_deref(struct prison *pr, int flags)
 					    refcount_load(&prison0.pr_uref) > 0,
 					    ("prison0 pr_uref=0"));
 					pr->pr_state = PRISON_STATE_DYING;
+					prison_cleanup_locked(pr);
 					mtx_unlock(&pr->pr_mtx);
 					flags &= ~PD_LOCKED;
-					prison_cleanup(pr);
+					prison_cleanup_unlocked(pr);
 				}
 			}
 		}
@@ -3327,8 +3638,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
 		}
 		if (!(cpr->pr_flags & PR_REMOVE))
 			continue;
-		prison_cleanup(cpr);
+		prison_cleanup_unlocked(cpr);
 		mtx_lock(&cpr->pr_mtx);
+		prison_cleanup_locked(cpr);
 		cpr->pr_flags &= ~PR_REMOVE;
 		if (cpr->pr_flags & PR_PERSIST) {
 			cpr->pr_flags &= ~PR_PERSIST;
@@ -3363,8 +3675,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
 	if (rpr != NULL)
 		LIST_REMOVE(rpr, pr_sibling);
 
-	prison_cleanup(pr);
+	prison_cleanup_unlocked(pr);
 	mtx_lock(&pr->pr_mtx);
+	prison_cleanup_locked(pr);
 	if (pr->pr_flags & PR_PERSIST) {
 		pr->pr_flags &= ~PR_PERSIST;
 		prison_proc_free_not_last(pr);
@@ -3411,10 +3724,22 @@ prison_lock_xlock(struct prison *pr, int flags)
 
 /*
  * Release a prison's resources when it starts dying (when the last user
- * reference is dropped, or when it is killed).
+ * reference is dropped, or when it is killed).  Two functions are called,
+ * for work that requires a locked prison or an unlocked one.
  */
 static void
-prison_cleanup(struct prison *pr)
+prison_cleanup_locked(struct prison *pr)
+{
+	sx_assert(&allprison_lock, SA_XLOCKED);
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	prison_knote(pr, NOTE_JAIL_REMOVE);
+	knlist_detach(pr->pr_klist);
+	jaildesc_prison_cleanup(pr);
+	pr->pr_klist = NULL;
+}
+
+static void
+prison_cleanup_unlocked(struct prison *pr)
 {
 	sx_assert(&allprison_lock, SA_XLOCKED);
 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
@@ -3970,7 +4295,6 @@ prison_priv_check(struct ucred *cred, int priv)
 		 */
 	case PRIV_KTRACE:
 
-#if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
@@ -3979,6 +4303,11 @@ prison_priv_check(struct ucred *cred, int priv)
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
+		if (cred->cr_prison->pr_allow & PR_ALLOW_SETAUDIT)
+			return (0);
+		else
+			return (EPERM);
+#if 0
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
@@ -4715,6 +5044,10 @@ SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set system time");
 SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may modify routing table");
+#ifdef AUDIT
+SYSCTL_JAIL_PARAM(_allow, setaudit, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may set and get audit session state");
+#endif
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
@@ -5039,6 +5372,23 @@ prison_racct_detach(struct prison *pr)
 }
 #endif /* RACCT */
 
+/*
+ * Submit a knote for a prison, locking if necessary.
+ */
+static void
+prison_knote(struct prison *pr, long hint)
+{
+	int locked;
+
+	locked = mtx_owned(&pr->pr_mtx);
+	if (!locked)
+		mtx_lock(&pr->pr_mtx);
+	KNOTE_LOCKED(pr->pr_klist, hint);
+	jaildesc_knote(pr, hint);
+	if (!locked)
+		mtx_unlock(&pr->pr_mtx);
+}
+
 #ifdef DDB
 
 static void
diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c
new file mode 100644
index 000000000000..a564393d3366
--- /dev/null
+++ b/sys/kern/kern_jaildesc.c
@@ -0,0 +1,413 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 James Gritton.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/jail.h>
+#include <sys/jaildesc.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/stat.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors");
+
+static fo_poll_t	jaildesc_poll;
+static fo_kqfilter_t	jaildesc_kqfilter;
+static fo_stat_t	jaildesc_stat;
+static fo_close_t	jaildesc_close;
+static fo_fill_kinfo_t	jaildesc_fill_kinfo;
+static fo_cmp_t		jaildesc_cmp;
+
+static struct fileops jaildesc_ops = {
+	.fo_read = invfo_rdwr,
+	.fo_write = invfo_rdwr,
+	.fo_truncate = invfo_truncate,
+	.fo_ioctl = invfo_ioctl,
+	.fo_poll = jaildesc_poll,
+	.fo_kqfilter = jaildesc_kqfilter,
+	.fo_stat = jaildesc_stat,
+	.fo_close = jaildesc_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_fill_kinfo = jaildesc_fill_kinfo,
+	.fo_cmp = jaildesc_cmp,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
+/*
+ * Given a jail descriptor number, return its prison and/or its
+ * credential.  They are returned held, and will need to be released
+ * by the caller.
+ */
+int
+jaildesc_find(struct thread *td, int fd, struct prison **prp,
+    struct ucred **ucredp)
+{
+	struct file *fp;
+	struct jaildesc *jd;
+	struct prison *pr;
+	int error;
+
+	error = fget(td, fd, &cap_no_rights, &fp);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_JAILDESC) {
+		error = EINVAL;
+		goto out;
+	}
+	jd = fp->f_data;
+	JAILDESC_LOCK(jd);
+	pr = jd->jd_prison;
+	if (pr == NULL || !prison_isvalid(pr)) {
+		error = ENOENT;
+		JAILDESC_UNLOCK(jd);
+		goto out;
+	}
+	if (prp != NULL) {
+		prison_hold(pr);
+		*prp = pr;
+	}
+	JAILDESC_UNLOCK(jd);
+	if (ucredp != NULL)
+		*ucredp = crhold(fp->f_cred);
+ out:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Allocate a new jail decriptor, not yet associated with a prison.
+ * Return the file pointer (with a reference held) and the descriptor
+ * number.
+ */
+int
+jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning)
+{
+	struct file *fp;
+	struct jaildesc *jd;
+	int error;
+
+	if (owning) {
+		error = priv_check(td, PRIV_JAIL_REMOVE);
+		if (error != 0)
+			return (error);
+	}
+	jd = malloc(sizeof(*jd), M_JAILDESC, M_WAITOK | M_ZERO);
+	error = falloc_caps(td, &fp, fdp, 0, NULL);
+	if (error != 0) {
+		free(jd, M_JAILDESC);
+		return (error);
+	}
+	finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 ?
+	    FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops);
+	JAILDESC_LOCK_INIT(jd);
+	knlist_init_mtx(&jd->jd_selinfo.si_note, &jd->jd_lock);
+	if (owning)
+		jd->jd_flags |= JDF_OWNING;
+	*fpp = fp;
+	return (0);
+}
+
+/*
+ * Assocate a jail descriptor with its prison.
+ */
+void
+jaildesc_set_prison(struct file *fp, struct prison *pr)
+{
+	struct jaildesc *jd;
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	jd = fp->f_data;
+	JAILDESC_LOCK(jd);
+	jd->jd_prison = pr;
+	LIST_INSERT_HEAD(&pr->pr_descs, jd, jd_list);
+	prison_hold(pr);
+	JAILDESC_UNLOCK(jd);
+}
+
+/*
+ * Detach all the jail descriptors from a prison.
+ */
+void
+jaildesc_prison_cleanup(struct prison *pr)
+{
+	struct jaildesc *jd;
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	while ((jd = LIST_FIRST(&pr->pr_descs))) {
+		JAILDESC_LOCK(jd);
+		LIST_REMOVE(jd, jd_list);
+		jd->jd_prison = NULL;
+		JAILDESC_UNLOCK(jd);
+		prison_free(pr);
+	}
+}
+
+/*
+ * Pass a note to all listening kqueues.
+ */
+void
+jaildesc_knote(struct prison *pr, long hint)
+{
+	struct jaildesc *jd;
+	int prison_locked;
+
+	if (!LIST_EMPTY(&pr->pr_descs)) {
+		prison_locked = mtx_owned(&pr->pr_mtx);
+		if (!prison_locked)
+			prison_lock(pr);
+		LIST_FOREACH(jd, &pr->pr_descs, jd_list) {
+			JAILDESC_LOCK(jd);
+			if (hint == NOTE_JAIL_REMOVE) {
+				jd->jd_flags |= JDF_REMOVED;
+				if (jd->jd_flags & JDF_SELECTED) {
+					jd->jd_flags &= ~JDF_SELECTED;
+					selwakeup(&jd->jd_selinfo);
+				}
+			}
+			KNOTE_LOCKED(&jd->jd_selinfo.si_note, hint);
+			JAILDESC_UNLOCK(jd);
+		}
+		if (!prison_locked)
+			prison_unlock(pr);
+	}
+}
+
+static int
+jaildesc_close(struct file *fp, struct thread *td)
+{
+	struct jaildesc *jd;
+	struct prison *pr;
+
+	jd = fp->f_data;
+	fp->f_data = NULL;
+	if (jd != NULL) {
+		JAILDESC_LOCK(jd);
+		pr = jd->jd_prison;
+		if (pr != NULL) {
+			/*
+			 * Free or remove the associated prison.
+			 * This requires a second check after re-
+			 * ordering locks.  This jaildesc can remain
+			 * unlocked once we have a prison reference,
+			 * because that prison is the only place that
+			 * still points back to it.
+			 */
+			prison_hold(pr);
+			JAILDESC_UNLOCK(jd);
+			if (jd->jd_flags & JDF_OWNING) {
+				sx_xlock(&allprison_lock);
+				prison_lock(pr);
+				if (jd->jd_prison != NULL) {
+					/*
+					 * Unlink the prison, but don't free
+					 * it; that will be done as part of
+					 * of prison_remove.
+					 */
+					LIST_REMOVE(jd, jd_list);
+					prison_remove(pr);
+				} else {
+					prison_unlock(pr);
+					sx_xunlock(&allprison_lock);
+				}
+			} else {
+				prison_lock(pr);
+				if (jd->jd_prison != NULL) {
+					LIST_REMOVE(jd, jd_list);
+					prison_free(pr);
+				}
+				prison_unlock(pr);
+			}
+			prison_free(pr);
+		}
+		knlist_destroy(&jd->jd_selinfo.si_note);
+		JAILDESC_LOCK_DESTROY(jd);
+		free(jd, M_JAILDESC);
+	}
+	return (0);
+}
+
+static int
+jaildesc_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct jaildesc *jd;
+	int revents;
+
+	revents = 0;
+	jd = fp->f_data;
+	JAILDESC_LOCK(jd);
+	if (jd->jd_flags & JDF_REMOVED)
+		revents |= POLLHUP;
+	if (revents == 0) {
+		selrecord(td, &jd->jd_selinfo);
+		jd->jd_flags |= JDF_SELECTED;
+	}
+	JAILDESC_UNLOCK(jd);
+	return (revents);
+}
+
+static void
+jaildesc_kqops_detach(struct knote *kn)
+{
+	struct jaildesc *jd;
+
+	jd = kn->kn_fp->f_data;
+	knlist_remove(&jd->jd_selinfo.si_note, kn, 0);
+}
+
+static int
+jaildesc_kqops_event(struct knote *kn, long hint)
+{
+	struct jaildesc *jd;
+	u_int event;
+
+	jd = kn->kn_fp->f_data;
+	if (hint == 0) {
+		/*
+		 * Initial test after registration. Generate a
+		 * NOTE_JAIL_REMOVE in case the prison already died
+		 * before registration.
+		 */
+		event = jd->jd_flags & JDF_REMOVED ? NOTE_JAIL_REMOVE : 0;
+	} else {
+		/*
+		 * Mask off extra data.  In the NOTE_JAIL_CHILD case,
+		 * that's everything except the NOTE_JAIL_CHILD bit
+		 * itself, since a JID is any positive integer.
+		 */
+		event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD :
+		    (u_int)hint & NOTE_JAIL_CTRLMASK;
+	}
+
+	/* If the user is interested in this event, record it. */
+	if (kn->kn_sfflags & event) {
+		kn->kn_fflags |= event;
+		/* Report the created jail id or attached process id. */
+		if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) {
+			if (kn->kn_data != 0)
+				kn->kn_fflags |= NOTE_JAIL_MULTI;
+			kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U :
+			    (u_int)hint & ~event;
+		}
+	}
+
+	/* Prison is gone, so flag the event as finished. */
+	if (event == NOTE_JAIL_REMOVE) {
+		kn->kn_flags |= EV_EOF | EV_ONESHOT;
+		if (kn->kn_fflags == 0)
+			kn->kn_flags |= EV_DROP;
+		return (1);
+	}
+
+	return (kn->kn_fflags != 0);
+}
+
+static const struct filterops jaildesc_kqops = {
+	.f_isfd = 1,
+	.f_detach = jaildesc_kqops_detach,
+	.f_event = jaildesc_kqops_event,
+	.f_copy = knote_triv_copy,
+};
+
+static int
+jaildesc_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct jaildesc *jd;
+
+	jd = fp->f_data;
+	switch (kn->kn_filter) {
+	case EVFILT_JAILDESC:
+		kn->kn_fop = &jaildesc_kqops;
+		kn->kn_flags |= EV_CLEAR;
+		knlist_add(&jd->jd_selinfo.si_note, kn, 0);
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+static int
+jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
+{
+	struct jaildesc *jd;
+
+	bzero(sb, sizeof(struct stat));
+	jd = fp->f_data;
+	JAILDESC_LOCK(jd);
+	if (jd->jd_prison != NULL) {
+		sb->st_ino = jd->jd_prison->pr_id;
+		sb->st_mode = S_IFREG | S_IRWXU;
+	} else
+		sb->st_mode = S_IFREG;
+	JAILDESC_UNLOCK(jd);
+	return (0);
+}
+
+static int
+jaildesc_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+    struct filedesc *fdp)
+{
+	struct jaildesc *jd;
+
+	jd = fp->f_data;
+	kif->kf_type = KF_TYPE_JAILDESC;
+	kif->kf_un.kf_jail.kf_jid = jd->jd_prison ? jd->jd_prison->pr_id : 0;
+	return (0);
+}
+
+static int
+jaildesc_cmp(struct file *fp1, struct file *fp2, struct thread *td)
+{
+	struct jaildesc *jd1, *jd2;
+	int jid1, jid2;
+
+	if (fp2->f_type != DTYPE_JAILDESC)
+		return (3);
+	jd1 = fp1->f_data;
+	JAILDESC_LOCK(jd1);
+	jid1 = jd1->jd_prison ? (uintptr_t)jd1->jd_prison->pr_id : 0;
+	JAILDESC_UNLOCK(jd1);
+	jd2 = fp2->f_data;
+	JAILDESC_LOCK(jd2);
+	jid2 = jd2->jd_prison ? (uintptr_t)jd2->jd_prison->pr_id : 0;
+	JAILDESC_UNLOCK(jd2);
+	return (kcmp_cmp(jid1, jid2));
+}
diff --git a/sys/kern/kern_jailmeta.c b/sys/kern/kern_jailmeta.c
index 4e37eccad03a..91bb7155820d 100644
--- a/sys/kern/kern_jailmeta.c
+++ b/sys/kern/kern_jailmeta.c
@@ -599,22 +599,18 @@ SYSCTL_PROC(_security_jail, OID_AUTO, env,
 
 /* Setup and tear down. */
 
-static int
+static void
 jm_sysinit(void *arg __unused)
 {
 	meta.osd_slot = osd_jail_register(jm_osd_destructor, meta.methods);
 	env.osd_slot = osd_jail_register(jm_osd_destructor, env.methods);
-
-	return (0);
 }
 
-static int
+static void
 jm_sysuninit(void *arg __unused)
 {
 	osd_jail_deregister(meta.osd_slot);
 	osd_jail_deregister(env.osd_slot);
-
-	return (0);
 }
 
 SYSINIT(jailmeta, SI_SUB_DRIVERS, SI_ORDER_ANY, jm_sysinit, NULL);
diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c
new file mode 100644
index 000000000000..2efea7dcf9a7
--- /dev/null
+++ b/sys/kern/kern_kexec.c
@@ -0,0 +1,350 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#ifdef INTRNG
+#include <sys/intr.h>
+#endif
+#include <sys/kexec.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
+
+#include <machine/kexec.h>
+
+#ifndef	KEXEC_MD_PAGES
+/*
+ * Number of MD pages for extra bookkeeping.
+ * This is a macro because it can be a constant (some architectures make it 0).
+ * It accepts an argument, which is an array of
+ * kexec_segment[KEXEC_SEGMENT_MAX].
+ */
+#define	KEXEC_MD_PAGES(x)	0
+#endif
+
+/*
+ * Basic design:
+ *
+ * Given an array of "segment descriptors" stage an image to be loaded and
+ * jumped to at reboot, instead of rebooting via firmware.
+ *
+ * Constraints:
+ * - The segment descriptors' "mem" and "memsz" must each fit within a
+ *   vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl.
+ *   A single segment cannot span multiple vm_phys_seg segments, even if the
+ *   vm_phys_seg segments are adjacent.
+ *
+ * Technical details:
+ *
+ * Take advantage of the VM subsystem and create a vm_object to hold the staged
+ * image.  When grabbing pages for the object, sort the pages so that if a page
+ * in the object is located in the physical range of any of the kexec segment
+ * targets then it gets placed at the pindex corresponding to that physical
+ * address.  This avoids the chance of corruption by writing over the page in
+ * the final copy, or the need for a copy buffer page.
+ */
+
+static struct kexec_image staged_image;
+static vm_offset_t stage_addr;
+static vm_object_t kexec_obj;
+
+static eventhandler_tag kexec_reboot_handler;
+static struct mtx kexec_mutex;
+
+static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments");
+
+
+static void
+kexec_reboot(void *junk __unused, int howto)
+{
+	if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL)
+		return;
+
+#ifdef SMP
+	cpu_mp_stop();
+#endif /* SMP */
+	intr_disable();
+	printf("Starting kexec reboot\n");
+
+	scheduler_stopped = true;
+	kexec_reboot_md(&staged_image);
+}
+
+MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF);
+
+/* Sort the segment list once copied in */
+static int
+seg_cmp(const void *seg1, const void *seg2)
+{
+	const struct kexec_segment *s1, *s2;
+
+	s1 = seg1;
+	s2 = seg2;
+
+	return ((uintptr_t)s1->mem - (uintptr_t)s2->mem);
+}
+
+static bool
+segment_fits(struct kexec_segment *seg)
+{
+	vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem;
+
+	for (int i = 0; i < vm_phys_nsegs; i++) {
+		if (v >= vm_phys_segs[i].start &&
+		    (v + seg->memsz - 1) <= vm_phys_segs[i].end)
+			return (true);
+	}
+
+	return (false);
+}
+
+static vm_paddr_t
+pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind)
+{
+	for (int i = count; i > 0; --i) {
+		if (pind >= segs[i - 1].pindex)
+			return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target);
+	}
+
+	panic("No segment for pindex %ju\n", (uintmax_t)pind);
+}
+
+/*
+ * For now still tied to the system call, so assumes all memory is userspace.
+ */
+int
+kern_kexec_load(struct thread *td, u_long entry, u_long nseg,
+    struct kexec_segment *seg, u_long flags)
+{
+	static int kexec_loading;
+	struct kexec_segment segtmp[KEXEC_SEGMENT_MAX];
+	struct kexec_image *new_image_stage = 0;
+	vm_object_t new_segments = NULL;
+	uint8_t *buf;
+	int err = 0;
+	int i;
+	const size_t segsize = nseg * sizeof(struct kexec_segment);
+	vm_page_t *page_list = 0;
+	vm_size_t image_count, md_pages, page_count, tmpsize;
+	vm_offset_t segment_va = 0;
+	/*
+	 * - Do any sanity checking
+	 * - Load the new segments to temporary
+	 * - Remove the old segments
+	 * - Install the new segments
+	 */
+
+	if (nseg > KEXEC_SEGMENT_MAX)
+		return (EINVAL);
+
+	if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0)
+		return (EBUSY);
+
+	/* Only do error checking if we're installing new segments. */
+	if (nseg > 0) {
+		/* Create the new kexec object before destroying the old one. */
+		bzero(&segtmp, sizeof(segtmp));
+		err = copyin(seg, segtmp, segsize);
+		if (err != 0)
+			goto out;
+		qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp);
+		new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO);
+		/*
+		 * Sanity checking:
+		 * - All segments must not overlap the kernel, so must be fully enclosed
+		 *   in a vm_phys_seg (each kexec segment must be in a single
+		 *   vm_phys_seg segment, cannot cross even adjacent segments).
+		 */
+		image_count = 0;
+		for (i = 0; i < nseg; i++) {
+			if (!segment_fits(&segtmp[i]) ||
+			    segtmp[i].bufsz > segtmp[i].memsz) {
+				err = EINVAL;
+				goto out;
+			}
+			new_image_stage->segments[i].pindex = image_count;
+			new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem;
+			new_image_stage->segments[i].size = segtmp[i].memsz;
+			image_count += atop(segtmp[i].memsz);
+		}
+		md_pages = KEXEC_MD_PAGES(segtmp);
+		page_count = image_count + md_pages;
+		new_segments = vm_object_allocate(OBJT_PHYS, page_count);
+		page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK);
+
+		/*
+		 * - Grab all pages for all segments (use pindex to slice it)
+		 * - Walk the list (once)
+		 *   - At each pindex, check if the target PA that corresponds
+		 *     to that index is in the object.  If so, swap the pages.
+		 *   - At the end of this the list will be "best" sorted.
+		 */
+		vm_page_grab_pages_unlocked(new_segments, 0,
+		    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO,
+		    page_list, page_count);
+
+		/* Sort the pages to best match the PA */
+		VM_OBJECT_WLOCK(new_segments);
+		for (i = 0; i < image_count; i++) {
+			vm_page_t curpg, otherpg, tmp;
+			vm_pindex_t otheridx;
+
+			curpg = page_list[i];
+			otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments,
+			    nseg, curpg->pindex));
+			otheridx = otherpg->pindex;
+
+			if (otherpg->object == new_segments) {
+				/*
+				 * Swap 'curpg' and 'otherpg', since 'otherpg'
+				 * is at the PA 'curpg' covers.
+				 */
+				vm_radix_remove(&new_segments->rtree, otheridx);
+				vm_radix_remove(&new_segments->rtree, i);
+				otherpg->pindex = i;
+				curpg->pindex = otheridx;
+				vm_radix_insert(&new_segments->rtree, curpg);
+				vm_radix_insert(&new_segments->rtree, otherpg);
+				tmp = curpg;
+				page_list[i] = otherpg;
+				page_list[otheridx] = tmp;
+			}
+		}
+		for (i = 0; i < nseg; i++) {
+			new_image_stage->segments[i].first_page =
+			    vm_radix_lookup(&new_segments->rtree,
+			    new_image_stage->segments[i].pindex);
+		}
+		if (md_pages > 0)
+			new_image_stage->first_md_page =
+			    vm_radix_lookup(&new_segments->rtree,
+			    page_count - md_pages);
+		else
+			new_image_stage->first_md_page = NULL;
+		VM_OBJECT_WUNLOCK(new_segments);
+
+		/* Map the object to do the copies */
+		err = vm_map_find(kernel_map, new_segments, 0, &segment_va,
+		    ptoa(page_count), 0, VMFS_ANY_SPACE,
+		    VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT);
+		if (err != 0)
+			goto out;
+		buf = (void *)segment_va;
+		new_image_stage->map_addr = segment_va;
+		new_image_stage->map_size = ptoa(new_segments->size);
+		new_image_stage->entry = entry;
+		new_image_stage->map_obj = new_segments;
+		for (i = 0; i < nseg; i++) {
+			err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz);
+			if (err != 0) {
+				goto out;
+			}
+			new_image_stage->segments[i].map_buf = buf;
+			buf += segtmp[i].bufsz;
+			tmpsize = segtmp[i].memsz - segtmp[i].bufsz;
+			if (tmpsize > 0)
+				memset(buf, 0, tmpsize);
+			buf += tmpsize;
+		}
+		/* What's left are the MD pages, so zero them all out. */
+		if (md_pages > 0)
+			bzero(buf, ptoa(md_pages));
+
+		cpu_flush_dcache((void *)segment_va, ptoa(page_count));
+		if ((err = kexec_load_md(new_image_stage)) != 0)
+			goto out;
+	}
+	if (kexec_obj != NULL) {
+		vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0);
+		KASSERT(stage_addr != 0, ("Mapped kexec_obj without address"));
+		vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size);
+	}
+	kexec_obj = new_segments;
+	bzero(&staged_image, sizeof(staged_image));
+	if (nseg > 0)
+		memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage));
+
+	printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry);
+	if (nseg > 0) {
+		if (kexec_reboot_handler == NULL)
+			kexec_reboot_handler =
+			    EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL,
+			    SHUTDOWN_PRI_DEFAULT - 150);
+	} else {
+		if (kexec_reboot_handler != NULL)
+			EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler);
+	}
+out:
+	/* Clean up the mess if we've gotten far. */
+	if (err != 0 && new_segments != NULL) {
+		vm_object_unwire(new_segments, 0, new_segments->size, 0);
+		if (segment_va != 0)
+			vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size);
+		else
+			vm_object_deallocate(new_segments);
+	}
+	atomic_store_rel_int(&kexec_loading, false);
+	if (new_image_stage != NULL)
+		free(new_image_stage, M_TEMP);
+	if (page_list != 0)
+		free(page_list, M_TEMP);
+
+	return (err);
+}
+
+int
+sys_kexec_load(struct thread *td, struct kexec_load_args *uap)
+{
+	int error;
+
+	// FIXME: Do w need a better privilege check than PRIV_REBOOT here?
+	error = priv_check(td, PRIV_REBOOT);
+	if (error != 0)
+		return (error);
+	return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags));
+}
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
index d566bc01bc5e..e2f63cbc0c5a 100644
--- a/sys/kern/kern_linker.c
+++ b/sys/kern/kern_linker.c
@@ -435,7 +435,7 @@ linker_file_register_modules(linker_file_t lf)
 }
 
 static void
-linker_init_kernel_modules(void)
+linker_init_kernel_modules(void *dummy __unused)
 {
 
 	sx_xlock(&kld_sx);
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 31bff6d2c1aa..76f68677e292 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -1780,9 +1780,11 @@ lockmgr_chain(struct thread *td, struct thread **ownerp)
 
 	lk = td->td_wchan;
 
-	if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
+	if (!TD_ON_SLEEPQ(td) || sleepq_type(td->td_wchan) != SLEEPQ_LK ||
+	    LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
 		return (0);
-	db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
+	db_printf("blocked on lock %p (%s) \"%s\" ", &lk->lock_object,
+	    lock_class_lockmgr.lc_name, lk->lock_object.lo_name);
 	if (lk->lk_lock & LK_SHARE)
 		db_printf("SHARED (count %ju)\n",
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
diff --git a/sys/kern/kern_loginclass.c b/sys/kern/kern_loginclass.c
index 55db6c28a1db..0c111c4f78d8 100644
--- a/sys/kern/kern_loginclass.c
+++ b/sys/kern/kern_loginclass.c
@@ -225,6 +225,8 @@ sys_setloginclass(struct thread *td, struct setloginclass_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index 879220be050b..fcbfbe64f854 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -303,7 +303,7 @@ sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS)
  */
 #if MALLOC_DEBUG_MAXZONES > 1
 static void
-tunable_set_numzones(void)
+tunable_set_numzones(void *dummy __unused)
 {
 
 	TUNABLE_INT_FETCH("debug.malloc.numzones",
@@ -751,11 +751,14 @@ malloc_domainset(size_t size, struct malloc_type *mtp, struct domainset *ds,
 		return (malloc_large(size, mtp, DOMAINSET_RR(), flags
 		    DEBUG_REDZONE_ARG));
 
-	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
-	do {
-		va = malloc_domain(&size, &indx, mtp, domain, flags);
-	} while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0);
+	indx = -1;
+	va = NULL;
+	if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) == 0)
+		do {
+			va = malloc_domain(&size, &indx, mtp, domain, flags);
+		} while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0);
 	malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
+
 	if (__predict_false(va == NULL)) {
 		KASSERT((flags & M_WAITOK) == 0,
 		    ("malloc(M_WAITOK) returned NULL"));
@@ -1299,7 +1302,7 @@ mallocinit(void *dummy)
 #endif
 			    align, UMA_ZONE_MALLOC);
 		}
-		for (;i <= size; i+= KMEM_ZBASE)
+		for (; i <= size; i+= KMEM_ZBASE)
 			kmemsize[i >> KMEM_ZSHIFT] = indx;
 	}
 }
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index f952b3fc8805..d67c70984528 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -503,8 +503,8 @@ _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
 /*
  * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
- * We call this if the lock is either contested (i.e. we need to go to
- * sleep waiting for it), or if we need to recurse on it.
+ * We get here if lock profiling is enabled, the lock is already held by
+ * someone else or we are recursing on it.
  */
 #if LOCK_DEBUG > 0
 void
@@ -660,13 +660,8 @@ retry_turnstile:
 		}
 #endif
 
-		/*
-		 * If the mutex isn't already contested and a failure occurs
-		 * setting the contested bit, the mutex was either released
-		 * or the state of the MTX_RECURSED bit changed.
-		 */
-		if ((v & MTX_CONTESTED) == 0 &&
-		    !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_CONTESTED)) {
+		if ((v & MTX_WAITERS) == 0 &&
+		    !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_WAITERS)) {
 			goto retry_turnstile;
 		}
 
@@ -869,7 +864,7 @@ _thread_lock(struct thread *td)
 		WITNESS_LOCK(&m->lock_object, LOP_EXCLUSIVE, file, line);
 		return;
 	}
-	_mtx_release_lock_quick(m);
+	atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED);
 slowpath_unlocked:
 	spinlock_exit();
 slowpath_noirq:
@@ -959,7 +954,7 @@ retry:
 		}
 		if (m == td->td_lock)
 			break;
-		_mtx_release_lock_quick(m);
+		atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED);
 	}
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
@@ -1029,8 +1024,8 @@ thread_lock_set(struct thread *td, struct mtx *new)
 /*
  * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
- * We are only called here if the lock is recursed, contested (i.e. we
- * need to wake up a blocked thread) or lockstat probe is active.
+ * We get here if lock profiling is enabled, the lock is already held by
+ * someone else or we are recursing on it.
  */
 #if LOCK_DEBUG > 0
 void
@@ -1071,7 +1066,7 @@ __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v)
 	 * can be removed from the hash list if it is empty.
 	 */
 	turnstile_chain_lock(&m->lock_object);
-	_mtx_release_lock_quick(m);
+	atomic_store_rel_ptr(&m->mtx_lock, MTX_UNOWNED);
 	ts = turnstile_lookup(&m->lock_object);
 	MPASS(ts != NULL);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
@@ -1136,9 +1131,9 @@ __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
  * General init routine used by the MTX_SYSINIT() macro.
  */
 void
-mtx_sysinit(void *arg)
+mtx_sysinit(const void *arg)
 {
-	struct mtx_args *margs = arg;
+	const struct mtx_args *margs = arg;
 
 	mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
 	    margs->ma_opts);
@@ -1207,7 +1202,7 @@ _mtx_destroy(volatile uintptr_t *c)
 	if (!mtx_owned(m))
 		MPASS(mtx_unowned(m));
 	else {
-		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_WAITERS)) == 0);
 
 		/* Perform the non-mtx related part of mtx_unlock_spin(). */
 		if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) {
@@ -1359,8 +1354,8 @@ db_show_mtx(const struct lock_object *lock)
 		db_printf("DESTROYED");
 	else {
 		db_printf("OWNED");
-		if (m->mtx_lock & MTX_CONTESTED)
-			db_printf(", CONTESTED");
+		if (m->mtx_lock & MTX_WAITERS)
+			db_printf(", WAITERS");
 		if (m->mtx_lock & MTX_RECURSED)
 			db_printf(", RECURSED");
 	}
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 379fbda619c0..6e56664d12ce 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -1112,13 +1112,14 @@ fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 		if (cred->cr_flags & CRED_FLAG_CAPMODE)
 			kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE;
 		/* XXX bde doesn't like KI_NGROUPS */
-		if (cred->cr_ngroups > KI_NGROUPS) {
+		if (1 + cred->cr_ngroups > KI_NGROUPS) {
 			kp->ki_ngroups = KI_NGROUPS;
 			kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW;
 		} else
-			kp->ki_ngroups = cred->cr_ngroups;
-		bcopy(cred->cr_groups, kp->ki_groups,
-		    kp->ki_ngroups * sizeof(gid_t));
+			kp->ki_ngroups = 1 + cred->cr_ngroups;
+		kp->ki_groups[0] = cred->cr_gid;
+		bcopy(cred->cr_groups, kp->ki_groups + 1,
+		    (kp->ki_ngroups - 1) * sizeof(gid_t));
 		kp->ki_rgid = cred->cr_rgid;
 		kp->ki_svgid = cred->cr_svgid;
 		/* If jailed(cred), emulate the old P_JAILED flag. */
@@ -2943,8 +2944,11 @@ sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
 	cred = crhold(p->p_ucred);
 	PROC_UNLOCK(p);
 
-	error = SYSCTL_OUT(req, cred->cr_groups,
-	    cred->cr_ngroups * sizeof(gid_t));
+	error = SYSCTL_OUT(req, &cred->cr_gid, sizeof(gid_t));
+	if (error == 0)
+		error = SYSCTL_OUT(req, cred->cr_groups,
+		    cred->cr_ngroups * sizeof(gid_t));
+
 	crfree(cred);
 	return (error);
 }
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index 0ca42d640767..3c145851b683 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -291,11 +291,6 @@ sys_getgid(struct thread *td, struct getgid_args *uap)
 	return (0);
 }
 
-/*
- * Get effective group ID.  The "egid" is groups[0], and could be obtained
- * via getgroups.  This syscall exists because it is somewhat painful to do
- * correctly in a library function.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct getegid_args {
         int     dummy;
@@ -701,7 +696,7 @@ kern_setcred(struct thread *const td, const u_int flags,
 	gid_t *groups = NULL;
 	gid_t smallgroups[CRED_SMALLGROUPS_NB];
 	int error;
-	bool cred_set;
+	bool cred_set = false;
 
 	/* Bail out on unrecognized flags. */
 	if (flags & ~SETCREDF_MASK)
@@ -844,17 +839,32 @@ kern_setcred(struct thread *const td, const u_int flags,
 	if (cred_set) {
 		setsugid(p);
 		to_free_cred = old_cred;
+#ifdef RACCT
+		racct_proc_ucred_changed(p, old_cred, new_cred);
+#endif
+#ifdef RCTL
+		crhold(new_cred);
+#endif
 		MPASS(error == 0);
 	} else
 		error = EAGAIN;
 
 unlock_finish:
 	PROC_UNLOCK(p);
+
 	/*
 	 * Part 3: After releasing the process lock, we perform cleanups and
 	 * finishing operations.
 	 */
 
+#ifdef RCTL
+	if (cred_set) {
+		rctl_proc_ucred_changed(p, new_cred);
+		/* Paired with the crhold() just above. */
+		crfree(new_cred);
+	}
+#endif
+
 #ifdef MAC
 	if (mac_set_proc_data != NULL)
 		mac_set_proc_finish(td, proc_label_set, mac_set_proc_data);
@@ -987,6 +997,8 @@ sys_setuid(struct thread *td, struct setuid_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
@@ -1395,6 +1407,8 @@ sys_setreuid(struct thread *td, struct setreuid_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
@@ -1541,6 +1555,8 @@ sys_setresuid(struct thread *td, struct setresuid_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
@@ -1803,12 +1819,6 @@ groupmember(gid_t gid, const struct ucred *cred)
 bool
 realgroupmember(gid_t gid, const struct ucred *cred)
 {
-	/*
-	 * Although the equality test on 'cr_rgid' below doesn't access
-	 * 'cr_groups', we check for the latter's length here as we assume that,
-	 * if 'cr_ngroups' is 0, the passed 'struct ucred' is invalid, and
-	 * 'cr_rgid' may not have been filled.
-	 */
 	groups_check_positive_len(cred->cr_ngroups);
 
 	if (gid == cred->cr_rgid)
@@ -1896,19 +1906,22 @@ SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
 static int
 cr_canseeothergids(struct ucred *u1, struct ucred *u2)
 {
-	if (!see_other_gids) {
-		if (realgroupmember(u1->cr_rgid, u2))
-			return (0);
+	if (see_other_gids)
+		return (0);
 
-		for (int i = 1; i < u1->cr_ngroups; i++)
-			if (realgroupmember(u1->cr_groups[i], u2))
-				return (0);
+	/* Restriction in force. */
 
-		if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) != 0)
-			return (ESRCH);
-	}
+	if (realgroupmember(u1->cr_rgid, u2))
+		return (0);
 
-	return (0);
+	for (int i = 0; i < u1->cr_ngroups; i++)
+		if (realgroupmember(u1->cr_groups[i], u2))
+			return (0);
+
+	if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) == 0)
+		return (0);
+
+	return (ESRCH);
 }
 
 /*
@@ -2276,6 +2289,7 @@ cr_xids_subset(struct ucred *active_cred, struct ucred *obj_cred)
 		}
 	}
 	grpsubset = grpsubset &&
+	    groupmember(obj_cred->cr_gid, active_cred) &&
 	    groupmember(obj_cred->cr_rgid, active_cred) &&
 	    groupmember(obj_cred->cr_svgid, active_cred);
 
@@ -2921,8 +2935,8 @@ crextend(struct ucred *cr, int n)
  * Normalizes a set of groups to be applied to a 'struct ucred'.
  *
  * Normalization ensures that the supplementary groups are sorted in ascending
- * order and do not contain duplicates.  This allows group_is_supplementary
- * to do a binary search.
+ * order and do not contain duplicates.  This allows group_is_supplementary() to
+ * do a binary search.
  */
 static void
 groups_normalize(int *ngrp, gid_t *groups)
@@ -2985,9 +2999,9 @@ crsetgroups_internal(struct ucred *cr, int ngrp, const gid_t *groups)
  * Copy groups in to a credential after expanding it if required.
  *
  * May sleep in order to allocate memory (except if, e.g., crextend() was called
- * before with 'ngrp' or greater).  Truncates the list to ngroups_max if
+ * before with 'ngrp' or greater).  Truncates the list to 'ngroups_max' if
  * it is too large.  Array 'groups' doesn't need to be sorted.  'ngrp' must be
- * strictly positive.
+ * positive.
  */
 void
 crsetgroups(struct ucred *cr, int ngrp, const gid_t *groups)
@@ -3018,8 +3032,8 @@ crsetgroups(struct ucred *cr, int ngrp, const gid_t *groups)
  * Same as crsetgroups() but sets the effective GID as well.
  *
  * This function ensures that an effective GID is always present in credentials.
- * An empty array will only set the effective GID to the default_egid, while a
- * non-empty array will peel off groups[0] to set as the effective GID and use
+ * An empty array will only set the effective GID to 'default_egid', while
+ * a non-empty array will peel off groups[0] to set as the effective GID and use
  * the remainder, if any, as supplementary groups.
  */
 void
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 7351e9cb6313..17b64ad00bb5 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -1236,16 +1236,20 @@ racct_updatepcpu_containers(void)
 	    racct_updatepcpu_post, NULL, NULL);
 }
 
+static bool
+racct_proc_to_skip(const struct proc *p)
+{
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	return (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0);
+}
+
 static void
 racctd(void)
 {
 	struct proc *p;
-	struct proc *idle;
 
 	ASSERT_RACCT_ENABLED();
 
-	idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc;
-
 	for (;;) {
 		racct_decay();
 
@@ -1253,12 +1257,7 @@ racctd(void)
 
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
-			if (p == idle) {
-				PROC_UNLOCK(p);
-				continue;
-			}
-			if (p->p_state != PRS_NORMAL ||
-			    (p->p_flag & P_IDLEPROC) != 0) {
+			if (racct_proc_to_skip(p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1284,7 +1283,7 @@ racctd(void)
 		 */
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
-			if (p->p_state != PRS_NORMAL) {
+			if (racct_proc_to_skip(p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1312,7 +1311,7 @@ static struct kproc_desc racctd_kp = {
 };
 
 static void
-racctd_init(void)
+racctd_init(void *dummy __unused)
 {
 	if (!racct_enable)
 		return;
@@ -1322,7 +1321,7 @@ racctd_init(void)
 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
 
 static void
-racct_init(void)
+racct_init(void *dummy __unused)
 {
 	if (!racct_enable)
 		return;
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
index 3854ffbeec29..cd66bff62608 100644
--- a/sys/kern/kern_rangelock.c
+++ b/sys/kern/kern_rangelock.c
@@ -300,7 +300,7 @@ static void rangelock_free_free(struct rl_q_entry *free);
 static void rangelock_noncheating_destroy(struct rangelock *lock);
 
 static void
-rangelock_sys_init(void)
+rangelock_sys_init(void *dummy __unused)
 {
 	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 4232c71f86fb..682ba86d23ff 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -209,7 +209,7 @@ static struct dict actionnames[] = {
 	{ "throttle", RCTL_ACTION_THROTTLE },
 	{ NULL, -1 }};
 
-static void rctl_init(void);
+static void rctl_init(void *);
 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
 
 static uma_zone_t rctl_rule_zone;
@@ -2175,7 +2175,7 @@ rctl_racct_release(struct racct *racct)
 }
 
 static void
-rctl_init(void)
+rctl_init(void *dummy __unused)
 {
 
 	if (!racct_enable)
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
index c1633dd19de2..7206572ffc02 100644
--- a/sys/kern/kern_rmlock.c
+++ b/sys/kern/kern_rmlock.c
@@ -337,9 +337,9 @@ rm_wowned(const struct rmlock *rm)
 }
 
 void
-rm_sysinit(void *arg)
+rm_sysinit(const void *arg)
 {
-	struct rm_args *args;
+	const struct rm_args *args;
 
 	args = arg;
 	rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags);
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index e182d1fe9baf..84a3a890be63 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
@@ -266,9 +266,9 @@ _rw_destroy(volatile uintptr_t *c)
 }
 
 void
-rw_sysinit(void *arg)
+rw_sysinit(const void *arg)
 {
-	struct rw_args *args;
+	const struct rw_args *args;
 
 	args = arg;
 	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c
index 5b8398caaca9..f48d0e3d616b 100644
--- a/sys/kern/kern_sharedpage.c
+++ b/sys/kern/kern_sharedpage.c
@@ -130,8 +130,7 @@ shared_page_init(void *dummy __unused)
 	shared_page_mapping = (char *)addr;
 }
 
-SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
-    NULL);
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, shared_page_init, NULL);
 
 /*
  * Push the timehands update to the shared page.
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index da0efac0598d..a55f3c761449 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -113,7 +113,7 @@ static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock);
-static void	sigqueue_start(void);
+static void	sigqueue_start(void *);
 static void	sigfastblock_setpend(struct thread *td, bool resched);
 static void	sig_handle_first_stop(struct thread *td, struct proc *p,
     int sig);
@@ -124,6 +124,7 @@ const struct filterops sig_filtops = {
 	.f_attach = filt_sigattach,
 	.f_detach = filt_sigdetach,
 	.f_event = filt_signal,
+	.f_copy = knote_triv_copy,
 };
 
 static int	kern_forcesigexit = 1;
@@ -344,7 +345,7 @@ ast_sigsuspend(struct thread *td, int tda __unused)
 }
 
 static void
-sigqueue_start(void)
+sigqueue_start(void *dummy __unused)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
@@ -2656,9 +2657,11 @@ ptrace_coredumpreq(struct thread *td, struct proc *p,
 		return;
 	}
 
+	memset(&wctx, 0, sizeof(wctx));
 	wctx.vp = tcq->tc_vp;
 	wctx.fcred = NOCRED;
 
+	memset(&cdw, 0, sizeof(wctx));
 	cdw.ctx = &wctx;
 	cdw.write_fn = core_vn_write;
 	cdw.extend_fn = core_vn_extend;
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
index accea5d288eb..249faf5b1ec4 100644
--- a/sys/kern/kern_sx.c
+++ b/sys/kern/kern_sx.c
@@ -222,9 +222,9 @@ owner_sx(const struct lock_object *lock, struct thread **owner)
 #endif
 
 void
-sx_sysinit(void *arg)
+sx_sysinit(const void *arg)
 {
-	struct sx_args *sargs = arg;
+	const struct sx_args *sargs = arg;
 
 	sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
 }
@@ -1539,16 +1539,19 @@ sx_chain(struct thread *td, struct thread **ownerp)
 
 	/*
 	 * Check to see if this thread is blocked on an sx lock.
-	 * First, we check the lock class.  If that is ok, then we
-	 * compare the lock name against the wait message.
+	 * The thread should be on a sleep queue with type SLEEPQ_SX, the
+	 * purported lock should have the lock class index of sx, and the lock
+	 * name should match the wait message.
 	 */
 	sx = td->td_wchan;
-	if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
+	if (!TD_ON_SLEEPQ(td) || sleepq_type(td->td_wchan) != SLEEPQ_SX ||
+	    LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
 	    sx->lock_object.lo_name != td->td_wmesg)
 		return (0);
 
 	/* We think we have an sx lock, so output some details. */
-	db_printf("blocked on sx \"%s\" ", td->td_wmesg);
+	db_printf("blocked on lock %p (%s) \"%s\" ", &sx->lock_object,
+	    lock_class_sx.lc_name, td->td_wmesg);
 	*ownerp = sx_xholder(sx);
 	if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK (count %ju)\n",
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 0e8c2b9f362e..4329959a2ef4 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -347,6 +347,17 @@ kern_thr_exit(struct thread *td)
 	p = td->td_proc;
 
 	/*
+	 * Clear kernel ASTs in advance of selecting the last exiting
+	 * thread and acquiring schedulers locks.  It is fine to
+	 * clear the ASTs here even if we are not going to exit after
+	 * all.  On the other hand, leaving them pending could trigger
+	 * execution in subsystems in a context where they are not
+	 * prepared to handle top kernel actions, even in execution of
+	 * an unrelated thread.
+	 */
+	ast_kclear(td);
+
+	/*
 	 * If all of the threads in a process call this routine to
 	 * exit (e.g. all threads call pthread_exit()), exactly one
 	 * thread should return to the caller to terminate the process
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 50b040132396..3180c66cb42b 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -1694,8 +1694,10 @@ thread_single_end(struct proc *p, int mode)
 				thread_unlock(td);
 		}
 	}
-	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
-	    ("inconsistent boundary count %d", p->p_boundary_count));
+	KASSERT(mode != SINGLE_BOUNDARY || P_SHOULDSTOP(p) ||
+	    p->p_boundary_count == 0,
+	    ("pid %d proc %p flags %#x inconsistent boundary count %d",
+	    p->p_pid, p, p->p_flag, p->p_boundary_count));
 	PROC_SUNLOCK(p);
 	wakeup(&p->p_flag);
 }
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 2a6f0989f6aa..5b7485c25cd7 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -90,7 +90,7 @@ static int	user_clock_nanosleep(struct thread *td, clockid_t clock_id,
 		    int flags, const struct timespec *ua_rqtp,
 		    struct timespec *ua_rmtp);
 
-static void	itimer_start(void);
+static void	itimer_start(void *);
 static int	itimer_init(void *, int, int);
 static void	itimer_fini(void *, int);
 static void	itimer_enter(struct itimer *);
@@ -1170,7 +1170,7 @@ eventratecheck(struct timeval *lasttime, int *cureps, int maxeps)
 }
 
 static void
-itimer_start(void)
+itimer_start(void *dummy __unused)
 {
 	static const struct kclock rt_clock = {
 		.timer_create  = realtimer_create,
diff --git a/sys/kern/kern_tslog.c b/sys/kern/kern_tslog.c
index fbf81d423b95..09070eea284f 100644
--- a/sys/kern/kern_tslog.c
+++ b/sys/kern/kern_tslog.c
@@ -220,3 +220,13 @@ SYSCTL_PROC(_debug, OID_AUTO, tslog_user,
     CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_SKIP,
     0, 0, sysctl_debug_tslog_user,
     "", "Dump recorded userland event timestamps");
+
+void
+sysinit_tslog_shim(const void *data)
+{
+	const struct sysinit_tslog *x = data;
+
+	tslog(curthread, TS_ENTER, "SYSINIT", x->name);
+	(x->func)(x->data);
+	tslog(curthread, TS_EXIT, "SYSINIT", x->name);
+}
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
index bbebadc4c395..ebd203858b66 100644
--- a/sys/kern/link_elf.c
+++ b/sys/kern/link_elf.c
@@ -518,9 +518,15 @@ link_elf_init(void* arg)
 	(void)link_elf_link_common_finish(linker_kernel_file);
 	linker_kernel_file->flags |= LINKER_FILE_LINKED;
 	TAILQ_INIT(&set_pcpu_list);
+	ef->pcpu_start = DPCPU_START;
+	ef->pcpu_stop = DPCPU_STOP;
+	ef->pcpu_base = DPCPU_START;
 #ifdef VIMAGE
 	TAILQ_INIT(&set_vnet_list);
 	vnet_save_init((void *)VNET_START, VNET_STOP - VNET_START);
+	ef->vnet_start = VNET_START;
+	ef->vnet_stop = VNET_STOP;
+	ef->vnet_base = VNET_START;
 #endif
 }
 
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
index 151aab96f9be..a3a53a39bfd6 100644
--- a/sys/kern/link_elf_obj.c
+++ b/sys/kern/link_elf_obj.c
@@ -70,6 +70,7 @@
 
 typedef struct {
 	void		*addr;
+	void		*origaddr; /* Used by debuggers. */
 	Elf_Off		size;
 	int		flags;	/* Section flags. */
 	int		sec;	/* Original section number. */
@@ -492,7 +493,8 @@ link_elf_link_preload(linker_class_t cls, const char *filename,
 		case SHT_FINI_ARRAY:
 			if (shdr[i].sh_addr == 0)
 				break;
-			ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
+			ef->progtab[pb].addr = ef->progtab[pb].origaddr =
+			    (void *)shdr[i].sh_addr;
 			if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
 #ifdef __amd64__
@@ -1088,6 +1090,8 @@ link_elf_load_file(linker_class_t cls, const char *filename,
 				ef->progtab[pb].name = "<<NOBITS>>";
 			if (ef->progtab[pb].name != NULL && 
 			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) {
+				ef->progtab[pb].origaddr =
+				    (void *)(uintptr_t)mapbase;
 				ef->progtab[pb].addr =
 				    dpcpu_alloc(shdr[i].sh_size);
 				if (ef->progtab[pb].addr == NULL) {
@@ -1101,6 +1105,8 @@ link_elf_load_file(linker_class_t cls, const char *filename,
 #ifdef VIMAGE
 			else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, VNET_SETNAME)) {
+				ef->progtab[pb].origaddr =
+				    (void *)(uintptr_t)mapbase;
 				ef->progtab[pb].addr =
 				    vnet_data_alloc(shdr[i].sh_size);
 				if (ef->progtab[pb].addr == NULL) {
diff --git a/sys/kern/md4c.c b/sys/kern/md4c.c
deleted file mode 100644
index e173e17e3387..000000000000
--- a/sys/kern/md4c.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm
- */
-
-/*-
-   SPDX-License-Identifier: RSA-MD
-
-   Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
-
-   License to copy and use this software is granted provided that it
-   is identified as the "RSA Data Security, Inc. MD4 Message-Digest
-   Algorithm" in all material mentioning or referencing this software
-   or this function.
-
-   License is also granted to make and use derivative works provided
-   that such works are identified as "derived from the RSA Data
-   Security, Inc. MD4 Message-Digest Algorithm" in all material
-   mentioning or referencing the derived work.
-
-   RSA Data Security, Inc. makes no representations concerning either
-   the merchantability of this software or the suitability of this
-   software for any particular purpose. It is provided "as is"
-   without express or implied warranty of any kind.
-
-   These notices must be retained in any copies of any part of this
-   documentation and/or software.
- */
-
-#include <sys/param.h>
-#ifdef _KERNEL
-#include <sys/systm.h>
-#else
-#include <string.h>
-#endif
-#include <sys/md4.h>
-
-typedef unsigned char *POINTER;
-typedef uint16_t UINT2;
-typedef uint32_t UINT4;
-
-#define PROTO_LIST(list) list
-
-/* Constants for MD4Transform routine.
- */
-#define S11 3
-#define S12 7
-#define S13 11
-#define S14 19
-#define S21 3
-#define S22 5
-#define S23 9
-#define S24 13
-#define S31 3
-#define S32 9
-#define S33 11
-#define S34 15
-
-static void MD4Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
-static void Encode PROTO_LIST
-  ((unsigned char *, UINT4 *, unsigned int));
-static void Decode PROTO_LIST
-  ((UINT4 *, const unsigned char *, unsigned int));
-
-static unsigned char PADDING[64] = {
-  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* F, G and H are basic MD4 functions.
- */
-#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
-#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-
-/* ROTATE_LEFT rotates x left n bits.
- */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
-
-/* FF, GG and HH are transformations for rounds 1, 2 and 3 */
-/* Rotation is separate from addition to prevent recomputation */
-#define FF(a, b, c, d, x, s) { \
-    (a) += F ((b), (c), (d)) + (x); \
-    (a) = ROTATE_LEFT ((a), (s)); \
-  }
-#define GG(a, b, c, d, x, s) { \
-    (a) += G ((b), (c), (d)) + (x) + (UINT4)0x5a827999; \
-    (a) = ROTATE_LEFT ((a), (s)); \
-  }
-#define HH(a, b, c, d, x, s) { \
-    (a) += H ((b), (c), (d)) + (x) + (UINT4)0x6ed9eba1; \
-    (a) = ROTATE_LEFT ((a), (s)); \
-  }
-
-/* MD4 initialization. Begins an MD4 operation, writing a new context.
- */
-void
-MD4Init(MD4_CTX *context)
-{
-  context->count[0] = context->count[1] = 0;
-
-  /* Load magic initialization constants.
-   */
-  context->state[0] = 0x67452301;
-  context->state[1] = 0xefcdab89;
-  context->state[2] = 0x98badcfe;
-  context->state[3] = 0x10325476;
-}
-
-/* MD4 block update operation. Continues an MD4 message-digest
-     operation, processing another message block, and updating the
-     context.
- */
-void
-MD4Update(MD4_CTX *context, const unsigned char *input,
-    unsigned int inputLen)
-{
-  unsigned int i, index, partLen;
-
-  /* Compute number of bytes mod 64 */
-  index = (unsigned int)((context->count[0] >> 3) & 0x3F);
-  /* Update number of bits */
-  if ((context->count[0] += ((UINT4)inputLen << 3))
-      < ((UINT4)inputLen << 3))
-    context->count[1]++;
-  context->count[1] += ((UINT4)inputLen >> 29);
-
-  partLen = 64 - index;
-  /* Transform as many times as possible.
-   */
-  if (inputLen >= partLen) {
-    bcopy(input, &context->buffer[index], partLen);
-    MD4Transform (context->state, context->buffer);
-
-    for (i = partLen; i + 63 < inputLen; i += 64)
-      MD4Transform (context->state, &input[i]);
-
-    index = 0;
-  }
-  else
-    i = 0;
-
-  /* Buffer remaining input */
-  bcopy(&input[i], &context->buffer[index], inputLen-i);
-}
-
-/* MD4 padding. */
-void
-MD4Pad(MD4_CTX *context)
-{
-  unsigned char bits[8];
-  unsigned int index, padLen;
-
-  /* Save number of bits */
-  Encode (bits, context->count, 8);
-
-  /* Pad out to 56 mod 64.
-   */
-  index = (unsigned int)((context->count[0] >> 3) & 0x3f);
-  padLen = (index < 56) ? (56 - index) : (120 - index);
-  MD4Update (context, PADDING, padLen);
-
-  /* Append length (before padding) */
-  MD4Update (context, bits, 8);
-}
-
-/* MD4 finalization. Ends an MD4 message-digest operation, writing the
-     the message digest and zeroizing the context.
- */
-void
-MD4Final(unsigned char digest[static 16], MD4_CTX *context)
-{
-  /* Do padding */
-  MD4Pad (context);
-
-  /* Store state in digest */
-  Encode (digest, context->state, 16);
-
-  /* Zeroize sensitive information.
-   */
-  bzero(context, sizeof (*context));
-}
-
-/* MD4 basic transformation. Transforms state based on block.
- */
-static void
-MD4Transform(UINT4 state[4], const unsigned char block[64])
-{
-  UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
-
-  Decode (x, block, 64);
-
-  /* Round 1 */
-  FF (a, b, c, d, x[ 0], S11); /* 1 */
-  FF (d, a, b, c, x[ 1], S12); /* 2 */
-  FF (c, d, a, b, x[ 2], S13); /* 3 */
-  FF (b, c, d, a, x[ 3], S14); /* 4 */
-  FF (a, b, c, d, x[ 4], S11); /* 5 */
-  FF (d, a, b, c, x[ 5], S12); /* 6 */
-  FF (c, d, a, b, x[ 6], S13); /* 7 */
-  FF (b, c, d, a, x[ 7], S14); /* 8 */
-  FF (a, b, c, d, x[ 8], S11); /* 9 */
-  FF (d, a, b, c, x[ 9], S12); /* 10 */
-  FF (c, d, a, b, x[10], S13); /* 11 */
-  FF (b, c, d, a, x[11], S14); /* 12 */
-  FF (a, b, c, d, x[12], S11); /* 13 */
-  FF (d, a, b, c, x[13], S12); /* 14 */
-  FF (c, d, a, b, x[14], S13); /* 15 */
-  FF (b, c, d, a, x[15], S14); /* 16 */
-
-  /* Round 2 */
-  GG (a, b, c, d, x[ 0], S21); /* 17 */
-  GG (d, a, b, c, x[ 4], S22); /* 18 */
-  GG (c, d, a, b, x[ 8], S23); /* 19 */
-  GG (b, c, d, a, x[12], S24); /* 20 */
-  GG (a, b, c, d, x[ 1], S21); /* 21 */
-  GG (d, a, b, c, x[ 5], S22); /* 22 */
-  GG (c, d, a, b, x[ 9], S23); /* 23 */
-  GG (b, c, d, a, x[13], S24); /* 24 */
-  GG (a, b, c, d, x[ 2], S21); /* 25 */
-  GG (d, a, b, c, x[ 6], S22); /* 26 */
-  GG (c, d, a, b, x[10], S23); /* 27 */
-  GG (b, c, d, a, x[14], S24); /* 28 */
-  GG (a, b, c, d, x[ 3], S21); /* 29 */
-  GG (d, a, b, c, x[ 7], S22); /* 30 */
-  GG (c, d, a, b, x[11], S23); /* 31 */
-  GG (b, c, d, a, x[15], S24); /* 32 */
-
-  /* Round 3 */
-  HH (a, b, c, d, x[ 0], S31); /* 33 */
-  HH (d, a, b, c, x[ 8], S32); /* 34 */
-  HH (c, d, a, b, x[ 4], S33); /* 35 */
-  HH (b, c, d, a, x[12], S34); /* 36 */
-  HH (a, b, c, d, x[ 2], S31); /* 37 */
-  HH (d, a, b, c, x[10], S32); /* 38 */
-  HH (c, d, a, b, x[ 6], S33); /* 39 */
-  HH (b, c, d, a, x[14], S34); /* 40 */
-  HH (a, b, c, d, x[ 1], S31); /* 41 */
-  HH (d, a, b, c, x[ 9], S32); /* 42 */
-  HH (c, d, a, b, x[ 5], S33); /* 43 */
-  HH (b, c, d, a, x[13], S34); /* 44 */
-  HH (a, b, c, d, x[ 3], S31); /* 45 */
-  HH (d, a, b, c, x[11], S32); /* 46 */
-  HH (c, d, a, b, x[ 7], S33); /* 47 */
-  HH (b, c, d, a, x[15], S34); /* 48 */
-
-  state[0] += a;
-  state[1] += b;
-  state[2] += c;
-  state[3] += d;
-
-  /* Zeroize sensitive information.
-   */
-  bzero((POINTER)x, sizeof (x));
-}
-
-/* Encodes input (UINT4) into output (unsigned char). Assumes len is
-     a multiple of 4.
- */
-static void
-Encode(unsigned char *output, UINT4 *input, unsigned int len)
-{
-  unsigned int i, j;
-
-  for (i = 0, j = 0; j < len; i++, j += 4) {
-    output[j] = (unsigned char)(input[i] & 0xff);
-    output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
-    output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
-    output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
-  }
-}
-
-/* Decodes input (unsigned char) into output (UINT4). Assumes len is
-     a multiple of 4.
- */
-static void
-Decode(UINT4 *output, const unsigned char *input, unsigned int len)
-{
-  unsigned int i, j;
-
-  for (i = 0, j = 0; j < len; i++, j += 4)
-    output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
-      (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
-}
-
-#ifdef WEAK_REFS
-/* When building libmd, provide weak references. Note: this is not
-   activated in the context of compiling these sources for internal
-   use in libcrypt.
- */
-#undef MD4Init
-__weak_reference(_libmd_MD4Init, MD4Init);
-#undef MD4Update
-__weak_reference(_libmd_MD4Update, MD4Update);
-#undef MD4Pad
-__weak_reference(_libmd_MD4Pad, MD4Pad);
-#undef MD4Final
-__weak_reference(_libmd_MD4Final, MD4Final);
-#endif
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
deleted file mode 100644
index 0922d0f8cc61..000000000000
--- a/sys/kern/md5c.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*-
- * SPDX-License-Identifier: RSA-MD
- *
- * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
- *
- * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
- * rights reserved.
- *
- * License to copy and use this software is granted provided that it
- * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
- * Algorithm" in all material mentioning or referencing this software
- * or this function.
- *
- * License is also granted to make and use derivative works provided
- * that such works are identified as "derived from the RSA Data
- * Security, Inc. MD5 Message-Digest Algorithm" in all material
- * mentioning or referencing the derived work.
- *
- * RSA Data Security, Inc. makes no representations concerning either
- * the merchantability of this software or the suitability of this
- * software for any particular purpose. It is provided "as is"
- * without express or implied warranty of any kind.
- *
- * These notices must be retained in any copies of any part of this
- * documentation and/or software.
- *
- * This code is the same as the code published by RSA Inc.  It has been
- * edited for clarity and style only.
- */
-
-#include <sys/types.h>
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-#else
-#include <string.h>
-#endif
-
-#include <machine/endian.h>
-#include <sys/endian.h>
-#include <sys/md5.h>
-
-static void MD5Transform(uint32_t [4], const unsigned char [64]);
-
-#if (BYTE_ORDER == LITTLE_ENDIAN)
-#define Encode memcpy
-#define Decode memcpy
-#else 
-
-/*
- * Encodes input (uint32_t) into output (unsigned char). Assumes len is
- * a multiple of 4.
- */
-
-static void
-Encode (unsigned char *output, uint32_t *input, unsigned int len)
-{
-	unsigned int i;
-	uint32_t ip;
-
-	for (i = 0; i < len / 4; i++) {
-		ip = input[i];
-		*output++ = ip;
-		*output++ = ip >> 8;
-		*output++ = ip >> 16;
-		*output++ = ip >> 24;
-	}
-}
-
-/*
- * Decodes input (unsigned char) into output (uint32_t). Assumes len is
- * a multiple of 4.
- */
-
-static void
-Decode (uint32_t *output, const unsigned char *input, unsigned int len)
-{
-	unsigned int i;
-
-	for (i = 0; i < len; i += 4) { 
-		*output++ = input[i] | (input[i+1] << 8) | (input[i+2] << 16) |
-		    (input[i+3] << 24);
-	}
-}
-#endif
-
-static unsigned char PADDING[64] = {
-  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* F, G, H and I are basic MD5 functions. */
-#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
-#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-#define I(x, y, z) ((y) ^ ((x) | (~z)))
-
-/* ROTATE_LEFT rotates x left n bits. */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
-
-/*
- * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
- * Rotation is separate from addition to prevent recomputation.
- */
-#define FF(a, b, c, d, x, s, ac) { \
-	(a) += F ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
-#define GG(a, b, c, d, x, s, ac) { \
-	(a) += G ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
-#define HH(a, b, c, d, x, s, ac) { \
-	(a) += H ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
-#define II(a, b, c, d, x, s, ac) { \
-	(a) += I ((b), (c), (d)) + (x) + (uint32_t)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
-
-/* MD5 initialization. Begins an MD5 operation, writing a new context. */
-
-void
-MD5Init(MD5_CTX *context)
-{
-
-	context->count[0] = context->count[1] = 0;
-
-	/* Load magic initialization constants.  */
-	context->state[0] = 0x67452301;
-	context->state[1] = 0xefcdab89;
-	context->state[2] = 0x98badcfe;
-	context->state[3] = 0x10325476;
-}
-
-/* 
- * MD5 block update operation. Continues an MD5 message-digest
- * operation, processing another message block, and updating the
- * context.
- */
-
-void
-MD5Update(MD5_CTX *context, const void *in, unsigned int inputLen)
-{
-	unsigned int i, index, partLen;
-	const unsigned char *input = in;
-
-	/* Compute number of bytes mod 64 */
-	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
-
-	/* Update number of bits */
-	if ((context->count[0] += ((uint32_t)inputLen << 3))
-	    < ((uint32_t)inputLen << 3))
-		context->count[1]++;
-	context->count[1] += ((uint32_t)inputLen >> 29);
-
-	partLen = 64 - index;
-
-	/* Transform as many times as possible. */
-	if (inputLen >= partLen) {
-		memcpy((void *)&context->buffer[index], (const void *)input,
-		    partLen);
-		MD5Transform (context->state, context->buffer);
-
-		for (i = partLen; i + 63 < inputLen; i += 64)
-			MD5Transform (context->state, &input[i]);
-
-		index = 0;
-	}
-	else
-		i = 0;
-
-	/* Buffer remaining input */
-	memcpy ((void *)&context->buffer[index], (const void *)&input[i],
-	    inputLen-i);
-}
-
-/*
- * MD5 padding. Adds padding followed by original length.
- */
-
-static void
-MD5Pad(MD5_CTX *context)
-{
-	unsigned char bits[8];
-	unsigned int index, padLen;
-
-	/* Save number of bits */
-	Encode (bits, context->count, 8);
-
-	/* Pad out to 56 mod 64. */
-	index = (unsigned int)((context->count[0] >> 3) & 0x3f);
-	padLen = (index < 56) ? (56 - index) : (120 - index);
-	MD5Update (context, PADDING, padLen);
-
-	/* Append length (before padding) */
-	MD5Update (context, bits, 8);
-}
-
-/*
- * MD5 finalization. Ends an MD5 message-digest operation, writing the
- * the message digest and zeroizing the context.
- */
-
-void
-MD5Final(unsigned char digest[static MD5_DIGEST_LENGTH], MD5_CTX *context)
-{
-	/* Do padding. */
-	MD5Pad (context);
-
-	/* Store state in digest */
-	Encode (digest, context->state, MD5_DIGEST_LENGTH);
-
-	/* Zeroize sensitive information. */
-	explicit_bzero (context, sizeof (*context));
-}
-
-/* MD5 basic transformation. Transforms state based on block. */
-
-static void
-MD5Transform(uint32_t state[4], const unsigned char block[64])
-{
-	uint32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
-
-	Decode (x, block, 64);
-
-	/* Round 1 */
-#define S11 7
-#define S12 12
-#define S13 17
-#define S14 22
-	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
-	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
-	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
-	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
-	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
-	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
-	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
-	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
-	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
-	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
-	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
-	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
-	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
-	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
-	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
-	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
-
-	/* Round 2 */
-#define S21 5
-#define S22 9
-#define S23 14
-#define S24 20
-	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
-	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
-	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
-	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
-	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
-	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
-	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
-	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
-	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
-	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
-	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
-	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
-	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
-	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
-	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
-	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
-
-	/* Round 3 */
-#define S31 4
-#define S32 11
-#define S33 16
-#define S34 23
-	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
-	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
-	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
-	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
-	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
-	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
-	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
-	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
-	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
-	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
-	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
-	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
-	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
-	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
-	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
-	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
-
-	/* Round 4 */
-#define S41 6
-#define S42 10
-#define S43 15
-#define S44 21
-	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
-	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
-	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
-	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
-	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
-	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
-	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
-	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
-	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
-	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
-	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
-	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
-	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
-	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
-	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
-	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
-
-	state[0] += a;
-	state[1] += b;
-	state[2] += c;
-	state[3] += d;
-
-	/* Zeroize sensitive information. */
-	memset ((void *)x, 0, sizeof (x));
-}
-
-#ifdef WEAK_REFS
-/* When building libmd, provide weak references. Note: this is not
-   activated in the context of compiling these sources for internal
-   use in libcrypt.
- */
-#undef MD5Init
-__weak_reference(_libmd_MD5Init, MD5Init);
-#undef MD5Update
-__weak_reference(_libmd_MD5Update, MD5Update);
-#undef MD5Final
-__weak_reference(_libmd_MD5Final, MD5Final);
-#endif
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 464efda1e91a..fee6c1a844e2 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -835,6 +835,7 @@ ASAN_ATOMIC_FUNC_TESTANDSET(32, uint32_t);
 ASAN_ATOMIC_FUNC_TESTANDSET(64, uint64_t);
 ASAN_ATOMIC_FUNC_TESTANDSET(int, u_int);
 ASAN_ATOMIC_FUNC_TESTANDSET(long, u_long);
+ASAN_ATOMIC_FUNC_TESTANDSET(acq_long, u_long);
 ASAN_ATOMIC_FUNC_TESTANDSET(ptr, uintptr_t);
 
 ASAN_ATOMIC_FUNC_SWAP(32, uint32_t);
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
index 62a3da964c37..b84f69cfd03e 100644
--- a/sys/kern/subr_bus.c
+++ b/sys/kern/subr_bus.c
@@ -280,6 +280,9 @@ device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 	struct sbuf sb;
 	device_t dev = (device_t)arg1;
 	device_t iommu;
+#ifdef IOMMU
+	device_t requester;
+#endif
 	int error;
 	uint16_t rid;
 	const char *c;
@@ -314,9 +317,15 @@ device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 		}
 		rid = 0;
 #ifdef IOMMU
-		iommu_get_requester(dev, &rid);
+		error = iommu_get_requester(dev, &requester, &rid);
+		/*
+		 * Do not return requester error from sysctl, iommu
+		 * unit might be assigned by other means.
+		 */
+#else
+		error = ENXIO;
 #endif
-		if (rid != 0)
+		if (error == 0)
 			sbuf_printf(&sb, "%srid=%#x", c, rid);
 		break;
 	default:
@@ -4624,7 +4633,7 @@ bus_release_resources(device_t dev, const struct resource_spec *rs,
  * parent of @p dev.
  */
 struct resource *
-bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start,
+(bus_alloc_resource)(device_t dev, int type, int *rid, rman_res_t start,
     rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
index 07a9cc0f57be..c4d0223d484f 100644
--- a/sys/kern/subr_devstat.c
+++ b/sys/kern/subr_devstat.c
@@ -415,7 +415,7 @@ sysctl_devstat(SYSCTL_HANDLER_ARGS)
 	if (error != 0)
 		return (error);
 
-	for (;nds != NULL;) {
+	while (nds != NULL) {
 		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
 		if (error != 0)
 			return (error);
diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c
index 56264a96c9fa..909dd10a6e69 100644
--- a/sys/kern/subr_kdb.c
+++ b/sys/kern/subr_kdb.c
@@ -330,7 +330,7 @@ kdb_reboot(void)
 #define	KEY_CRTLP	16	/* ^P */
 #define	KEY_CRTLR	18	/* ^R */
 
-/* States of th KDB "alternate break sequence" detecting state machine. */
+/* States of the KDB "alternate break sequence" detecting state machine. */
 enum {
 	KDB_ALT_BREAK_SEEN_NONE,
 	KDB_ALT_BREAK_SEEN_CR,
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
index 5380902e602f..aac35a56130e 100644
--- a/sys/kern/subr_log.c
+++ b/sys/kern/subr_log.c
@@ -79,6 +79,7 @@ static const struct filterops log_read_filterops = {
 	.f_attach =	NULL,
 	.f_detach =	logkqdetach,
 	.f_event =	logkqread,
+	.f_copy = knote_triv_copy,
 };
 
 static struct logsoftc {
diff --git a/sys/kern/subr_msan.c b/sys/kern/subr_msan.c
index a3238b61482b..883dbd2b7604 100644
--- a/sys/kern/subr_msan.c
+++ b/sys/kern/subr_msan.c
@@ -1301,6 +1301,7 @@ MSAN_ATOMIC_FUNC_TESTANDSET(32, uint32_t);
 MSAN_ATOMIC_FUNC_TESTANDSET(64, uint64_t);
 MSAN_ATOMIC_FUNC_TESTANDSET(int, u_int);
 MSAN_ATOMIC_FUNC_TESTANDSET(long, u_long);
+MSAN_ATOMIC_FUNC_TESTANDSET(acq_long, u_long);
 MSAN_ATOMIC_FUNC_TESTANDSET(ptr, uintptr_t);
 
 MSAN_ATOMIC_FUNC_SWAP(32, uint32_t);
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
index 471640c290a7..a67e5fa6cbff 100644
--- a/sys/kern/subr_param.c
+++ b/sys/kern/subr_param.c
@@ -235,14 +235,11 @@ init_param1(void)
 	 * specification for <limits.h>, paragraph "Runtime Increasable
 	 * Values").
 	 *
-	 * On the other hand, INT_MAX would result in an overflow for the common
-	 * 'ngroups_max + 1' computation (to obtain the size of the internal
-	 * groups array, its first element being reserved for the effective
-	 * GID).  Also, the number of allocated bytes for the group array must
-	 * not overflow on 32-bit machines.  For all these reasons, we limit the
-	 * number of supplementary groups to some very high number that we
-	 * expect will never be reached in all practical uses and ensures we
-	 * avoid the problems just exposed, even if 'gid_t' was to be enlarged
+	 * On the other hand, a too high value would result in an overflow when
+	 * computing the number of bytes to allocate for the groups array.  We
+	 * thus limit the number of supplementary groups to some very high
+	 * number that we expect will never be reached in all practical uses,
+	 * avoiding the problem just exposed even if 'gid_t' were to be enlarged
 	 * by a magnitude.
 	 */
 	ngroups_max = NGROUPS_MAX;
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
index 5c14e15830f4..c9a387a5e87b 100644
--- a/sys/kern/subr_pcpu.c
+++ b/sys/kern/subr_pcpu.c
@@ -140,7 +140,7 @@ uma_zone_t pcpu_zone_32;
 uma_zone_t pcpu_zone_64;
 
 static void
-pcpu_zones_startup(void)
+pcpu_zones_startup(void *dummy __unused)
 {
 
 	pcpu_zone_4 = uma_zcreate("pcpu-4", 4,
diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c
index db0e7bf5b0e3..f5a581e42bf3 100644
--- a/sys/kern/subr_power.c
+++ b/sys/kern/subr_power.c
@@ -3,6 +3,10 @@
  *
  * Copyright (c) 2001 Mitsuru IWASAKI
  * All rights reserved.
+ * Copyright (c) 2025 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Aymeric Wibo
+ * <obiwac@freebsd.org> under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -30,24 +34,113 @@
 #include <sys/eventhandler.h>
 #include <sys/power.h>
 #include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 
+enum power_stype	 power_standby_stype	= POWER_STYPE_UNKNOWN;
+enum power_stype	 power_suspend_stype	= POWER_STYPE_UNKNOWN;
+enum power_stype	 power_hibernate_stype	= POWER_STYPE_UNKNOWN;
+
 static u_int		 power_pm_type	= POWER_PM_TYPE_NONE;
 static power_pm_fn_t	 power_pm_fn	= NULL;
 static void		*power_pm_arg	= NULL;
+static bool		 power_pm_supported[POWER_STYPE_COUNT] = {0};
 static struct task	 power_pm_task;
 
+enum power_stype
+power_name_to_stype(const char *name)
+{
+	enum power_stype	stype;
+
+	for (stype = 0; stype < POWER_STYPE_COUNT; stype++) {
+		if (strcasecmp(name, power_stype_names[stype]) == 0)
+			return (stype);
+	}
+	return (POWER_STYPE_UNKNOWN);
+}
+
+const char *
+power_stype_to_name(enum power_stype stype)
+{
+	if (stype == POWER_STYPE_UNKNOWN)
+		return ("NONE");
+	if (stype < POWER_STYPE_AWAKE || stype >= POWER_STYPE_COUNT)
+		return (NULL);
+	return (power_stype_names[stype]);
+}
+
+static int
+sysctl_supported_stypes(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct sbuf sb;
+	enum power_stype stype;
+
+	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
+	for (stype = 0; stype < POWER_STYPE_COUNT; stype++) {
+		if (power_pm_supported[stype])
+			sbuf_printf(&sb, "%s ", power_stype_to_name(stype));
+	}
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+	sbuf_delete(&sb);
+
+	return (error);
+}
+
+static int
+power_sysctl_stype(SYSCTL_HANDLER_ARGS)
+{
+	char			name[10];
+	int			err;
+	enum power_stype	new_stype, old_stype;
+
+	old_stype = *(enum power_stype *)oidp->oid_arg1;
+	strlcpy(name, power_stype_to_name(old_stype), sizeof(name));
+	err = sysctl_handle_string(oidp, name, sizeof(name), req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	new_stype = power_name_to_stype(name);
+	if (new_stype == POWER_STYPE_UNKNOWN)
+		return (EINVAL);
+	if (!power_pm_supported[new_stype])
+		return (EOPNOTSUPP);
+	if (new_stype != old_stype)
+		*(enum power_stype *)oidp->oid_arg1 = new_stype;
+	return (0);
+}
+
+static SYSCTL_NODE(_kern, OID_AUTO, power, CTLFLAG_RW, 0,
+    "Generic power management related sysctls");
+
+SYSCTL_PROC(_kern_power, OID_AUTO, supported_stype,
+    CTLTYPE_STRING | CTLFLAG_RD, 0, 0, sysctl_supported_stypes, "A",
+    "List supported sleep types");
+SYSCTL_PROC(_kern_power, OID_AUTO, standby, CTLTYPE_STRING | CTLFLAG_RW,
+    &power_standby_stype, 0, power_sysctl_stype, "A",
+    "Sleep type to enter on standby");
+SYSCTL_PROC(_kern_power, OID_AUTO, suspend, CTLTYPE_STRING | CTLFLAG_RW,
+    &power_suspend_stype, 0, power_sysctl_stype, "A",
+    "Sleep type to enter on suspend");
+SYSCTL_PROC(_kern_power, OID_AUTO, hibernate, CTLTYPE_STRING | CTLFLAG_RW,
+    &power_hibernate_stype, 0, power_sysctl_stype, "A",
+    "Sleep type to enter on hibernate");
+
 static void
 power_pm_deferred_fn(void *arg, int pending)
 {
-	int state = (intptr_t)arg;
+	enum power_stype stype = (intptr_t)arg;
 
-	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, stype);
 }
 
 int
-power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
+power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg,
+    bool pm_supported[static POWER_STYPE_COUNT])
 {
 	int	error;
 
@@ -56,6 +149,16 @@ power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
 		power_pm_type	= pm_type;
 		power_pm_fn	= pm_fn;
 		power_pm_arg	= pm_arg;
+		memcpy(power_pm_supported, pm_supported,
+		    sizeof(power_pm_supported));
+		if (power_pm_supported[POWER_STYPE_STANDBY])
+			power_standby_stype = POWER_STYPE_STANDBY;
+		if (power_pm_supported[POWER_STYPE_SUSPEND_TO_MEM])
+			power_suspend_stype = POWER_STYPE_SUSPEND_TO_MEM;
+		else if (power_pm_supported[POWER_STYPE_SUSPEND_TO_IDLE])
+			power_suspend_stype = POWER_STYPE_SUSPEND_TO_IDLE;
+		if (power_pm_supported[POWER_STYPE_HIBERNATE])
+			power_hibernate_stype = POWER_STYPE_HIBERNATE;
 		error = 0;
 		TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL);
 	} else {
@@ -75,14 +178,27 @@ power_pm_get_type(void)
 void
 power_pm_suspend(int state)
 {
+	enum power_stype	stype;
+
 	if (power_pm_fn == NULL)
 		return;
 
-	if (state != POWER_SLEEP_STATE_STANDBY &&
-	    state != POWER_SLEEP_STATE_SUSPEND &&
-	    state != POWER_SLEEP_STATE_HIBERNATE)
+	switch (state) {
+	case POWER_SLEEP_STATE_STANDBY:
+		stype = power_standby_stype;
+		break;
+	case POWER_SLEEP_STATE_SUSPEND:
+		stype = power_suspend_stype;
+		break;
+	case POWER_SLEEP_STATE_HIBERNATE:
+		stype = power_hibernate_stype;
+		break;
+	default:
+		printf("%s: unknown sleep state %d\n", __func__, state);
 		return;
-	power_pm_task.ta_context = (void *)(intptr_t)state;
+	}
+
+	power_pm_task.ta_context = (void *)(intptr_t)stype;
 	taskqueue_enqueue(taskqueue_thread, &power_pm_task);
 }
 
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
index db0ceb17b9f0..e2070ae3f865 100644
--- a/sys/kern/subr_prf.c
+++ b/sys/kern/subr_prf.c
@@ -766,7 +766,7 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 				PCHAR(hex2ascii(*up & 0x0f));
 				up++;
 				if (width)
-					for (q=p;*q;q++)
+					for (q = p; *q; q++)
 						PCHAR(*q);
 			}
 			break;
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 1f9577fddf9c..9f5106316018 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -242,7 +242,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 	KASSERT(
 	    type == IPI_STOP || type == IPI_STOP_HARD
 #if X86
-	    || type == IPI_SUSPEND
+	    || type == IPI_SUSPEND || type == IPI_OFF
 #endif
 	    , ("%s: invalid stop type", __func__));
 
@@ -260,7 +260,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 	 * will be lost, violating FreeBSD's assumption of reliable
 	 * IPI delivery.
 	 */
-	if (type == IPI_SUSPEND)
+	if (type == IPI_SUSPEND || type == IPI_OFF)
 		mtx_lock_spin(&smp_ipi_mtx);
 #endif
 
@@ -280,7 +280,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 #endif
 
 #if X86
-	if (type == IPI_SUSPEND)
+	if (type == IPI_SUSPEND || type == IPI_OFF)
 		cpus = &suspended_cpus;
 	else
 #endif
@@ -298,7 +298,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 	}
 
 #if X86
-	if (type == IPI_SUSPEND)
+	if (type == IPI_SUSPEND || type == IPI_OFF)
 		mtx_unlock_spin(&smp_ipi_mtx);
 #endif
 
@@ -327,6 +327,13 @@ suspend_cpus(cpuset_t map)
 
 	return (generic_stop_cpus(map, IPI_SUSPEND));
 }
+
+int
+offline_cpus(cpuset_t map)
+{
+
+	return (generic_stop_cpus(map, IPI_OFF));
+}
 #endif
 
 /*
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
index d5b3b62f0821..48896529f685 100644
--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -55,8 +55,8 @@ syscallenter(struct thread *td)
 	struct proc *p;
 	struct syscall_args *sa;
 	struct sysent *se;
-	int error, traced;
-	bool sy_thr_static;
+	int error;
+	bool sy_thr_static, traced;
 
 	VM_CNT_INC(v_syscall);
 	p = td->td_proc;
@@ -219,7 +219,7 @@ syscallret(struct thread *td)
 	struct proc *p;
 	struct syscall_args *sa;
 	ksiginfo_t ksi;
-	int traced;
+	bool traced;
 
 	KASSERT(td->td_errno != ERELOOKUP,
 	    ("ERELOOKUP not consumed syscall %d", td->td_sa.code));
@@ -250,9 +250,9 @@ syscallret(struct thread *td)
 	}
 #endif
 
-	traced = 0;
+	traced = false;
 	if (__predict_false(p->p_flag & P_TRACED)) {
-		traced = 1;
+		traced = true;
 		PROC_LOCK(p);
 		td->td_dbgflags |= TDB_SCX;
 		PROC_UNLOCK(p);
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index ab47b6ad29a3..c937f6a82757 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -57,7 +57,7 @@
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
- *	6 capitalized : a member of the Jehovah's Witnesses 
+ *	6 capitalized : a member of the Jehovah's Witnesses
  */
 
 /*
@@ -131,7 +131,7 @@
 #define	LI_SLEEPABLE	0x00040000	/* Lock may be held while sleeping. */
 
 #ifndef WITNESS_COUNT
-#define	WITNESS_COUNT 		1536
+#define	WITNESS_COUNT		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(512 + (MAXCPU * 4))
@@ -158,20 +158,18 @@
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
-#define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
-#define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
-#define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
-#define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
-#define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
-#define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
-#define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
-#define	WITNESS_RELATED_MASK						\
-	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
-#define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
-					  * observed. */
-#define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
-#define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
-#define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
+#define	WITNESS_UNRELATED	0x00	/* No lock order relation. */
+#define	WITNESS_PARENT		0x01	/* Parent, aka direct ancestor. */
+#define	WITNESS_ANCESTOR	0x02	/* Direct or indirect ancestor. */
+#define	WITNESS_CHILD		0x04	/* Child, aka direct descendant. */
+#define	WITNESS_DESCENDANT	0x08	/* Direct or indirect descendant. */
+#define	WITNESS_ANCESTOR_MASK	(WITNESS_PARENT | WITNESS_ANCESTOR)
+#define	WITNESS_DESCENDANT_MASK	(WITNESS_CHILD | WITNESS_DESCENDANT)
+#define	WITNESS_RELATED_MASK	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
+#define	WITNESS_REVERSAL	0x10	/* A lock order reversal has been observed. */
+#define	WITNESS_RESERVED1	0x20	/* Unused flag, reserved. */
+#define	WITNESS_RESERVED2	0x40	/* Unused flag, reserved. */
+#define	WITNESS_LOCK_ORDER_KNOWN 0x80	/* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
@@ -218,20 +216,18 @@ struct lock_list_entry {
  * (for example, "vnode interlock").
  */
 struct witness {
-	char  			w_name[MAX_W_NAME];
-	uint32_t 		w_index;  /* Index in the relationship matrix */
+	char			w_name[MAX_W_NAME];
+	uint32_t		w_index;	/* Index in the relationship matrix */
 	struct lock_class	*w_class;
-	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
-	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
-	struct witness		*w_hash_next; /* Linked list in hash buckets. */
-	const char		*w_file; /* File where last acquired */
-	uint32_t 		w_line; /* Line where last acquired */
-	uint32_t 		w_refcount;
-	uint16_t 		w_num_ancestors; /* direct/indirect
-						  * ancestor count */
-	uint16_t 		w_num_descendants; /* direct/indirect
-						    * descendant count */
-	int16_t 		w_ddb_level;
+	STAILQ_ENTRY(witness)	w_list;		/* List of all witnesses. */
+	STAILQ_ENTRY(witness)	w_typelist;	/* Witnesses of a type. */
+	struct witness		*w_hash_next;	/* Linked list in hash buckets. */
+	const char		*w_file;	/* File where last acquired */
+	uint32_t		w_line;		/* Line where last acquired */
+	uint32_t		w_refcount;
+	uint16_t		w_num_ancestors;   /* direct/indirect ancestor count */
+	uint16_t		w_num_descendants; /* direct/indirect descendant count */
+	int16_t			w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
@@ -265,7 +261,7 @@ struct witness_lock_order_data {
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
- * (struct witness_lock_order_data). 
+ * (struct witness_lock_order_data).
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
@@ -295,7 +291,6 @@ struct witness_order_list_entry {
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
-
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
@@ -304,7 +299,6 @@ static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
-
 	return (a->from == b->from && a->to == b->to);
 }
 
@@ -415,7 +409,7 @@ SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin,
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
-SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
+SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN,
     &witness_count, 0, "");
 
 /*
@@ -760,7 +754,6 @@ static int witness_spin_warn = 0;
 static const char *
 fixup_filename(const char *file)
 {
-
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
@@ -835,7 +828,7 @@ witness_startup(void *mem)
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
-		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
+		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
 		    (witness_count + 1));
 	}
 
@@ -989,16 +982,16 @@ witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
 {
 	int i;
 
- 	for (i = 0; i < indent; i++)
- 		prnt(" ");
+	for (i = 0; i < indent; i++)
+		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
- 	if (w->w_displayed) {
- 		prnt(" -- (already displayed)\n");
- 		return;
- 	}
- 	w->w_displayed = 1;
+	if (w->w_displayed) {
+		prnt(" -- (already displayed)\n");
+		return;
+	}
+	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
@@ -1079,7 +1072,6 @@ witness_ddb_display(int(*prnt)(const char *fmt, ...))
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
-
 	if (witness_watch == -1 || KERNEL_PANICKED())
 		return (0);
 
@@ -1257,7 +1249,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file,
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
-			    "acquiring duplicate lock of same type: \"%s\"\n", 
+			    "acquiring duplicate lock of same type: \"%s\"\n",
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
@@ -1523,6 +1515,10 @@ witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
+	/* Update per-witness last file and line acquire. */
+	w->w_file = file;
+	w->w_line = line;
+
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
@@ -1530,15 +1526,9 @@ witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
-		instance->li_file = file;
-		instance->li_line = line;
 		return;
 	}
 
-	/* Update per-witness last file and line acquire. */
-	w->w_file = file;
-	w->w_line = line;
-
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
@@ -1743,7 +1733,7 @@ found:
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
-	 * head object into lists so that frequent allocation from the 
+	 * head object into lists so that frequent allocation from the
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
@@ -1781,7 +1771,7 @@ witness_thread_exit(struct thread *td)
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
-				
+
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
@@ -1948,7 +1938,6 @@ found:
 static void
 depart(struct witness *w)
 {
-
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		w_sleep_cnt--;
@@ -1999,18 +1988,18 @@ adopt(struct witness *parent, struct witness *child)
 		child->w_num_ancestors++;
 	}
 
-	/* 
-	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
+	/*
+	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
-		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
+		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
-			/* 
+			/*
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
@@ -2021,7 +2010,7 @@ adopt(struct witness *parent, struct witness *child)
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
-			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
+			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
@@ -2029,16 +2018,16 @@ adopt(struct witness *parent, struct witness *child)
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
-			/* 
+			/*
 			 * Make sure we aren't marking a node as both an
-			 * ancestor and descendant. We should have caught 
+			 * ancestor and descendant. We should have caught
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
-				    i, j, w_rmatrix[i][j]); 
+				    i, j, w_rmatrix[i][j]);
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
@@ -2047,7 +2036,7 @@ adopt(struct witness *parent, struct witness *child)
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
-				    j, i, w_rmatrix[j][i]); 
+				    j, i, w_rmatrix[j][i]);
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
@@ -2124,7 +2113,6 @@ _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
-
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
@@ -2134,7 +2122,6 @@ isitmychild(struct witness *parent, struct witness *child)
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
-
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
@@ -2182,7 +2169,7 @@ witness_get(void)
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
-	MPASS(index > 0 && index == w_max_used_index+1 &&
+	MPASS(index > 0 && index == w_max_used_index + 1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
@@ -2194,7 +2181,6 @@ witness_get(void)
 static void
 witness_free(struct witness *w)
 {
-
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
@@ -2219,11 +2205,10 @@ witness_lock_list_get(void)
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
-		
+
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
-
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
@@ -2297,7 +2282,6 @@ witness_voutput(const char *fmt, va_list ap)
 static int
 witness_thread_has_locks(struct thread *td)
 {
-
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
@@ -2573,14 +2557,12 @@ witness_setflag(struct lock_object *lock, int flag, int set)
 void
 witness_norelease(struct lock_object *lock)
 {
-
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
-
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
@@ -2588,7 +2570,6 @@ witness_releaseok(struct lock_object *lock)
 static void
 witness_ddb_list(struct thread *td)
 {
-
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
@@ -2653,7 +2634,6 @@ DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE);
 
 DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE)
 {
-
 	witness_ddb_display(db_printf);
 }
 #endif
@@ -2673,9 +2653,9 @@ sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
-	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
+	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
 	    M_WAITOK | M_ZERO);
-	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
+	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
@@ -2750,12 +2730,12 @@ restart:
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
-			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
+			    tmp_w1->w_name, tmp_w1->w_class->lc_name,
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
-				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
+				    tmp_w1->w_name, tmp_w1->w_class->lc_name,
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_putc(sb, '\n');
@@ -2763,7 +2743,7 @@ restart:
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
-				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
+				    tmp_w2->w_name, tmp_w2->w_class->lc_name,
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_putc(sb, '\n');
@@ -2823,7 +2803,6 @@ sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 static int
 sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
 {
-
 	return (db_printf("%.*s", len, data));
 }
 
@@ -3068,7 +3047,7 @@ witness_lock_order_get(struct witness *parent, struct witness *child)
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
-	hash = witness_hash_djb2((const char*)&key,
+	hash = witness_hash_djb2((const char *)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
@@ -3089,7 +3068,6 @@ out:
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
-
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
@@ -3115,7 +3093,7 @@ witness_lock_order_add(struct witness *parent, struct witness *child)
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
-	hash = witness_hash_djb2((const char*)&key,
+	hash = witness_hash_djb2((const char *)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
@@ -3134,7 +3112,6 @@ witness_lock_order_add(struct witness *parent, struct witness *child)
 static void
 witness_increment_graph_generation(void)
 {
-
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
@@ -3143,7 +3120,6 @@ witness_increment_graph_generation(void)
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
-
 	witness_output("%.*s", len, data);
 	return (len);
 }
diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c
index c2a0f67cae85..04ed107c933d 100644
--- a/sys/kern/sys_eventfd.c
+++ b/sys/kern/sys_eventfd.c
@@ -85,13 +85,16 @@ static int	filt_eventfdwrite(struct knote *kn, long hint);
 static const struct filterops eventfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
-	.f_event = filt_eventfdread
+	.f_event = filt_eventfdread,
+	.f_copy = knote_triv_copy,
 };
 
+
 static const struct filterops eventfd_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
-	.f_event = filt_eventfdwrite
+	.f_event = filt_eventfdwrite,
+	.f_copy = knote_triv_copy,
 };
 
 struct eventfd {
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 5606b36f772f..b84f675d1dcb 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -729,7 +729,7 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 {
 	struct file *fp;
 	struct filedesc *fdp;
-	int error, tmp, locked;
+	int error, f_flag, tmp, locked;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(com);
@@ -782,30 +782,36 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 		goto out;
 	}
 
+	f_flag = 0;
 	switch (com) {
 	case FIONCLEX:
 		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
-		goto out;
+		break;
 	case FIOCLEX:
 		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
-		goto out;
-	case FIONBIO:
-		if ((tmp = *(int *)data))
-			atomic_set_int(&fp->f_flag, FNONBLOCK);
-		else
-			atomic_clear_int(&fp->f_flag, FNONBLOCK);
-		data = (void *)&tmp;
 		break;
+	case FIONBIO:
 	case FIOASYNC:
-		if ((tmp = *(int *)data))
-			atomic_set_int(&fp->f_flag, FASYNC);
-		else
-			atomic_clear_int(&fp->f_flag, FASYNC);
-		data = (void *)&tmp;
+		f_flag = com == FIONBIO ? FNONBLOCK : FASYNC;
+		tmp = *(int *)data;
+		fsetfl_lock(fp);
+		if (((fp->f_flag & f_flag) != 0) != (tmp != 0)) {
+			error = fo_ioctl(fp, com, (void *)&tmp, td->td_ucred,
+			    td);
+			if (error == 0) {
+				if (tmp != 0)
+					atomic_set_int(&fp->f_flag, f_flag);
+				else
+					atomic_clear_int(&fp->f_flag, f_flag);
+			}
+		}
+		fsetfl_unlock(fp);
+		break;
+	default:
+		error = fo_ioctl(fp, com, data, td->td_ucred, td);
 		break;
 	}
 
-	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
 	switch (locked) {
 	case LA_XLOCKED:
@@ -2339,3 +2345,35 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1,
 	}
 	return (eerror);
 }
+
+int
+exterr_set_from(const struct kexterr *ke)
+{
+	struct thread *td;
+
+	td = curthread;
+	if ((td->td_pflags2 & TDP2_UEXTERR) != 0) {
+		td->td_pflags2 |= TDP2_EXTERR;
+		td->td_kexterr = *ke;
+	}
+	return (td->td_kexterr.error);
+}
+
+void
+exterr_clear(struct kexterr *ke)
+{
+	memset(ke, 0, sizeof(*ke));
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+void
+exterr_db_print(struct kexterr *ke)
+{
+	db_printf("errno %d cat %d msg %s p1 %#jx p2 %#jx line %d\n",
+	    ke->error, ke->cat, ke->msg == NULL ? "<none>" : ke->msg,
+	    (uintmax_t)ke->p1, (uintmax_t)ke->p2, ke->src_line);
+}
+#endif
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index ed651da96b14..6531cea31423 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -181,20 +181,23 @@ static int	filt_pipedump(struct proc *p, struct knote *kn,
 static const struct filterops pipe_nfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach_notsup,
-	.f_event = filt_pipenotsup
+	.f_event = filt_pipenotsup,
 	/* no userdump */
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops pipe_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_piperead,
 	.f_userdump = filt_pipedump,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops pipe_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_pipewrite,
 	.f_userdump = filt_pipedump,
+	.f_copy = knote_triv_copy,
 };
 
 /*
@@ -234,6 +237,7 @@ static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
 static int pipe_create(struct pipe *pipe, bool backing);
+static void pipe_destroy(struct pipe *pipe);
 static int pipe_paircreate(struct thread *td, struct pipepair **p_pp);
 static __inline int pipelock(struct pipe *cpipe, bool catch);
 static __inline void pipeunlock(struct pipe *cpipe);
@@ -399,16 +403,7 @@ pipe_paircreate(struct thread *td, struct pipepair **p_pp)
 		goto fail;
 	error = pipe_create(wpipe, false);
 	if (error != 0) {
-		/*
-		 * This cleanup leaves the pipe inode number for rpipe
-		 * still allocated, but never used.  We do not free
-		 * inode numbers for opened pipes, which is required
-		 * for correctness because numbers must be unique.
-		 * But also it avoids any memory use by the unr
-		 * allocator, so stashing away the transient inode
-		 * number is reasonable.
-		 */
-		pipe_free_kmem(rpipe);
+		pipe_destroy(rpipe);
 		goto fail;
 	}
 
@@ -575,7 +570,7 @@ pipespace_new(struct pipe *cpipe, int size)
 	static int curfail = 0;
 	static struct timeval lastfail;
 
-	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
+	PIPE_LOCK_ASSERT(cpipe, MA_NOTOWNED);
 	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
 		("pipespace: resize of direct writes not allowed"));
 retry:
@@ -743,6 +738,16 @@ pipe_create(struct pipe *pipe, bool large_backing)
 	return (error);
 }
 
+static void
+pipe_destroy(struct pipe *pipe)
+{
+	pipe_free_kmem(pipe);
+	/*
+	 * Note: we "leak" pipe_ino -- by design the alloc_unr64 mechanism does
+	 * not undo allocations.
+	 */
+}
+
 /* ARGSUSED */
 static int
 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
@@ -1677,8 +1682,7 @@ static void
 pipe_free_kmem(struct pipe *cpipe)
 {
 
-	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
-	    ("pipe_free_kmem: pipe mutex locked"));
+	PIPE_LOCK_ASSERT(cpipe, MA_NOTOWNED);
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c
index 11bd1b6f30e1..c5db21544b0f 100644
--- a/sys/kern/sys_procdesc.c
+++ b/sys/kern/sys_procdesc.c
@@ -129,7 +129,7 @@ procdesc_find(struct thread *td, int fd, const cap_rights_t *rightsp,
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
-		error = EBADF;
+		error = EINVAL;
 		goto out;
 	}
 	pd = fp->f_data;
@@ -486,6 +486,7 @@ static const struct filterops procdesc_kqops = {
 	.f_isfd = 1,
 	.f_detach = procdesc_kqops_detach,
 	.f_event = procdesc_kqops_event,
+	.f_copy = knote_triv_copy,
 };
 
 static int
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index c221106ae067..bc0725230cca 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -586,7 +586,7 @@ soaio_enqueue(struct task *task)
 }
 
 static void
-soaio_init(void)
+soaio_init(void *dummy __unused)
 {
 
 	soaio_lifetime = AIOD_LIFETIME_DEFAULT;
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 4122f9261871..06a4adc3d8cb 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -602,4 +602,7 @@ const char *syscallnames[] = {
 	"inotify_rm_watch",			/* 594 = inotify_rm_watch */
 	"getgroups",			/* 595 = getgroups */
 	"setgroups",			/* 596 = setgroups */
+	"jail_attach_jd",			/* 597 = jail_attach_jd */
+	"jail_remove_jd",			/* 598 = jail_remove_jd */
+	"kexec_load",			/* 599 = kexec_load */
 };
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index fa64597d14a5..ea6d2b5aa1ef 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -552,13 +552,13 @@
 		    _Out_writes_bytes_(len/PAGE_SIZE) char *vec
 		);
 	}
-79	AUE_GETGROUPS	STD|CAPENABLED|COMPAT14 {
+79	AUE_GETGROUPS	COMPAT14|CAPENABLED {
 		int getgroups(
 		    int gidsetsize,
 		    _Out_writes_opt_(gidsetsize) gid_t *gidset
 		);
 	}
-80	AUE_SETGROUPS	STD|COMPAT14 {
+80	AUE_SETGROUPS	COMPAT14 {
 		int setgroups(
 		    int gidsetsize,
 		    _In_reads_(gidsetsize) const gid_t *gidset
@@ -3383,5 +3383,23 @@
 		    _In_reads_(gidsetsize) const gid_t *gidset
 		);
 	}
+597	AUE_JAIL_ATTACH	STD {
+		int jail_attach_jd(
+		    int fd
+		);
+	}
+598	AUE_JAIL_REMOVE	STD {
+		int jail_remove_jd(
+		    int fd
+		);
+	}
 
+599	AUE_NULL	STD {
+		int kexec_load(
+			uint64_t entry,
+			u_long nseg,
+			_In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments,
+			u_long flags
+		);
+	}
 ; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 2b1ea9eed8d4..5951cebbe74a 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3500,6 +3500,30 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 2;
 		break;
 	}
+	/* jail_attach_jd */
+	case 597: {
+		struct jail_attach_jd_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* jail_remove_jd */
+	case 598: {
+		struct jail_remove_jd_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* kexec_load */
+	case 599: {
+		struct kexec_load_args *p = params;
+		uarg[a++] = p->entry; /* uint64_t */
+		uarg[a++] = p->nseg; /* u_long */
+		uarg[a++] = (intptr_t)p->segments; /* struct kexec_segment * */
+		uarg[a++] = p->flags; /* u_long */
+		*n_args = 4;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
@@ -9367,6 +9391,45 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* jail_attach_jd */
+	case 597:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* jail_remove_jd */
+	case 598:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kexec_load */
+	case 599:
+		switch (ndx) {
+		case 0:
+			p = "uint64_t";
+			break;
+		case 1:
+			p = "u_long";
+			break;
+		case 2:
+			p = "userland struct kexec_segment *";
+			break;
+		case 3:
+			p = "u_long";
+			break;
+		default:
+			break;
+		};
+		break;
 	default:
 		break;
 	};
@@ -11365,6 +11428,21 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* jail_attach_jd */
+	case 597:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* jail_remove_jd */
+	case 598:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kexec_load */
+	case 599:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	default:
 		break;
 	};
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index c8e2c561b7cf..067471eb949a 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -754,12 +754,14 @@ static const struct filterops tty_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_read_detach,
 	.f_event = tty_kqops_read_event,
+	.f_copy = knote_triv_copy,
 };
 
 static const struct filterops tty_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_write_detach,
 	.f_event = tty_kqops_write_event,
+	.f_copy = knote_triv_copy,
 };
 
 static int
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
index 1291770a9ccb..2672935c2d89 100644
--- a/sys/kern/tty_pts.c
+++ b/sys/kern/tty_pts.c
@@ -491,11 +491,13 @@ static const struct filterops pts_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_read_detach,
 	.f_event = pts_kqops_read_event,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops pts_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_write_detach,
 	.f_event = pts_kqops_write_event,
+	.f_copy = knote_triv_copy,
 };
 
 static int
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index 6f2760635bad..4c1bb1ff228e 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@@ -281,11 +281,13 @@ static const struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
+	.f_copy = knote_triv_copy,
 };
 
 /*
@@ -867,7 +869,7 @@ mqfs_lookupx(struct vop_cachedlookup_args *ap)
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
-	*vpp = NULLVP;
+	*vpp = NULL;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
@@ -886,7 +888,7 @@ mqfs_lookupx(struct vop_cachedlookup_args *ap)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
-		VREF(dvp);
+		vref(dvp);
 		return (0);
 	}
 
@@ -921,7 +923,7 @@ mqfs_lookupx(struct vop_cachedlookup_args *ap)
 				return (error);
 			}
 			if (*vpp == dvp) {
-				VREF(dvp);
+				vref(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index fe2d8d056062..eb9544628137 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -191,16 +191,19 @@ static const struct filterops soread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sordetach,
 	.f_event = filt_soread,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops sowrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_sowrite,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops soempty_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_soempty,
+	.f_copy = knote_triv_copy,
 };
 
 so_gen_t	so_gencnt;	/* generation count for sockets */
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 19870e989437..807271488af2 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1069,6 +1069,21 @@ uipc_stream_sbspace(struct sockbuf *sb)
 	return (min(space, mbspace));
 }
 
+/*
+ * UNIX version of generic sbwait() for writes.  We wait on peer's receive
+ * buffer, using our timeout.
+ */
+static int
+uipc_stream_sbwait(struct socket *so, sbintime_t timeo)
+{
+	struct sockbuf *sb = &so->so_rcv;
+
+	SOCK_RECVBUF_LOCK_ASSERT(so);
+	sb->sb_flags |= SB_WAIT;
+	return (msleep_sbt(&sb->sb_acc, SOCK_RECVBUF_MTX(so), PSOCK | PCATCH,
+	    "sbwait", timeo, 0, 0));
+}
+
 static int
 uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr,
     struct uio *uio0, struct mbuf *m, struct mbuf *c, int flags,
@@ -1203,7 +1218,8 @@ restart:
 				error = EWOULDBLOCK;
 				goto out4;
 			}
-			if ((error = sbwait(so2, SO_RCV)) != 0) {
+			if ((error = uipc_stream_sbwait(so2,
+			    so->so_snd.sb_timeo)) != 0) {
 				SOCK_RECVBUF_UNLOCK(so2);
 				goto out4;
 			} else
@@ -1543,15 +1559,19 @@ restart:
 				mc_init_m(&cmc, control);
 
 				SOCK_RECVBUF_LOCK(so);
-				MPASS(!(sb->sb_state & SBS_CANTRCVMORE));
-
-				if (__predict_false(cmc.mc_len + sb->sb_ccc +
-				    sb->sb_ctl > sb->sb_hiwat)) {
+				if (__predict_false(
+				    (sb->sb_state & SBS_CANTRCVMORE) ||
+				    cmc.mc_len + sb->sb_ccc + sb->sb_ctl >
+				    sb->sb_hiwat)) {
 					/*
-					 * Too bad, while unp_externalize() was
-					 * failing, the other side had filled
-					 * the buffer and we can't prepend data
-					 * back. Losing data!
+					 * While the lock was dropped and we
+					 * were failing in unp_externalize(),
+					 * the peer could has a) disconnected,
+					 * b) filled the buffer so that we
+					 * can't prepend data back.
+					 * These are two edge conditions that
+					 * we just can't handle, so lose the
+					 * data and return the error.
 					 */
 					SOCK_RECVBUF_UNLOCK(so);
 					SOCK_IO_RECV_UNLOCK(so);
@@ -1807,9 +1827,7 @@ uipc_filt_sowrite(struct knote *kn, long hint)
 	kn->kn_data = uipc_stream_sbspace(&so2->so_rcv);
 
 	if (so2->so_rcv.sb_state & SBS_CANTRCVMORE) {
-		/*
-		 * XXXGL: maybe kn->kn_flags |= EV_EOF ?
-		 */
+		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else if (kn->kn_sfflags & NOTE_LOWAT)
 		return (kn->kn_data >= kn->kn_sdata);
@@ -1837,11 +1855,13 @@ static const struct filterops uipc_write_filtops = {
 	.f_isfd = 1,
 	.f_detach = uipc_filt_sowdetach,
 	.f_event = uipc_filt_sowrite,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops uipc_empty_filtops = {
 	.f_isfd = 1,
 	.f_detach = uipc_filt_sowdetach,
 	.f_event = uipc_filt_soempty,
+	.f_copy = knote_triv_copy,
 };
 
 static int
@@ -2399,7 +2419,7 @@ uipc_sendfile_wait(struct socket *so, off_t need, int *space)
 		}
 		if (!sockref)
 			soref(so2);
-		error = sbwait(so2, SO_RCV);
+		error = uipc_stream_sbwait(so2, so->so_snd.sb_timeo);
 		if (error == 0 &&
 		    __predict_false(sb->sb_state & SBS_CANTRCVMORE))
 			error = EPIPE;
@@ -3669,11 +3689,14 @@ unp_internalize(struct mbuf *control, struct mchain *mc, struct thread *td)
 			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
 			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
 			cmcred->cmcred_euid = td->td_ucred->cr_uid;
-			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
+			_Static_assert(CMGROUP_MAX >= 1,
+			    "Room needed for the effective GID.");
+			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups + 1,
 			    CMGROUP_MAX);
-			for (i = 0; i < cmcred->cmcred_ngroups; i++)
+			cmcred->cmcred_groups[0] = td->td_ucred->cr_gid;
+			for (i = 1; i < cmcred->cmcred_ngroups; i++)
 				cmcred->cmcred_groups[i] =
-				    td->td_ucred->cr_groups[i];
+				    td->td_ucred->cr_groups[i - 1];
 			break;
 
 		case SCM_RIGHTS:
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index e63fa4c01434..02d4b8426757 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -345,12 +345,14 @@ static const struct filterops aio_filtops = {
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
+	.f_copy = knote_triv_copy,
 };
 static const struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
-	.f_event = filt_lio
+	.f_event = filt_lio,
+	.f_copy = knote_triv_copy,
 };
 
 static eventhandler_tag exit_tag, exec_tag;
@@ -2485,7 +2487,7 @@ aio_biowakeup(struct bio *bp)
 	long bcount = bp->bio_bcount;
 	long resid = bp->bio_resid;
 	int opcode, nblks;
-	int bio_error = bp->bio_error;
+	int abio_error = bp->bio_error;
 	uint16_t flags = bp->bio_flags;
 
 	opcode = job->uaiocb.aio_lio_opcode;
@@ -2501,16 +2503,16 @@ aio_biowakeup(struct bio *bp)
 	 * error of whichever failed bio completed last.
 	 */
 	if (flags & BIO_ERROR)
-		atomic_store_int(&job->error, bio_error);
+		atomic_store_int(&job->error, abio_error);
 	if (opcode & LIO_WRITE)
 		atomic_add_int(&job->outblock, nblks);
 	else
 		atomic_add_int(&job->inblock, nblks);
 
 	if (refcount_release(&job->nbio)) {
-		bio_error = atomic_load_int(&job->error);
-		if (bio_error != 0)
-			aio_complete(job, -1, bio_error);
+		abio_error = atomic_load_int(&job->error);
+		if (abio_error != 0)
+			aio_complete(job, -1, abio_error);
 		else
 			aio_complete(job, atomic_load_long(&job->nbytes), 0);
 	}
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 19c39e42bafa..880cc6b99951 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -44,6 +44,7 @@
  * see man buf(9) for more info.
  */
 
+#define	EXTERR_CATEGORY	EXTERR_CAT_VFSBIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
@@ -55,6 +56,7 @@
 #include <sys/counter.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
+#include <sys/exterrvar.h>
 #include <sys/fail.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
@@ -1775,7 +1777,6 @@ buf_alloc(struct bufdomain *bd)
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
-	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
@@ -1785,6 +1786,7 @@ buf_alloc(struct bufdomain *bd)
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
+	exterr_clear(&bp->b_exterr);
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
@@ -2276,7 +2278,7 @@ breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size,
 		}
 		if ((flags & GB_CVTENXIO) != 0)
 			bp->b_xflags |= BX_CVTENXIO;
-		bp->b_ioflags &= ~BIO_ERROR;
+		bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR);
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
@@ -2353,7 +2355,7 @@ bufwrite(struct buf *bp)
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
-	bp->b_ioflags &= ~BIO_ERROR;
+	bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR);
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
@@ -4520,8 +4522,11 @@ biowait(struct bio *bp, const char *wmesg)
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wmesg, 0);
 	mtx_unlock(mtxp);
-	if (bp->bio_error != 0)
+	if (bp->bio_error != 0) {
+		if ((bp->bio_flags & BIO_EXTERR) != 0)
+			return (exterr_set_from(&bp->bio_exterr));
 		return (bp->bio_error);
+	}
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
@@ -4568,6 +4573,8 @@ bufwait(struct buf *bp)
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
+		if ((bp->b_ioflags & BIO_EXTERR) != 0)
+			exterr_set_from(&bp->b_exterr);
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
@@ -5522,6 +5529,8 @@ DB_SHOW_COMMAND(buffer, db_show_buffer)
 		db_printf("\n");
 	}
 	BUF_LOCKPRINTINFO(bp);
+	if ((bp->b_ioflags & BIO_EXTERR) != 0)
+		exterr_db_print(&bp->b_exterr);
 #if defined(FULL_BUF_TRACKING)
 	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
 
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 89c1d779f04c..557e451f9a45 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -86,7 +86,7 @@
  *
  * This fundamental choice needs to be revisited. In the meantime, the current
  * state is described below. Significance of all notable routines is explained
- * in comments placed above their implementation. Scattered thoroughout the
+ * in comments placed above their implementation. Scattered throughout the
  * file are TODO comments indicating shortcomings which can be fixed without
  * reworking everything (most of the fixes will likely be reusable). Various
  * details are omitted from this explanation to not clutter the overview, they
@@ -109,18 +109,19 @@
  * The (directory vnode; name) tuple reliably determines the target entry if
  * it exists.
  *
- * Since there are no small locks at this time (all are 32 bytes in size on
- * LP64), the code works around the problem by introducing lock arrays to
- * protect hash buckets and vnode lists.
+ * Since there were no small locks at the time of writing this comment (all are
+ * 32 bytes in size on LP64), the code works around the problem by introducing
+ * lock arrays to protect hash buckets and vnode lists.
  *
  * II. Filesystem integration
  *
  * Filesystems participating in name caching do the following:
  * - set vop_lookup routine to vfs_cache_lookup
- * - set vop_cachedlookup to whatever can perform the lookup if the above fails
- * - if they support lockless lookup (see below), vop_fplookup_vexec and
- *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
- *   mount point
+ * - set vop_cachedlookup to a routine which can perform the lookup if the
+ *   above fails
+ * - if they support lockless lookup (see below), they set vop_fplookup_vexec
+ *   and vop_fplookup_symlink along with the MNTK_FPLOOKUP flag on the mount
+ *   point
  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
  *   applicable
  * - call cache_enter to add entries depending on the MAKEENTRY flag
@@ -134,11 +135,15 @@
  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
  *
+ * You may notice a degree of CPU waste in this callchain.
+ *
  * III. Performance considerations
  *
  * For lockless case forward lookup avoids any writes to shared areas apart
  * from the terminal path component. In other words non-modifying lookups of
- * different files don't suffer any scalability problems in the namecache.
+ * different files don't suffer any scalability problems in the namecache
+ * itself.
+ *
  * Looking up the same file is limited by VFS and goes beyond the scope of this
  * file.
  *
@@ -158,8 +163,10 @@
  *
  * IV. Observability
  *
- * Note not everything has an explicit dtrace probe nor it should have, thus
- * some of the one-liners below depend on implementation details.
+ * Several statistics are collected in the vfs.cache sysctl tree.
+ *
+ * Some of the state can be checked for with explicit dtrace probes, must of it
+ * depends on implementation details.
  *
  * Examples:
  *
@@ -167,7 +174,7 @@
  * # line number, column 2 is status code (see cache_fpl_status)
  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
  *
- * # Lengths of names added by binary name
+ * # Histogram of lengths of names added, aggregated by which programs are doing it
  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
  *
  * # Same as above but only those which exceed 64 characters
@@ -195,6 +202,11 @@
  * - vnodes are subject to being recycled even if target inode is left in memory,
  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
  *   names get duplicated -- kept by filesystem itself and namecache separately
+ * - vnode reclamation (see vnlru in kern/vfs_subr.c) defaults to skipping
+ *   directories for this very reason, which arguably further reducing quality
+ *   of vnode LRU. Per the above this is done to avoid breaking vnode -> path
+ *   resolution (it becomes expensive for directories and impossible for the rest)
+ *   This would not be a factor if namecache entries could persist without vnodes.
  * - struct namecache has a fixed size and comes in 2 variants, often wasting
  *   space.  now hard to replace with malloc due to dependence on SMR, which
  *   requires UMA zones to opt in
@@ -207,7 +219,8 @@
  * performance left on the table, most notably from single-threaded standpoint.
  * Below is a woefully incomplete list of changes which can help.  Ideas are
  * mostly sketched out, no claim is made all kinks or prerequisites are laid
- * out.
+ * out. The name of the game is eliding branches altogether and hopefully some
+ * of memory accesses.
  *
  * Note there is performance lost all over VFS.
  *
@@ -223,13 +236,6 @@
  * the vnode to hang around for the long haul, but would work for aforementioned
  * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
  *
- * === hotpatching for sdt probes
- *
- * They result in *tons* of branches all over with rather regrettable codegen
- * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
- * Reworking the code to patch itself at runtime with asm goto would solve it.
- * asm goto is fully supported by gcc and clang.
- *
  * === copyinstr
  *
  * On all architectures it operates one byte at a time, while it could be
@@ -251,10 +257,12 @@
  * things worked out locklessly. Instead the lockless lookup could be the
  * actual entry point which calls what is currently namei as a fallback.
  *
+ * It could be hotpatched if lockless lookup is disabled.
+ *
  * === avoidable branches in cache_can_fplookup
  *
  * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
- * this is off, none of fplookup code should execute).
+ * this is off, none of fplookup code should execute, see above).
  *
  * Both audit and capsicum branches can be combined into one, but it requires
  * paying off a lot of tech debt first.
@@ -277,8 +285,18 @@
  *
  * === inactive on v_usecount reaching 0
  *
- * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such
- * processing with a bit in usecount.
+ * VOP_NEED_INACTIVE should not exist. Filesystems can indicate need for such
+ * processing with a bit in usecount and adding a hold count. Then vput fast path
+ * would become as simple as (ACHTUNG: locking ignored):
+ *
+ * ref = atomic_fetchadd_int(&vp->v_count, -1) - 1;
+ * if ((ref & MAGIC_BIT) == 0) // common case
+ *	return;
+ * if (ref != 0) // the bit is set but this was not the last user
+ *	return;
+ * // do inactive here
+ *
+ * Also see below.
  *
  * === v_holdcnt
  *
@@ -287,7 +305,8 @@
  * vnlru et al would consider the vnode not-freeable if has either hold or
  * usecount on it.
  *
- * This would eliminate 2 atomics.
+ * This would eliminate 2 atomics in the common case of securing a vnode and
+ * undoing it.
  */
 
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
@@ -3321,12 +3340,10 @@ sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 	    uap->flags, UIO_USERSPACE));
 }
 
-/*
- * Retrieve the full filesystem path that correspond to a vnode from the name
- * cache (if available)
- */
-int
-vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
+static int
+vn_fullpath_up_to_pwd_vnode(struct vnode *vp,
+    struct vnode *(*const get_pwd_vnode)(const struct pwd *),
+    char **retbuf, char **freebuf)
 {
 	struct pwd *pwd;
 	char *buf;
@@ -3340,11 +3357,13 @@ vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vfs_smr_enter();
 	pwd = pwd_get_smr();
-	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
+	error = vn_fullpath_any_smr(vp, get_pwd_vnode(pwd), buf, retbuf,
+	    &buflen, 0);
 	VFS_SMR_ASSERT_NOT_ENTERED();
 	if (error < 0) {
 		pwd = pwd_hold(curthread);
-		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
+		error = vn_fullpath_any(vp, get_pwd_vnode(pwd), buf, retbuf,
+		    &buflen);
 		pwd_drop(pwd);
 	}
 	if (error == 0)
@@ -3354,6 +3373,42 @@ vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 	return (error);
 }
 
+static inline struct vnode *
+get_rdir(const struct pwd *pwd)
+{
+	return (pwd->pwd_rdir);
+}
+
+/*
+ * Produce a filesystem path that starts from the current chroot directory and
+ * corresponds to the passed vnode, using the name cache (if available).
+ */
+int
+vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
+{
+	return (vn_fullpath_up_to_pwd_vnode(vp, get_rdir, retbuf, freebuf));
+}
+
+static inline struct vnode *
+get_jdir(const struct pwd *pwd)
+{
+	return (pwd->pwd_jdir);
+}
+
+/*
+ * Produce a filesystem path that starts from the current jail's root directory
+ * and corresponds to the passed vnode, using the name cache (if available).
+ *
+ * This function allows to ignore chroots done inside a jail (or the host),
+ * allowing path checks to remain unaffected by privileged or unprivileged
+ * chroot calls.
+ */
+int
+vn_fullpath_jail(struct vnode *vp, char **retbuf, char **freebuf)
+{
+	return (vn_fullpath_up_to_pwd_vnode(vp, get_jdir, retbuf, freebuf));
+}
+
 /*
  * This function is similar to vn_fullpath, but it attempts to lookup the
  * pathname relative to the global root mount point.  This is required for the
@@ -4632,7 +4687,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
 }
 
 /*
- * The target vnode is not supported, prepare for the slow path to take over.
+ * Prepare fallback to the locked lookup while trying to retain the progress.
  */
 static int __noinline
 cache_fplookup_partial_setup(struct cache_fpl *fpl)
@@ -6289,53 +6344,90 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
  *
  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
- * outlined below.
- *
- * Traditional vnode lookup conceptually looks like this:
+ * outlined at the end.
  *
- * vn_lock(current);
- * for (;;) {
- *	next = find();
- *	vn_lock(next);
- *	vn_unlock(current);
- *	current = next;
- *	if (last)
- *	    break;
- * }
- * return (current);
+ * Traversing from one vnode to another requires atomicity with regard to
+ * permissions, mount points and of course their relative placement (if you are
+ * looking up "bar" in "foo" and you found it, it better be in that directory
+ * at the time).
  *
- * Each jump to the next vnode is safe memory-wise and atomic with respect to
- * any modifications thanks to holding respective locks.
+ * Normally this is accomplished with locking, but it comes with a significant
+ * performance hit and is untenable as a fast path even in a moderate core
+ * count environment (at the time of writing this comment this would be a
+ * little south of 100).
  *
  * The same guarantee can be provided with a combination of safe memory
  * reclamation and sequence counters instead. If all operations which affect
  * the relationship between the current vnode and the one we are looking for
  * also modify the counter, we can verify whether all the conditions held as
- * we made the jump. This includes things like permissions, mount points etc.
- * Counter modification is provided by enclosing relevant places in
- * vn_seqc_write_begin()/end() calls.
+ * we made the jump.
  *
- * Thus this translates to:
+ * See places which issue vn_seqc_write_begin()/vn_seqc_write_end() for
+ * operations affected.
+ *
+ * Suppose the variable "cnp" contains lookup metadata (the path etc.), then
+ * locked lookup conceptually looks like this:
+ *
+ * // lock the current directory
+ * vn_lock(dvp);
+ * for (;;) {
+ *      // permission check
+ * 	if (!canlookup(dvp, cnp))
+ * 	    abort();
+ * 	// look for the target name inside dvp
+ *	tvp = findnext(dvp, cnp);
+ *	vn_lock(tvp);
+ *	// tvp is still guaranteed to be inside of dvp because of the lock on dvp
+ *	vn_unlock(dvp);
+ *      // dvp is unlocked. its state is now arbitrary, but that's fine as we
+ *      // made the jump while everything relevant was correct, continue with tvp
+ *      // as the directory to look up names in
+ *	tvp = dvp;
+ *	if (last)
+ *	    break;
+ *	// if not last loop back and continue until done
+ * }
+ * vget(tvp);
+ * return (tvp);
+ *
+ * Lockless lookup replaces locking with sequence counter checks:
  *
  * vfs_smr_enter();
  * dvp_seqc = seqc_read_any(dvp);
- * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
+ * // fail if someone is altering the directory vnode
+ * if (seqc_in_modify(dvp_seqc))
  *     abort();
  * for (;;) {
- * 	tvp = find();
+ *      // permission check. note it can race, but we will validate the outcome
+ *      // with a seqc
+ * 	if (!canlookup_smr(dvp, cnp)) {
+ * 	    // has dvp changed from under us? if so, the denial may be invalid
+ *	    if (!seqc_consistent(dvp, dvp_seqc)
+ * 	        fallback_to_locked();
+ * 	    // nothing changed, lookup denial is valid
+ * 	    fail();
+ * 	}
+ * 	// look for the target name inside dvp
+ * 	tvp = findnext(dvp, cnp);
  * 	tvp_seqc = seqc_read_any(tvp);
- * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
- * 	    abort();
- * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
- * 	    abort();
- * 	dvp = tvp; // we know nothing of importance has changed
- * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
+ *	// bail if someone is altering the target vnode
+ * 	if (seqc_in_modify(tvp_seqc))
+ * 	    fallback_to_locked();
+ *	// bail if someone is altering the directory vnode
+ * 	if (!seqc_consistent(dvp, dvp_seqc)
+ * 	    fallback_to_locked();
+ * 	// we confirmed neither dvp nor tvp changed while we were making the
+ * 	// jump to the next component, thus the result is the same as if we
+ *      // held the lock on dvp and tvp the entire time, continue with tvp
+ *      // as the directory to look up names in
+ * 	dvp = tvp;
+ * 	dvp_seqc = tvp_seqc;
  * 	if (last)
  * 	    break;
  * }
  * vget(); // secure the vnode
  * if (!seqc_consistent(tvp, tvp_seqc) // final check
- * 	    abort();
+ *     fallback_to_locked();
  * // at this point we know nothing has changed for any parent<->child pair
  * // as they were crossed during the lookup, meaning we matched the guarantee
  * // of the locked variant
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 2e397b8e9e8f..b674313993c4 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -260,8 +260,10 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
 	 */
 	while (lblkno < (origblkno + maxra)) {
 		error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
-		if (error)
+		if (error) {
+			error = 0;
 			break;
+		}
 
 		if (blkno == -1)
 			break;
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 85f67731e1cc..4eca09aef145 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -458,6 +458,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap)
 		case _PC_HAS_NAMEDATTR:
 		case _PC_HAS_HIDDENSYSTEM:
 		case _PC_CLONE_BLKSIZE:
+		case _PC_CASE_INSENSITIVE:
 			*ap->a_retval = 0;
 			return (0);
 		default:
@@ -708,7 +709,7 @@ vop_stdvptocnp(struct vop_vptocnp_args *ap)
 	if (error)
 		return (error);
 
-	VREF(vp);
+	vref(vp);
 	locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 	NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
@@ -727,10 +728,10 @@ vop_stdvptocnp(struct vop_vptocnp_args *ap)
 	    ((*dvp)->v_vflag & VV_ROOT) &&
 	    ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
 		*dvp = (*dvp)->v_mount->mnt_vnodecovered;
-		VREF(mvp);
+		vref(mvp);
 		VOP_UNLOCK(mvp);
 		vn_close(mvp, FREAD, cred, td);
-		VREF(*dvp);
+		vref(*dvp);
 		vn_lock(*dvp, LK_SHARED | LK_RETRY);
 		covered = 1;
 	}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
index cd30d5cfae47..ceda770cb714 100644
--- a/sys/kern/vfs_init.c
+++ b/sys/kern/vfs_init.c
@@ -103,6 +103,16 @@ struct vattr va_null;
  * Routines having to do with the management of the vnode table.
  */
 
+void
+vfs_unref_vfsconf(struct vfsconf *vfsp)
+{
+	vfsconf_lock();
+	KASSERT(vfsp->vfc_refcount > 0,
+	    ("vfs %p refcount underflow %d", vfsp, vfsp->vfc_refcount));
+	vfsp->vfc_refcount--;
+	vfsconf_unlock();
+}
+
 static struct vfsconf *
 vfs_byname_locked(const char *name)
 {
@@ -123,9 +133,11 @@ vfs_byname(const char *name)
 {
 	struct vfsconf *vfsp;
 
-	vfsconf_slock();
+	vfsconf_lock();
 	vfsp = vfs_byname_locked(name);
-	vfsconf_sunlock();
+	if (vfsp != NULL)
+		vfsp->vfc_refcount++;
+	vfsconf_unlock();
 	return (vfsp);
 }
 
@@ -387,7 +399,7 @@ vfs_register(struct vfsconf *vfc)
 	static int once;
 	struct vfsconf *tvfc;
 	uint32_t hashval;
-	int secondpass;
+	int error, prevmaxconf, secondpass;
 
 	if (!once) {
 		vattr_null(&va_null);
@@ -405,6 +417,7 @@ vfs_register(struct vfsconf *vfc)
 		return (EEXIST);
 	}
 
+	prevmaxconf = maxvfsconf;
 	if (vfs_typenumhash != 0) {
 		/*
 		 * Calculate a hash on vfc_name to use for vfc_typenum. Unless
@@ -497,16 +510,24 @@ vfs_register(struct vfsconf *vfc)
 		vfc->vfc_vfsops = &vfsops_sigdefer;
 	}
 
-	if (vfc->vfc_flags & VFCF_JAIL)
-		prison_add_vfs(vfc);
-
 	/*
 	 * Call init function for this VFS...
 	 */
 	if ((vfc->vfc_flags & VFCF_SBDRY) != 0)
-		vfc->vfc_vfsops_sd->vfs_init(vfc);
+		error = vfc->vfc_vfsops_sd->vfs_init(vfc);
 	else
-		vfc->vfc_vfsops->vfs_init(vfc);
+		error = vfc->vfc_vfsops->vfs_init(vfc);
+
+	if (error != 0) {
+		maxvfsconf = prevmaxconf;
+		TAILQ_REMOVE(&vfsconf, vfc, vfc_list);
+		vfsconf_unlock();
+		return (error);
+	}
+
+	if ((vfc->vfc_flags & VFCF_JAIL) != 0)
+		prison_add_vfs(vfc);
+
 	vfsconf_unlock();
 
 	/*
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
index b265a5ff3a62..e60d8426ee42 100644
--- a/sys/kern/vfs_inotify.c
+++ b/sys/kern/vfs_inotify.c
@@ -111,6 +111,7 @@ static const struct filterops inotify_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_inotifydetach,
 	.f_event = filt_inotifyevent,
+	.f_copy = knote_triv_copy,
 };
 
 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index fb3e6a7a2534..39c7da803de1 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -883,7 +883,7 @@ vfs_lookup_degenerate(struct nameidata *ndp, struct vnode *dp, int wantparent)
 	}
 	if (wantparent) {
 		ndp->ni_dvp = dp;
-		VREF(dp);
+		vref(dp);
 	}
 	ndp->ni_vp = dp;
 	cnp->cn_namelen = 0;
@@ -1121,7 +1121,7 @@ vfs_lookup(struct nameidata *ndp)
 
 	cnp->cn_lkflags = LK_SHARED;
 	dp = ndp->ni_startdir;
-	ndp->ni_startdir = NULLVP;
+	ndp->ni_startdir = NULL;
 
 	/*
 	 * Leading slashes, if any, are supposed to be skipped by the caller.
@@ -1284,7 +1284,7 @@ dirloop:
 			    (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
-				VREF(dp);
+				vref(dp);
 				goto nextname;
 			}
 			if ((dp->v_vflag & VV_ROOT) == 0)
@@ -1295,7 +1295,7 @@ dirloop:
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
-			VREF(dp);
+			vref(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
@@ -1343,7 +1343,7 @@ unionlookup:
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
-			VREF(dp);
+			vref(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
@@ -1615,7 +1615,7 @@ vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 		}
 		/* ASSERT(dvp == ndp->ni_startdir) */
 		if (refstart)
-			VREF(dvp);
+			vref(dvp);
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp);
 		/*
@@ -1653,7 +1653,7 @@ vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 
 	/* ASSERT(dvp == ndp->ni_startdir) */
 	if (refstart)
-		VREF(dvp);
+		vref(dvp);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp);
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 8e64a7fe966b..13403acacc08 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -683,7 +683,6 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
 	MPASSERT(mp->mnt_vfs_ops == 1, mp,
 	    ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 	(void) vfs_busy(mp, MBF_NOWAIT);
-	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
@@ -731,7 +730,6 @@ vfs_mount_destroy(struct mount *mp)
 	    __FILE__, __LINE__));
 	MPPASS(mp->mnt_writeopcount == 0, mp);
 	MPPASS(mp->mnt_secondary_writes == 0, mp);
-	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
@@ -769,6 +767,9 @@ vfs_mount_destroy(struct mount *mp)
 		vfs_free_addrlist(mp->mnt_export);
 		free(mp->mnt_export, M_MOUNT);
 	}
+	vfsconf_lock();
+	mp->mnt_vfc->vfc_refcount--;
+	vfsconf_unlock();
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
@@ -1133,6 +1134,7 @@ vfs_domount_first(
 	if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred,
 	    vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) {
 		vput(vp);
+		vfs_unref_vfsconf(vfsp);
 		return (EPERM);
 	}
 
@@ -1169,6 +1171,7 @@ vfs_domount_first(
 	}
 	if (error != 0) {
 		vput(vp);
+		vfs_unref_vfsconf(vfsp);
 		return (error);
 	}
 	vn_seqc_write_begin(vp);
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
index e0d1cec5bd71..dd2364f5bf6a 100644
--- a/sys/kern/vfs_mountroot.c
+++ b/sys/kern/vfs_mountroot.c
@@ -266,7 +266,7 @@ vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
 		if (vfsp == NULL)
 			return (ENOENT);
 
-		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
+		mp = vfs_mount_alloc(NULL, vfsp, "/dev", td->td_ucred);
 
 		error = VFS_MOUNT(mp);
 		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index a6e38be89291..58975f7ac932 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2186,6 +2186,8 @@ freevnode(struct vnode *vp)
 {
 	struct bufobj *bo;
 
+	ASSERT_VOP_UNLOCKED(vp, __func__);
+
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 *
@@ -2222,12 +2224,16 @@ freevnode(struct vnode *vp)
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
+		int error __diagused;
+
 		/*
 		 * Use LK_NOWAIT to shut up witness about the lock. We may get
 		 * here while having another vnode locked when trying to
 		 * satisfy a lookup and needing to recycle.
 		 */
-		VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		VNASSERT(error == 0, vp,
+		    ("freevnode: cannot lock vp %p for pollinfo destroy", vp));
 		destroy_vpollinfo(vp->v_pollinfo);
 		VOP_UNLOCK(vp);
 		vp->v_pollinfo = NULL;
@@ -3346,13 +3352,22 @@ vget_abort(struct vnode *vp, enum vgetstate vs)
 	switch (vs) {
 	case VGET_USECOUNT:
 		vrele(vp);
-		break;
+		goto out_ok;
 	case VGET_HOLDCNT:
 		vdrop(vp);
+		goto out_ok;
+	case VGET_NONE:
 		break;
-	default:
-		__assert_unreachable();
 	}
+
+	__assert_unreachable();
+
+	/*
+	 * This is a goto label should the cases above have more in common than
+	 * just the 'return' statement.
+	 */
+out_ok:
+	return;
 }
 
 int
@@ -3561,11 +3576,6 @@ enum vput_op { VRELE, VPUT, VUNREF };
  * exclusive lock on the vnode, while it is legal to call here with only a
  * shared lock (or no locks). If locking the vnode in an expected manner fails,
  * inactive processing gets deferred to the syncer.
- *
- * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
- * on the lock being held all the way until VOP_INACTIVE. This in particular
- * happens with UFS which adds half-constructed vnodes to the hash, where they
- * can be found by other code.
  */
 static void
 vput_final(struct vnode *vp, enum vput_op func)
@@ -3643,26 +3653,26 @@ vput_final(struct vnode *vp, enum vput_op func)
 		}
 		break;
 	}
-	if (error == 0) {
-		if (func == VUNREF) {
-			VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
-			    ("recursive vunref"));
-			vp->v_vflag |= VV_UNREF;
-		}
-		for (;;) {
-			error = vinactive(vp);
-			if (want_unlock)
-				VOP_UNLOCK(vp);
-			if (error != ERELOOKUP || !want_unlock)
-				break;
-			VOP_LOCK(vp, LK_EXCLUSIVE);
-		}
-		if (func == VUNREF)
-			vp->v_vflag &= ~VV_UNREF;
-		vdropl(vp);
-	} else {
+	if (error != 0) {
 		vdefer_inactive(vp);
+		return;
 	}
+	if (func == VUNREF) {
+		VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
+		    ("recursive vunref"));
+		vp->v_vflag |= VV_UNREF;
+	}
+	for (;;) {
+		error = vinactive(vp);
+		if (want_unlock)
+			VOP_UNLOCK(vp);
+		if (error != ERELOOKUP || !want_unlock)
+			break;
+		VOP_LOCK(vp, LK_EXCLUSIVE);
+	}
+	if (func == VUNREF)
+		vp->v_vflag &= ~VV_UNREF;
+	vdropl(vp);
 	return;
 out:
 	if (func == VPUT)
@@ -4501,6 +4511,17 @@ vgonel(struct vnode *vp)
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
+	 *
+	 * FIXME: this is buggy for vnode ops with custom locking primitives.
+	 *
+	 * vget used to be gated with a special flag serializing it against vgone,
+	 * which got lost in the process of SMP-ifying the VFS layer.
+	 *
+	 * Suppose a custom locking routine references ->v_data.
+	 *
+	 * Since now it is possible to start executing it as vgone is
+	 * progressing, this very well may crash as ->v_data gets invalidated
+	 * and memory used to back it is freed.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
@@ -6524,6 +6545,7 @@ const struct filterops fs_filtops = {
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent,
+	.f_copy = knote_triv_copy,
 };
 
 static int
@@ -6603,24 +6625,28 @@ static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static int	filt_vfsdump(struct proc *p, struct knote *kn,
 		    struct kinfo_knote *kin);
+static int	filt_vfscopy(struct knote *kn, struct proc *p1);
 
 static const struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread,
 	.f_userdump = filt_vfsdump,
+	.f_copy = filt_vfscopy,
 };
 static const struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite,
 	.f_userdump = filt_vfsdump,
+	.f_copy = filt_vfscopy,
 };
 static const struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode,
 	.f_userdump = filt_vfsdump,
+	.f_copy = filt_vfscopy,
 };
 
 static void
@@ -6804,6 +6830,16 @@ filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin)
 	return (0);
 }
 
+static int
+filt_vfscopy(struct knote *kn, struct proc *p1)
+{
+	struct vnode *vp;
+
+	vp = (struct vnode *)kn->kn_hook;
+	vhold(vp);
+	return (0);
+}
+
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index bf3ed9d515dc..1a739d354f1f 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1119,7 +1119,7 @@ flags_to_rights(int flags, cap_rights_t *rightsp)
 	if (flags & O_TRUNC)
 		cap_rights_set_one(rightsp, CAP_FTRUNCATE);
 
-	if (flags & (O_SYNC | O_FSYNC))
+	if (flags & (O_SYNC | O_FSYNC | O_DSYNC))
 		cap_rights_set_one(rightsp, CAP_FSYNC);
 
 	if (flags & (O_EXLOCK | O_SHLOCK))
@@ -1932,7 +1932,7 @@ restart:
 	if (error != 0)
 		return (error);
 
-	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+	if (nd.ni_vp != NULL || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE_PNBUF(&nd);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
@@ -4363,7 +4363,7 @@ unionread:
 		struct vnode *tvp = vp;
 
 		vp = vp->v_mount->mnt_vnodecovered;
-		VREF(vp);
+		vref(vp);
 		fp->f_vnode = vp;
 		foffset = 0;
 		vput(tvp);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index a4f41192f684..a53df50c06bd 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -798,58 +798,84 @@ vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
 }
 
 #if OFF_MAX <= LONG_MAX
-off_t
-foffset_lock(struct file *fp, int flags)
+static void
+file_v_lock(struct file *fp, short lock_bit, short lock_wait_bit)
 {
-	volatile short *flagsp;
-	off_t res;
+	short *flagsp;
 	short state;
 
-	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
-
-	if ((flags & FOF_NOLOCK) != 0)
-		return (atomic_load_long(&fp->f_offset));
-
-	/*
-	 * According to McKusick the vn lock was protecting f_offset here.
-	 * It is now protected by the FOFFSET_LOCKED flag.
-	 */
-	flagsp = &fp->f_vnread_flags;
-	if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
-		return (atomic_load_long(&fp->f_offset));
+	flagsp = &fp->f_vflags;
+	state = atomic_load_16(flagsp);
+	for (;;) {
+		if ((state & lock_bit) != 0)
+			break;
+		if (atomic_fcmpset_acq_16(flagsp, &state, state | lock_bit))
+			return;
+	}
 
-	sleepq_lock(&fp->f_vnread_flags);
+	sleepq_lock(flagsp);
 	state = atomic_load_16(flagsp);
 	for (;;) {
-		if ((state & FOFFSET_LOCKED) == 0) {
+		if ((state & lock_bit) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
-			    FOFFSET_LOCKED))
+			    state | lock_bit))
 				continue;
 			break;
 		}
-		if ((state & FOFFSET_LOCK_WAITING) == 0) {
+		if ((state & lock_wait_bit) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
-			    state | FOFFSET_LOCK_WAITING))
+			    state | lock_wait_bit))
 				continue;
 		}
 		DROP_GIANT();
-		sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
-		sleepq_wait(&fp->f_vnread_flags, PRI_MAX_KERN);
+		sleepq_add(flagsp, NULL, "vofflock", 0, 0);
+		sleepq_wait(flagsp, PRI_MAX_KERN);
 		PICKUP_GIANT();
-		sleepq_lock(&fp->f_vnread_flags);
+		sleepq_lock(flagsp);
 		state = atomic_load_16(flagsp);
 	}
-	res = atomic_load_long(&fp->f_offset);
-	sleepq_release(&fp->f_vnread_flags);
-	return (res);
+	sleepq_release(flagsp);
 }
 
-void
-foffset_unlock(struct file *fp, off_t val, int flags)
+static void
+file_v_unlock(struct file *fp, short lock_bit, short lock_wait_bit)
 {
-	volatile short *flagsp;
+	short *flagsp;
 	short state;
 
+	flagsp = &fp->f_vflags;
+	state = atomic_load_16(flagsp);
+	for (;;) {
+		if ((state & lock_wait_bit) != 0)
+			break;
+		if (atomic_fcmpset_rel_16(flagsp, &state, state & ~lock_bit))
+			return;
+	}
+
+	sleepq_lock(flagsp);
+	MPASS((*flagsp & lock_bit) != 0);
+	MPASS((*flagsp & lock_wait_bit) != 0);
+	atomic_clear_16(flagsp, lock_bit | lock_wait_bit);
+	sleepq_broadcast(flagsp, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(flagsp);
+}
+
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+	if ((flags & FOF_NOLOCK) == 0) {
+		file_v_lock(fp, FILE_V_FOFFSET_LOCKED,
+		    FILE_V_FOFFSET_LOCK_WAITING);
+	}
+
+	return (atomic_load_long(&fp->f_offset));
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOUPDATE) == 0)
@@ -859,21 +885,10 @@ foffset_unlock(struct file *fp, off_t val, int flags)
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 
-	if ((flags & FOF_NOLOCK) != 0)
-		return;
-
-	flagsp = &fp->f_vnread_flags;
-	state = atomic_load_16(flagsp);
-	if ((state & FOFFSET_LOCK_WAITING) == 0 &&
-	    atomic_cmpset_rel_16(flagsp, state, 0))
-		return;
-
-	sleepq_lock(&fp->f_vnread_flags);
-	MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
-	MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
-	fp->f_vnread_flags = 0;
-	sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
-	sleepq_release(&fp->f_vnread_flags);
+	if ((flags & FOF_NOLOCK) == 0) {
+		file_v_unlock(fp, FILE_V_FOFFSET_LOCKED,
+		    FILE_V_FOFFSET_LOCK_WAITING);
+	}
 }
 
 static off_t
@@ -882,7 +897,47 @@ foffset_read(struct file *fp)
 
 	return (atomic_load_long(&fp->f_offset));
 }
-#else
+
+void
+fsetfl_lock(struct file *fp)
+{
+	file_v_lock(fp, FILE_V_SETFL_LOCKED, FILE_V_SETFL_LOCK_WAITING);
+}
+
+void
+fsetfl_unlock(struct file *fp)
+{
+	file_v_unlock(fp, FILE_V_SETFL_LOCKED, FILE_V_SETFL_LOCK_WAITING);
+}
+
+#else	/* OFF_MAX <= LONG_MAX */
+
+static void
+file_v_lock_mtxp(struct file *fp, struct mtx *mtxp, short lock_bit,
+    short lock_wait_bit)
+{
+	mtx_assert(mtxp, MA_OWNED);
+
+	while ((fp->f_vflags & lock_bit) != 0) {
+		fp->f_vflags |= lock_wait_bit;
+		msleep(&fp->f_vflags, mtxp, PRI_MAX_KERN,
+		    "vofflock", 0);
+	}
+	fp->f_vflags |= lock_bit;
+}
+
+static void
+file_v_unlock_mtxp(struct file *fp, struct mtx *mtxp, short lock_bit,
+    short lock_wait_bit)
+{
+	mtx_assert(mtxp, MA_OWNED);
+
+	KASSERT((fp->f_vflags & lock_bit) != 0, ("Lost lock_bit"));
+	if ((fp->f_vflags & lock_wait_bit) != 0)
+		wakeup(&fp->f_vflags);
+	fp->f_vflags &= ~(lock_bit | lock_wait_bit);
+}
+
 off_t
 foffset_lock(struct file *fp, int flags)
 {
@@ -894,12 +949,8 @@ foffset_lock(struct file *fp, int flags)
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
-		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
-			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PRI_MAX_KERN,
-			    "vofflock", 0);
-		}
-		fp->f_vnread_flags |= FOFFSET_LOCKED;
+		file_v_lock_mtxp(fp, mtxp, FILE_V_FOFFSET_LOCKED,
+		    FILE_V_FOFFSET_LOCK_WAITING);
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
@@ -922,11 +973,8 @@ foffset_unlock(struct file *fp, off_t val, int flags)
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 	if ((flags & FOF_NOLOCK) == 0) {
-		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
-		    ("Lost FOFFSET_LOCKED"));
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
+		file_v_unlock_mtxp(fp, mtxp, FILE_V_FOFFSET_LOCKED,
+		    FILE_V_FOFFSET_LOCK_WAITING);
 	}
 	mtx_unlock(mtxp);
 }
@@ -937,6 +985,30 @@ foffset_read(struct file *fp)
 
 	return (foffset_lock(fp, FOF_NOLOCK));
 }
+
+void
+fsetfl_lock(struct file *fp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	file_v_lock_mtxp(fp, mtxp, FILE_V_SETFL_LOCKED,
+	    FILE_V_SETFL_LOCK_WAITING);
+	mtx_unlock(mtxp);
+}
+
+void
+fsetfl_unlock(struct file *fp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	file_v_unlock_mtxp(fp, mtxp, FILE_V_SETFL_LOCKED,
+	    FILE_V_SETFL_LOCK_WAITING);
+	mtx_unlock(mtxp);
+}
 #endif
 
 void