src - FreeBSD source tree

diff options


context:
space:
mode:

author	Julian Elischer <julian@FreeBSD.org>	2002-06-29 17:26:22 +0000
committer	Julian Elischer <julian@FreeBSD.org>	2002-06-29 17:26:22 +0000
commit	e602ba25fd1f9a7ea2215c01f470c08f140de809 (patch)
tree	0a0483a267784fa8e2bf86857d8727edb5b122e9 /sys/kern
parent	cc5dcb202cd7616bae9321687ec46a384a061d99 (diff)
download	src-e602ba25fd1f9a7ea2215c01f470c08f140de809.tar.gz src-e602ba25fd1f9a7ea2215c01f470c08f140de809.zip

Part 1 of KSE-III

The ability to schedule multiple threads per process (one one cpu) by making ALL system calls optionally asynchronous. to come: ia64 and power-pc patches, patches for gdb, test program (in tools) Reviewed by: Almost everyone who counts (at various times, peter, jhb, matt, alfred, mini, bernd, and a cast of thousands) NOTE: this is still Beta code, and contains lots of debugging stuff. expect slight instability in signals..

Notes

Notes: svn path=/head/; revision=99072

Diffstat (limited to 'sys/kern')

-rw-r--r--

sys/kern/init_main.c

-rw-r--r--

sys/kern/init_sysent.c

-rw-r--r--

sys/kern/kern_condvar.c

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

sys/kern/kern_kthread.c

-rw-r--r--

sys/kern/kern_mutex.c

-rw-r--r--

sys/kern/kern_poll.c

-rw-r--r--

sys/kern/kern_proc.c

217

-rw-r--r--

sys/kern/kern_shutdown.c

-rw-r--r--

sys/kern/kern_sig.c

386

-rw-r--r--

sys/kern/kern_subr.c

-rw-r--r--

sys/kern/kern_switch.c

662

-rw-r--r--

sys/kern/kern_synch.c

275

-rw-r--r--

sys/kern/ksched.c

-rw-r--r--

sys/kern/subr_smp.c

-rw-r--r--

sys/kern/subr_trap.c

-rw-r--r--

sys/kern/subr_turnstile.c

-rw-r--r--

sys/kern/subr_witness.c

-rw-r--r--

sys/kern/sys_generic.c

-rw-r--r--

sys/kern/sys_process.c

-rw-r--r--

sys/kern/syscalls.master

-rw-r--r--

sys/kern/tty.c

26 files changed, 1601 insertions, 491 deletions

diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index d5c565680605..06cc8d831ebb 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c

@@ -289,6 +289,7 @@ proc0_init(void *dummy __unused)

* Initialize thread, process and pgrp structures.

procinit();

+ threadinit();

* Initialize sleep queue hash table

@@ -322,19 +323,34 @@ proc0_init(void *dummy __unused)

p->p_sysent = &aout_sysvec;

#endif

+ /*

+ * proc_linkup was already done in init_i386() or alphainit() etc.

+ * because the earlier code needed to follow td->td_proc. Otherwise

+ * I would have done it here.. maybe this means this should be

+ * done earlier too.

+ */

ke = &proc0.p_kse; /* XXXKSE */

kg = &proc0.p_ksegrp; /* XXXKSE */

p->p_flag = P_SYSTEM;

p->p_sflag = PS_INMEM;

- p->p_stat = SRUN;

- p->p_ksegrp.kg_nice = NZERO;

- kg->kg_pri_class = PRI_TIMESHARE;

- kg->kg_user_pri = PUSER;

- td->td_priority = PVM;

- td->td_base_pri = PUSER;

+ p->p_state = PRS_NORMAL;

+ td->td_state = TDS_RUNNING;

+ kg->kg_nice = NZERO;

+ kg->kg_pri_class = PRI_TIMESHARE;

+ kg->kg_user_pri = PUSER;

+ td->td_priority = PVM;

+ td->td_base_pri = PUSER;

+ td->td_kse = ke; /* XXXKSE */

+ ke->ke_oncpu = 0;

+ ke->ke_state = KES_RUNNING;

+ ke->ke_thread = td;

+ /* proc_linkup puts it in the idle queue, that's not what we want. */

+ TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);

+ kg->kg_idle_kses--;

p->p_peers = 0;

p->p_leader = p;

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

bcopy("swapper", p->p_comm, sizeof ("swapper"));

@@ -662,8 +678,7 @@ kick_init(const void *udata __unused)

td = FIRST_THREAD_IN_PROC(initproc);

mtx_lock_spin(&sched_lock);

- initproc->p_stat = SRUN;

- setrunqueue(FIRST_THREAD_IN_PROC(initproc)); /* XXXKSE */

+ setrunqueue(td); /* XXXKSE */

mtx_unlock_spin(&sched_lock);

}

SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)

diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 425e3b73fc88..cf8ba8038bcf 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c

@@ -405,7 +405,7 @@ struct sysent sysent[] = {

{ 0, (sy_call_t *)kse_wakeup }, /* 380 = kse_wakeup */

{ AS(kse_new_args), (sy_call_t *)kse_new }, /* 381 = kse_new */

{ AS(thread_wakeup_args), (sy_call_t *)thread_wakeup }, /* 382 = thread_wakeup */

- { 0, (sy_call_t *)kse_yield }, /* 383 = kse_yield */

+ { SYF_MPSAFE | 0, (sy_call_t *)kse_yield }, /* 383 = kse_yield */

{ 0, (sy_call_t *)nosys }, /* 384 = __mac_get_proc */

{ 0, (sy_call_t *)nosys }, /* 385 = __mac_set_proc */

{ 0, (sy_call_t *)nosys }, /* 386 = __mac_get_fd */

diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
index 9d30d2503e18..78585b28da6a 100644
--- a/sys/kern/kern_condvar.c
+++ b/sys/kern/kern_condvar.c

@@ -48,7 +48,7 @@

#define CV_ASSERT(cvp, mp, td) do { \

KASSERT((td) != NULL, ("%s: curthread NULL", __func__)); \

- KASSERT((td)->td_proc->p_stat == SRUN, ("%s: not SRUN", __func__)); \

+ KASSERT((td)->td_state == TDS_RUNNING, ("%s: not TDS_RUNNING", __func__)); \

KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \

KASSERT((mp) != NULL, ("%s: mp NULL", __func__)); \

mtx_assert((mp), MA_OWNED | MA_NOTRECURSED); \

@@ -80,6 +80,7 @@

#endif

static void cv_timedwait_end(void *arg);

+static void cv_check_upcall(struct thread *td);

* Initialize a condition variable. Must be called before use.

@@ -109,14 +110,47 @@ cv_destroy(struct cv *cvp)

+ * Decide if we need to queue an upcall.

+ * This is copied from msleep(), perhaps this should be a common function.

+ */

+static void

+cv_check_upcall(struct thread *td)

+ /*

+ * If we are capable of async syscalls and there isn't already

+ * another one ready to return, start a new thread

+ * and queue it as ready to run. Note that there is danger here

+ * because we need to make sure that we don't sleep allocating

+ * the thread (recursion here might be bad).

+ * Hence the TDF_INMSLEEP flag.

+ */

+ if ((td->td_proc->p_flag & P_KSES) && td->td_mailbox &&

+ (td->td_flags & TDF_INMSLEEP) == 0) {

+ /*

+ * If we have no queued work to do,

+ * upcall to the UTS to see if it has more work.

+ * We don't need to upcall now, just queue it.

+ */

+ if (TAILQ_FIRST(&td->td_ksegrp->kg_runq) == NULL) {

+ /* Don't recurse here! */

+ td->td_flags |= TDF_INMSLEEP;

+ thread_schedule_upcall(td, td->td_kse);

+ td->td_flags &= ~TDF_INMSLEEP;

+ }

+/*

* Switch context.

static __inline void

cv_switch(struct thread *td)

{

- td->td_proc->p_stat = SSLEEP;

+ td->td_state = TDS_SLP;

td->td_proc->p_stats->p_ru.ru_nvcsw++;

+ cv_check_upcall(td);

mi_switch();

CTR3(KTR_PROC, "cv_switch: resume thread %p (pid %d, %s)", td,

td->td_proc->p_pid, td->td_proc->p_comm);

@@ -135,7 +169,7 @@ cv_switch_catch(struct thread *td)

* We put ourselves on the sleep queue and start our timeout before

* calling cursig, as we could stop there, and a wakeup or a SIGCONT (or

* both) could occur while we were stopped. A SIGCONT would cause us to

- * be marked as SSLEEP without resuming us, thus we must be ready for

+ * be marked as TDS_SLP without resuming us, thus we must be ready for

* sleep when cursig is called. If the wakeup happens while we're

* stopped, td->td_wchan will be 0 upon return from cursig.

@@ -143,13 +177,15 @@ cv_switch_catch(struct thread *td)

mtx_unlock_spin(&sched_lock);

p = td->td_proc;

PROC_LOCK(p);

- sig = cursig(p); /* XXXKSE */

+ sig = cursig(td); /* XXXKSE */

+ if (thread_suspend_check(1))

+ sig = SIGSTOP;

mtx_lock_spin(&sched_lock);

PROC_UNLOCK(p);

if (sig != 0) {

if (td->td_wchan != NULL)

cv_waitq_remove(td);

- td->td_proc->p_stat = SRUN;

+ td->td_state = TDS_RUNNING; /* XXXKSE */

} else if (td->td_wchan != NULL) {

cv_switch(td);

}

@@ -175,7 +211,6 @@ cv_waitq_add(struct cv *cvp, struct thread *td)

td->td_flags |= TDF_CVWAITQ;

td->td_wchan = cvp;

td->td_wmesg = cvp->cv_description;

- td->td_kse->ke_slptime = 0; /* XXXKSE */

td->td_ksegrp->kg_slptime = 0; /* XXXKSE */

td->td_base_pri = td->td_priority;

CTR3(KTR_PROC, "cv_waitq_add: thread %p (pid %d, %s)", td,

@@ -285,7 +320,7 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)

PROC_LOCK(p);

if (sig == 0)

- sig = cursig(p); /* XXXKSE */

+ sig = cursig(td); /* XXXKSE */

if (sig != 0) {

if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))

rval = EINTR;

@@ -293,6 +328,8 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)

rval = ERESTART;

}

PROC_UNLOCK(p);

+ if (p->p_flag & P_WEXIT)

+ rval = EINTR;

#ifdef KTRACE

if (KTRPOINT(td, KTR_CSW))

@@ -363,6 +400,8 @@ cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)

mi_switch();

}

+ if (td->td_proc->p_flag & P_WEXIT)

+ rval = EWOULDBLOCK;

mtx_unlock_spin(&sched_lock);

#ifdef KTRACE

if (KTRPOINT(td, KTR_CSW))

@@ -436,12 +475,11 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)

td->td_proc->p_stats->p_ru.ru_nivcsw++;

mi_switch();

}

mtx_unlock_spin(&sched_lock);

PROC_LOCK(p);

if (sig == 0)

- sig = cursig(p);

+ sig = cursig(td);

if (sig != 0) {

if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))

rval = EINTR;

@@ -450,6 +488,9 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)

}

PROC_UNLOCK(p);

+ if (p->p_flag & P_WEXIT)

+ rval = EINTR;

#ifdef KTRACE

if (KTRPOINT(td, KTR_CSW))

ktrcsw(0, 0);

@@ -477,15 +518,13 @@ cv_wakeup(struct cv *cvp)

TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);

td->td_flags &= ~TDF_CVWAITQ;

td->td_wchan = 0;

- if (td->td_proc->p_stat == SSLEEP) {

+ if (td->td_state == TDS_SLP) {

/* OPTIMIZED EXPANSION OF setrunnable(td); */

CTR3(KTR_PROC, "cv_signal: thread %p (pid %d, %s)",

td, td->td_proc->p_pid, td->td_proc->p_comm);

if (td->td_ksegrp->kg_slptime > 1) /* XXXKSE */

updatepri(td);

- td->td_kse->ke_slptime = 0;

td->td_ksegrp->kg_slptime = 0;

- td->td_proc->p_stat = SRUN;

if (td->td_proc->p_sflag & PS_INMEM) {

setrunqueue(td);

maybe_resched(td);

@@ -568,7 +607,7 @@ cv_timedwait_end(void *arg)

td->td_flags &= ~TDF_TIMEOUT;

setrunqueue(td);

} else if (td->td_wchan != NULL) {

- if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */

+ if (td->td_state == TDS_SLP) /* XXXKSE */

setrunnable(td);

else

cv_waitq_remove(td);

@@ -577,3 +616,27 @@ cv_timedwait_end(void *arg)

td->td_flags |= TDF_TIMOFAIL;

mtx_unlock_spin(&sched_lock);

}

+/*

+ * For now only abort interruptable waits.

+ * The others will have to either complete on their own or have a timeout.

+ */

+void

+cv_abort(struct thread *td)

+ CTR3(KTR_PROC, "cv_abort: thread %p (pid %d, %s)", td,

+ td->td_proc->p_pid,

+ td->td_proc->p_comm);

+ mtx_lock_spin(&sched_lock);

+ if ((td->td_flags & (TDF_SINTR|TDF_TIMEOUT)) == TDF_SINTR) {

+ if (td->td_wchan != NULL) {

+ if (td->td_state == TDS_SLP)

+ setrunnable(td);

+ else

+ cv_waitq_remove(td);

+ }

+ mtx_unlock_spin(&sched_lock);

diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index feaa12343f77..0cd7f2794482 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c

@@ -154,12 +154,14 @@ execve(td, uap)

PROC_LOCK(p);

KASSERT((p->p_flag & P_INEXEC) == 0,

("%s(): process already has P_INEXEC flag", __func__));

+ if ((p->p_flag & P_KSES) && thread_single(SNGLE_EXIT)) {

+ PROC_UNLOCK(p);

+ mtx_unlock(&Giant);

+ return (ERESTART); /* Try again later. */

+ }

+ /* If we get here all other threads are dead. */

p->p_flag |= P_INEXEC;

PROC_UNLOCK(p);

-/* XXXKSE */

-/* !!!!!!!! we need abort all the other threads of this process before we */

-/* proceed beyond his point! */

* Initialize part of the common data

diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 63a51351fa49..fea5438f3f22 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c

@@ -145,6 +145,67 @@ exit1(td, rv)

* XXXXKSE: MUST abort all other threads before proceeding past here.

+ PROC_LOCK(p);

+ if (p->p_flag & P_KSES) {

+ /*

+ * First check if some other thread got here before us..

+ * if so, act apropriatly, (exit or suspend);

+ */

+ thread_suspend_check(0);

+ /*

+ * Here is a trick..

+ * We need to free up our KSE to process other threads

+ * so that we can safely set the UNBOUND flag

+ * (whether or not we have a mailbox) as we are NEVER

+ * going to return to the user.

+ * The flag will not be set yet if we are exiting

+ * because of a signal, pagefault, or similar

+ * (or even an exit(2) from the UTS).

+ */

+ td->td_flags |= TDF_UNBOUND;

+ /*

+ * Kill off the other threads. This requires

+ * Some co-operation from other parts of the kernel

+ * so it may not be instant.

+ * With this state set:

+ * Any thread entering the kernel from userspace will

+ * thread_exit() in trap(). Any thread attempting to

+ * sleep will return immediatly

+ * with EINTR or EWOULDBLOCK, which will hopefully force them

+ * to back out to userland, freeing resources as they go, and

+ * anything attempting to return to userland will thread_exit()

+ * from userret(). thread_exit() will unsuspend us

+ * when the last other thread exits.

+ */

+ if (thread_single(SNGLE_EXIT)) {

+ panic ("Exit: Single threading fouled up");

+ }

+ /*

+ * All other activity in this process is now stopped.

+ * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)

+ * ...

+ * Turn off threading support.

+ */

+ p->p_flag &= ~P_KSES;

+ td->td_flags &= ~TDF_UNBOUND;

+ thread_single_end(); /* Don't need this any more. */

+ }

+ /*

+ * With this state set:

+ * Any thread entering the kernel from userspace will thread_exit()

+ * in trap(). Any thread attempting to sleep will return immediatly

+ * with EINTR or EWOULDBLOCK, which will hopefully force them

+ * to back out to userland, freeing resources as they go, and

+ * anything attempting to return to userland will thread_exit()

+ * from userret(). thread_exit() will do a wakeup on p->p_numthreads

+ * if it transitions to 1.

+ */

+ p->p_flag |= P_WEXIT;

+ PROC_UNLOCK(p);

+ if (td->td_kse->ke_mdstorage)

+ cpu_free_kse_mdstorage(td->td_kse);

/* Are we a task leader? */

PROC_LOCK(p);

@@ -185,7 +246,6 @@ exit1(td, rv)

PROC_LOCK(p);

p->p_flag &= ~(P_TRACED | P_PPWAIT);

- p->p_flag |= P_WEXIT;

SIGEMPTYSET(p->p_siglist);

PROC_UNLOCK(p);

if (timevalisset(&p->p_realtimer.it_value))

@@ -434,22 +494,24 @@ exit1(td, rv)

* We have to wait until after releasing all locks before

- * changing p_stat. If we block on a mutex then we will be

+ * changing p_state. If we block on a mutex then we will be

* back at SRUN when we resume and our parent will never

* harvest us.

- p->p_stat = SZOMB;

+ p->p_state = PRS_ZOMBIE;

wakeup(p->p_pptr);

PROC_UNLOCK(p->p_pptr);

- PROC_UNLOCK(p);

cnt.v_swtch++;

binuptime(PCPU_PTR(switchtime));

PCPU_SET(switchticks, ticks);

- cpu_sched_exit(td);

- cpu_throw();

+ cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */

+ /*

+ * Make sure this thread is discarded from the zombie.

+ * This will also release this thread's reference to the ucred.

+ */

+ thread_exit();

panic("exit1");

}

@@ -504,6 +566,8 @@ wait1(td, uap, compat)

int status, error;

+ struct kse *ke;

+ struct ksegrp *kg;

q = td->td_proc;

if (uap->pid == 0) {

@@ -540,7 +604,7 @@ loop:

}

nfound++;

- if (p->p_stat == SZOMB) {

+ if (p->p_state == PRS_ZOMBIE) {

* charge childs scheduling cpu usage to parent

* XXXKSE assume only one thread & kse & ksegrp

@@ -656,6 +720,21 @@ loop:

}

+ * There should only be one KSE/KSEGRP but

+ * do it right anyhow.

+ */

+ FOREACH_KSEGRP_IN_PROC(p, kg) {

+ FOREACH_KSE_IN_GROUP(kg, ke) {

+ /* Free the KSE spare thread. */

+ if (ke->ke_tdspare != NULL) {

+ thread_free(ke->ke_tdspare);

+ p->p_kse.ke_tdspare = NULL;

+ }

+ thread_reap(); /* check for zombie threads */

+ /*

* Give vm and machine-dependent layer a chance

* to free anything that cpu_exit couldn't

* release while still running in process context.

@@ -669,7 +748,7 @@ loop:

mtx_unlock(&Giant);

return (0);

}

- if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&

+ if (P_SHOULDSTOP(p) && ((p->p_flag & P_WAITED) == 0) &&

(p->p_flag & P_TRACED || uap->options & WUNTRACED)) {

p->p_flag |= P_WAITED;

sx_xunlock(&proctree_lock);

diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 016653bcb471..eac0267ce1bb 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c

@@ -212,23 +212,6 @@ sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)

SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,

0, 0, sysctl_kern_randompid, "I", "Random PID modulus");

-#if 0

-void

-kse_init(struct kse *kse1, struct kse *kse2)

-void

-thread_init(struct thread *thread1, struct thread *thread2)

-void

-ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2)

-#endif

int

fork1(td, flags, procp)

struct thread *td; /* parent proc */

@@ -296,6 +279,29 @@ fork1(td, flags, procp)

return (0);

}

+ if (p1->p_flag & P_KSES) {

+ /*

+ * Idle the other threads for a second.

+ * Since the user space is copied, it must remain stable.

+ * In addition, all threads (from the user perspective)

+ * need to either be suspended or in the kernel,

+ * where they will try restart in the parent and will

+ * be aborted in the child.

+ */

+ PROC_LOCK(p1);

+ if (thread_single(SNGLE_NO_EXIT)) {

+ /* Abort.. someone else is single threading before us */

+ PROC_UNLOCK(p1);

+ return (ERESTART);

+ }

+ PROC_UNLOCK(p1);

+ /*

+ * All other activity in this process

+ * is now suspended at the user boundary,

+ * (or other safe places if we think of any).

+ */

+ }

/* Allocate new proc. */

newproc = uma_zalloc(proc_zone, M_WAITOK);

@@ -311,6 +317,11 @@ fork1(td, flags, procp)

if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {

sx_xunlock(&allproc_lock);

uma_zfree(proc_zone, newproc);

+ if (p1->p_flag & P_KSES) {

+ PROC_LOCK(p1);

+ thread_single_end();

+ PROC_UNLOCK(p1);

+ }

tsleep(&forksleep, PUSER, "fork", hz / 2);

return (EAGAIN);

}

@@ -325,6 +336,11 @@ fork1(td, flags, procp)

if (!ok) {

sx_xunlock(&allproc_lock);

uma_zfree(proc_zone, newproc);

+ if (p1->p_flag & P_KSES) {

+ PROC_LOCK(p1);

+ thread_single_end();

+ PROC_UNLOCK(p1);

+ }

tsleep(&forksleep, PUSER, "fork", hz / 2);

return (EAGAIN);

}

@@ -411,7 +427,7 @@ again:

lastpid = trypid;

p2 = newproc;

- p2->p_stat = SIDL; /* protect against others */

+ p2->p_state = PRS_NEW; /* protect against others */

p2->p_pid = trypid;

LIST_INSERT_HEAD(&allproc, p2, p_list);

LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);

@@ -449,7 +465,7 @@ again:

* Start by zeroing the section of proc that is zero-initialized,

* then copy the section that is copied directly from the parent.

- td2 = thread_get(p2);

+ td2 = thread_alloc();

ke2 = &p2->p_kse;

kg2 = &p2->p_ksegrp;

@@ -459,8 +475,10 @@ again:

(unsigned) RANGEOF(struct proc, p_startzero, p_endzero));

bzero(&ke2->ke_startzero,

(unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));

+#if 0 /* bzero'd by the thread allocator */

bzero(&td2->td_startzero,

(unsigned) RANGEOF(struct thread, td_startzero, td_endzero));

+#endif

bzero(&kg2->kg_startzero,

(unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));

@@ -482,9 +500,22 @@ again:

* XXXKSE Theoretically only the running thread would get copied

* Others in the kernel would be 'aborted' in the child.

* i.e return E*something*

+ * On SMP we would have to stop them running on

+ * other CPUs! (set a flag in the proc that stops

+ * all returns to userland until completed)

+ * This is wrong but ok for 1:1.

proc_linkup(p2, kg2, ke2, td2);

+ /* Set up the thread as an active thread (as if runnable). */

+ TAILQ_REMOVE(&kg2->kg_iq, ke2, ke_kgrlist);

+ kg2->kg_idle_kses--;

+ ke2->ke_state = KES_UNQUEUED;

+ ke2->ke_thread = td2;

+ td2->td_kse = ke2;

+ td2->td_flags &= ~TDF_UNBOUND; /* For the rest of this syscall. */

+KASSERT((ke2->ke_kgrlist.tqe_next != ke2), ("linked to self!"));

/* note.. XXXKSE no pcb or u-area yet */

@@ -699,7 +730,6 @@ again:

p2->p_acflag = AFORK;

if ((flags & RFSTOPPED) == 0) {

mtx_lock_spin(&sched_lock);

- p2->p_stat = SRUN;

setrunqueue(td2);

mtx_unlock_spin(&sched_lock);

}

@@ -803,6 +833,9 @@ fork_exit(callout, arg, frame)

struct proc *p = td->td_proc;

td->td_kse->ke_oncpu = PCPU_GET(cpuid);

+ p->p_state = PRS_NORMAL;

+ td->td_state = TDS_RUNNING; /* Already done in switch() on 386. */

+ td->td_kse->ke_state = KES_RUNNING;

* Finish setting up thread glue. We need to initialize

* the thread into a td_critnest=1 state. Some platforms

@@ -814,7 +847,7 @@ fork_exit(callout, arg, frame)

sched_lock.mtx_lock = (uintptr_t)td;

sched_lock.mtx_recurse = 0;

cpu_critical_fork_exit();

- CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid,

+ CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid,

p->p_comm);

if (PCPU_GET(switchtime.sec) == 0)

binuptime(PCPU_PTR(switchtime));

diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
index 29194b735f45..306f2a57cdad 100644
--- a/sys/kern/kern_idle.c
+++ b/sys/kern/kern_idle.c

@@ -40,6 +40,7 @@ idle_setup(void *dummy)

struct pcpu *pc;

#endif

struct proc *p;

+ struct thread *td;

int error;

#ifdef SMP

@@ -60,7 +61,10 @@ idle_setup(void *dummy)

panic("idle_setup: kthread_create error %d\n", error);

p->p_flag |= P_NOLOAD;

- p->p_stat = SRUN;

+ td = FIRST_THREAD_IN_PROC(p);

+ td->td_state = TDS_RUNQ;

+ td->td_kse->ke_state = KES_ONRUNQ;

+ td->td_kse->ke_flags |= KEF_IDLEKSE;

#ifdef SMP

}

#endif

@@ -75,16 +79,22 @@ idle_proc(void *dummy)

#ifdef DIAGNOSTIC

int count;

#endif

+ struct thread *td;

+ struct proc *p;

+ td = curthread;

+ p = td->td_proc;

+ td->td_state = TDS_RUNNING;

+ td->td_kse->ke_state = KES_RUNNING;

for (;;) {

mtx_assert(&Giant, MA_NOTOWNED);

#ifdef DIAGNOSTIC

count = 0;

- while (count >= 0 && procrunnable() == 0) {

+ while (count >= 0 && kserunnable() == 0) {

#else

- while (procrunnable() == 0) {

+ while (kserunnable() == 0) {

#endif

* This is a good place to put things to be done in

@@ -103,8 +113,9 @@ idle_proc(void *dummy)

}

mtx_lock_spin(&sched_lock);

- curproc->p_stats->p_ru.ru_nvcsw++;

+ p->p_stats->p_ru.ru_nvcsw++;

mi_switch();

+ td->td_kse->ke_state = KES_RUNNING;

mtx_unlock_spin(&sched_lock);

}

diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
index d65dc8228c03..fb9c092d4311 100644
--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c

@@ -201,7 +201,7 @@ ithread_create(struct ithd **ithread, int vector, int flags,

td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */

td->td_ksegrp->kg_pri_class = PRI_ITHD;

td->td_priority = PRI_MAX_ITHD;

- p->p_stat = SWAIT;

+ td->td_state = TDS_IWAIT;

ithd->it_td = td;

td->td_ithd = ithd;

if (ithread != NULL)

@@ -229,8 +229,7 @@ ithread_destroy(struct ithd *ithread)

}

ithread->it_flags |= IT_DEAD;

mtx_lock_spin(&sched_lock);

- if (p->p_stat == SWAIT) {

- p->p_stat = SRUN; /* XXXKSE */

+ if (td->td_state == TDS_IWAIT) {

setrunqueue(td);

}

mtx_unlock_spin(&sched_lock);

@@ -327,7 +326,7 @@ ok:

* handler as being dead and let the ithread do the actual removal.

mtx_lock_spin(&sched_lock);

- if (ithread->it_td->td_proc->p_stat != SWAIT) {

+ if (ithread->it_td->td_state != TDS_IWAIT) {

handler->ih_flags |= IH_DEAD;

@@ -374,8 +373,8 @@ ithread_schedule(struct ithd *ithread, int do_switch)

td = ithread->it_td;

p = td->td_proc;

KASSERT(p != NULL, ("ithread %s has no process", ithread->it_name));

- CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d", __func__, p->p_pid, p->p_comm,

- ithread->it_need);

+ CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d",

+ __func__, p->p_pid, p->p_comm, ithread->it_need);

* Set it_need to tell the thread to keep running if it is already

@@ -387,14 +386,16 @@ ithread_schedule(struct ithd *ithread, int do_switch)

ithread->it_need = 1;

mtx_lock_spin(&sched_lock);

- if (p->p_stat == SWAIT) {

+ if (td->td_state == TDS_IWAIT) {

CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);

- p->p_stat = SRUN;

- setrunqueue(td); /* XXXKSE */

- if (do_switch && curthread->td_critnest == 1 &&

- curthread->td_proc->p_stat == SRUN) {

+ setrunqueue(td);

+ if (do_switch &&

+ (curthread->td_critnest == 1)/* &&

+ (curthread->td_state == TDS_RUNNING) XXXKSE*/) {

+#if 0 /* not needed in KSE */

if (curthread != PCPU_GET(idlethread))

setrunqueue(curthread);

+#endif

curthread->td_proc->p_stats->p_ru.ru_nivcsw++;

mi_switch();

} else {

@@ -402,7 +403,7 @@ ithread_schedule(struct ithd *ithread, int do_switch)

}

} else {

CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",

- __func__, p->p_pid, ithread->it_need, p->p_stat);

+ __func__, p->p_pid, ithread->it_need, p->p_state);

}

mtx_unlock_spin(&sched_lock);

@@ -550,7 +551,7 @@ restart:

if (ithd->it_enable != NULL)

ithd->it_enable(ithd->it_vector);

- p->p_stat = SWAIT; /* we're idle */

+ td->td_state = TDS_IWAIT; /* we're idle */

p->p_stats->p_ru.ru_nvcsw++;

CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid);

mi_switch();

diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index a456a86fa489..e8e2feaa9c0a 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c

@@ -109,8 +109,7 @@ kthread_create(void (*func)(void *), void *arg,

mtx_lock_spin(&sched_lock);

p2->p_sflag |= PS_INMEM;

if (!(flags & RFSTOPPED)) {

- p2->p_stat = SRUN;

- setrunqueue(FIRST_THREAD_IN_PROC(p2)); /* XXXKSE */

+ setrunqueue(FIRST_THREAD_IN_PROC(p2));

}

mtx_unlock_spin(&sched_lock);

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 08bca8d67b2c..c2e79d02d5f2 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c

@@ -119,23 +119,20 @@ propagate_priority(struct thread *td)

return;

}

+ KASSERT(td->td_state != TDS_SURPLUS, ("Mutex owner SURPLUS"));

+ MPASS(td->td_proc != NULL);

MPASS(td->td_proc->p_magic == P_MAGIC);

- KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));

+ KASSERT(td->td_state != TDS_SLP,

+ ("sleeping thread owns a mutex"));

if (td->td_priority <= pri) /* lower is higher priority */

return;

- /*

- * Bump this thread's priority.

- */

- td->td_priority = pri;

* If lock holder is actually running, just bump priority.

- if (thread_running(td)) {

- MPASS(td->td_proc->p_stat == SRUN

- || td->td_proc->p_stat == SZOMB

- || td->td_proc->p_stat == SSTOP);

+ if (td->td_state == TDS_RUNNING) {

+ td->td_priority = pri;

return;

}

@@ -151,20 +148,26 @@ propagate_priority(struct thread *td)

* If on run queue move to new run queue, and quit.

* XXXKSE this gets a lot more complicated under threads

* but try anyhow.

+ * We should have a special call to do this more efficiently.

- if (td->td_proc->p_stat == SRUN) {

+ if (td->td_state == TDS_RUNQ) {

MPASS(td->td_blocked == NULL);

remrunqueue(td);

+ td->td_priority = pri;

setrunqueue(td);

return;

}

+ /*

+ * Adjust for any other cases.

+ */

+ td->td_priority = pri;

* If we aren't blocked on a mutex, we should be.

- KASSERT(td->td_proc->p_stat == SMTX, (

+ KASSERT(td->td_state == TDS_MTX, (

"process %d(%s):%d holds %s but isn't blocked on a mutex\n",

- td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,

+ td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,

m->mtx_object.lo_name));

@@ -590,7 +593,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)

td->td_blocked = m;

td->td_mtxname = m->mtx_object.lo_name;

- td->td_proc->p_stat = SMTX;

+ td->td_state = TDS_MTX;

propagate_priority(td);

if (LOCK_LOG_TEST(&m->mtx_object, opts))

@@ -727,7 +730,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)

m, td1);

td1->td_blocked = NULL;

- td1->td_proc->p_stat = SRUN;

setrunqueue(td1);

if (td->td_critnest == 1 && td1->td_priority < pri) {

@@ -744,7 +746,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)

}

#endif

- setrunqueue(td);

if (LOCK_LOG_TEST(&m->mtx_object, opts))

CTR2(KTR_LOCK,

"_mtx_unlock_sleep: %p switching out lock=%p", m,

diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
index a197bc0e3c11..9dd692463209 100644
--- a/sys/kern/kern_poll.c
+++ b/sys/kern/kern_poll.c

@@ -503,7 +503,6 @@ poll_idle(void)

mtx_unlock(&Giant);

mtx_assert(&Giant, MA_NOTOWNED);

mtx_lock_spin(&sched_lock);

- setrunqueue(td);

td->td_proc->p_stats->p_ru.ru_nvcsw++;

mi_switch();

mtx_unlock_spin(&sched_lock);

diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index a5378d9c3482..8b15fc2c4d46 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c

@@ -44,6 +44,7 @@

#include <sys/mutex.h>

#include <sys/proc.h>

#include <sys/sysproto.h>

+#include <sys/kse.h>

#include <sys/sysctl.h>

#include <sys/filedesc.h>

#include <sys/tty.h>

@@ -111,44 +112,28 @@ procinit()

uihashinit();

}

-/*

- * Note that we do not link to the proc's ucred here

- * The thread is linked as if running but no KSE assigned

- */

-static void

-thread_link(struct thread *td, struct ksegrp *kg)

- struct proc *p = kg->kg_proc;

- td->td_proc = p;

- td->td_ksegrp = kg;

- td->td_last_kse = &p->p_kse;

- TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);

- TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);

- td->td_critnest = 0;

- td->td_kse = NULL;

- cpu_thread_link(td);

* KSE is linked onto the idle queue.

-static void

+void

kse_link(struct kse *ke, struct ksegrp *kg)

{

struct proc *p = kg->kg_proc;

+KASSERT((ke->ke_state != KES_ONRUNQ), ("linking suspect kse on run queue"));

TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);

kg->kg_kses++;

+KASSERT((ke->ke_state != KES_IDLE), ("already on idle queue"));

+ ke->ke_state = KES_IDLE;

TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);

+ kg->kg_idle_kses++;

ke->ke_proc = p;

ke->ke_ksegrp = kg;

ke->ke_thread = NULL;

ke->ke_oncpu = NOCPU;

}

-static void

+void

ksegrp_link(struct ksegrp *kg, struct proc *p)

{

@@ -159,10 +144,13 @@ ksegrp_link(struct ksegrp *kg, struct proc *p)

TAILQ_INIT(&kg->kg_iq); /* all kses in ksegrp */

kg->kg_proc = p;

/* the following counters are in the -zero- section and may not need clearing */

+ kg->kg_numthreads = 0;

kg->kg_runnable = 0;

kg->kg_kses = 0;

+ kg->kg_idle_kses = 0;

kg->kg_runq_kses = 0; /* XXXKSE change name */

/* link it in now that it's consitant */

+ p->p_numksegrps++;

TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);

}

@@ -177,30 +165,13 @@ proc_linkup(struct proc *p, struct ksegrp *kg,

TAILQ_INIT(&p->p_ksegrps); /* all ksegrps in proc */

TAILQ_INIT(&p->p_threads); /* all threads in proc */

+ TAILQ_INIT(&p->p_suspended); /* Threads suspended */

ksegrp_link(kg, p);

kse_link(ke, kg);

thread_link(td, kg);

- /* link them together for 1:1 */

- td->td_kse = ke;

- ke->ke_thread = td;

}

-/* temporary version is ultra simple while we are in 1:1 mode */

-struct thread *

-thread_get(struct proc *p)

- struct thread *td = &p->p_xxthread;

- return (td);

-/*********************

-* STUB KSE syscalls

-*********************/

-/* struct thread_wakeup_args { struct thread_mailbox *tmbx; }; */

int

thread_wakeup(struct thread *td, struct thread_wakeup_args *uap)

{

@@ -219,7 +190,11 @@ int

kse_yield(struct thread *td, struct kse_yield_args *uap)

{

- return(ENOSYS);

+ PROC_LOCK(td->td_proc);

+ mtx_lock_spin(&sched_lock);

+ thread_exit();

+ /* NOTREACHED */

+ return(0);

}

int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)

@@ -228,16 +203,80 @@ int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)

return(ENOSYS);

}

-int

-kse_new(struct thread *td, struct kse_new_args *uap)

+/*

+ * No new KSEG: first call: use current KSE, don't schedule an upcall

+ * All other situations, do alloate a new KSE and schedule an upcall on it.

+ */

/* struct kse_new_args {

struct kse_mailbox *mbx;

int new_grp_flag;

}; */

+int

+kse_new(struct thread *td, struct kse_new_args *uap)

{

+ struct kse *newkse;

+ struct proc *p;

+ struct kse_mailbox mbx;

+ int err;

- return (ENOSYS);

+ p = td->td_proc;

+ if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))

+ return (err);

+ PROC_LOCK(p);

+ /*

+ * If we have no KSE mode set, just set it, and skip KSE and KSEGRP

+ * creation. You cannot request a new group with the first one as

+ * you are effectively getting one. Instead, go directly to saving

+ * the upcall info.

+ */

+ if ((td->td_proc->p_flag & P_KSES) || (uap->new_grp_flag)) {

+ return (EINVAL); /* XXX */

+ /*

+ * If newgroup then create the new group.

+ * Check we have the resources for this.

+ */

+ /* Copy lots of fields from the current KSEGRP. */

+ /* Create the new KSE */

+ /* Copy lots of fields from the current KSE. */

+ } else {

+ /*

+ * We are switching to KSEs so just

+ * use the preallocated ones for this call.

+ * XXXKSE if we have to initialise any fields for KSE

+ * mode operation, do it here.

+ */

+ newkse = td->td_kse;

+ }

+ /*

+ * Fill out the KSE-mode specific fields of the new kse.

+ */

+ PROC_UNLOCK(p);

+ mtx_lock_spin(&sched_lock);

+ mi_switch(); /* Save current registers to PCB. */

+ mtx_unlock_spin(&sched_lock);

+ newkse->ke_upcall = mbx.kmbx_upcall;

+ newkse->ke_stackbase = mbx.kmbx_stackbase;

+ newkse->ke_stacksize = mbx.kmbx_stacksize;

+ newkse->ke_mailbox = uap->mbx;

+ cpu_save_upcall(td, newkse);

+ /* Note that we are the returning syscall */

+ td->td_retval[0] = 0;

+ td->td_retval[1] = 0;

+ if ((td->td_proc->p_flag & P_KSES) || (uap->new_grp_flag)) {

+ thread_schedule_upcall(td, newkse);

+ } else {

+ /*

+ * Don't set this until we are truely ready, because

+ * things will start acting differently. Return to the

+ * calling code for the first time. Assuming we set up

+ * the mailboxes right, all syscalls after this will be

+ * asynchronous.

+ */

+ td->td_proc->p_flag |= P_KSES;

+ }

+ return (0);

}

@@ -554,7 +593,7 @@ fixjobc(p, pgrp, entering)

LIST_FOREACH(p, &p->p_children, p_sibling) {

if ((hispgrp = p->p_pgrp) != pgrp &&

hispgrp->pg_session == mysession &&

- p->p_stat != SZOMB) {

+ p->p_state != PRS_ZOMBIE) {

PGRP_LOCK(hispgrp);

if (entering)

hispgrp->pg_jobc++;

@@ -583,7 +622,7 @@ orphanpg(pg)

mtx_lock_spin(&sched_lock);

LIST_FOREACH(p, &pg->pg_members, p_pglist) {

- if (p->p_stat == SSTOP) {

+ if (P_SHOULDSTOP(p)) {

mtx_unlock_spin(&sched_lock);

LIST_FOREACH(p, &pg->pg_members, p_pglist) {

PROC_LOCK(p);

@@ -674,7 +713,9 @@ fill_kinfo_proc(p, kp)

kp->ki_sigcatch = p->p_procsig->ps_sigcatch;

}

mtx_lock_spin(&sched_lock);

- if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {

+ if (p->p_state != PRS_NEW &&

+ p->p_state != PRS_ZOMBIE &&

+ p->p_vmspace != NULL) {

struct vmspace *vm = p->p_vmspace;

kp->ki_size = vm->vm_map.size;

@@ -697,35 +738,65 @@ fill_kinfo_proc(p, kp)

p->p_stats->p_cru.ru_stime.tv_usec;

}

td = FIRST_THREAD_IN_PROC(p);

- if (td->td_wmesg != NULL)

- strncpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg) - 1);

- if (p->p_stat == SMTX) {

- kp->ki_kiflag |= KI_MTXBLOCK;

- strncpy(kp->ki_mtxname, td->td_mtxname,

- sizeof(kp->ki_mtxname) - 1);

+ if (!(p->p_flag & P_KSES)) {

+ if (td->td_wmesg != NULL) {

+ strncpy(kp->ki_wmesg, td->td_wmesg,

+ sizeof(kp->ki_wmesg) - 1);

+ }

+ if (td->td_state == TDS_MTX) {

+ kp->ki_kiflag |= KI_MTXBLOCK;

+ strncpy(kp->ki_mtxname, td->td_mtxname,

+ sizeof(kp->ki_mtxname) - 1);

+ }

}

- kp->ki_stat = p->p_stat;

+ if (p->p_state == PRS_NORMAL) { /* XXXKSE very aproximate */

+ if ((td->td_state == TDS_RUNQ) ||

+ (td->td_state == TDS_RUNNING)) {

+ kp->ki_stat = SRUN;

+ } else if (td->td_state == TDS_SLP) {

+ kp->ki_stat = SSLEEP;

+ } else if (P_SHOULDSTOP(p)) {

+ kp->ki_stat = SSTOP;

+ } else if (td->td_state == TDS_MTX) {

+ kp->ki_stat = SMTX;

+ } else {

+ kp->ki_stat = SWAIT;

+ }

+ } else if (p->p_state == PRS_ZOMBIE) {

+ kp->ki_stat = SZOMB;

+ } else {

+ kp->ki_stat = SIDL;

+ }

kp->ki_sflag = p->p_sflag;

kp->ki_swtime = p->p_swtime;

kp->ki_pid = p->p_pid;

/* vvv XXXKSE */

- bintime2timeval(&p->p_runtime, &tv);

- kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;

- kp->ki_pctcpu = p->p_kse.ke_pctcpu;

- kp->ki_estcpu = td->td_ksegrp->kg_estcpu;

- kp->ki_slptime = td->td_ksegrp->kg_slptime;

- kp->ki_wchan = td->td_wchan;

- kp->ki_pri.pri_level = td->td_priority;

- kp->ki_pri.pri_user = td->td_ksegrp->kg_user_pri;

- kp->ki_pri.pri_class = td->td_ksegrp->kg_pri_class;

- kp->ki_pri.pri_native = td->td_base_pri;

- kp->ki_nice = td->td_ksegrp->kg_nice;

- kp->ki_rqindex = p->p_kse.ke_rqindex;

- kp->ki_oncpu = p->p_kse.ke_oncpu;

- kp->ki_lastcpu = td->td_lastcpu;

- kp->ki_tdflags = td->td_flags;

- kp->ki_pcb = td->td_pcb;

- kp->ki_kstack = (void *)td->td_kstack;

+ if (!(p->p_flag & P_KSES)) {

+ bintime2timeval(&p->p_runtime, &tv);

+ kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;

+ kp->ki_pctcpu = p->p_kse.ke_pctcpu;

+ kp->ki_estcpu = p->p_ksegrp.kg_estcpu;

+ kp->ki_slptime = p->p_ksegrp.kg_slptime;

+ kp->ki_wchan = td->td_wchan;

+ kp->ki_pri.pri_level = td->td_priority;

+ kp->ki_pri.pri_user = p->p_ksegrp.kg_user_pri;

+ kp->ki_pri.pri_class = p->p_ksegrp.kg_pri_class;

+ kp->ki_pri.pri_native = td->td_base_pri;

+ kp->ki_nice = p->p_ksegrp.kg_nice;

+ kp->ki_rqindex = p->p_kse.ke_rqindex;

+ kp->ki_oncpu = p->p_kse.ke_oncpu;

+ kp->ki_lastcpu = td->td_lastcpu;

+ kp->ki_tdflags = td->td_flags;

+ kp->ki_pcb = td->td_pcb;

+ kp->ki_kstack = (void *)td->td_kstack;

+ } else {

+ kp->ki_oncpu = -1;

+ kp->ki_lastcpu = -1;

+ kp->ki_tdflags = -1;

+ /* All the reast are 0 */

+ }

/* ^^^ XXXKSE */

mtx_unlock_spin(&sched_lock);

sp = NULL;

@@ -878,7 +949,7 @@ sysctl_kern_proc(SYSCTL_HANDLER_ARGS)

* Skip embryonic processes.

- if (p->p_stat == SIDL) {

+ if (p->p_state == PRS_NEW) {

PROC_UNLOCK(p);

continue;

}

diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
index d2cb69d4fe26..0803cff61614 100644
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c

@@ -281,7 +281,6 @@ boot(int howto)

DROP_GIANT();

for (subiter = 0; subiter < 50 * iter; subiter++) {

mtx_lock_spin(&sched_lock);

- setrunqueue(curthread);

curthread->td_proc->p_stats->p_ru.ru_nvcsw++;

mi_switch(); /* Allow interrupt threads to run */

mtx_unlock_spin(&sched_lock);

diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index a561a1967288..e8ded210c749 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c

@@ -84,7 +84,7 @@ static int killpg1(struct thread *td, int sig, int pgid, int all);

static int sig_ffs(sigset_t *set);

static int sigprop(int sig);

static void stop(struct proc *);

+static void tdsignal(struct thread *td, int sig, sig_t action);

static int filt_sigattach(struct knote *kn);

static void filt_sigdetach(struct knote *kn);

static int filt_signal(struct knote *kn, long hint);

@@ -168,16 +168,18 @@ static int sigproptbl[NSIG] = {

* Determine signal that should be delivered to process p, the current

* process, 0 if none. If there is a pending stop signal with default

* action, the process stops in issignal().

+ * XXXKSE the check for a pending stop is not done under KSE

* MP SAFE.

int

-cursig(struct proc *p)

+cursig(struct thread *td)

{

+ struct proc *p = td->td_proc;

PROC_LOCK_ASSERT(p, MA_OWNED);

mtx_assert(&sched_lock, MA_NOTOWNED);

- return (SIGPENDING(p) ? issignal(p) : 0);

+ return (SIGPENDING(p) ? issignal(td) : 0);

}

@@ -1042,7 +1044,7 @@ killpg1(td, sig, pgid, all)

PROC_UNLOCK(p);

continue;

}

- if (p->p_stat == SZOMB) {

+ if (p->p_state == PRS_ZOMBIE) {

PROC_UNLOCK(p);

continue;

}

@@ -1243,12 +1245,10 @@ psignal(p, sig)

{

- register int prop;

struct thread *td;

-#ifdef SMP

- struct ksegrp *kg;

-#endif

+ register int prop;

KASSERT(_SIG_VALID(sig),

("psignal(): invalid signal %d\n", sig));

@@ -1257,7 +1257,6 @@ psignal(p, sig)

KNOTE(&p->p_klist, NOTE_SIGNAL | sig);

prop = sigprop(sig);

* If proc is traced, always give parent a chance;

* if signal event is tracked by procfs, give *that*

@@ -1283,29 +1282,6 @@ psignal(p, sig)

action = SIG_DFL;

}

- /*

- * bring the priority of a process up if we want it to get

- * killed in this lifetime.

- * XXXKSE think if a better way to do this.

- *

- * What we need to do is see if there is a thread that will

- * be able to accept the signal. e.g.

- * FOREACH_THREAD_IN_PROC() {

- * if runnable, we're done

- * else pick one at random.

- * }

- */

- /* XXXKSE

- * For now there is one thread per proc.

- * Effectively select one sucker thread..

- */

- td = FIRST_THREAD_IN_PROC(p);

- mtx_lock_spin(&sched_lock);

- if ((p->p_ksegrp.kg_nice > NZERO) && (action == SIG_DFL) &&

- (prop & SA_KILL) && ((p->p_flag & P_TRACED) == 0))

- p->p_ksegrp.kg_nice = NZERO; /* XXXKSE */

- mtx_unlock_spin(&sched_lock);

if (prop & SA_CONT)

SIG_STOPSIGMASK(p->p_siglist);

@@ -1316,48 +1292,125 @@ psignal(p, sig)

* is default; don't stop the process below if sleeping,

* and don't clear any pending SIGCONT.

- if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&

- action == SIG_DFL)

+ if ((prop & SA_TTYSTOP) &&

+ (p->p_pgrp->pg_jobc == 0) &&

+ (action == SIG_DFL))

return;

SIG_CONTSIGMASK(p->p_siglist);

}

SIGADDSET(p->p_siglist, sig);

mtx_lock_spin(&sched_lock);

signotify(p);

+ mtx_unlock_spin(&sched_lock);

- * Defer further processing for signals which are held,

- * except that stopped processes must be continued by SIGCONT.

+ * Some signals have a process-wide effect and a per-thread

+ * component. Most processing occurs when the process next

+ * tries to cross the user boundary, however there are some

+ * times when processing needs to be done immediatly, such as

+ * waking up threads so that they can cross the user boundary.

+ * We try do the per-process part here.

- if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) {

- mtx_unlock_spin(&sched_lock);

- return;

- }

- switch (p->p_stat) {

- case SSLEEP:

+ if (P_SHOULDSTOP(p)) {

- * If process is sleeping uninterruptibly

- * we can't interrupt the sleep... the signal will

- * be noticed when the process returns through

- * trap() or syscall().

+ * The process is in stopped mode. All the threads should be

+ * either winding down or already on the suspended queue.

- if ((td->td_flags & TDF_SINTR) == 0)

+ if (p->p_flag & P_TRACED) {

+ /*

+ * The traced process is already stopped,

+ * so no further action is necessary.

+ * No signal can restart us.

+ */

goto out;

+ }

+ if (sig == SIGKILL) {

+ /*

+ * SIGKILL sets process running.

+ * It will die elsewhere.

+ * All threads must be restarted.

+ */

+ p->p_flag &= ~P_STOPPED;

+ goto runfast;

+ }

+ if (prop & SA_CONT) {

+ /*

+ * If SIGCONT is default (or ignored), we continue the

+ * process but don't leave the signal in p_siglist as

+ * it has no further action. If SIGCONT is held, we

+ * continue the process and leave the signal in

+ * p_siglist. If the process catches SIGCONT, let it

+ * handle the signal itself. If it isn't waiting on

+ * an event, it goes back to run state.

+ * Otherwise, process goes back to sleep state.

+ */

+ p->p_flag &= ~P_STOPPED_SGNL;

+ if (action == SIG_DFL) {

+ SIGDELSET(p->p_siglist, sig);

+ } else if (action == SIG_CATCH) {

+ /*

+ * The process wants to catch it so it needs

+ * to run at least one thread, but which one?

+ * It would seem that the answer would be to

+ * run an upcall in the next KSE to run, and

+ * deliver the signal that way. In a NON KSE

+ * process, we need to make sure that the

+ * single thread is runnable asap.

+ * XXXKSE for now however, make them all run.

+ */

+ goto runfast;

+ }

+ /*

+ * The signal is not ignored or caught.

+ */

+ mtx_lock_spin(&sched_lock);

+ thread_unsuspend(p); /* Checks if should do it. */

+ mtx_unlock_spin(&sched_lock);

+ goto out;

+ }

+ if (prop & SA_STOP) {

+ /*

+ * Already stopped, don't need to stop again

+ * (If we did the shell could get confused).

+ */

+ SIGDELSET(p->p_siglist, sig);

+ goto out;

+ }

- * Process is sleeping and traced... make it runnable

- * so it can discover the signal in issignal() and stop

- * for the parent.

+ * All other kinds of signals:

+ * If a thread is sleeping interruptibly, simulate a

+ * wakeup so that when it is continued it will be made

+ * runnable and can look at the signal. However, don't make

+ * the process runnable, leave it stopped.

+ * It may run a bit until it hits a thread_suspend_check().

+ *

+ * XXXKSE I don't understand this at all.

- if (p->p_flag & P_TRACED)

- goto run;

+ mtx_lock_spin(&sched_lock);

+ FOREACH_THREAD_IN_PROC(p, td) {

+ if (td->td_wchan && (td->td_flags & TDF_SINTR)) {

+ if (td->td_flags & TDF_CVWAITQ)

+ cv_waitq_remove(td);

+ else

+ unsleep(td);

+ setrunnable(td);

+ }

+ mtx_unlock_spin(&sched_lock);

+ goto out;

- * If SIGCONT is default (or ignored) and process is

- * asleep, we are finished; the process should not

- * be awakened.

+ * XXXKSE What about threads that are waiting on mutexes?

+ * Shouldn't they abort too?

- if ((prop & SA_CONT) && action == SIG_DFL) {

+ } else if (p->p_state == PRS_NORMAL) {

+ if (prop & SA_CONT) {

+ /*

+ * Already active, don't need to start again.

+ */

SIGDELSET(p->p_siglist, sig);

goto out;

}

@@ -1370,133 +1423,128 @@ psignal(p, sig)

if (prop & SA_STOP) {

if (action != SIG_DFL)

goto runfast;

* If a child holding parent blocked,

* stopping could cause deadlock.

if (p->p_flag & P_PPWAIT)

goto out;

- mtx_unlock_spin(&sched_lock);

SIGDELSET(p->p_siglist, sig);

p->p_xstat = sig;

PROC_LOCK(p->p_pptr);

- if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)

+ if (!(p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP))

psignal(p->p_pptr, SIGCHLD);

PROC_UNLOCK(p->p_pptr);

mtx_lock_spin(&sched_lock);

stop(p);

+ mtx_unlock_spin(&sched_lock);

goto out;

} else

goto runfast;

/* NOTREACHED */

+ } else {

+ /* Not in "NORMAL" state. discard the signal. */

+ SIGDELSET(p->p_siglist, sig);

+ goto out;

+ }

- case SSTOP:

- /*

- * If traced process is already stopped,

- * then no further action is necessary.

- */

- if (p->p_flag & P_TRACED)

- goto out;

+ /*

+ * The process is not stopped so we need to apply the signal to all the

+ * running threads.

+ */

- /*

- * Kill signal always sets processes running.

- */

- if (sig == SIGKILL)

- goto runfast;

+runfast:

+ FOREACH_THREAD_IN_PROC(p, td)

+ tdsignal(td, sig, action);

+ mtx_lock_spin(&sched_lock);

+ thread_unsuspend(p);

+ mtx_unlock_spin(&sched_lock);

+out:

+ /* If we jump here, sched_lock should not be owned. */

+ mtx_assert(&sched_lock, MA_NOTOWNED);

- if (prop & SA_CONT) {

- /*

- * If SIGCONT is default (or ignored), we continue the

- * process but don't leave the signal in p_siglist, as

- * it has no further action. If SIGCONT is held, we

- * continue the process and leave the signal in

- * p_siglist. If the process catches SIGCONT, let it

- * handle the signal itself. If it isn't waiting on

- * an event, then it goes back to run state.

- * Otherwise, process goes back to sleep state.

- */

- if (action == SIG_DFL)

- SIGDELSET(p->p_siglist, sig);

- if (action == SIG_CATCH)

- goto runfast;

- /*

- * XXXKSE

- * do this for each thread.

- */

- if (p->p_flag & P_KSES) {

- mtx_assert(&sched_lock,

- MA_OWNED | MA_NOTRECURSED);

- FOREACH_THREAD_IN_PROC(p, td) {

- if (td->td_wchan == NULL) {

- setrunnable(td); /* XXXKSE */

- } else {

- /* mark it as sleeping */

- }

- } else {

- p->p_flag |= P_CONTINUED;

- wakeup(p->p_pptr);

- if (td->td_wchan == NULL)

- goto run;

- p->p_stat = SSLEEP;

- }

- goto out;

+/*

+ * The force of a signal has been directed against a single

+ * thread. We need to see what we can do about knocking it

+ * out of any sleep it may be in etc.

+ */

+static void

+tdsignal(struct thread *td, int sig, sig_t action)

+ struct proc *p = td->td_proc;

+ register int prop;

+ prop = sigprop(sig);

+ /*

+ * Bring the priority of a process up if we want it to get

+ * killed in this lifetime.

+ * XXXKSE we should shift the priority to the thread.

+ */

+ mtx_lock_spin(&sched_lock);

+ if ((action == SIG_DFL) && (prop & SA_KILL)) {

+ if (td->td_priority > PUSER) {

+ td->td_priority = PUSER;

}

+ }

+ mtx_unlock_spin(&sched_lock);

- if (prop & SA_STOP) {

- /*

- * Already stopped, don't need to stop again.

- * (If we did the shell could get confused.)

- */

- SIGDELSET(p->p_siglist, sig);

+ /*

+ * Defer further processing for signals which are held,

+ * except that stopped processes must be continued by SIGCONT.

+ */

+ if (action == SIG_HOLD) {

+ goto out;

+ }

+ mtx_lock_spin(&sched_lock);

+ if (td->td_state == TDS_SLP) {

+ /*

+ * If thread is sleeping uninterruptibly

+ * we can't interrupt the sleep... the signal will

+ * be noticed when the process returns through

+ * trap() or syscall().

+ */

+ if ((td->td_flags & TDF_SINTR) == 0) {

+ mtx_unlock_spin(&sched_lock);

goto out;

}

- * If process is sleeping interruptibly, then simulate a

- * wakeup so that when it is continued, it will be made

- * runnable and can look at the signal. But don't make

- * the process runnable, leave it stopped.

- * XXXKSE should we wake ALL blocked threads?

+ * Process is sleeping and traced. Make it runnable

+ * so it can discover the signal in issignal() and stop

+ * for its parent.

- if (p->p_flag & P_KSES) {

- FOREACH_THREAD_IN_PROC(p, td) {

- if (td->td_wchan && (td->td_flags & TDF_SINTR)){

- if (td->td_flags & TDF_CVWAITQ)

- cv_waitq_remove(td);

- else

- unsleep(td); /* XXXKSE */

- }

- } else {

- if (td->td_wchan && td->td_flags & TDF_SINTR) {

- if (td->td_flags & TDF_CVWAITQ)

- cv_waitq_remove(td);

- else

- unsleep(td); /* XXXKSE */

- }

+ if (p->p_flag & P_TRACED) {

+ p->p_flag &= ~P_STOPPED_TRACE;

+ goto run;

}

- goto out;

+ mtx_unlock_spin(&sched_lock);

+ /*

+ * If SIGCONT is default (or ignored) and process is

+ * asleep, we are finished; the process should not

+ * be awakened.

+ */

+ if ((prop & SA_CONT) && action == SIG_DFL) {

+ SIGDELSET(p->p_siglist, sig);

+ goto out;

+ }

+ goto runfast;

+ /* NOTREACHED */

- default:

+ } else {

- * SRUN, SIDL, SZOMB do nothing with the signal,

+ * Other states do nothing with the signal immediatly,

* other than kicking ourselves if we are running.

* It will either never be noticed, or noticed very soon.

- if (p->p_stat == SRUN) {

+ mtx_unlock_spin(&sched_lock);

+ if (td->td_state == TDS_RUNQ ||

+ td->td_state == TDS_RUNNING) {

+ signotify(td->td_proc);

#ifdef SMP

- struct kse *ke;

- struct thread *td = curthread;

-/* we should only deliver to one thread.. but which one? */

- FOREACH_KSEGRP_IN_PROC(p, kg) {

- FOREACH_KSE_IN_GROUP(kg, ke) {

- if (ke->ke_thread == td) {

- continue;

- }

- forward_signal(ke->ke_thread);

- }

+ if (td->td_state == TDS_RUNNING && td != curthread)

+ forward_signal(td);

#endif

}

goto out;

@@ -1506,21 +1554,17 @@ psignal(p, sig)

runfast:

* Raise priority to at least PUSER.

- * XXXKSE Should we make them all run fast?

- * Maybe just one would be enough?

- if (FIRST_THREAD_IN_PROC(p)->td_priority > PUSER) {

- FIRST_THREAD_IN_PROC(p)->td_priority = PUSER;

+ mtx_lock_spin(&sched_lock);

+ if (td->td_priority > PUSER) {

+ td->td_priority = PUSER;

}

run:

- /* If we jump here, sched_lock has to be owned. */

mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);

- setrunnable(td); /* XXXKSE */

-out:

+ setrunnable(td);

mtx_unlock_spin(&sched_lock);

- /* Once we get here, sched_lock should not be owned. */

+out:

mtx_assert(&sched_lock, MA_NOTOWNED);

}

@@ -1533,16 +1577,18 @@ out:

* by checking the pending signal masks in cursig.) The normal call

* sequence is

- * while (sig = cursig(curproc))

+ * while (sig = cursig(curthread))

* postsig(sig);

int

-issignal(p)

- register struct proc *p;

+issignal(td)

+ struct thread *td;

{

+ struct proc *p;

sigset_t mask;

+ p = td->td_proc;

PROC_LOCK_ASSERT(p, MA_OWNED);

for (;;) {

int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);

@@ -1576,6 +1622,7 @@ issignal(p)

PROC_UNLOCK(p->p_pptr);

mtx_lock_spin(&sched_lock);

stop(p);

+ td->td_state = TDS_UNQUEUED;

PROC_UNLOCK(p);

DROP_GIANT();

p->p_stats->p_ru.ru_nivcsw++;

@@ -1633,6 +1680,7 @@ issignal(p)

#endif

break; /* == ignore */

}

+#if 0

* If there is a pending stop signal to process

* with default action, stop here,

@@ -1647,8 +1695,10 @@ issignal(p)

break; /* == ignore */

p->p_xstat = sig;

PROC_LOCK(p->p_pptr);

- if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)

+ if ((p->p_pptr->p_procsig->ps_flag &

+ PS_NOCLDSTOP) == 0) {

psignal(p->p_pptr, SIGCHLD);

+ }

PROC_UNLOCK(p->p_pptr);

mtx_lock_spin(&sched_lock);

stop(p);

@@ -1660,7 +1710,9 @@ issignal(p)

PICKUP_GIANT();

PROC_LOCK(p);

break;

- } else if (prop & SA_IGNORE) {

+ } else

+#endif

+ if (prop & SA_IGNORE) {

* Except for SIGCONT, shouldn't get here.

* Default action is to ignore; drop it.

@@ -1706,7 +1758,7 @@ stop(p)

PROC_LOCK_ASSERT(p, MA_OWNED);

mtx_assert(&sched_lock, MA_OWNED);

- p->p_stat = SSTOP;

+ p->p_flag |= P_STOPPED_SGNL;

p->p_flag &= ~P_WAITED;

wakeup(p->p_pptr);

}

diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index 5e32eeeb817e..c63091c008f8 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c

@@ -538,7 +538,6 @@ uio_yield()

mtx_lock_spin(&sched_lock);

DROP_GIANT();

td->td_priority = td->td_ksegrp->kg_user_pri; /* XXXKSE */

- setrunqueue(td);

td->td_proc->p_stats->p_ru.ru_nivcsw++;

mi_switch();

mtx_unlock_spin(&sched_lock);

diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 2b531c0dae3d..40d3ef87bd33 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c

@@ -26,6 +26,69 @@

* $FreeBSD$

+/***

+Here is the logic..

+If there are N processors, then there are at most N KSEs (kernel

+schedulable entities) working to process threads that belong to a

+KSEGOUP (kg). If there are X of these KSEs actually running at the

+moment in question, then there are at most M (N-X) of these KSEs on

+the run queue, as running KSEs are not on the queue.

+Runnable threads are queued off the KSEGROUP in priority order.

+If there are M or more threads runnable, the top M threads

+(by priority) are 'preassigned' to the M KSEs not running. The KSEs take

+their priority from those threads and are put on the run queue.

+The last thread that had a priority high enough to have a KSE associated

+with it, AND IS ON THE RUN QUEUE is pointed to by

+kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs

+assigned as all the available KSEs are activly running, or because there

+are no threads queued, that pointer is NULL.

+When a KSE is removed from the run queue to become runnable, we know

+it was associated with the highest priority thread in the queue (at the head

+of the queue). If it is also the last assigned we know M was 1 and must

+now be 0. Since the thread is no longer queued that pointer must be

+removed from it. Since we know there were no more KSEs available,

+(M was 1 and is now 0) and since we are not FREEING our KSE

+but using it, we know there are STILL no more KSEs available, we can prove

+that the next thread in the ksegrp list will not have a KSE to assign to

+it, so we can show that the pointer must be made 'invalid' (NULL).

+The pointer exists so that when a new thread is made runnable, it can

+have its priority compared with the last assigned thread to see if

+it should 'steal' its KSE or not.. i.e. is it 'earlier'

+on the list than that thread or later.. If it's earlier, then the KSE is

+removed from the last assigned (which is now not assigned a KSE)

+and reassigned to the new thread, which is placed earlier in the list.

+The pointer is then backed up to the previous thread (which may or may not

+be the new thread).

+When a thread sleeps or is removed, the KSE becomes available and if there

+are queued threads that are not assigned KSEs, the highest priority one of

+them is assigned the KSE, which is then placed back on the run queue at

+the approipriate place, and the kg->kg_last_assigned pointer is adjusted down

+to point to it.

+The following diagram shows 2 KSEs and 3 threads from a single process.

+ RUNQ: --->KSE---KSE--... (KSEs queued at priorities from threads)

+ \ \____

+ \ \

+ KSEGROUP---thread--thread--thread (queued in priority order)

+ \ /

+ \_______________/

+ (last_assigned)

+The result of this scheme is that the M available KSEs are always

+queued at the priorities they have inherrited from the M highest priority

+threads for that KSEGROUP. If this situation changes, the KSEs are

+reassigned to keep this true.

+*/

#include <sys/param.h>

#include <sys/systm.h>

#include <sys/kernel.h>

@@ -44,34 +107,442 @@ CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);

static struct runq runq;

SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)

+static void runq_readjust(struct runq *rq, struct kse *ke);

+/************************************************************************

+ * Functions that manipulate runnability from a thread perspective. *

+ ************************************************************************/

- * Wrappers which implement old interface; act on global run queue.

+ * Select the KSE that will be run next. From that find the thread, and x

+ * remove it from the KSEGRP's run queue. If there is thread clustering,

+ * this will be what does it.

struct thread *

choosethread(void)

{

- return (runq_choose(&runq)->ke_thread);

+ struct kse *ke;

+ struct thread *td;

+ struct ksegrp *kg;

+ if ((ke = runq_choose(&runq))) {

+ td = ke->ke_thread;

+ KASSERT((td->td_kse == ke), ("kse/thread mismatch"));

+ kg = ke->ke_ksegrp;

+ if (td->td_flags & TDF_UNBOUND) {

+ TAILQ_REMOVE(&kg->kg_runq, td, td_runq);

+ if (kg->kg_last_assigned == td)

+ if (TAILQ_PREV(td, threadqueue, td_runq)

+ != NULL)

+ printf("Yo MAMA!\n");

+ kg->kg_last_assigned = TAILQ_PREV(td,

+ threadqueue, td_runq);

+ /*

+ * If we have started running an upcall,

+ * Then TDF_UNBOUND WAS set because the thread was

+ * created without a KSE. Now that we have one,

+ * and it is our time to run, we make sure

+ * that BOUND semantics apply for the rest of

+ * the journey to userland, and into the UTS.

+ */

+#ifdef NOTYET

+ if (td->td_flags & TDF_UPCALLING)

+ tdf->td_flags &= ~TDF_UNBOUND;

+#endif

+ }

+ kg->kg_runnable--;

+ CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",

+ td, td->td_priority);

+ } else {

+ /* Pretend the idle thread was on the run queue. */

+ td = PCPU_GET(idlethread);

+ /* Simulate that it was on the run queue */

+ td->td_state = TDS_RUNQ;

+ td->td_kse->ke_state = KES_UNQUEUED;

+ CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);

+ }

+ thread_sanity_check(td);

+ return (td);

+/*

+ * Given a KSE (now surplus), either assign a new runable thread to it

+ * (and put it in the run queue) or put it in the ksegrp's idle KSE list.

+ * Assumes the kse is not linked to any threads any more. (has been cleaned).

+ */

+void

+kse_reassign(struct kse *ke)

+ struct ksegrp *kg;

+ struct thread *td;

+ kg = ke->ke_ksegrp;

+KASSERT((ke->ke_state != KES_ONRUNQ), ("kse_reassigning non-free kse"));

+ /*

+ * Find the first unassigned thread

+ * If there is a 'last assigned' then see what's next.

+ * otherwise look at what is first.

+ */

+ if ((td = kg->kg_last_assigned)) {

+ td = TAILQ_NEXT(td, td_runq);

+ } else {

+ td = TAILQ_FIRST(&kg->kg_runq);

+ }

+ /*

+ * If we found one assign it the kse, otherwise idle the kse.

+ */

+ if (td) {

+ thread_sanity_check(td);

+ kg->kg_last_assigned = td;

+ td->td_kse = ke;

+ ke->ke_thread = td;

+ runq_add(&runq, ke);

+ CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);

+ } else {

+ KASSERT((ke->ke_state != KES_IDLE), ("kse already idle"));

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ ke->ke_state = KES_IDLE;

+ ke->ke_thread = NULL;

+ TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);

+ kg->kg_idle_kses++;

+ CTR1(KTR_RUNQ, "kse_reassign: ke%p idled", ke);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self2!"));

+ }

}

int

-procrunnable(void)

+kserunnable(void)

{

return runq_check(&runq);

}

+/*

+ * Remove a thread from its KSEGRP's run queue.

+ * This in turn may remove it from a KSE if it was already assigned

+ * to one, possibly causing a new thread to be assigned to the KSE

+ * and the KSE getting a new priority (unless it's a BOUND thread/KSE pair).

+ */

void

remrunqueue(struct thread *td)

{

- runq_remove(&runq, td->td_kse);

+ struct thread *td2, *td3;

+ struct ksegrp *kg;

+ struct kse *ke;

+ mtx_assert(&sched_lock, MA_OWNED);

+ thread_sanity_check(td);

+ KASSERT ((td->td_state == TDS_RUNQ),

+ ("remrunqueue: Bad state on run queue"));

+ kg = td->td_ksegrp;

+ ke = td->td_kse;

+ /*

+ * If it's a bound thread/KSE pair, take the shortcut. All non-KSE

+ * threads are BOUND.

+ */

+ CTR1(KTR_RUNQ, "remrunqueue: td%p", td);

+ td->td_state = TDS_UNQUEUED;

+ kg->kg_runnable--;

+ if ((td->td_flags & TDF_UNBOUND) == 0) {

+ /* Bring its kse with it, leave the thread attached */

+ runq_remove(&runq, ke);

+ ke->ke_state = KES_UNQUEUED;

+ return;

+ }

+ if (ke) {

+ /*

+ * This thread has been assigned to a KSE.

+ * We need to dissociate it and try assign the

+ * KSE to the next available thread. Then, we should

+ * see if we need to move the KSE in the run queues.

+ */

+ td2 = kg->kg_last_assigned;

+ KASSERT((td2 != NULL), ("last assigned has wrong value "));

+ td->td_kse = NULL;

+ if ((td3 = TAILQ_NEXT(td2, td_runq))) {

+ KASSERT(td3 != td, ("td3 somehow matched td"));

+ /*

+ * Give the next unassigned thread to the KSE

+ * so the number of runnable KSEs remains

+ * constant.

+ */

+ td3->td_kse = ke;

+ ke->ke_thread = td3;

+ kg->kg_last_assigned = td3;

+ runq_readjust(&runq, ke);

+ } else {

+ /*

+ * There is no unassigned thread.

+ * If we were the last assigned one,

+ * adjust the last assigned pointer back

+ * one, which may result in NULL.

+ */

+ if (td == td2) {

+ kg->kg_last_assigned =

+ TAILQ_PREV(td, threadqueue, td_runq);

+ }

+ runq_remove(&runq, ke);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ KASSERT((ke->ke_state != KES_IDLE),

+ ("kse already idle"));

+ ke->ke_state = KES_IDLE;

+ ke->ke_thread = NULL;

+KASSERT((TAILQ_FIRST(&kg->kg_iq) != ke), ("really bad screwup"));

+ TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);

+ kg->kg_idle_kses++;

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self2!"));

+ }

+ TAILQ_REMOVE(&kg->kg_runq, td, td_runq);

+ thread_sanity_check(td);

}

+#if 1 /* use the first version */

void

setrunqueue(struct thread *td)

{

- runq_add(&runq, td->td_kse);

+ struct kse *ke;

+ struct ksegrp *kg;

+ struct thread *td2;

+ struct thread *tda;

+ CTR1(KTR_RUNQ, "setrunqueue: td%p", td);

+ mtx_assert(&sched_lock, MA_OWNED);

+ thread_sanity_check(td);

+ KASSERT((td->td_state != TDS_RUNQ), ("setrunqueue: bad thread state"));

+ td->td_state = TDS_RUNQ;

+ kg = td->td_ksegrp;

+ kg->kg_runnable++;

+ if ((td->td_flags & TDF_UNBOUND) == 0) {

+ KASSERT((td->td_kse != NULL),

+ ("queueing BAD thread to run queue"));

+ /*

+ * Common path optimisation: Only one of everything

+ * and the KSE is always already attached.

+ * Totally ignore the ksegrp run queue.

+ */

+ runq_add(&runq, td->td_kse);

+ return;

+ }

+ /*

+ * Ok, so we are threading with this thread.

+ * We don't have a KSE, see if we can get one..

+ */

+ tda = kg->kg_last_assigned;

+ if ((ke = td->td_kse) == NULL) {

+ /*

+ * We will need a KSE, see if there is one..

+ * First look for a free one, before getting desperate.

+ * If we can't get one, our priority is not high enough..

+ * that's ok..

+ */

+ if (kg->kg_idle_kses) {

+ /*

+ * There is a free one so it's ours for the asking..

+ */

+ ke = TAILQ_FIRST(&kg->kg_iq);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self3!"));

+ TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);

+ ke->ke_state = KES_UNQUEUED;

+ kg->kg_idle_kses--;

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self4!"));

+ } else if (tda && (tda->td_priority > td->td_priority)) {

+ /*

+ * None free, but there is one we can commandeer.

+ */

+ ke = tda->td_kse;

+ tda->td_kse = NULL;

+ ke->ke_thread = NULL;

+ tda = kg->kg_last_assigned =

+ TAILQ_PREV(tda, threadqueue, td_runq);

+ runq_remove(&runq, ke);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self5!"));

+ }

+ } else {

+ KASSERT(ke->ke_thread == td, ("KSE/thread mismatch"));

+ KASSERT(ke->ke_state != KES_IDLE, ("KSE unexpectedly idle"));

+ ke->ke_thread = NULL;

+ td->td_kse = NULL;

+ }

+ /*

+ * Add the thread to the ksegrp's run queue at

+ * the appropriate place.

+ */

+ TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {

+ if (td2->td_priority > td->td_priority) {

+ TAILQ_INSERT_BEFORE(td2, td, td_runq);

+ break;

+ }

+ if (td2 == NULL) {

+ /* We ran off the end of the TAILQ or it was empty. */

+ TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);

+ }

+ /*

+ * If we have a ke to use, then put it on the run queue and

+ * If needed, readjust the last_assigned pointer.

+ */

+ if (ke) {

+ if (tda == NULL) {

+ /*

+ * No pre-existing last assigned so whoever is first

+ * gets the KSE we borught in.. (may be us)

+ */

+ td2 = TAILQ_FIRST(&kg->kg_runq);

+ KASSERT((td2->td_kse == NULL),

+ ("unexpected ke present"));

+ td2->td_kse = ke;

+ ke->ke_thread = td2;

+ kg->kg_last_assigned = td2;

+ } else if (tda->td_priority > td->td_priority) {

+ /*

+ * It's ours, grab it, but last_assigned is past us

+ * so don't change it.

+ */

+ td->td_kse = ke;

+ ke->ke_thread = td;

+ } else {

+ /*

+ * We are past last_assigned, so

+ * put the new kse on whatever is next,

+ * which may or may not be us.

+ */

+ td2 = TAILQ_NEXT(tda, td_runq);

+ kg->kg_last_assigned = td2;

+ td2->td_kse = ke;

+ ke->ke_thread = td2;

+ }

+ runq_add(&runq, ke);

+ }

+ thread_sanity_check(td);

}

+#else

+void

+setrunqueue(struct thread *td)

+ struct kse *ke;

+ struct ksegrp *kg;

+ struct thread *td2;

+ CTR1(KTR_RUNQ, "setrunqueue: td%p", td);

+ KASSERT((td->td_state != TDS_RUNQ), ("setrunqueue: bad thread state"));

+ td->td_state = TDS_RUNQ;

+ kg = td->td_ksegrp;

+ kg->kg_runnable++;

+ if ((td->td_flags & TDF_UNBOUND) == 0) {

+ /*

+ * Common path optimisation: Only one of everything

+ * and the KSE is always already attached.

+ * Totally ignore the ksegrp run queue.

+ */

+ runq_add(&runq, td->td_kse);

+ return;

+ }

+ /*

+ * First add the thread to the ksegrp's run queue at

+ * the appropriate place.

+ */

+ TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {

+ if (td2->td_priority > td->td_priority) {

+ TAILQ_INSERT_BEFORE(td2, td, td_runq);

+ break;

+ }

+ if (td2 == NULL) {

+ /* We ran off the end of the TAILQ or it was empty. */

+ TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);

+ }

+ /*

+ * The following could be achieved by simply doing:

+ * td->td_kse = NULL; kse_reassign(ke);

+ * but I felt that I'd try do it inline here.

+ * All this work may not be worth it.

+ */

+ if ((ke = td->td_kse)) { /* XXXKSE */

+ /*

+ * We have a KSE already. See whether we can keep it

+ * or if we need to give it to someone else.

+ * Either way it will need to be inserted into

+ * the runq. kse_reassign() will do this as will runq_add().

+ */

+ if ((kg->kg_last_assigned) &&

+ (kg->kg_last_assigned->td_priority > td->td_priority)) {

+ /*

+ * We can definitly keep the KSE

+ * as the "last assignead thread" has

+ * less priority than we do.

+ * The "last assigned" pointer stays the same.

+ */

+ runq_add(&runq, ke);

+ return;

+ }

+ /*

+ * Give it to the correct thread,

+ * which may be (often is) us, but may not be.

+ */

+ td->td_kse = NULL;

+ kse_reassign(ke);

+ return;

+ }

+ /*

+ * There are two cases where KSE adjustment is needed.

+ * Usurpation of an already assigned KSE, and assignment

+ * of a previously IDLE KSE.

+ */

+ if (kg->kg_idle_kses) {

+ /*

+ * If there are unassigned KSEs then we definitly

+ * will be assigned one from the idle KSE list.

+ * If we are the last, we should get the "last

+ * assigned" pointer set to us as well.

+ */

+ ke = TAILQ_FIRST(&kg->kg_iq);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);

+ ke->ke_state = KES_UNQUEUED;

+ kg->kg_idle_kses--;

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ ke->ke_thread = td;

+ td->td_kse = ke;

+ runq_add(&runq, ke);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ if (TAILQ_NEXT(td, td_runq) == NULL) {

+ kg->kg_last_assigned = td;

+ }

+ } else if (kg->kg_last_assigned &&

+ (kg->kg_last_assigned->td_priority > td->td_priority)) {

+ /*

+ * If there were none last-assigned, all KSEs

+ * are actually out running as we speak.

+ * If there was a last assigned, but we didn't see it,

+ * we must be inserting before it, so take the KSE from

+ * the last assigned, and back it up one entry. Then,

+ * assign the KSE to the new thread and adjust its priority.

+ */

+ td2 = kg->kg_last_assigned;

+ ke = td2->td_kse;

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ kg->kg_last_assigned =

+ TAILQ_PREV(td2, threadqueue, td_runq);

+ td2->td_kse = NULL;

+ td->td_kse = ke;

+ ke->ke_thread = td;

+ runq_readjust(&runq, ke);

+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));

+ }

+#endif

+/************************************************************************

+ * Critical section marker functions *

+ ************************************************************************/

/* Critical sections that prevent preemption. */

void

critical_enter(void)

@@ -98,6 +569,23 @@ critical_exit(void)

}

+/************************************************************************

+ * SYSTEM RUN QUEUE manipulations and tests *

+ ************************************************************************/

+/*

+ * Initialize a run structure.

+ */

+void

+runq_init(struct runq *rq)

+ int i;

+ bzero(rq, sizeof *rq);

+ for (i = 0; i < RQ_NQS; i++)

+ TAILQ_INIT(&rq->rq_queues[i]);

* Clear the status bit of the queue corresponding to priority level pri,

* indicating that it is empty.

@@ -156,7 +644,7 @@ runq_setbit(struct runq *rq, int pri)

}

- * Add the process to the queue specified by its priority, and set the

+ * Add the KSE to the queue specified by its priority, and set the

* corresponding status bit.

void

@@ -165,14 +653,16 @@ runq_add(struct runq *rq, struct kse *ke)

struct rqhead *rqh;

int pri;

-#ifdef INVARIANTS

- struct proc *p = ke->ke_proc;

-#endif

- if (ke->ke_flags & KEF_ONRUNQ)

- return;

mtx_assert(&sched_lock, MA_OWNED);

- KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",

- p, p->p_comm));

+ KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));

+ KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread"));

+ if (ke->ke_state == KES_ONRUNQ)

+ return;

+#if defined(INVARIANTS) && defined(DIAGNOSTIC)

+ KASSERT(ke->ke_state != KES_ONRUNQ,

+ ("runq_add: kse %p (%s) already in run queue", ke,

+ ke->ke_proc->p_comm));

+#endif

pri = ke->ke_thread->td_priority / RQ_PPQ;

ke->ke_rqindex = pri;

runq_setbit(rq, pri);

@@ -180,7 +670,8 @@ runq_add(struct runq *rq, struct kse *ke)

CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",

ke->ke_proc, ke->ke_thread->td_priority, pri, rqh);

TAILQ_INSERT_TAIL(rqh, ke, ke_procq);

- ke->ke_flags |= KEF_ONRUNQ;

+ ke->ke_ksegrp->kg_runq_kses++;

+ ke->ke_state = KES_ONRUNQ;

}

@@ -219,43 +710,38 @@ runq_choose(struct runq *rq)

int pri;

mtx_assert(&sched_lock, MA_OWNED);

- if ((pri = runq_findbit(rq)) != -1) {

+ while ((pri = runq_findbit(rq)) != -1) {

rqh = &rq->rq_queues[pri];

ke = TAILQ_FIRST(rqh);

KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));

- KASSERT(ke->ke_proc->p_stat == SRUN,

- ("runq_choose: process %d(%s) in state %d", ke->ke_proc->p_pid,

- ke->ke_proc->p_comm, ke->ke_proc->p_stat));

- CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);

+ CTR3(KTR_RUNQ,

+ "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);

+KASSERT(ke->ke_procq.tqe_prev != NULL, ("no prev"));

+if (ke->ke_procq.tqe_next)

+ KASSERT(ke->ke_procq.tqe_next->ke_procq.tqe_prev != NULL, ("no next"));

TAILQ_REMOVE(rqh, ke, ke_procq);

+ ke->ke_ksegrp->kg_runq_kses--;

if (TAILQ_EMPTY(rqh)) {

CTR0(KTR_RUNQ, "runq_choose: empty");

runq_clrbit(rq, pri);

}

- ke->ke_flags &= ~KEF_ONRUNQ;

+ ke->ke_state = KES_RUNNING;

+ KASSERT((ke->ke_thread != NULL),

+ ("runq_choose: No thread on KSE"));

+ KASSERT((ke->ke_thread->td_kse != NULL),

+ ("runq_choose: No KSE on thread"));

return (ke);

}

CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);

- return (PCPU_GET(idlethread)->td_kse);

+ return (NULL);

}

- * Initialize a run structure.

- */

-void

-runq_init(struct runq *rq)

- int i;

- bzero(rq, sizeof *rq);

- for (i = 0; i < RQ_NQS; i++)

- TAILQ_INIT(&rq->rq_queues[i]);

-/*

- * Remove the process from the queue specified by its priority, and clear the

+ * Remove the KSE from the queue specified by its priority, and clear the

* corresponding status bit if the queue becomes empty.

+ * Caller must set ke->ke_state afterwards.

void

runq_remove(struct runq *rq, struct kse *ke)

@@ -263,8 +749,7 @@ runq_remove(struct runq *rq, struct kse *ke)

struct rqhead *rqh;

int pri;

- if (!(ke->ke_flags & KEF_ONRUNQ))

- return;

+ KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));

mtx_assert(&sched_lock, MA_OWNED);

pri = ke->ke_rqindex;

rqh = &rq->rq_queues[pri];

@@ -276,5 +761,104 @@ runq_remove(struct runq *rq, struct kse *ke)

CTR0(KTR_RUNQ, "runq_remove: empty");

runq_clrbit(rq, pri);

}

- ke->ke_flags &= ~KEF_ONRUNQ;

+ ke->ke_state = KES_UNQUEUED;

+ ke->ke_ksegrp->kg_runq_kses--;

+static void

+runq_readjust(struct runq *rq, struct kse *ke)

+ if (ke->ke_rqindex != (ke->ke_thread->td_priority / RQ_PPQ)) {

+ runq_remove(rq, ke);

+ runq_add(rq, ke);

+ }

+void

+thread_sanity_check(struct thread *td)

+ struct proc *p;

+ struct ksegrp *kg;

+ struct kse *ke;

+ struct thread *td2;

+ unsigned int prevpri;

+ int saw_lastassigned;

+ int unassigned;

+ int assigned;

+ p = td->td_proc;

+ kg = td->td_ksegrp;

+ ke = td->td_kse;

+ if (kg != &p->p_ksegrp) {

+ panic ("wrong ksegrp");

+ }

+ if (ke) {

+ if (ke != &p->p_kse) {

+ panic("wrong kse");

+ }

+ if (ke->ke_thread != td) {

+ panic("wrong thread");

+ }

+ if ((p->p_flag & P_KSES) == 0) {

+ if (ke == NULL) {

+ panic("non KSE thread lost kse");

+ }

+ } else {

+ prevpri = 0;

+ saw_lastassigned = 0;

+ unassigned = 0;

+ assigned = 0;

+ TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {

+ if (td2->td_priority < prevpri) {

+ panic("thread runqueue unosorted");

+ }

+ prevpri = td2->td_priority;

+ if (td2->td_kse) {

+ assigned++;

+ if (unassigned) {

+ panic("unassigned before assigned");

+ }

+ if (kg->kg_last_assigned == NULL) {

+ panic("lastassigned corrupt");

+ }

+ if (saw_lastassigned) {

+ panic("last assigned not last");

+ }

+ if (td2->td_kse->ke_thread != td2) {

+ panic("mismatched kse/thread");

+ }

+ } else {

+ unassigned++;

+ }

+ if (td2 == kg->kg_last_assigned) {

+ saw_lastassigned = 1;

+ if (td2->td_kse == NULL) {

+ panic("last assigned not assigned");

+ }

+ if (kg->kg_last_assigned && (saw_lastassigned == 0)) {

+ panic("where on earth does lastassigned point?");

+ }

+ FOREACH_THREAD_IN_GROUP(kg, td2) {

+ if (((td2->td_flags & TDF_UNBOUND) == 0) &&

+ (td2->td_state == TDS_RUNQ)) {

+ assigned++;

+ if (td2->td_kse == NULL) {

+ panic ("BOUND thread with no KSE");

+ }

+#if 0

+ if ((unassigned + assigned) != kg->kg_runnable) {

+ panic("wrong number in runnable");

+ }

+#endif

+ }

}

diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index bd1a625e8757..a2a44ff8e143 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c

@@ -277,9 +277,13 @@ schedcpu(arg)

* with 16-bit int's (remember them?)

* overflow takes 45 days.

- /* XXXKSE */

- /* if ((ke->ke_flags & KEF_ONRUNQ) == 0) */

- if (p->p_stat == SSLEEP || p->p_stat == SSTOP) {

+ /* XXXKSE **WRONG***/

+ /*

+ * the kse slptimes are not touched in wakeup

+ * because the thread may not HAVE a KSE

+ */

+ if (ke->ke_state == KES_ONRUNQ &&

+ ke->ke_state == KES_RUNNING) {

ke->ke_slptime++;

} else {

ke->ke_slptime = 0;

@@ -321,20 +325,31 @@ schedcpu(arg)

}

kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);

resetpriority(kg);

- td = FIRST_THREAD_IN_PROC(p);

- if (td->td_priority >= PUSER &&

- (p->p_sflag & PS_INMEM)) {

- int changedqueue =

- ((td->td_priority / RQ_PPQ) !=

- (kg->kg_user_pri / RQ_PPQ));

- td->td_priority = kg->kg_user_pri;

- FOREACH_KSE_IN_GROUP(kg, ke) {

- if ((ke->ke_oncpu == NOCPU) &&

- (p->p_stat == SRUN) && /* XXXKSE */

- changedqueue) {

- remrunqueue(ke->ke_thread);

- setrunqueue(ke->ke_thread);

+ FOREACH_THREAD_IN_GROUP(kg, td) {

+ int changedqueue;

+ if (td->td_priority >= PUSER) {

+ /*

+ * Only change the priority

+ * of threads that are still at their

+ * user priority.

+ * XXXKSE This is problematic

+ * as we may need to re-order

+ * the threads on the KSEG list.

+ */

+ changedqueue =

+ ((td->td_priority / RQ_PPQ) !=

+ (kg->kg_user_pri / RQ_PPQ));

+ td->td_priority = kg->kg_user_pri;

+ if (changedqueue &&

+ td->td_state == TDS_RUNQ) {

+ /* this could be optimised */

+ remrunqueue(td);

+ td->td_priority =

+ kg->kg_user_pri;

+ setrunqueue(td);

+ } else {

+ td->td_priority = kg->kg_user_pri;

}

@@ -409,6 +424,7 @@ sleepinit(void)

* entered before msleep returns. If priority includes the PDROP

* flag the mutex is not entered before returning.

int

msleep(ident, mtx, priority, wmesg, timo)

void *ident;

@@ -426,9 +442,48 @@ msleep(ident, mtx, priority, wmesg, timo)

if (KTRPOINT(td, KTR_CSW))

ktrcsw(1, 0);

#endif

+ KASSERT((td->td_kse != NULL), ("msleep: NULL KSE?"));

+ KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse state?"));

WITNESS_SLEEP(0, &mtx->mtx_object);

KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,

("sleeping without a mutex"));

+ /*

+ * If we are capable of async syscalls and there isn't already

+ * another one ready to return, start a new thread

+ * and queue it as ready to run. Note that there is danger here

+ * because we need to make sure that we don't sleep allocating

+ * the thread (recursion here might be bad).

+ * Hence the TDF_INMSLEEP flag.

+ */

+ if (p->p_flag & P_KSES) {

+ /* Just don't bother if we are exiting

+ and not the exiting thread. */

+ if ((p->p_flag & P_WEXIT) && catch && p->p_singlethread != td)

+ return (EINTR);

+ if (td->td_mailbox && (!(td->td_flags & TDF_INMSLEEP))) {

+ /*

+ * If we have no queued work to do, then

+ * upcall to the UTS to see if it has more to do.

+ * We don't need to upcall now, just make it and

+ * queue it.

+ */

+ mtx_lock_spin(&sched_lock);

+ if (TAILQ_FIRST(&td->td_ksegrp->kg_runq) == NULL) {

+ /* Don't recurse here! */

+ KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse stateX?"));

+ td->td_flags |= TDF_INMSLEEP;

+ thread_schedule_upcall(td, td->td_kse);

+ td->td_flags &= ~TDF_INMSLEEP;

+ KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse stateY?"));

+ }

+ mtx_unlock_spin(&sched_lock);

+ }

+ KASSERT((td->td_kse != NULL), ("msleep: NULL KSE2?"));

+ KASSERT((td->td_kse->ke_state == KES_RUNNING),

+ ("msleep: kse state2?"));

+ KASSERT((td->td_kse->ke_thread == td),

+ ("msleep: kse/thread mismatch?"));

+ }

mtx_lock_spin(&sched_lock);

if (cold || panicstr) {

@@ -454,7 +509,7 @@ msleep(ident, mtx, priority, wmesg, timo)

}

KASSERT(p != NULL, ("msleep1"));

- KASSERT(ident != NULL && td->td_proc->p_stat == SRUN, ("msleep"));

+ KASSERT(ident != NULL && td->td_state == TDS_RUNNING, ("msleep"));

td->td_wchan = ident;

td->td_wmesg = wmesg;

@@ -468,20 +523,23 @@ msleep(ident, mtx, priority, wmesg, timo)

callout_reset(&td->td_slpcallout, timo, endtsleep, td);

* We put ourselves on the sleep queue and start our timeout

- * before calling cursig, as we could stop there, and a wakeup

- * or a SIGCONT (or both) could occur while we were stopped.

- * A SIGCONT would cause us to be marked as SSLEEP

+ * before calling thread_suspend_check, as we could stop there, and

+ * a wakeup or a SIGCONT (or both) could occur while we were stopped.

* without resuming us, thus we must be ready for sleep

* when cursig is called. If the wakeup happens while we're

* stopped, td->td_wchan will be 0 upon return from cursig.

if (catch) {

- CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,

+ CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,

p->p_pid, p->p_comm);

td->td_flags |= TDF_SINTR;

mtx_unlock_spin(&sched_lock);

PROC_LOCK(p);

- sig = cursig(p);

+ sig = cursig(td);

+ if (thread_suspend_check(1)) {

+ sig = EINTR;

+ rval = EINTR;

+ }

mtx_lock_spin(&sched_lock);

PROC_UNLOCK(p);

if (sig != 0) {

@@ -492,13 +550,13 @@ msleep(ident, mtx, priority, wmesg, timo)

} else

sig = 0;

if (td->td_wchan != NULL) {

- td->td_proc->p_stat = SSLEEP;

p->p_stats->p_ru.ru_nvcsw++;

+ td->td_state = TDS_SLP;

mi_switch();

}

- CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", td, p->p_pid,

+ CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,

p->p_comm);

- KASSERT(td->td_proc->p_stat == SRUN, ("running but not SRUN"));

+ KASSERT(td->td_state == TDS_RUNNING, ("running but not TDS_RUNNING"));

td->td_flags &= ~TDF_SINTR;

if (td->td_flags & TDF_TIMEOUT) {

td->td_flags &= ~TDF_TIMEOUT;

@@ -524,8 +582,8 @@ msleep(ident, mtx, priority, wmesg, timo)

if (rval == 0 && catch) {

PROC_LOCK(p);

- /* XXX: shouldn't we always be calling cursig() */

- if (sig != 0 || (sig = cursig(p))) {

+ /* XXX: shouldn't we always be calling cursig() */

+ if (sig != 0 || (sig = cursig(td))) {

if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))

rval = EINTR;

else

@@ -571,7 +629,7 @@ endtsleep(arg)

td->td_flags &= ~TDF_TIMEOUT;

setrunqueue(td);

} else if (td->td_wchan != NULL) {

- if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */

+ if (td->td_state == TDS_SLP) /* XXXKSE */

setrunnable(td);

else

unsleep(td);

@@ -583,6 +641,38 @@ endtsleep(arg)

}

+ * Abort a thread, as if an interrupt had occured. Only abort

+ * interruptable waits (unfortunatly it isn't only safe to abort others).

+ * This is about identical to cv_abort().

+ * Think about merging them?

+ * Also, whatever the signal code does...

+ */

+void

+abortsleep(struct thread *td)

+ mtx_lock_spin(&sched_lock);

+ /*

+ * If the TDF_TIMEOUT flag is set, just leave. A

+ * timeout is scheduled anyhow.

+ */

+ if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {

+ if (td->td_wchan != NULL) {

+ if (td->td_state == TDS_SLP) { /* XXXKSE */

+ setrunnable(td);

+ } else {

+ /*

+ * Probably in a suspended state..

+ * um.. dunno XXXKSE

+ */

+ unsleep(td);

+ }

+ mtx_unlock_spin(&sched_lock);

+/*

* Remove a process from its wait queue

void

@@ -618,25 +708,24 @@ restart:

if (td->td_wchan == ident) {

TAILQ_REMOVE(qp, td, td_slpq);

td->td_wchan = NULL;

- if (td->td_proc->p_stat == SSLEEP) {

+ if (td->td_state == TDS_SLP) {

/* OPTIMIZED EXPANSION OF setrunnable(p); */

CTR3(KTR_PROC, "wakeup: thread %p (pid %d, %s)",

td, p->p_pid, p->p_comm);

if (td->td_ksegrp->kg_slptime > 1)

updatepri(td);

td->td_ksegrp->kg_slptime = 0;

- td->td_kse->ke_slptime = 0;

- td->td_proc->p_stat = SRUN;

if (p->p_sflag & PS_INMEM) {

setrunqueue(td);

maybe_resched(td);

} else {

+/* XXXKSE Wrong! */ td->td_state = TDS_RUNQ;

p->p_sflag |= PS_SWAPINREQ;

wakeup(&proc0);

}

/* END INLINE EXPANSION */

- goto restart;

}

+ goto restart;

}

mtx_unlock_spin(&sched_lock);

@@ -665,20 +754,19 @@ restart:

if (td->td_wchan == ident) {

TAILQ_REMOVE(qp, td, td_slpq);

td->td_wchan = NULL;

- if (td->td_proc->p_stat == SSLEEP) {

+ if (td->td_state == TDS_SLP) {

/* OPTIMIZED EXPANSION OF setrunnable(p); */

- CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",

- p, p->p_pid, p->p_comm);

+ CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",

+ td, p->p_pid, p->p_comm);

if (td->td_ksegrp->kg_slptime > 1)

updatepri(td);

td->td_ksegrp->kg_slptime = 0;

- td->td_kse->ke_slptime = 0;

- td->td_proc->p_stat = SRUN;

if (p->p_sflag & PS_INMEM) {

setrunqueue(td);

maybe_resched(td);

break;

} else {

+/* XXXKSE Wrong */ td->td_state = TDS_RUNQ;

p->p_sflag |= PS_SWAPINREQ;

wakeup(&proc0);

}

@@ -698,15 +786,19 @@ mi_switch()

{

struct bintime new_switchtime;

struct thread *td = curthread; /* XXX */

- register struct proc *p = td->td_proc; /* XXX */

+ struct proc *p = td->td_proc; /* XXX */

+ struct kse *ke = td->td_kse;

#if 0

#endif

u_int sched_nest;

mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);

+ KASSERT((ke->ke_state == KES_RUNNING), ("mi_switch: kse state?"));

#ifdef INVARIANTS

- if (p->p_stat != SMTX && p->p_stat != SRUN)

+ if (td->td_state != TDS_MTX &&

+ td->td_state != TDS_RUNQ &&

+ td->td_state != TDS_RUNNING)

mtx_assert(&Giant, MA_NOTOWNED);

#endif

@@ -735,7 +827,8 @@ mi_switch()

* XXX drop sched_lock, pickup Giant

- if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&

+ if (p->p_state != PRS_ZOMBIE &&

+ p->p_limit->p_cpulimit != RLIM_INFINITY &&

p->p_runtime > p->p_limit->p_cpulimit) {

rlim = &p->p_rlimit[RLIMIT_CPU];

if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {

@@ -763,17 +856,35 @@ mi_switch()

cnt.v_swtch++;

PCPU_SET(switchtime, new_switchtime);

- CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,

+ CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,

p->p_comm);

sched_nest = sched_lock.mtx_recurse;

- td->td_lastcpu = td->td_kse->ke_oncpu;

- td->td_kse->ke_oncpu = NOCPU;

- td->td_kse->ke_flags &= ~KEF_NEEDRESCHED;

+ td->td_lastcpu = ke->ke_oncpu;

+ ke->ke_oncpu = NOCPU;

+ ke->ke_flags &= ~KEF_NEEDRESCHED;

+ /*

+ * At the last moment: if this KSE is not on the run queue,

+ * it needs to be freed correctly and the thread treated accordingly.

+ */

+ if ((td->td_state == TDS_RUNNING) &&

+ ((ke->ke_flags & KEF_IDLEKSE) == 0)) {

+ /* Put us back on the run queue (kse and all). */

+ setrunqueue(td);

+ } else if ((td->td_flags & TDF_UNBOUND) &&

+ (td->td_state != TDS_RUNQ)) { /* in case of old code */

+ /*

+ * We will not be on the run queue.

+ * Someone else can use the KSE if they need it.

+ */

+ td->td_kse = NULL;

+ kse_reassign(ke);

+ }

cpu_switch();

td->td_kse->ke_oncpu = PCPU_GET(cpuid);

+ td->td_kse->ke_state = KES_RUNNING;

sched_lock.mtx_recurse = sched_nest;

sched_lock.mtx_lock = (uintptr_t)td;

- CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,

+ CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,

p->p_comm);

if (PCPU_GET(switchtime.sec) == 0)

binuptime(PCPU_PTR(switchtime));

@@ -791,37 +902,42 @@ setrunnable(struct thread *td)

struct proc *p = td->td_proc;

mtx_lock_spin(&sched_lock);

- switch (p->p_stat) {

- case SZOMB: /* not a thread flag XXXKSE */

+ switch (p->p_state) {

+ case PRS_ZOMBIE:

panic("setrunnable(1)");

+ default:

+ break;

}

- switch (td->td_proc->p_stat) {

+ switch (td->td_state) {

case 0:

- case SRUN:

- case SWAIT:

+ case TDS_RUNNING:

+ case TDS_IWAIT:

default:

+ printf("state is %d", td->td_state);

panic("setrunnable(2)");

- case SSTOP:

- case SSLEEP: /* e.g. when sending signals */

+ case TDS_SUSPENDED:

+ thread_unsuspend(p);

+ break;

+ case TDS_SLP: /* e.g. when sending signals */

if (td->td_flags & TDF_CVWAITQ)

cv_waitq_remove(td);

else

unsleep(td);

- break;

- case SIDL:

+ case TDS_UNQUEUED: /* being put back onto the queue */

+ case TDS_NEW: /* not yet had time to suspend */

+ case TDS_RUNQ: /* not yet had time to suspend */

break;

}

- td->td_proc->p_stat = SRUN;

if (td->td_ksegrp->kg_slptime > 1)

updatepri(td);

td->td_ksegrp->kg_slptime = 0;

- td->td_kse->ke_slptime = 0;

if ((p->p_sflag & PS_INMEM) == 0) {

+ td->td_state = TDS_RUNQ; /* XXXKSE not a good idea */

p->p_sflag |= PS_SWAPINREQ;

wakeup(&proc0);

} else {

- setrunqueue(td);

+ if (td->td_state != TDS_RUNQ)

+ setrunqueue(td); /* XXXKSE */

maybe_resched(td);

}

mtx_unlock_spin(&sched_lock);

@@ -848,7 +964,7 @@ resetpriority(kg)

kg->kg_user_pri = newpriority;

}

FOREACH_THREAD_IN_GROUP(kg, td) {

- maybe_resched(td);

+ maybe_resched(td); /* XXXKSE silly */

}

mtx_unlock_spin(&sched_lock);

}

@@ -865,20 +981,21 @@ loadav(void *arg)

int i, nrun;

struct loadavg *avg;

struct proc *p;

- struct ksegrp *kg;

+ struct thread *td;

avg = &averunnable;

sx_slock(&allproc_lock);

nrun = 0;

FOREACH_PROC_IN_SYSTEM(p) {

- FOREACH_KSEGRP_IN_PROC(p, kg) {

- switch (p->p_stat) {

- case SRUN:

+ FOREACH_THREAD_IN_PROC(p, td) {

+ switch (td->td_state) {

+ case TDS_RUNQ:

+ case TDS_RUNNING:

if ((p->p_flag & P_NOLOAD) != 0)

goto nextproc;

- /* FALLTHROUGH */

- case SIDL:

- nrun++;

+ nrun++; /* XXXKSE */

+ default:

+ break;

}

nextproc:

continue;

@@ -932,19 +1049,18 @@ void

schedclock(td)

struct thread *td;

{

- struct kse *ke = td->td_kse;

- struct ksegrp *kg = td->td_ksegrp;

+ struct kse *ke;

+ struct ksegrp *kg;

- if (td) {

- ke->ke_cpticks++;

- kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);

- if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {

- resetpriority(td->td_ksegrp);

- if (td->td_priority >= PUSER)

- td->td_priority = kg->kg_user_pri;

- }

- } else {

- panic("schedclock");

+ KASSERT((td != NULL), ("schedlock: null thread pointer"));

+ ke = td->td_kse;

+ kg = td->td_ksegrp;

+ ke->ke_cpticks++;

+ kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);

+ if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {

+ resetpriority(kg);

+ if (td->td_priority >= PUSER)

+ td->td_priority = kg->kg_user_pri;

}

@@ -959,7 +1075,6 @@ yield(struct thread *td, struct yield_args *uap)

mtx_assert(&Giant, MA_NOTOWNED);

mtx_lock_spin(&sched_lock);

td->td_priority = PRI_MAX_TIMESHARE;

- setrunqueue(td);

kg->kg_proc->p_stats->p_ru.ru_nvcsw++;

mi_switch();

mtx_unlock_spin(&sched_lock);

diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
index c9081c314c75..bbe36bea6874 100644
--- a/sys/kern/ksched.c
+++ b/sys/kern/ksched.c

@@ -181,7 +181,18 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,

mtx_lock_spin(&sched_lock);

rtp_to_pri(&rtp, kg);

- td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */

+ FOREACH_THREAD_IN_GROUP(kg, td) { /* XXXKSE */

+ if (td->td_state == TDS_RUNNING) {

+ td->td_kse->ke_flags |= KEF_NEEDRESCHED;

+ } else if (td->td_state == TDS_RUNQ) {

+ if (td->td_priority > kg->kg_user_pri) {

+ remrunqueue(td);

+ td->td_priority =

+ kg->kg_user_pri;

+ setrunqueue(td);

+ }

mtx_unlock_spin(&sched_lock);

}

else

@@ -203,7 +214,19 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,

* on the scheduling code: You must leave the

* scheduling info alone.

- td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */

+ FOREACH_THREAD_IN_GROUP(kg, td) {

+ if (td->td_state == TDS_RUNNING) {

+ td->td_kse->ke_flags |= KEF_NEEDRESCHED;

+ } else if (td->td_state == TDS_RUNQ) {

+ if (td->td_priority > kg->kg_user_pri) {

+ remrunqueue(td);

+ td->td_priority =

+ kg->kg_user_pri;

+ setrunqueue(td);

+ }

mtx_unlock_spin(&sched_lock);

}

break;

diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 9dad93bb2dd5..afd4c5d0c069 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c

@@ -124,8 +124,8 @@ forward_signal(struct thread *td)

* executing so that it executes ast().

mtx_assert(&sched_lock, MA_OWNED);

- KASSERT(td->td_proc->p_stat == SRUN,

- ("forward_signal: process is not SRUN"));

+ KASSERT(td->td_state == TDS_RUNNING,

+ ("forward_signal: thread is not TDS_RUNNING"));

CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);

diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 3b415de5c401..027aa9c7f34b 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c

@@ -48,6 +48,8 @@

#include <sys/lock.h>

#include <sys/mutex.h>

#include <sys/proc.h>

+#include <sys/kse.h>

+#include <sys/ktr.h>

#include <sys/resourcevar.h>

#include <sys/signalvar.h>

#include <sys/systm.h>

@@ -71,13 +73,15 @@ userret(td, frame, oticks)

struct kse *ke = td->td_kse;

struct ksegrp *kg = td->td_ksegrp;

+ CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,

+ p->p_comm);

#ifdef INVARIANTS

/* Check that we called signotify() enough. */

mtx_lock(&Giant);

PROC_LOCK(p);

mtx_lock_spin(&sched_lock);

if (SIGPENDING(p) && ((p->p_sflag & PS_NEEDSIGCHK) == 0 ||

- (p->p_kse.ke_flags & KEF_ASTPENDING) == 0))

+ (ke->ke_flags & KEF_ASTPENDING) == 0))

printf("failed to set signal flags proprly for ast()\n");

mtx_unlock_spin(&sched_lock);

PROC_UNLOCK(p);

@@ -100,6 +104,22 @@ userret(td, frame, oticks)

}

+ * We need to check to see if we have to exit or wait due to a

+ * single threading requirement or some other STOP condition.

+ */

+ PROC_LOCK(p);

+ thread_suspend_check(0); /* Can suspend or kill */

+ PROC_UNLOCK(p);

+ /*

+ * DO special thread processing, e.g. upcall tweaking and such

+ */

+ if (p->p_flag & P_KSES) {

+ thread_userret(p, kg, ke, td, frame);

+ /* printf("KSE thread returned"); */

+ }

+ /*

* Charge system time if profiling.

* XXX should move PS_PROFIL to a place that can obviously be

@@ -121,8 +141,7 @@ userret(td, frame, oticks)

* This function will return with preemption disabled.

void

-ast(framep)

- struct trapframe *framep;

+ast(struct trapframe *framep)

{

struct thread *td = curthread;

struct proc *p = td->td_proc;

@@ -136,6 +155,8 @@ ast(framep)

int ucode;

#endif

+ CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,

+ p->p_comm);

KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));

#ifdef WITNESS

if (witness_list(td))

@@ -164,6 +185,13 @@ ast(framep)

p->p_stats->p_prof.pr_ticks = 0;

}

mtx_unlock_spin(&sched_lock);

+ /*

+ * XXXKSE While the fact that we owe a user profiling

+ * tick is stored per KSE in this code, the statistics

+ * themselves are still stored per process.

+ * This should probably change, by which I mean that

+ * possibly the location of both might change.

+ */

if (td->td_ucred != p->p_ucred)

cred_update_thread(td);

@@ -192,14 +220,13 @@ ast(framep)

if (flags & KEF_NEEDRESCHED) {

mtx_lock_spin(&sched_lock);

td->td_priority = kg->kg_user_pri;

- setrunqueue(td);

p->p_stats->p_ru.ru_nivcsw++;

mi_switch();

mtx_unlock_spin(&sched_lock);

}

if (sflag & PS_NEEDSIGCHK) {

PROC_LOCK(p);

- while ((sig = cursig(p)) != 0)

+ while ((sig = cursig(td)) != 0)

postsig(sig);

PROC_UNLOCK(p);

}

diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
index 08bca8d67b2c..c2e79d02d5f2 100644
--- a/sys/kern/subr_turnstile.c
+++ b/sys/kern/subr_turnstile.c

@@ -119,23 +119,20 @@ propagate_priority(struct thread *td)

return;

}

+ KASSERT(td->td_state != TDS_SURPLUS, ("Mutex owner SURPLUS"));

+ MPASS(td->td_proc != NULL);

MPASS(td->td_proc->p_magic == P_MAGIC);

- KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));

+ KASSERT(td->td_state != TDS_SLP,

+ ("sleeping thread owns a mutex"));

if (td->td_priority <= pri) /* lower is higher priority */

return;

- /*

- * Bump this thread's priority.

- */

- td->td_priority = pri;

* If lock holder is actually running, just bump priority.

- if (thread_running(td)) {

- MPASS(td->td_proc->p_stat == SRUN

- || td->td_proc->p_stat == SZOMB

- || td->td_proc->p_stat == SSTOP);

+ if (td->td_state == TDS_RUNNING) {

+ td->td_priority = pri;

return;

}

@@ -151,20 +148,26 @@ propagate_priority(struct thread *td)

* If on run queue move to new run queue, and quit.

* XXXKSE this gets a lot more complicated under threads

* but try anyhow.

+ * We should have a special call to do this more efficiently.

- if (td->td_proc->p_stat == SRUN) {

+ if (td->td_state == TDS_RUNQ) {

MPASS(td->td_blocked == NULL);

remrunqueue(td);

+ td->td_priority = pri;

setrunqueue(td);

return;

}

+ /*

+ * Adjust for any other cases.

+ */

+ td->td_priority = pri;

* If we aren't blocked on a mutex, we should be.

- KASSERT(td->td_proc->p_stat == SMTX, (

+ KASSERT(td->td_state == TDS_MTX, (

"process %d(%s):%d holds %s but isn't blocked on a mutex\n",

- td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,

+ td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,

m->mtx_object.lo_name));

@@ -590,7 +593,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)

td->td_blocked = m;

td->td_mtxname = m->mtx_object.lo_name;

- td->td_proc->p_stat = SMTX;

+ td->td_state = TDS_MTX;

propagate_priority(td);

if (LOCK_LOG_TEST(&m->mtx_object, opts))

@@ -727,7 +730,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)

m, td1);

td1->td_blocked = NULL;

- td1->td_proc->p_stat = SRUN;

setrunqueue(td1);

if (td->td_critnest == 1 && td1->td_priority < pri) {

@@ -744,7 +746,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)

}

#endif

- setrunqueue(td);

if (LOCK_LOG_TEST(&m->mtx_object, opts))

CTR2(KTR_LOCK,

"_mtx_unlock_sleep: %p switching out lock=%p", m,

diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index 182221d2124f..02b3a0dac965 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c

@@ -225,6 +225,7 @@ static struct witness_order_list_entry order_lists[] = {

#endif

{ "clk", &lock_class_mtx_spin },

{ "mutex profiling lock", &lock_class_mtx_spin },

+ { "zombie_thread_lock", &lock_class_mtx_spin },

{ NULL, NULL },

{ NULL, NULL }

};

diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 1bdd913ea2e9..d8fba59f28e9 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c

@@ -1187,7 +1187,7 @@ selwakeup(sip)

sip->si_thread = NULL;

mtx_lock_spin(&sched_lock);

if (td->td_wchan == (caddr_t)&selwait) {

- if (td->td_proc->p_stat == SSLEEP)

+ if (td->td_state == TDS_SLP)

setrunnable(td);

else

cv_waitq_remove(td);

diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index dacb9d9384a4..ab6f1e88326c 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c

@@ -467,7 +467,7 @@ ptrace(struct thread *td, struct ptrace_args *uap)

}

/* not currently stopped */

- if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) {

+ if (!P_SHOULDSTOP(p) || (p->p_flag & P_WAITED) == 0) {

error = EBUSY;

goto fail;

}

@@ -566,10 +566,12 @@ ptrace(struct thread *td, struct ptrace_args *uap)

if (proctree_locked)

sx_xunlock(&proctree_lock);

/* deliver or queue signal */

- if (p->p_stat == SSTOP) {

+ if (P_SHOULDSTOP(p)) {

p->p_xstat = uap->data;

mtx_lock_spin(&sched_lock);

+ p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SGNL);

setrunnable(td2); /* XXXKSE */

+ /* Need foreach kse in proc, ... make_kse_queued(). */

mtx_unlock_spin(&sched_lock);

} else if (uap->data)

psignal(p, uap->data);

diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index d8115fb2e428..15a5d7cdda7d 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master

@@ -552,7 +552,7 @@

381 STD BSD { int kse_new(struct kse_mailbox * mbx, \

int new_grp_flag); }

382 STD BSD { int thread_wakeup(struct thread_mailbox *tmbx); }

-383 STD BSD { int kse_yield(void); }

+383 MSTD BSD { int kse_yield(void); }

384 UNIMPL BSD __mac_get_proc

385 UNIMPL BSD __mac_set_proc

386 UNIMPL BSD __mac_get_fd

diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index b9c57432699b..6c915e1b39ca 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c

@@ -2392,17 +2392,35 @@ ttyinfo(struct tty *tp)

PGRP_UNLOCK(tp->t_pgrp);

td = FIRST_THREAD_IN_PROC(pick);

- stmp = pick->p_stat == SRUN ? "running" : /* XXXKSE */

- pick->p_stat == SMTX ? td->td_mtxname :

- td->td_wmesg ? td->td_wmesg : "iowait";

+ if (pick->p_flag & P_KSES) {

+ stmp = "KSE" ; /* XXXKSE */

+ } else {

+ if (td) {

+ if (td->td_state == TDS_RUNQ) {

+ stmp = "running";

+ } else if (td->td_state == TDS_MTX) {

+ stmp = td->td_mtxname;

+ } else if (td->td_wmesg) {

+ stmp = td->td_wmesg;

+ } else {

+ stmp = "iowait";

+ }

+ } else {

+ stmp = "threadless";

+ panic("ttyinfo: no thread!?");

+ }

calcru(pick, &utime, &stime, NULL);

- ltmp = pick->p_stat == SIDL || pick->p_stat == SWAIT ||

- pick->p_stat == SZOMB ? 0 :

- pgtok(vmspace_resident_count(pick->p_vmspace));

+ ltmp = ((pick->p_state == PRS_NEW)

+ || (td && (td->td_state == TDS_IWAIT))

+ || (pick->p_state == PRS_ZOMBIE ? 0 :

+ pgtok(vmspace_resident_count(pick->p_vmspace))));

mtx_unlock_spin(&sched_lock);

ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm,

- pick->p_pid, pick->p_stat == SMTX ? "*" : "", stmp);

+ pick->p_pid,

+ td->td_state == TDS_MTX ? "*" : "",

+ stmp);

/* Print user time. */

ttyprintf(tp, "%ld.%02ldu ",

@@ -2433,7 +2451,19 @@ ttyinfo(struct tty *tp)

* we pick out just "short-term" sleepers (P_SINTR == 0).

* 4) Further ties are broken by picking the highest pid.

-#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))

+#define ISRUN(p, val) \

+do { \

+ struct thread *td; \

+ val = 0; \

+ FOREACH_THREAD_IN_PROC(p, td) { \

+ if (td->td_state == TDS_RUNQ || \

+ td->td_state == TDS_RUNNING) { \

+ val = 1; \

+ break; \

+ } \

+} while (0)

#define TESTAB(a, b) ((a)<<1 | (b))

#define ONLYA 2

#define ONLYB 1

@@ -2449,10 +2479,13 @@ proc_compare(struct proc *p1, struct proc *p2)

if (p1 == NULL)

return (1);

+ ISRUN(p1, esta);

+ ISRUN(p2, estb);

* see if at least one of them is runnable

- switch (TESTAB(ISRUN(p1), ISRUN(p2))) {

+ switch (TESTAB(esta, estb)) {

case ONLYA:

return (0);

case ONLYB:

@@ -2477,7 +2510,7 @@ proc_compare(struct proc *p1, struct proc *p2)

* weed out zombies

- switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {

+ switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {

case ONLYA:

return (1);

case ONLYB: