aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/amd64/amd64/pmap.c380
-rw-r--r--sys/amd64/amd64/trap.c2
-rw-r--r--sys/amd64/amd64/vm_machdep.c3
-rw-r--r--sys/amd64/include/pmap.h2
-rw-r--r--sys/amd64/include/proc.h11
-rw-r--r--sys/kern/kern_thread.c2
6 files changed, 382 insertions, 18 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 6a51696d8b41..5969aa6653a7 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -107,6 +107,7 @@ __FBSDID("$FreeBSD$");
* and to when physical maps must be made correct.
*/
+#include "opt_ddb.h"
#include "opt_pmap.h"
#include "opt_vm.h"
@@ -130,6 +131,10 @@ __FBSDID("$FreeBSD$");
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
+#ifdef DDB
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -468,21 +473,45 @@ SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
static struct mtx invl_gen_mtx;
-static u_long pmap_invl_gen = 0;
/* Fake lock object to satisfy turnstiles interface. */
static struct lock_object invl_gen_ts = {
.lo_name = "invlts",
};
+static struct pmap_invl_gen pmap_invl_gen_head = {
+ .gen = 1,
+ .next = NULL,
+};
+static u_long pmap_invl_gen = 1;
+
+#define PMAP_ASSERT_NOT_IN_DI() \
+ KASSERT(pmap_not_in_di(), ("DI already started"))
+
+static bool pmap_not_in_di_l(void);
+static bool pmap_not_in_di_u(void);
+DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ? pmap_not_in_di_l :
+ pmap_not_in_di_u);
+}
static bool
-pmap_not_in_di(void)
+pmap_not_in_di_l(void)
{
+ struct pmap_invl_gen *invl_gen;
- return (curthread->td_md.md_invl_gen.gen == 0);
+ invl_gen = &curthread->td_md.md_invl_gen;
+ return (invl_gen->gen == 0);
}
-#define PMAP_ASSERT_NOT_IN_DI() \
- KASSERT(pmap_not_in_di(), ("DI already started"))
+static void
+pmap_thread_init_invl_gen_l(struct thread *td)
+{
+ struct pmap_invl_gen *invl_gen;
+
+ invl_gen = &td->td_md.md_invl_gen;
+ invl_gen->gen = 0;
+}
/*
* Start a new Delayed Invalidation (DI) block of code, executed by
@@ -493,7 +522,7 @@ pmap_not_in_di(void)
* pmap active.
*/
static void
-pmap_delayed_invl_started(void)
+pmap_delayed_invl_started_l(void)
{
struct pmap_invl_gen *invl_gen;
u_long currgen;
@@ -525,7 +554,7 @@ pmap_delayed_invl_started(void)
* current thread's DI.
*/
static void
-pmap_delayed_invl_finished(void)
+pmap_delayed_invl_finished_l(void)
{
struct pmap_invl_gen *invl_gen, *next;
struct turnstile *ts;
@@ -551,6 +580,284 @@ pmap_delayed_invl_finished(void)
invl_gen->gen = 0;
}
+static bool
+pmap_not_in_di_u(void)
+{
+ struct pmap_invl_gen *invl_gen;
+
+ invl_gen = &curthread->td_md.md_invl_gen;
+ return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
+}
+
+static void
+pmap_thread_init_invl_gen_u(struct thread *td)
+{
+ struct pmap_invl_gen *invl_gen;
+
+ invl_gen = &td->td_md.md_invl_gen;
+ invl_gen->gen = 0;
+ invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
+}
+
+static bool
+pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
+{
+ uint64_t new_high, new_low, old_high, old_low;
+ char res;
+
+ old_low = new_low = 0;
+ old_high = new_high = (uintptr_t)0;
+
+ __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
+ : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
+ : "b"(new_low), "c" (new_high)
+ : "memory", "cc");
+ if (res == 0) {
+ if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
+ return (false);
+ out->gen = old_low;
+ out->next = (void *)old_high;
+ } else {
+ out->gen = new_low;
+ out->next = (void *)new_high;
+ }
+ return (true);
+}
+
+static bool
+pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
+ struct pmap_invl_gen *new_val)
+{
+ uint64_t new_high, new_low, old_high, old_low;
+ char res;
+
+ new_low = new_val->gen;
+ new_high = (uintptr_t)new_val->next;
+ old_low = old_val->gen;
+ old_high = (uintptr_t)old_val->next;
+
+ __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
+ : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
+ : "b"(new_low), "c" (new_high)
+ : "memory", "cc");
+ return (res);
+}
+
+#ifdef PV_STATS
+static long invl_start_restart;
+SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
+ &invl_start_restart, 0,
+ "");
+static long invl_finish_restart;
+SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
+ &invl_finish_restart, 0,
+ "");
+static int invl_max_qlen;
+SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
+ &invl_max_qlen, 0,
+ "");
+#endif
+
+static struct lock_delay_config __read_frequently di_delay;
+LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
+
+static void
+pmap_delayed_invl_started_u(void)
+{
+ struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
+ struct thread *td;
+ struct lock_delay_arg lda;
+ uintptr_t prevl;
+ u_char pri;
+#ifdef PV_STATS
+ int i, ii;
+#endif
+
+ td = curthread;
+ invl_gen = &td->td_md.md_invl_gen;
+ PMAP_ASSERT_NOT_IN_DI();
+ lock_delay_arg_init(&lda, &di_delay);
+ thread_lock(td);
+ pri = td->td_base_pri;
+ if (pri < PVM) {
+ invl_gen->saved_pri = 0;
+ } else {
+ invl_gen->saved_pri = pri;
+ sched_prio(td, PVM);
+ }
+ thread_unlock(td);
+
+again:
+ PV_STAT(i = 0);
+ for (p = &pmap_invl_gen_head;; p = prev.next) {
+ PV_STAT(i++);
+ prevl = atomic_load_ptr(&p->next);
+ if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
+ PV_STAT(atomic_add_long(&invl_start_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+ if (prevl == 0)
+ break;
+ prev.next = (void *)prevl;
+ }
+#ifdef PV_STATS
+ if ((ii = invl_max_qlen) < i)
+ atomic_cmpset_int(&invl_max_qlen, ii, i);
+#endif
+
+ if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
+ PV_STAT(atomic_add_long(&invl_start_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+
+ new_prev.gen = prev.gen;
+ new_prev.next = invl_gen;
+ invl_gen->gen = prev.gen + 1;
+
+ /* Formal fence between store to invl->gen and updating *p. */
+ atomic_thread_fence_rel();
+
+ /*
+ * After inserting an invl_gen element with invalid bit set,
+ * this thread blocks any other thread trying to enter the
+ * delayed invalidation block. Do not allow to remove us from
+ * the CPU, because it causes starvation for other threads.
+ */
+ critical_enter();
+
+ /*
+ * ABA for *p is not possible there, since p->gen can only
+ * increase. So if the *p thread finished its di, then
+ * started a new one and got inserted into the list at the
+ * same place, its gen will appear greater than the previously
+ * read gen.
+ */
+ if (!pmap_di_store_invl(p, &prev, &new_prev)) {
+ critical_exit();
+ PV_STAT(atomic_add_long(&invl_start_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+
+ /*
+ * There we clear PMAP_INVL_GEN_NEXT_INVALID in
+ * invl_gen->next, allowing other threads to iterate past us.
+ * pmap_di_store_invl() provides fence between the generation
+ * write and the update of next.
+ */
+ invl_gen->next = NULL;
+ critical_exit();
+}
+
+static bool
+pmap_delayed_invl_finished_u_crit(struct pmap_invl_gen *invl_gen,
+ struct pmap_invl_gen *p)
+{
+ struct pmap_invl_gen prev, new_prev;
+ u_long mygen;
+
+ /*
+ * Load invl_gen->gen after setting invl_gen->next
+ * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger
+ * generations to propagate to our invl_gen->gen. Lock prefix
+ * in atomic_set_ptr() worked as seq_cst fence.
+ */
+ mygen = atomic_load_long(&invl_gen->gen);
+
+ if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
+ return (false);
+
+ KASSERT(prev.gen < mygen,
+ ("invalid di gen sequence %lu %lu", prev.gen, mygen));
+ new_prev.gen = mygen;
+ new_prev.next = (void *)((uintptr_t)invl_gen->next &
+ ~PMAP_INVL_GEN_NEXT_INVALID);
+
+ /* Formal fence between load of prev and storing update to it. */
+ atomic_thread_fence_rel();
+
+ return (pmap_di_store_invl(p, &prev, &new_prev));
+}
+
+static void
+pmap_delayed_invl_finished_u(void)
+{
+ struct pmap_invl_gen *invl_gen, *p;
+ struct thread *td;
+ struct lock_delay_arg lda;
+ uintptr_t prevl;
+
+ td = curthread;
+ invl_gen = &td->td_md.md_invl_gen;
+ KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
+ KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
+ ("missed invl_start: INVALID"));
+ lock_delay_arg_init(&lda, &di_delay);
+
+again:
+ for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
+ prevl = atomic_load_ptr(&p->next);
+ if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
+ PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+ if ((void *)prevl == invl_gen)
+ break;
+ }
+
+ /*
+ * It is legitimate to not find ourself on the list if a
+ * thread before us finished its DI and started it again.
+ */
+ if (__predict_false(p == NULL)) {
+ PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+
+ critical_enter();
+ atomic_set_ptr((uintptr_t *)&invl_gen->next,
+ PMAP_INVL_GEN_NEXT_INVALID);
+ if (!pmap_delayed_invl_finished_u_crit(invl_gen, p)) {
+ atomic_clear_ptr((uintptr_t *)&invl_gen->next,
+ PMAP_INVL_GEN_NEXT_INVALID);
+ critical_exit();
+ PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+ critical_exit();
+ if (invl_gen->saved_pri != 0) {
+ thread_lock(td);
+ sched_prio(td, invl_gen->saved_pri);
+ thread_unlock(td);
+ }
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(di_queue, pmap_di_queue)
+{
+ struct pmap_invl_gen *p, *pn;
+ struct thread *td;
+ uintptr_t nextl;
+ bool first;
+
+ for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
+ first = false) {
+ nextl = atomic_load_ptr(&p->next);
+ pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
+ td = first ? NULL : __containerof(p, struct thread,
+ td_md.md_invl_gen);
+ db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
+ (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
+ td != NULL ? td->td_tid : -1);
+ }
+}
+#endif
+
#ifdef PV_STATS
static long invl_wait;
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
@@ -579,7 +886,7 @@ pmap_delayed_invl_genp(vm_page_t m)
* processor.
*/
static void
-pmap_delayed_invl_wait(vm_page_t m)
+pmap_delayed_invl_wait_l(vm_page_t m)
{
struct turnstile *ts;
u_long *m_gen;
@@ -603,6 +910,54 @@ pmap_delayed_invl_wait(vm_page_t m)
}
}
+static void
+pmap_delayed_invl_wait_u(vm_page_t m)
+{
+ u_long *m_gen;
+#ifdef PV_STATS
+ bool accounted = false;
+#endif
+
+ m_gen = pmap_delayed_invl_genp(m);
+ while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
+#ifdef PV_STATS
+ if (!accounted) {
+ atomic_add_long(&invl_wait, 1);
+ accounted = true;
+ }
+#endif
+ kern_yield(PRI_USER);
+ }
+}
+
+DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_started, (void), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_delayed_invl_started_l : pmap_delayed_invl_started_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_finished, (void), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_delayed_invl_finished_l : pmap_delayed_invl_finished_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u);
+}
+
/*
* Mark the page m's PV list as participating in the current thread's
* DI block. Any threads concurrently using m's PV list to remove or
@@ -2854,6 +3209,7 @@ void
pmap_pinit0(pmap_t pmap)
{
struct proc *p;
+ struct thread *td;
int i;
PMAP_LOCK_INIT(pmap);
@@ -2872,12 +3228,14 @@ pmap_pinit0(pmap_t pmap)
pmap->pm_pcids[i].pm_gen = 1;
}
pmap_activate_boot(pmap);
+ td = curthread;
if (pti) {
- p = curproc;
+ p = td->td_proc;
PROC_LOCK(p);
p->p_md.md_flags |= P_MD_KPTI;
PROC_UNLOCK(p);
}
+ pmap_thread_init_invl_gen(td);
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
@@ -9327,11 +9685,7 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
return (error);
}
-#include "opt_ddb.h"
#ifdef DDB
-#include <sys/kdb.h>
-#include <ddb/ddb.h>
-
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
pmap_t pmap;
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 264b31caf91c..d1571873264b 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -1183,7 +1183,7 @@ amd64_syscall(struct thread *td, int traced)
KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
("System call %s returning with mangled pcb_save",
syscallname(td->td_proc, td->td_sa.code)));
- KASSERT(td->td_md.md_invl_gen.gen == 0,
+ KASSERT(pmap_not_in_di(),
("System call %s returning with leaked invl_gen %lu",
syscallname(td->td_proc, td->td_sa.code),
td->td_md.md_invl_gen.gen));
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 8508b6860297..c8f4687901aa 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -228,7 +228,7 @@ cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
/* Setup to release spin count in fork_exit(). */
td2->td_md.md_spinlock_count = 1;
td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
- td2->td_md.md_invl_gen.gen = 0;
+ pmap_thread_init_invl_gen(td2);
/* As an i386, do not copy io permission bitmap. */
pcb2->pcb_tssp = NULL;
@@ -544,6 +544,7 @@ cpu_copy_thread(struct thread *td, struct thread *td0)
/* Setup to release spin count in fork_exit(). */
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
+ pmap_thread_init_invl_gen(td);
}
/*
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 91d6fb2f934b..d6739c2cea4e 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -441,6 +441,7 @@ void *pmap_mapbios(vm_paddr_t, vm_size_t);
void *pmap_mapdev(vm_paddr_t, vm_size_t);
void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
void *pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size);
+bool pmap_not_in_di(void);
boolean_t pmap_page_is_mapped(vm_page_t m);
void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
void pmap_pinit_pml4(vm_page_t);
@@ -465,6 +466,7 @@ void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
u_int keyidx, int flags);
+void pmap_thread_init_invl_gen(struct thread *td);
int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
#endif /* _KERNEL */
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index c063849972fc..21d6cce7954f 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -50,10 +50,17 @@ struct proc_ldt {
int ldt_refcnt;
};
+#define PMAP_INVL_GEN_NEXT_INVALID 0x1ULL
struct pmap_invl_gen {
u_long gen; /* (k) */
- LIST_ENTRY(pmap_invl_gen) link; /* (pp) */
-};
+ union {
+ LIST_ENTRY(pmap_invl_gen) link; /* (pp) */
+ struct {
+ struct pmap_invl_gen *next;
+ u_char saved_pri;
+ };
+ };
+} __aligned(16);
/*
* Machine-dependent part of the proc structure for AMD64.
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 9529aa30820d..c1966bf88de2 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -84,7 +84,7 @@ _Static_assert(offsetof(struct thread, td_pflags) == 0x104,
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x478,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x530,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x548,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
"struct proc KBI p_flag");