aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorKonstantin Belousov <kib@FreeBSD.org>2019-05-16 13:28:48 +0000
committerKonstantin Belousov <kib@FreeBSD.org>2019-05-16 13:28:48 +0000
commit4d3b28bcdcc494059c30887ad1721c8eb6c5eada (patch)
treedb3a21ac2f601c06c225e3a9439a5aa7374a4613 /sys
parenta9fd669b4a564e3dc4668dd5a8bfe341cbbefaa9 (diff)
downloadsrc-4d3b28bcdcc494059c30887ad1721c8eb6c5eada.tar.gz
src-4d3b28bcdcc494059c30887ad1721c8eb6c5eada.zip
amd64 pmap: rework delayed invalidation, removing global mutex.
For machines having cmpxcgh16b instruction, i.e. everything but very early Athlons, provide lockless implementation of delayed invalidation. The implementation maintains lock-less single-linked list with the trick from the T.L. Harris article about volatile mark of the elements being removed. Double-CAS is used to atomically update both link and generation. New thread starting DI appends itself to the end of the queue, setting the generation to the generation of the last element +1. On DI finish, thread donates its generation to the previous element. The generation of the fake head of the list is the last passed DI generation. Basically, the implementation is a queued spinlock but without spinlock. Many thanks both to Peter Holm and Mark Johnson for keeping with me while I produced intermediate versions of the patch. Reviewed by: markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 month MFC note: td_md.md_invl_gen should go to the end of struct thread Differential revision: https://reviews.freebsd.org/D19630
Notes
Notes: svn path=/head/; revision=347695
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/pmap.c380
-rw-r--r--sys/amd64/amd64/trap.c2
-rw-r--r--sys/amd64/amd64/vm_machdep.c3
-rw-r--r--sys/amd64/include/pmap.h2
-rw-r--r--sys/amd64/include/proc.h11
-rw-r--r--sys/kern/kern_thread.c2
6 files changed, 382 insertions, 18 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 6a51696d8b41..5969aa6653a7 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -107,6 +107,7 @@ __FBSDID("$FreeBSD$");
* and to when physical maps must be made correct.
*/
+#include "opt_ddb.h"
#include "opt_pmap.h"
#include "opt_vm.h"
@@ -130,6 +131,10 @@ __FBSDID("$FreeBSD$");
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
+#ifdef DDB
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -468,21 +473,45 @@ SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
static struct mtx invl_gen_mtx;
-static u_long pmap_invl_gen = 0;
/* Fake lock object to satisfy turnstiles interface. */
static struct lock_object invl_gen_ts = {
.lo_name = "invlts",
};
+static struct pmap_invl_gen pmap_invl_gen_head = {
+ .gen = 1,
+ .next = NULL,
+};
+static u_long pmap_invl_gen = 1;
+
+#define PMAP_ASSERT_NOT_IN_DI() \
+ KASSERT(pmap_not_in_di(), ("DI already started"))
+
+static bool pmap_not_in_di_l(void);
+static bool pmap_not_in_di_u(void);
+DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ? pmap_not_in_di_l :
+ pmap_not_in_di_u);
+}
static bool
-pmap_not_in_di(void)
+pmap_not_in_di_l(void)
{
+ struct pmap_invl_gen *invl_gen;
- return (curthread->td_md.md_invl_gen.gen == 0);
+ invl_gen = &curthread->td_md.md_invl_gen;
+ return (invl_gen->gen == 0);
}
-#define PMAP_ASSERT_NOT_IN_DI() \
- KASSERT(pmap_not_in_di(), ("DI already started"))
+static void
+pmap_thread_init_invl_gen_l(struct thread *td)
+{
+ struct pmap_invl_gen *invl_gen;
+
+ invl_gen = &td->td_md.md_invl_gen;
+ invl_gen->gen = 0;
+}
/*
* Start a new Delayed Invalidation (DI) block of code, executed by
@@ -493,7 +522,7 @@ pmap_not_in_di(void)
* pmap active.
*/
static void
-pmap_delayed_invl_started(void)
+pmap_delayed_invl_started_l(void)
{
struct pmap_invl_gen *invl_gen;
u_long currgen;
@@ -525,7 +554,7 @@ pmap_delayed_invl_started(void)
* current thread's DI.
*/
static void
-pmap_delayed_invl_finished(void)
+pmap_delayed_invl_finished_l(void)
{
struct pmap_invl_gen *invl_gen, *next;
struct turnstile *ts;
@@ -551,6 +580,284 @@ pmap_delayed_invl_finished(void)
invl_gen->gen = 0;
}
+static bool
+pmap_not_in_di_u(void)
+{
+ struct pmap_invl_gen *invl_gen;
+
+ invl_gen = &curthread->td_md.md_invl_gen;
+ return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
+}
+
+static void
+pmap_thread_init_invl_gen_u(struct thread *td)
+{
+ struct pmap_invl_gen *invl_gen;
+
+ invl_gen = &td->td_md.md_invl_gen;
+ invl_gen->gen = 0;
+ invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
+}
+
+static bool
+pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
+{
+ uint64_t new_high, new_low, old_high, old_low;
+ char res;
+
+ old_low = new_low = 0;
+ old_high = new_high = (uintptr_t)0;
+
+ __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
+ : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
+ : "b"(new_low), "c" (new_high)
+ : "memory", "cc");
+ if (res == 0) {
+ if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
+ return (false);
+ out->gen = old_low;
+ out->next = (void *)old_high;
+ } else {
+ out->gen = new_low;
+ out->next = (void *)new_high;
+ }
+ return (true);
+}
+
+static bool
+pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
+ struct pmap_invl_gen *new_val)
+{
+ uint64_t new_high, new_low, old_high, old_low;
+ char res;
+
+ new_low = new_val->gen;
+ new_high = (uintptr_t)new_val->next;
+ old_low = old_val->gen;
+ old_high = (uintptr_t)old_val->next;
+
+ __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
+ : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
+ : "b"(new_low), "c" (new_high)
+ : "memory", "cc");
+ return (res);
+}
+
+#ifdef PV_STATS
+static long invl_start_restart;
+SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
+ &invl_start_restart, 0,
+ "");
+static long invl_finish_restart;
+SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
+ &invl_finish_restart, 0,
+ "");
+static int invl_max_qlen;
+SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
+ &invl_max_qlen, 0,
+ "");
+#endif
+
+static struct lock_delay_config __read_frequently di_delay;
+LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
+
+static void
+pmap_delayed_invl_started_u(void)
+{
+ struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
+ struct thread *td;
+ struct lock_delay_arg lda;
+ uintptr_t prevl;
+ u_char pri;
+#ifdef PV_STATS
+ int i, ii;
+#endif
+
+ td = curthread;
+ invl_gen = &td->td_md.md_invl_gen;
+ PMAP_ASSERT_NOT_IN_DI();
+ lock_delay_arg_init(&lda, &di_delay);
+ thread_lock(td);
+ pri = td->td_base_pri;
+ if (pri < PVM) {
+ invl_gen->saved_pri = 0;
+ } else {
+ invl_gen->saved_pri = pri;
+ sched_prio(td, PVM);
+ }
+ thread_unlock(td);
+
+again:
+ PV_STAT(i = 0);
+ for (p = &pmap_invl_gen_head;; p = prev.next) {
+ PV_STAT(i++);
+ prevl = atomic_load_ptr(&p->next);
+ if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
+ PV_STAT(atomic_add_long(&invl_start_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+ if (prevl == 0)
+ break;
+ prev.next = (void *)prevl;
+ }
+#ifdef PV_STATS
+ if ((ii = invl_max_qlen) < i)
+ atomic_cmpset_int(&invl_max_qlen, ii, i);
+#endif
+
+ if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
+ PV_STAT(atomic_add_long(&invl_start_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+
+ new_prev.gen = prev.gen;
+ new_prev.next = invl_gen;
+ invl_gen->gen = prev.gen + 1;
+
+ /* Formal fence between store to invl->gen and updating *p. */
+ atomic_thread_fence_rel();
+
+ /*
+ * After inserting an invl_gen element with invalid bit set,
+ * this thread blocks any other thread trying to enter the
+ * delayed invalidation block. Do not allow to remove us from
+ * the CPU, because it causes starvation for other threads.
+ */
+ critical_enter();
+
+ /*
+ * ABA for *p is not possible there, since p->gen can only
+ * increase. So if the *p thread finished its di, then
+ * started a new one and got inserted into the list at the
+ * same place, its gen will appear greater than the previously
+ * read gen.
+ */
+ if (!pmap_di_store_invl(p, &prev, &new_prev)) {
+ critical_exit();
+ PV_STAT(atomic_add_long(&invl_start_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+
+ /*
+ * There we clear PMAP_INVL_GEN_NEXT_INVALID in
+ * invl_gen->next, allowing other threads to iterate past us.
+ * pmap_di_store_invl() provides fence between the generation
+ * write and the update of next.
+ */
+ invl_gen->next = NULL;
+ critical_exit();
+}
+
+static bool
+pmap_delayed_invl_finished_u_crit(struct pmap_invl_gen *invl_gen,
+ struct pmap_invl_gen *p)
+{
+ struct pmap_invl_gen prev, new_prev;
+ u_long mygen;
+
+ /*
+ * Load invl_gen->gen after setting invl_gen->next
+ * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger
+ * generations to propagate to our invl_gen->gen. Lock prefix
+ * in atomic_set_ptr() worked as seq_cst fence.
+ */
+ mygen = atomic_load_long(&invl_gen->gen);
+
+ if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
+ return (false);
+
+ KASSERT(prev.gen < mygen,
+ ("invalid di gen sequence %lu %lu", prev.gen, mygen));
+ new_prev.gen = mygen;
+ new_prev.next = (void *)((uintptr_t)invl_gen->next &
+ ~PMAP_INVL_GEN_NEXT_INVALID);
+
+ /* Formal fence between load of prev and storing update to it. */
+ atomic_thread_fence_rel();
+
+ return (pmap_di_store_invl(p, &prev, &new_prev));
+}
+
+static void
+pmap_delayed_invl_finished_u(void)
+{
+ struct pmap_invl_gen *invl_gen, *p;
+ struct thread *td;
+ struct lock_delay_arg lda;
+ uintptr_t prevl;
+
+ td = curthread;
+ invl_gen = &td->td_md.md_invl_gen;
+ KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
+ KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
+ ("missed invl_start: INVALID"));
+ lock_delay_arg_init(&lda, &di_delay);
+
+again:
+ for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
+ prevl = atomic_load_ptr(&p->next);
+ if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
+ PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+ if ((void *)prevl == invl_gen)
+ break;
+ }
+
+ /*
+ * It is legitimate to not find ourself on the list if a
+ * thread before us finished its DI and started it again.
+ */
+ if (__predict_false(p == NULL)) {
+ PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+
+ critical_enter();
+ atomic_set_ptr((uintptr_t *)&invl_gen->next,
+ PMAP_INVL_GEN_NEXT_INVALID);
+ if (!pmap_delayed_invl_finished_u_crit(invl_gen, p)) {
+ atomic_clear_ptr((uintptr_t *)&invl_gen->next,
+ PMAP_INVL_GEN_NEXT_INVALID);
+ critical_exit();
+ PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+ lock_delay(&lda);
+ goto again;
+ }
+ critical_exit();
+ if (invl_gen->saved_pri != 0) {
+ thread_lock(td);
+ sched_prio(td, invl_gen->saved_pri);
+ thread_unlock(td);
+ }
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(di_queue, pmap_di_queue)
+{
+ struct pmap_invl_gen *p, *pn;
+ struct thread *td;
+ uintptr_t nextl;
+ bool first;
+
+ for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
+ first = false) {
+ nextl = atomic_load_ptr(&p->next);
+ pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
+ td = first ? NULL : __containerof(p, struct thread,
+ td_md.md_invl_gen);
+ db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
+ (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
+ td != NULL ? td->td_tid : -1);
+ }
+}
+#endif
+
#ifdef PV_STATS
static long invl_wait;
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
@@ -579,7 +886,7 @@ pmap_delayed_invl_genp(vm_page_t m)
* processor.
*/
static void
-pmap_delayed_invl_wait(vm_page_t m)
+pmap_delayed_invl_wait_l(vm_page_t m)
{
struct turnstile *ts;
u_long *m_gen;
@@ -603,6 +910,54 @@ pmap_delayed_invl_wait(vm_page_t m)
}
}
+static void
+pmap_delayed_invl_wait_u(vm_page_t m)
+{
+ u_long *m_gen;
+#ifdef PV_STATS
+ bool accounted = false;
+#endif
+
+ m_gen = pmap_delayed_invl_genp(m);
+ while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
+#ifdef PV_STATS
+ if (!accounted) {
+ atomic_add_long(&invl_wait, 1);
+ accounted = true;
+ }
+#endif
+ kern_yield(PRI_USER);
+ }
+}
+
+DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_started, (void), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_delayed_invl_started_l : pmap_delayed_invl_started_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_finished, (void), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_delayed_invl_finished_l : pmap_delayed_invl_finished_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
+{
+
+ return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+ pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u);
+}
+
/*
* Mark the page m's PV list as participating in the current thread's
* DI block. Any threads concurrently using m's PV list to remove or
@@ -2854,6 +3209,7 @@ void
pmap_pinit0(pmap_t pmap)
{
struct proc *p;
+ struct thread *td;
int i;
PMAP_LOCK_INIT(pmap);
@@ -2872,12 +3228,14 @@ pmap_pinit0(pmap_t pmap)
pmap->pm_pcids[i].pm_gen = 1;
}
pmap_activate_boot(pmap);
+ td = curthread;
if (pti) {
- p = curproc;
+ p = td->td_proc;
PROC_LOCK(p);
p->p_md.md_flags |= P_MD_KPTI;
PROC_UNLOCK(p);
}
+ pmap_thread_init_invl_gen(td);
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
@@ -9327,11 +9685,7 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
return (error);
}
-#include "opt_ddb.h"
#ifdef DDB
-#include <sys/kdb.h>
-#include <ddb/ddb.h>
-
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
pmap_t pmap;
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 264b31caf91c..d1571873264b 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -1183,7 +1183,7 @@ amd64_syscall(struct thread *td, int traced)
KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
("System call %s returning with mangled pcb_save",
syscallname(td->td_proc, td->td_sa.code)));
- KASSERT(td->td_md.md_invl_gen.gen == 0,
+ KASSERT(pmap_not_in_di(),
("System call %s returning with leaked invl_gen %lu",
syscallname(td->td_proc, td->td_sa.code),
td->td_md.md_invl_gen.gen));
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 8508b6860297..c8f4687901aa 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -228,7 +228,7 @@ cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
/* Setup to release spin count in fork_exit(). */
td2->td_md.md_spinlock_count = 1;
td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
- td2->td_md.md_invl_gen.gen = 0;
+ pmap_thread_init_invl_gen(td2);
/* As an i386, do not copy io permission bitmap. */
pcb2->pcb_tssp = NULL;
@@ -544,6 +544,7 @@ cpu_copy_thread(struct thread *td, struct thread *td0)
/* Setup to release spin count in fork_exit(). */
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
+ pmap_thread_init_invl_gen(td);
}
/*
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 91d6fb2f934b..d6739c2cea4e 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -441,6 +441,7 @@ void *pmap_mapbios(vm_paddr_t, vm_size_t);
void *pmap_mapdev(vm_paddr_t, vm_size_t);
void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
void *pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size);
+bool pmap_not_in_di(void);
boolean_t pmap_page_is_mapped(vm_page_t m);
void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
void pmap_pinit_pml4(vm_page_t);
@@ -465,6 +466,7 @@ void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
u_int keyidx, int flags);
+void pmap_thread_init_invl_gen(struct thread *td);
int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
#endif /* _KERNEL */
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index c063849972fc..21d6cce7954f 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -50,10 +50,17 @@ struct proc_ldt {
int ldt_refcnt;
};
+#define PMAP_INVL_GEN_NEXT_INVALID 0x1ULL
struct pmap_invl_gen {
u_long gen; /* (k) */
- LIST_ENTRY(pmap_invl_gen) link; /* (pp) */
-};
+ union {
+ LIST_ENTRY(pmap_invl_gen) link; /* (pp) */
+ struct {
+ struct pmap_invl_gen *next;
+ u_char saved_pri;
+ };
+ };
+} __aligned(16);
/*
* Machine-dependent part of the proc structure for AMD64.
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 9529aa30820d..c1966bf88de2 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -84,7 +84,7 @@ _Static_assert(offsetof(struct thread, td_pflags) == 0x104,
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x478,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x530,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x548,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
"struct proc KBI p_flag");