aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2021-12-14 14:43:29 +0000
committerEd Maste <emaste@FreeBSD.org>2022-01-10 00:43:13 +0000
commitb8beb716be3d07b3918792e5759f91552fa527d5 (patch)
tree11737f02c1cea9f942249d11277caa3093e5ed67
parentd5f4d9d5245813f89a64ec6f84eebc025f3e9df9 (diff)
downloadsrc-b8beb716be3d07b3918792e5759f91552fa527d5.tar.gz
src-b8beb716be3d07b3918792e5759f91552fa527d5.zip
MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations
r368649 fixed a regression in r362031 that was MFC-ed to stable/12 as a part of r362572. That commit reordered IPI send and local TLB flush in TLB invalidations. Without this fix we've been seeing problems with stale memory content where changes done under a mutex were not immediately observed by another thread after taking the same mutex. Those inconsistenices were correlated to copy-on-write faults for pages contaning the data. The change needed some adaptations as I elected to skip two significant intermediate changes: - r363195 / dc43978a, amd64: allow parallel shootdown IPIs - r363311 / 3ec7e169, amd64 pmap: microoptimize local shootdowns for PCID PTI configurations Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D33413 (cherry picked from commit 1820ca2154611d6f27ce5a5fdd561a16ac54fdd8) Approved by: so Errata: FreeBSD-EN-22:04.pcid
-rw-r--r--sys/amd64/amd64/pmap.c273
-rw-r--r--sys/x86/x86/mp_x86.c32
2 files changed, 153 insertions, 152 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index c6f56c1cd73b..ee9777c788c1 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -2329,45 +2329,19 @@ pmap_invalidate_ept(pmap_t pmap)
static cpuset_t
pmap_invalidate_cpu_mask(pmap_t pmap)
{
-
return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
}
static inline void
-pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
- const bool invpcid_works1)
+pmap_invalidate_preipi_pcid(pmap_t pmap)
{
- struct invpcid_descr d;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
u_int cpuid, i;
+ sched_pin();
+
cpuid = PCPU_GET(cpuid);
- if (pmap == PCPU_GET(curpmap)) {
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- /*
- * Because pm_pcid is recalculated on a
- * context switch, we must disable switching.
- * Otherwise, we might use a stale value
- * below.
- */
- critical_enter();
- pcid = pmap->pm_pcids[cpuid].pm_pcid;
- if (invpcid_works1) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = va;
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlpg(ucr3, kcr3, va);
- }
- critical_exit();
- }
- } else
- pmap->pm_pcids[cpuid].pm_gen = 0;
+ if (pmap != PCPU_GET(curpmap))
+ cpuid = 0xffffffff; /* An impossible value */
CPU_FOREACH(i) {
if (cpuid != i)
@@ -2388,52 +2362,96 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
}
static void
-pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
{
+ sched_pin();
+}
- pmap_invalidate_page_pcid(pmap, va, true);
+DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t), static)
+{
+ return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
+ pmap_invalidate_preipi_nopcid);
+}
+
+static inline void
+pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
+ const bool invpcid_works1)
+{
+ struct invpcid_descr d;
+ uint64_t kcr3, ucr3;
+ uint32_t pcid;
+ u_int cpuid;
+
+ /*
+ * Because pm_pcid is recalculated on a context switch, we
+ * must ensure there is no preemption, not just pinning.
+ * Otherwise, we might use a stale value below.
+ */
+ CRITICAL_ASSERT(curthread);
+
+ /*
+ * No need to do anything with user page tables invalidation
+ * if there is no user page table.
+ */
+ if (pmap->pm_ucr3 == PMAP_NO_CR3)
+ return;
+
+ cpuid = PCPU_GET(cpuid);
+
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
+ if (invpcid_works1) {
+ d.pcid = pcid | PMAP_PCID_USER_PT;
+ d.pad = 0;
+ d.addr = va;
+ invpcid(&d, INVPCID_ADDR);
+ } else {
+ kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+ ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+ pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+ }
}
static void
-pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
{
+ pmap_invalidate_page_pcid_cb(pmap, va, true);
+}
- pmap_invalidate_page_pcid(pmap, va, false);
+static void
+pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
+{
+ pmap_invalidate_page_pcid_cb(pmap, va, false);
}
static void
-pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
{
}
-DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t),
+DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t),
static)
{
-
if (pmap_pcid_enabled)
- return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
- pmap_invalidate_page_pcid_noinvpcid);
- return (pmap_invalidate_page_nopcid);
+ return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
+ pmap_invalidate_page_pcid_noinvpcid_cb);
+ return (pmap_invalidate_page_nopcid_cb);
}
static void
pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
vm_offset_t addr2 __unused)
{
-
if (pmap == kernel_pmap) {
invlpg(va);
- } else {
- if (pmap == PCPU_GET(curpmap))
- invlpg(va);
- pmap_invalidate_page_mode(pmap, va);
+ } else if (pmap == PCPU_GET(curpmap)) {
+ invlpg(va);
+ pmap_invalidate_page_cb(pmap, va);
}
}
void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
-
if (pmap_type_guest(pmap)) {
pmap_invalidate_ept(pmap);
return;
@@ -2442,6 +2460,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
KASSERT(pmap->pm_type == PT_X86,
("pmap_invalidate_page: invalid type %d", pmap->pm_type));
+ pmap_invalidate_preipi(pmap);
smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
pmap_invalidate_page_curcpu_cb);
}
@@ -2450,73 +2469,62 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
static void
-pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
const bool invpcid_works1)
{
struct invpcid_descr d;
uint64_t kcr3, ucr3;
uint32_t pcid;
- u_int cpuid, i;
+ u_int cpuid;
+
+ CRITICAL_ASSERT(curthread);
+
+ if (pmap != PCPU_GET(curpmap) ||
+ pmap->pm_ucr3 == PMAP_NO_CR3)
+ return;
cpuid = PCPU_GET(cpuid);
- if (pmap == PCPU_GET(curpmap)) {
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap->pm_pcids[cpuid].pm_pcid;
- if (invpcid_works1) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = sva;
- for (; d.addr < eva; d.addr += PAGE_SIZE)
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
- }
- critical_exit();
- }
- } else
- pmap->pm_pcids[cpuid].pm_gen = 0;
- CPU_FOREACH(i) {
- if (cpuid != i)
- pmap->pm_pcids[i].pm_gen = 0;
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
+ if (invpcid_works1) {
+ d.pcid = pcid | PMAP_PCID_USER_PT;
+ d.pad = 0;
+ for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
+ invpcid(&d, INVPCID_ADDR);
+ } else {
+ kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+ ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+ pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
}
- /* See the comment in pmap_invalidate_page_pcid(). */
- atomic_thread_fence_seq_cst();
}
static void
-pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
+pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
vm_offset_t eva)
{
-
- pmap_invalidate_range_pcid(pmap, sva, eva, true);
+ pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
}
static void
-pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
+pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
vm_offset_t eva)
{
-
- pmap_invalidate_range_pcid(pmap, sva, eva, false);
+ pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
}
static void
-pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
+ vm_offset_t eva __unused)
{
}
-DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
+DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
vm_offset_t), static)
{
-
if (pmap_pcid_enabled)
- return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
- pmap_invalidate_range_pcid_noinvpcid);
- return (pmap_invalidate_range_nopcid);
+ return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
+ pmap_invalidate_range_pcid_noinvpcid_cb);
+ return (pmap_invalidate_range_nopcid_cb);
}
static void
@@ -2527,19 +2535,16 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
if (pmap == kernel_pmap) {
for (addr = sva; addr < eva; addr += PAGE_SIZE)
invlpg(addr);
- } else {
- if (pmap == PCPU_GET(curpmap)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- }
- pmap_invalidate_range_mode(pmap, sva, eva);
+ } else if (pmap == PCPU_GET(curpmap)) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ pmap_invalidate_range_cb(pmap, sva, eva);
}
}
void
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
-
if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
pmap_invalidate_all(pmap);
return;
@@ -2553,17 +2558,18 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
KASSERT(pmap->pm_type == PT_X86,
("pmap_invalidate_range: invalid type %d", pmap->pm_type));
+ pmap_invalidate_preipi(pmap);
smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
pmap_invalidate_range_curcpu_cb);
}
static inline void
-pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
+pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
{
struct invpcid_descr d;
uint64_t kcr3, ucr3;
uint32_t pcid;
- u_int cpuid, i;
+ u_int cpuid;
if (pmap == kernel_pmap) {
if (invpcid_works1) {
@@ -2572,87 +2578,72 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
} else {
invltlb_glob();
}
- } else {
+ } else if (pmap == PCPU_GET(curpmap)) {
+ CRITICAL_ASSERT(curthread);
cpuid = PCPU_GET(cpuid);
- if (pmap == PCPU_GET(curpmap)) {
- critical_enter();
- pcid = pmap->pm_pcids[cpuid].pm_pcid;
- if (invpcid_works1) {
- d.pcid = pcid;
- d.pad = 0;
- d.addr = 0;
+
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
+ if (invpcid_works1) {
+ d.pcid = pcid;
+ d.pad = 0;
+ d.addr = 0;
+ invpcid(&d, INVPCID_CTX);
+ if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+ d.pcid |= PMAP_PCID_USER_PT;
invpcid(&d, INVPCID_CTX);
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- d.pcid |= PMAP_PCID_USER_PT;
- invpcid(&d, INVPCID_CTX);
- }
+ }
+ } else {
+ kcr3 = pmap->pm_cr3 | pcid;
+ ucr3 = pmap->pm_ucr3;
+ if (ucr3 != PMAP_NO_CR3) {
+ ucr3 |= pcid | PMAP_PCID_USER_PT;
+ pmap_pti_pcid_invalidate(ucr3, kcr3);
} else {
- kcr3 = pmap->pm_cr3 | pcid;
- ucr3 = pmap->pm_ucr3;
- if (ucr3 != PMAP_NO_CR3) {
- ucr3 |= pcid | PMAP_PCID_USER_PT;
- pmap_pti_pcid_invalidate(ucr3, kcr3);
- } else {
- load_cr3(kcr3);
- }
+ load_cr3(kcr3);
}
- critical_exit();
- } else
- pmap->pm_pcids[cpuid].pm_gen = 0;
- CPU_FOREACH(i) {
- if (cpuid != i)
- pmap->pm_pcids[i].pm_gen = 0;
}
}
- /* See the comment in pmap_invalidate_page_pcid(). */
- atomic_thread_fence_seq_cst();
}
static void
-pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
+pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
{
-
- pmap_invalidate_all_pcid(pmap, true);
+ pmap_invalidate_all_pcid_cb(pmap, true);
}
static void
-pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
+pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
{
-
- pmap_invalidate_all_pcid(pmap, false);
+ pmap_invalidate_all_pcid_cb(pmap, false);
}
static void
-pmap_invalidate_all_nopcid(pmap_t pmap)
+pmap_invalidate_all_nopcid_cb(pmap_t pmap)
{
-
if (pmap == kernel_pmap)
invltlb_glob();
else if (pmap == PCPU_GET(curpmap))
invltlb();
}
-DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static)
+DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t), static)
{
-
if (pmap_pcid_enabled)
- return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
- pmap_invalidate_all_pcid_noinvpcid);
- return (pmap_invalidate_all_nopcid);
+ return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
+ pmap_invalidate_all_pcid_noinvpcid_cb);
+ return (pmap_invalidate_all_nopcid_cb);
}
static void
pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
vm_offset_t addr2 __unused)
{
-
- pmap_invalidate_all_mode(pmap);
+ pmap_invalidate_all_cb(pmap);
}
void
pmap_invalidate_all(pmap_t pmap)
{
-
if (pmap_type_guest(pmap)) {
pmap_invalidate_ept(pmap);
return;
@@ -2661,6 +2652,7 @@ pmap_invalidate_all(pmap_t pmap)
KASSERT(pmap->pm_type == PT_X86,
("pmap_invalidate_all: invalid type %d", pmap->pm_type));
+ pmap_invalidate_preipi(pmap);
smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
pmap_invalidate_all_curcpu_cb);
}
@@ -2669,14 +2661,13 @@ static void
pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
vm_offset_t addr2 __unused)
{
-
wbinvd();
}
void
pmap_invalidate_cache(void)
{
-
+ sched_pin();
smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
}
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 73565360007b..d7cd50e99eed 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1649,13 +1649,16 @@ volatile uint32_t smp_tlb_generation;
* Used by pmap to request invalidation of TLB or cache on local and
* remote processors. Mask provides the set of remote CPUs which are
* to be signalled with the IPI specified by vector. The curcpu_cb
- * callback is invoked on the calling CPU while waiting for remote
- * CPUs to complete the operation.
+ * callback is invoked on the calling CPU in a critical section while
+ * waiting for remote CPUs to complete the operation.
*
* The callback function is called unconditionally on the caller's
* underlying processor, even when this processor is not set in the
* mask. So, the callback function must be prepared to handle such
* spurious invocations.
+ *
+ * This function must be called with the thread pinned, and it unpins on
+ * completion.
*/
static void
smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
@@ -1670,23 +1673,21 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
* It is not necessary to signal other CPUs while booting or
* when in the debugger.
*/
- if (kdb_active || panicstr != NULL || !smp_started) {
- curcpu_cb(pmap, addr1, addr2);
- return;
- }
+ if (kdb_active || panicstr != NULL || !smp_started)
+ goto local_cb;
- sched_pin();
+ KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
/*
* Check for other cpus. Return if none.
*/
if (CPU_ISFULLSET(&mask)) {
if (mp_ncpus <= 1)
- goto nospinexit;
+ goto local_cb;
} else {
CPU_CLR(PCPU_GET(cpuid), &mask);
if (CPU_EMPTY(&mask))
- goto nospinexit;
+ goto local_cb;
}
if (!(read_eflags() & PSL_I))
@@ -1718,13 +1719,22 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
while (*p_cpudone != generation)
ia32_pause();
}
- mtx_unlock_spin(&smp_ipi_mtx);
+
+ /*
+ * Unpin before unlocking smp_ipi_mtx. If the thread owes
+ * preemption, this allows scheduler to select thread on any
+ * CPU from its cpuset.
+ */
sched_unpin();
+ mtx_unlock_spin(&smp_ipi_mtx);
+
return;
-nospinexit:
+local_cb:
+ critical_enter();
curcpu_cb(pmap, addr1, addr2);
sched_unpin();
+ critical_exit();
}
void