diff options
author | Konstantin Belousov <kib@FreeBSD.org> | 2022-10-10 23:08:55 +0000 |
---|---|---|
committer | Konstantin Belousov <kib@FreeBSD.org> | 2022-12-31 22:09:45 +0000 |
commit | cde70e312c3fde5b37a29be1dacb7fde9a45b94a (patch) | |
tree | 84cf1a6e46c20ca5288d6be328460c084a335333 | |
parent | 45ac7755a7c5d8508176b3d015bb27ff58485c80 (diff) | |
download | src-cde70e312c3f.tar.gz src-cde70e312c3f.zip |
amd64: for small cores, use (big hammer) INVPCID_CTXGLOB instead of INVLPG
A hypothetical CPU bug makes invalidation of global PTEs using INVLPG
in pcid mode unreliable, it seems. The workaround is applied for all
CPUs with small cores, since we do not know the scope of the issue, and
the right fix.
Reviewed by: alc (previous version)
Discussed with: emaste, markj
Tested by: karels
PR: 261169, 266145
Sponsored by: The FreeBSD Foundation
MFC after: 1 week
Differential revision: https://reviews.freebsd.org/D37770
-rw-r--r-- | sys/amd64/amd64/initcpu.c | 5 | ||||
-rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 16 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 36 | ||||
-rw-r--r-- | sys/amd64/include/pcpu.h | 3 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 20 |
5 files changed, 67 insertions, 13 deletions
diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 1b731821889e..08385d3095d0 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -324,6 +324,11 @@ initializecpu(void) if ((r[0] & CPUID_HYBRID_CORE_MASK) == CPUID_HYBRID_SMALL_CORE) { PCPU_SET(small_core, 1); + if (pmap_pcid_enabled && + pmap_pcid_invlpg_workaround_uena) { + PCPU_SET(pcid_invlpg_workaround, 1); + pmap_pcid_invlpg_workaround = 1; + } } } } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index f41e8dafcc86..5c60d301c1e7 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -861,7 +861,7 @@ invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - invlpg(smp_tlb_addr1); + pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { @@ -931,10 +931,16 @@ invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; - do { - invlpg(addr); - addr += PAGE_SIZE; - } while (addr < smp_tlb_addr2); + if (smp_tlb_pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < smp_tlb_addr2); + } if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index a44993efb409..07a00963004b 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -529,6 +529,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pmap_pcid_invlpg_workaround = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pmap_pcid_invlpg_workaround, 0, + "Enable small core PCID/INVLPG workaround"); +int pmap_pcid_invlpg_workaround_uena = 1; int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -2560,6 +2566,9 @@ pmap_init(void) VM_PAGE_TO_PHYS(m); } } + + TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", + &pmap_pcid_invlpg_workaround_uena); } SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, @@ -2791,7 +2800,7 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ - invlpg(va); + pmap_invlpg(pmap, va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB @@ -3130,7 +3139,7 @@ pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { - invlpg(va); + pmap_invlpg(kernel_pmap, va); } else if (pmap == PCPU_GET(curpmap)) { invlpg(va); pmap_invalidate_page_cb(pmap, va); @@ -3221,8 +3230,14 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) vm_offset_t addr; if (pmap == kernel_pmap) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + if (PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } } else if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -3760,7 +3775,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); - invlpg(vaddr); + pmap_invlpg(kernel_pmap, vaddr); /* XXXKIB atomic inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); @@ -7668,7 +7683,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i) va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); - invlpg(va); + pmap_invlpg(kernel_pmap, va); return ((void *)crashdumpmap); } @@ -10371,7 +10386,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); - invlpg(vaddr[i]); + pmap_invlpg(kernel_pmap, vaddr[i]); } } } @@ -10420,7 +10435,14 @@ pmap_quick_remove_page(vm_offset_t addr) if (addr != qframe) return; pte_store(vtopte(qframe), 0); + + /* + * Since qframe is exclusively mapped by + * pmap_quick_enter_page() and that function doesn't set PG_G, + * we can use INVLPG here. + */ invlpg(qframe); + mtx_unlock_spin(&qframe_mtx); } diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index 70f008fe835a..c0c35f4419e8 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -100,7 +100,8 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line"); u_int pc_smp_tlb_op; \ uint64_t pc_ucr3_load_mask; \ u_int pc_small_core; \ - char __pad[2912] /* pad to UMA_PCPU_ALLOC_SIZE */ + u_int pc_pcid_invlpg_workaround; \ + char __pad[2908] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index a55a14f94ed7..e7497c2f8b4b 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -431,6 +431,8 @@ extern vm_offset_t virtual_end; extern vm_paddr_t dmaplimit; extern int pmap_pcid_enabled; extern int invpcid_works; +extern int pmap_pcid_invlpg_workaround; +extern int pmap_pcid_invlpg_workaround_uena; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) @@ -514,6 +516,24 @@ pmap_invalidate_cpu_mask(pmap_t pmap) return (&pmap->pm_active); } +/* + * It seems that AlderLake+ small cores have some microarchitectural + * bug, which results in the INVLPG instruction failing to flush all + * global TLB entries when PCID is enabled. Work around it for now, + * by doing global invalidation on small cores instead of INVLPG. + */ +static __inline void +pmap_invlpg(pmap_t pmap, vm_offset_t va) +{ + if (pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + invlpg(va); + } +} + #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ |