diff options
Diffstat (limited to 'sys/amd64/amd64/pmap.c')
-rw-r--r-- | sys/amd64/amd64/pmap.c | 1046 |
1 files changed, 637 insertions, 409 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 893774357629..ff702ed2dcfb 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -43,8 +43,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. @@ -86,8 +84,6 @@ #define AMD64_NPT_AWARE #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - /* * Manages physical address maps. * @@ -178,14 +174,14 @@ __FBSDID("$FreeBSD$"); #define PMAP_MEMDOM 1 #endif -static __inline boolean_t +static __inline bool pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } -static __inline boolean_t +static __inline bool pmap_emulate_ad_bits(pmap_t pmap) { @@ -313,15 +309,32 @@ pmap_pku_mask_bit(pmap_t pmap) return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); } -#if !defined(DIAGNOSTIC) -#ifdef __GNUC_GNU_INLINE__ -#define PMAP_INLINE __attribute__((__gnu_inline__)) inline -#else -#define PMAP_INLINE extern inline -#endif -#else -#define PMAP_INLINE -#endif +static __inline bool +safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) +{ + + if (!pmap_emulate_ad_bits(pmap)) + return (true); + + KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); + + /* + * XWR = 010 or 110 will cause an unconditional EPT misconfiguration + * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared + * if the EPT_PG_WRITE bit is set. + */ + if ((pte & EPT_PG_WRITE) != 0) + return (false); + + /* + * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. + */ + if ((pte & EPT_PG_EXECUTE) == 0 || + ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) + return (true); + else + return (false); +} #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) @@ -384,7 +397,15 @@ pmap_pku_mask_bit(pmap_t pmap) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) -struct pmap kernel_pmap_store; +/* + * Statically allocate kernel pmap memory. However, memory for + * pm_pcids is obtained after the dynamic allocator is operational. + * Initialize it with a non-canonical pointer to catch early accesses + * regardless of the active mapping. + */ +struct pmap kernel_pmap_store = { + .pm_pcidp = (void *)0xdeadbeefdeadbeef, +}; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ @@ -401,7 +422,7 @@ pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); -static int pg_ps_enabled = 1; +static int __read_frequently pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); @@ -529,6 +550,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pmap_pcid_invlpg_workaround = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pmap_pcid_invlpg_workaround, 0, + "Enable small core PCID/INVLPG workaround"); +int pmap_pcid_invlpg_workaround_uena = 1; int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -1253,19 +1280,20 @@ static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode, int flags); -static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); -static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, +static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); -static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, +static bool pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); -static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, +static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); -static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); +static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, + bool allpte_PG_A_set); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, @@ -1276,10 +1304,10 @@ static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); #if VM_NRESERVLEVEL > 0 -static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp); +static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + vm_page_t mpte, struct rwlock **lockp); #endif -static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, +static bool pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, @@ -1294,10 +1322,10 @@ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); -static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, +static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); -static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, +static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); @@ -1520,7 +1548,7 @@ pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; -PMAP_INLINE pt_entry_t * +pt_entry_t * vtopte(vm_offset_t va) { KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); @@ -1654,6 +1682,7 @@ create_pagetables(vm_paddr_t *firstaddr) #endif int i, j, ndm1g, nkpdpe, nkdmpde; + TSENTER(); /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ @@ -1870,6 +1899,7 @@ create_pagetables(vm_paddr_t *firstaddr) } kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); + TSEXIT(); } /* @@ -1888,10 +1918,11 @@ pmap_bootstrap(vm_paddr_t *firstaddr) vm_offset_t va; pt_entry_t *pte, *pcpu_pte; struct region_descriptor r_gdt; - uint64_t cr4, pcpu_phys; + uint64_t cr4, pcpu0_phys; u_long res; int i; + TSENTER(); KERNend = *firstaddr; res = atop(KERNend - (vm_paddr_t)kernphys); @@ -1903,7 +1934,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) */ create_pagetables(firstaddr); - pcpu_phys = allocpages(firstaddr, MAXCPU); + pcpu0_phys = allocpages(firstaddr, 1); /* * Add a physical memory segment (vm_phys_seg) corresponding to the @@ -1948,7 +1979,12 @@ pmap_bootstrap(vm_paddr_t *firstaddr) kernel_pmap->pm_ucr3 = PMAP_NO_CR3; TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; + vm_radix_init(&kernel_pmap->pm_root); kernel_pmap->pm_flags = pmap_flags; + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range, + pkru_free_range, kernel_pmap, M_NOWAIT); + } /* * The kernel pmap is always active on all CPUs. Once CPUs are @@ -1981,10 +2017,15 @@ pmap_bootstrap(vm_paddr_t *firstaddr) SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); virtual_avail = va; - for (i = 0; i < MAXCPU; i++) { - pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | - pg_g | pg_nx | X86_PG_M | X86_PG_A; - } + /* + * Map the BSP PCPU now, the rest of the PCPUs are mapped by + * amd64_mp_alloc_pcpu()/start_all_aps() when we know the + * number of CPUs and NUMA affinity. + */ + pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx | + X86_PG_M | X86_PG_A; + for (i = 1; i < MAXCPU; i++) + pcpu_pte[i] = 0; /* * Re-initialize PCPU area for BSP after switching. @@ -2020,10 +2061,11 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* Initialize TLB Context Id. */ if (pmap_pcid_enabled) { - for (i = 0; i < MAXCPU; i++) { - kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; - kernel_pmap->pm_pcids[i].pm_gen = 1; - } + kernel_pmap->pm_pcidp = (void *)(uintptr_t) + offsetof(struct pcpu, pc_kpmap_store); + + PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN); + PCPU_SET(kpmap_store.pm_gen, 1); /* * PMAP_PCID_KERN + 1 is used for initialization of @@ -2041,6 +2083,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) */ load_cr4(rcr4() | CR4_PCIDE); } + TSEXIT(); } /* @@ -2287,7 +2330,10 @@ pmap_allow_2m_x_ept_recalculate(void) CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ CPUID_TO_MODEL(cpu_id) == 0x85)))) pmap_allow_2m_x_ept = 1; +#ifndef BURN_BRIDGES TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); +#endif + TUNABLE_INT_FETCH("vm.pmap.allow_2m_x_ept", &pmap_allow_2m_x_ept); } static bool @@ -2308,9 +2354,13 @@ pmap_init_pv_table(void) int domain, i, j, pages; /* - * We strongly depend on the size being a power of two, so the assert - * is overzealous. However, should the struct be resized to a - * different power of two, the code below needs to be revisited. + * For correctness we depend on the size being evenly divisible into a + * page. As a tradeoff between performance and total memory use, the + * entry is 64 bytes (aka one cacheline) in size. Not being smaller + * avoids false-sharing, but not being 128 bytes potentially allows for + * avoidable traffic due to adjacent cacheline prefetcher. + * + * Assert the size so that accidental changes fail to compile. */ CTASSERT((sizeof(*pvd) == 64)); @@ -2390,7 +2440,7 @@ pmap_init_pv_table(void) */ s = (vm_size_t)pv_npg * sizeof(struct md_page); s = round_page(s); - pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); + pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); @@ -2432,7 +2482,7 @@ pmap_init(void) "at physical 1G\n"); for (i = 0; i < atop(0x400000); i++) { ret = vm_page_blacklist_add(0x40000000 + - ptoa(i), FALSE); + ptoa(i), false); if (!ret && bootverbose) printf("page at %#lx already used\n", 0x40000000 + ptoa(i)); @@ -2463,7 +2513,7 @@ pmap_init(void) */ if ((i == 0 || kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && - pmap_insert_pt_page(kernel_pmap, mpte, false)) + pmap_insert_pt_page(kernel_pmap, mpte, false, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); @@ -2626,7 +2676,7 @@ pmap_swap_pat(pmap_t pmap, pt_entry_t entry) return (entry); } -boolean_t +bool pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { @@ -2639,7 +2689,7 @@ pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) * caching mode. */ int -pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) +pmap_cache_bits(pmap_t pmap, int mode, bool is_pde) { int cache_bits, pat_flag, pat_idx; @@ -2677,7 +2727,7 @@ pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) } static int -pmap_cache_mask(pmap_t pmap, boolean_t is_pde) +pmap_cache_mask(pmap_t pmap, bool is_pde) { int mask; @@ -2791,7 +2841,7 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ - invlpg(va); + pmap_invlpg(pmap, va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB @@ -2897,8 +2947,16 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a * user space page table(s). * - * If the INVPCID instruction is available, it is used to flush entries - * from the kernel page table. + * If the INVPCID instruction is available, it is used to flush user + * entries from the kernel page table. + * + * When PCID is enabled, the INVLPG instruction invalidates all TLB + * entries for the given page that either match the current PCID or + * are global. Since TLB entries for the same page under different + * PCIDs are unaffected, kernel pages which reside in all address + * spaces could be problematic. We avoid the problem by creating + * all kernel PTEs with the global flag (PG_G) set, when PTI is + * disabled. * * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its * address space, all other 4095 PCIDs are used for user mode spaces @@ -3013,6 +3071,7 @@ pmap_invalidate_ept(pmap_t pmap) static inline void pmap_invalidate_preipi_pcid(pmap_t pmap) { + struct pmap_pcid *pcidp; u_int cpuid, i; sched_pin(); @@ -3022,8 +3081,10 @@ pmap_invalidate_preipi_pcid(pmap_t pmap) cpuid = 0xffffffff; /* An impossible value */ CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; + if (cpuid != i) { + pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); + pcidp->pm_gen = 0; + } } /* @@ -3058,7 +3119,6 @@ pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; - u_int cpuid; /* * Because pm_pcid is recalculated on a context switch, we @@ -3077,9 +3137,7 @@ pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) return; - cpuid = PCPU_GET(cpuid); - - pcid = pmap->pm_pcids[cpuid].pm_pcid; + pcid = pmap_get_pcid(pmap); if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; @@ -3122,7 +3180,7 @@ pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { - invlpg(va); + pmap_invlpg(kernel_pmap, va); } else if (pmap == PCPU_GET(curpmap)) { invlpg(va); pmap_invalidate_page_cb(pmap, va); @@ -3154,7 +3212,6 @@ pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; - u_int cpuid; CRITICAL_ASSERT(curthread); @@ -3163,9 +3220,7 @@ pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) return; - cpuid = PCPU_GET(cpuid); - - pcid = pmap->pm_pcids[cpuid].pm_pcid; + pcid = pmap_get_pcid(pmap); if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; @@ -3213,8 +3268,14 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) vm_offset_t addr; if (pmap == kernel_pmap) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + if (PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } } else if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -3249,7 +3310,6 @@ pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) struct invpcid_descr d; uint64_t kcr3; uint32_t pcid; - u_int cpuid; if (pmap == kernel_pmap) { if (invpcid_works1) { @@ -3260,9 +3320,8 @@ pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) } } else if (pmap == PCPU_GET(curpmap)) { CRITICAL_ASSERT(curthread); - cpuid = PCPU_GET(cpuid); - pcid = pmap->pm_pcids[cpuid].pm_pcid; + pcid = pmap_get_pcid(pmap); if (invpcid_works1) { d.pcid = pcid; d.pad = 0; @@ -3419,6 +3478,7 @@ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct invpcid_descr d; + struct pmap_pcid *pcidp; uint64_t kcr3, ucr3; uint32_t pcid; @@ -3434,7 +3494,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); - pcid = pmap->pm_pcids[0].pm_pcid; + pcid = pmap_get_pcid(pmap); if (invpcid_works) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; @@ -3448,16 +3508,20 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) } critical_exit(); } - } else if (pmap_pcid_enabled) - pmap->pm_pcids[0].pm_gen = 0; + } else if (pmap_pcid_enabled) { + pcidp = zpcpu_get(pmap->pm_pcidp); + pcidp->pm_gen = 0; + } } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct invpcid_descr d; + struct pmap_pcid *pcidp; vm_offset_t addr; uint64_t kcr3, ucr3; + uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; @@ -3472,24 +3536,24 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); + pcid = pmap_get_pcid(pmap); if (invpcid_works) { - d.pcid = pmap->pm_pcids[0].pm_pcid | - PMAP_PCID_USER_PT; + d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { - kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. - pm_pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. - pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else if (pmap_pcid_enabled) { - pmap->pm_pcids[0].pm_gen = 0; + pcidp = zpcpu_get(pmap->pm_pcidp); + pcidp->pm_gen = 0; } } @@ -3497,7 +3561,9 @@ void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; + struct pmap_pcid *pcidp; uint64_t kcr3, ucr3; + uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; @@ -3516,8 +3582,9 @@ pmap_invalidate_all(pmap_t pmap) } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { critical_enter(); + pcid = pmap_get_pcid(pmap); if (invpcid_works) { - d.pcid = pmap->pm_pcids[0].pm_pcid; + d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); @@ -3526,10 +3593,10 @@ pmap_invalidate_all(pmap_t pmap) invpcid(&d, INVPCID_CTX); } } else { - kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; + kcr3 = pmap->pm_cr3 | pcid; if (pmap->pm_ucr3 != PMAP_NO_CR3) { - ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ - 0].pm_pcid | PMAP_PCID_USER_PT; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); @@ -3539,11 +3606,12 @@ pmap_invalidate_all(pmap_t pmap) invltlb(); } } else if (pmap_pcid_enabled) { - pmap->pm_pcids[0].pm_gen = 0; + pcidp = zpcpu_get(pmap->pm_pcidp); + pcidp->pm_gen = 0; } } -PMAP_INLINE void +void pmap_invalidate_cache(void) { @@ -3553,12 +3621,15 @@ pmap_invalidate_cache(void) static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { + struct pmap_pcid *pcidp; pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); - else - pmap->pm_pcids[0].pm_gen = 0; + else { + pcidp = zpcpu_get(pmap->pm_pcidp); + pcidp->pm_gen = 0; + } } #endif /* !SMP */ @@ -3743,7 +3814,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) spa = dmaplimit; } - pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | + pte_bits = pmap_cache_bits(kernel_pmap, mattr, false) | X86_PG_RW | X86_PG_V; error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr); @@ -3752,7 +3823,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); - invlpg(vaddr); + pmap_invlpg(kernel_pmap, vaddr); /* XXXKIB atomic inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); @@ -3766,7 +3837,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) * Extract the physical page address associated * with the given map/virtual_address pair. */ -vm_paddr_t +vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; @@ -3853,6 +3924,12 @@ out: return (m); } +/* + * Routine: pmap_kextract + * Function: + * Extract the physical page address associated with the given kernel + * virtual address. + */ vm_paddr_t pmap_kextract(vm_offset_t va) { @@ -3891,7 +3968,7 @@ pmap_kextract(vm_offset_t va) * Add a wired page to the kva. * Note: not SMP coherent. */ -PMAP_INLINE void +void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; @@ -3908,7 +3985,7 @@ pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) int cache_bits; pte = vtopte(va); - cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); + cache_bits = pmap_cache_bits(kernel_pmap, mode, false); pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V | cache_bits); } @@ -3917,7 +3994,7 @@ pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ -PMAP_INLINE void +void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; @@ -3965,7 +4042,7 @@ pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) endpte = pte + count; while (pte < endpte) { m = *ma++; - cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); + cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, false); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; @@ -4007,8 +4084,7 @@ pmap_qremove(vm_offset_t sva, int count) * physical memory manager after the TLB has been updated. */ static __inline void -pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, - boolean_t set_PG_ZERO) +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) { if (set_PG_ZERO) @@ -4024,14 +4100,26 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * - * If "promoted" is false, then the page table page "mpte" must be zero filled. + * If "promoted" is false, then the page table page "mpte" must be zero filled; + * "mpte"'s valid field will be set to 0. + * + * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must + * contain valid mappings with identical attributes except for PG_A; "mpte"'s + * valid field will be set to 1. + * + * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain + * valid mappings with identical attributes including PG_A; "mpte"'s valid + * field will be set to VM_PAGE_BITS_ALL. */ static __inline int -pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, + bool allpte_PG_A_set) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; + KASSERT(promoted || !allpte_PG_A_set, + ("a zero-filled PTP can't have PG_A set in every PTE")); + mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } @@ -4052,19 +4140,19 @@ pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count - * drops to zero, then the page table page is unmapped. Returns TRUE if the - * page table page was unmapped and FALSE otherwise. + * drops to zero, then the page table page is unmapped. Returns true if the + * page table page was unmapped and false otherwise. */ -static inline boolean_t +static inline bool pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); - return (TRUE); + return (true); } else - return (FALSE); + return (false); } static void @@ -4128,7 +4216,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ - pmap_add_delayed_free_list(m, free, TRUE); + pmap_add_delayed_free_list(m, free, true); } /* @@ -4169,12 +4257,24 @@ pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) } } +static void +pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen) +{ + struct pmap_pcid *pcidp; + int i; + + CPU_FOREACH(i) { + pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); + pcidp->pm_pcid = pcid; + pcidp->pm_gen = gen; + } +} + void pmap_pinit0(pmap_t pmap) { struct proc *p; struct thread *td; - int i; PMAP_LOCK_INIT(pmap); pmap->pm_pmltop = kernel_pmap->pm_pmltop; @@ -4187,10 +4287,8 @@ pmap_pinit0(pmap_t pmap) TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; - CPU_FOREACH(i) { - pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; - pmap->pm_pcids[i].pm_gen = 1; - } + pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK); + pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1); pmap_activate_boot(pmap); td = curthread; if (pti) { @@ -4264,14 +4362,14 @@ pmap_pinit_pml5(vm_page_t pml5pg) */ pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | - pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false); /* * Install self-referential address mapping entry. */ pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | - pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false); } static void @@ -4301,7 +4399,7 @@ pmap_pinit_pml5_pti(vm_page_t pml5pgu) pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = pmap_kextract((vm_offset_t)pti_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | - pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false); } /* Allocate a page table page and do related bookkeeping */ @@ -4338,6 +4436,8 @@ pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) pmap_pt_page_count_adj(pmap, -1); } +_Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp"); + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. @@ -4347,7 +4447,6 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pmltop_pg, pmltop_pgu; vm_paddr_t pmltop_phys; - int i; bzero(&pmap->pm_stats, sizeof pmap->pm_stats); @@ -4371,9 +4470,11 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); - CPU_FOREACH(i) { - pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; - pmap->pm_pcids[i].pm_gen = 0; + if (pmap_pcid_enabled) { + if (pmap->pm_pcidp == NULL) + pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, + M_WAITOK); + pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0); } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; @@ -4997,13 +5098,22 @@ pmap_growkernel(vm_offset_t addr) vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; + vm_offset_t end; + TSENTER(); mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* - * Return if "addr" is within the range of kernel page table pages - * that were preallocated during pmap bootstrap. Moreover, leave - * "kernel_vm_end" and the kernel page table as they were. + * The kernel map covers two distinct regions of KVA: that used + * for dynamic kernel memory allocations, and the uppermost 2GB + * of the virtual address space. The latter is used to map the + * kernel and loadable kernel modules. This scheme enables the + * use of a special code generation model for kernel code which + * takes advantage of compact addressing modes in machine code. + * + * Both regions grow upwards; to avoid wasting memory, the gap + * in between is unmapped. If "addr" is above "KERNBASE", the + * kernel's region is grown, otherwise the kmem region is grown. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the @@ -5015,22 +5125,35 @@ pmap_growkernel(vm_offset_t addr) * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ - if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) - return; + if (KERNBASE < addr) { + end = KERNBASE + nkpt * NBPDR; + if (end == 0) { + TSEXIT(); + return; + } + } else { + end = kernel_vm_end; + } addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); - if (kernel_vm_end < addr) - kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); - if (kernel_vm_end < addr) - kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); - while (kernel_vm_end < addr) { - pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); + if (addr <= end) { + /* + * The grown region is already mapped, so there is + * nothing to do. + */ + TSEXIT(); + return; + } + + kasan_shadow_map(end, addr - end); + kmsan_shadow_map(end, addr - end); + while (end < addr) { + pdpe = pmap_pdpe(kernel_pmap, end); if ((*pdpe & X86_PG_V) == 0) { - /* We need a new PDP entry */ nkpg = pmap_alloc_pt_page(kernel_pmap, - kernel_vm_end >> PDPSHIFT, VM_ALLOC_WIRED | + pmap_pdpe_pindex(end), VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); @@ -5039,55 +5162,46 @@ pmap_growkernel(vm_offset_t addr) X86_PG_A | X86_PG_M); continue; /* try again */ } - pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); + pde = pmap_pdpe_to_pde(pdpe, end); if ((*pde & X86_PG_V) != 0) { - kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; - if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { - kernel_vm_end = vm_map_max(kernel_map); + end = (end + NBPDR) & ~PDRMASK; + if (end - 1 >= vm_map_max(kernel_map)) { + end = vm_map_max(kernel_map); break; } continue; } - nkpg = pmap_alloc_pt_page(kernel_pmap, - pmap_pde_pindex(kernel_vm_end), VM_ALLOC_WIRED | - VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); + nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), + VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); - kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; - if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { - kernel_vm_end = vm_map_max(kernel_map); + end = (end + NBPDR) & ~PDRMASK; + if (end - 1 >= vm_map_max(kernel_map)) { + end = vm_map_max(kernel_map); break; } } + + if (end <= KERNBASE) + kernel_vm_end = end; + else + nkpt = howmany(end - KERNBASE, NBPDR); + TSEXIT(); } /*************************************************** * page management routines. ***************************************************/ -CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); -CTASSERT(_NPCM == 3); -CTASSERT(_NPCPV == 168); - -static __inline struct pv_chunk * -pv_to_chunk(pv_entry_t pv) -{ - - return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); -} - -#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) - -#define PC_FREE0 0xfffffffffffffffful -#define PC_FREE1 0xfffffffffffffffful -#define PC_FREE2 0x000000fffffffffful - -static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; +static const uint64_t pc_freemask[_NPCM] = { + [0 ... _NPCM - 2] = PC_FREEN, + [_NPCM - 1] = PC_FREEL +}; #ifdef PV_STATS @@ -5299,8 +5413,7 @@ reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) PV_STAT(counter_u64_add(pv_entry_spare, freed)); PV_STAT(counter_u64_add(pv_entry_count, -freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && - pc->pc_map[2] == PC_FREE2) { + if (pc_is_free(pc)) { PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); PV_STAT(counter_u64_add(pc_chunk_count, -1)); PV_STAT(counter_u64_add(pc_chunk_frees, 1)); @@ -5384,8 +5497,7 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv) field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; - if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || - pc->pc_map[2] != PC_FREE2) { + if (!pc_is_free(pc)) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); @@ -5510,9 +5622,9 @@ retry: dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; - pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ - pc->pc_map[1] = PC_FREE1; - pc->pc_map[2] = PC_FREE2; + pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREEN; + pc->pc_map[2] = PC_FREEL; pvc = &pv_chunks[vm_page_domain(m)]; mtx_lock(&pvc->pvc_lock); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); @@ -5610,9 +5722,9 @@ retry: dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; - pc->pc_map[0] = PC_FREE0; - pc->pc_map[1] = PC_FREE1; - pc->pc_map[2] = PC_FREE2; + pc->pc_map[0] = PC_FREEN; + pc->pc_map[1] = PC_FREEN; + pc->pc_map[2] = PC_FREEL; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); @@ -5786,7 +5898,7 @@ pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ -static boolean_t +static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { @@ -5799,9 +5911,9 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - return (TRUE); + return (true); } else - return (FALSE); + return (false); } /* @@ -5849,11 +5961,11 @@ pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ -static boolean_t +static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; - boolean_t rv; + bool rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); @@ -5906,7 +6018,7 @@ pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, va, pmap); } -static boolean_t +static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { @@ -5923,7 +6035,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); - PG_PTE_CACHE = pmap_cache_mask(pmap, 0); + PG_PTE_CACHE = pmap_cache_mask(pmap, false); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -5940,7 +6052,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); - return (FALSE); + return (false); } mpte = pmap_remove_pt_page(pmap, va); @@ -5977,7 +6089,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, */ if (mpte == NULL) { pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); - return (FALSE); + return (false); } if (!in_kernel) @@ -5992,17 +6104,17 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, newpte = pmap_swap_pat(pmap, newpte); /* - * If the page table page is not leftover from an earlier promotion, - * initialize it. + * If the PTP is not leftover from an earlier promotion or it does not + * have PG_A set in every PTE, then fill it. The new PTEs will all + * have PG_A set. */ - if (mpte->valid == 0) + if (!vm_page_all_valid(mpte)) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); /* - * If the mapping has changed attributes, update the page table - * entries. + * If the mapping has changed attributes, update the PTEs. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); @@ -6045,7 +6157,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, counter_u64_add(pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", va, pmap); - return (TRUE); + return (true); } /* @@ -6071,7 +6183,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ - if (mpte->valid != 0) + if (vm_page_any_valid(mpte)) pagezero((void *)PHYS_TO_DMAP(mptepa)); /* @@ -6137,13 +6249,13 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { - KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_any_valid(mpte), ("pmap_remove_pde: pte page not promoted")); pmap_pt_page_count_adj(pmap, -1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pde: pte page ref count error")); mpte->ref_count = 0; - pmap_add_delayed_free_list(mpte, free, FALSE); + pmap_add_delayed_free_list(mpte, free, false); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); @@ -6250,14 +6362,8 @@ pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, return (anyvalid); } -/* - * Remove the given range of addresses from the specified map. - * - * It is assumed that the start and end are properly - * rounded to the page size. - */ -void -pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +static void +pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) { struct rwlock *lock; vm_page_t mt; @@ -6289,7 +6395,8 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) pmap_delayed_invl_start(); PMAP_LOCK(pmap); - pmap_pkru_on_remove(pmap, sva, eva); + if (map_delete) + pmap_pkru_on_remove(pmap, sva, eva); /* * special handling of removing one page. a very @@ -6412,6 +6519,30 @@ out: } /* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + pmap_remove1(pmap, sva, eva, false); +} + +/* + * Remove the given range of addresses as part of a logical unmap + * operation. This has the effect of calling pmap_remove(), but + * also clears any metadata that should persist for the lifetime + * of a logical mapping. + */ +void +pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + pmap_remove1(pmap, sva, eva, true); +} + +/* * Routine: pmap_remove_all * Function: * Removes this physical page from @@ -6510,12 +6641,12 @@ retry: /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ -static boolean_t +static bool pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; - boolean_t anychanged; + bool anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); @@ -6525,7 +6656,7 @@ pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); - anychanged = FALSE; + anychanged = false; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { @@ -6550,7 +6681,7 @@ retry: if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else - anychanged = TRUE; + anychanged = true; } return (anychanged); } @@ -6569,7 +6700,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; pt_entry_t obits, pbits; - boolean_t anychanged; + bool anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { @@ -6585,7 +6716,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); - anychanged = FALSE; + anychanged = false; /* * Although this function delays and batches the invalidation @@ -6641,7 +6772,7 @@ retry_pdpe: if (!atomic_cmpset_long(pdpe, obits, pbits)) /* PG_PS cannot be cleared under us, */ goto retry_pdpe; - anychanged = TRUE; + anychanged = true; } continue; } @@ -6673,7 +6804,7 @@ retry_pdpe: * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) - anychanged = TRUE; + anychanged = true; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* @@ -6710,7 +6841,7 @@ retry: if (obits & PG_G) pmap_invalidate_page(pmap, sva); else - anychanged = TRUE; + anychanged = true; } } } @@ -6719,7 +6850,6 @@ retry: PMAP_UNLOCK(pmap); } -#if VM_NRESERVLEVEL > 0 static bool pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) { @@ -6729,6 +6859,7 @@ pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) return ((pde & EPT_PG_EXECUTE) != 0); } +#if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion @@ -6736,41 +6867,59 @@ pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ -static void -pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, +static bool +pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; - pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; - vm_page_t mpte; + pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; int PG_PTE_CACHE; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (!pmap_ps_enabled(pmap)) + return (false); + PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PKU_MASK = pmap_pku_mask_bit(pmap); - PG_PTE_CACHE = pmap_cache_mask(pmap, 0); - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PG_PTE_CACHE = pmap_cache_mask(pmap, false); /* * Examine the first PTE in the specified PTP. Abort if this PTE is - * either invalid, unused, or does not map the first 4KB physical page - * within a 2MB page. + * ineligible for promotion due to hardware errata, invalid, or does + * not map the first 4KB physical page within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); newpde = *firstpte; - if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || - !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, - newpde))) { + if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) + return (false); + if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); - return; + return (false); } + + /* + * Both here and in the below "for" loop, to allow for repromotion + * after MADV_FREE, conditionally write protect a clean PTE before + * possibly aborting the promotion due to other PTE attributes. Why? + * Suppose that MADV_FREE is applied to a part of a superpage, the + * address range [S, E). pmap_advise() will demote the superpage + * mapping, destroy the 4KB page mapping at the end of [S, E), and + * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, + * imagine that the memory in [S, E) is recycled, but the last 4KB + * page in [S, E) is not the last to be rewritten, or simply accessed. + * In other words, there is still a 4KB page in [S, E), call it P, + * that is writeable but PG_M and PG_A are clear in P's PTE. Unless + * we write protect P before aborting the promotion, if and when P is + * finally rewritten, there won't be a page fault to trigger + * repromotion. + */ setpde: if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* @@ -6780,6 +6929,8 @@ setpde: if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; + CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" + " in pmap %p", va & ~PDRMASK, pmap); } /* @@ -6787,14 +6938,15 @@ setpde: * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ - pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; + allpte_PG_A = newpde & PG_A; + pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { oldpte = *pte; - if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + if ((oldpte & (PG_FRAME | PG_V)) != pa) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); - return; + return (false); } setpte: if ((oldpte & (PG_M | PG_RW)) == PG_RW) { @@ -6813,17 +6965,35 @@ setpte: counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); - return; + return (false); } + allpte_PG_A &= oldpte; pa -= PAGE_SIZE; } /* - * Save the page table page in its current state until the PDE - * mapping the superpage is demoted by pmap_demote_pde() or - * destroyed by pmap_remove_pde(). + * Unless all PTEs have PG_A set, clear it from the superpage mapping, + * so that promotions triggered by speculative mappings, such as + * pmap_enter_quick(), don't automatically mark the underlying pages + * as referenced. + */ + newpde &= ~PG_A | allpte_PG_A; + + /* + * EPT PTEs with PG_M set and PG_A clear are not supported by early + * MMUs supporting EPT. + */ + KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde), + ("unsupported EPT PTE")); + + /* + * Save the PTP in its current state until the PDE mapping the + * superpage is demoted by pmap_demote_pde() or destroyed by + * pmap_remove_pde(). If PG_A is not set in every PTE, then request + * that the PTP be refilled on demotion. */ - mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + if (mpte == NULL) + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); @@ -6831,12 +7001,12 @@ setpte: ("pmap_promote_pde: page table page's pindex is wrong " "mpte %p pidx %#lx va %#lx va pde pidx %#lx", mpte, mpte->pindex, va, pmap_pde_pindex(va))); - if (pmap_insert_pt_page(pmap, mpte, true)) { + if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); - return; + return (false); } /* @@ -6861,6 +7031,7 @@ setpte: counter_u64_add(pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); + return (true); } #endif /* VM_NRESERVLEVEL > 0 */ @@ -6986,7 +7157,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; - boolean_t nosleep; + bool nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); @@ -7234,10 +7405,9 @@ unchanged: * populated, then attempt promotion. */ if ((mpte == NULL || mpte->ref_count == NPTEPG) && - pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) - pmap_promote_pde(pmap, pde, va, &lock); + (void)pmap_promote_pde(pmap, pde, va, mpte, &lock); #endif rv = KERN_SUCCESS; @@ -7249,13 +7419,12 @@ out: } /* - * Tries to create a read- and/or execute-only 2MB page mapping. Returns true - * if successful. Returns false if (1) a page table page cannot be allocated - * without sleeping, (2) a mapping already exists at the specified virtual - * address, or (3) a PV entry cannot be allocated without reclaiming another - * PV entry. + * Tries to create a read- and/or execute-only 2MB page mapping. Returns + * KERN_SUCCESS if the mapping was created. Otherwise, returns an error + * value. See pmap_enter_pde() for the possible error values when "no sleep", + * "no replace", and "no reclaim" are specified. */ -static bool +static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { @@ -7264,8 +7433,8 @@ pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); - newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | - PG_PS | PG_V; + newpde = VM_PAGE_TO_PHYS(m) | + pmap_cache_bits(pmap, m->md.pat_mode, true) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) @@ -7273,8 +7442,7 @@ pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | - PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == - KERN_SUCCESS); + PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); } /* @@ -7297,12 +7465,19 @@ pmap_every_pte_zero(vm_paddr_t pa) /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if - * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE - * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and - * a mapping already exists at the specified virtual address. Returns - * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table - * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if - * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. + * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, + * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns + * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB + * page mapping already exists within the 2MB virtual address range starting + * at the specified virtual address or (2) the requested 2MB page mapping is + * not supported due to hardware errata. Returns KERN_NO_SPACE if + * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at + * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU + * settings are not the same across the 2MB virtual address range starting at + * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either + * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation + * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation + * failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ @@ -7314,9 +7489,8 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; + vm_page_t uwptpg; - KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, - ("pmap_enter_pde: cannot create wired user mapping")); PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, @@ -7358,14 +7532,23 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, if ((oldpde & PG_V) != 0) { KASSERT(pdpg == NULL || pdpg->ref_count > 1, ("pmap_enter_pde: pdpg's reference count is too low")); - if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < - VM_MAXUSER_ADDRESS || (oldpde & PG_PS) != 0 || - !pmap_every_pte_zero(oldpde & PG_FRAME))) { - if (pdpg != NULL) - pdpg->ref_count--; - CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" - " in pmap %p", va, pmap); - return (KERN_FAILURE); + if ((flags & PMAP_ENTER_NOREPLACE) != 0) { + if ((oldpde & PG_PS) != 0) { + if (pdpg != NULL) + pdpg->ref_count--; + CTR2(KTR_PMAP, + "pmap_enter_pde: no space for va %#lx" + " in pmap %p", va, pmap); + return (KERN_NO_SPACE); + } else if (va < VM_MAXUSER_ADDRESS || + !pmap_every_pte_zero(oldpde & PG_FRAME)) { + if (pdpg != NULL) + pdpg->ref_count--; + CTR2(KTR_PMAP, + "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_FAILURE); + } } /* Break the existing mapping(s). */ SLIST_INIT(&free); @@ -7399,11 +7582,27 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - if (pmap_insert_pt_page(pmap, mt, false)) + if (pmap_insert_pt_page(pmap, mt, false, false)) panic("pmap_enter_pde: trie insert failed"); } } + /* + * Allocate leaf ptpage for wired userspace pages. + */ + uwptpg = NULL; + if ((newpde & PG_W) != 0 && pmap != kernel_pmap) { + uwptpg = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), + VM_ALLOC_WIRED); + if (uwptpg == NULL) + return (KERN_RESOURCE_SHORTAGE); + if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { + pmap_free_pt_page(pmap, uwptpg, false); + return (KERN_RESOURCE_SHORTAGE); + } + + uwptpg->ref_count = NPTEPG; + } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. @@ -7411,6 +7610,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); + if (uwptpg != NULL) { + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt == uwptpg, + ("removed pt page %p, expected %p", mt, + uwptpg)); + uwptpg->ref_count = 1; + pmap_free_pt_page(pmap, uwptpg, false); + } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); @@ -7460,6 +7667,7 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; + int rv; VM_OBJECT_ASSERT_LOCKED(m_start->object); @@ -7472,7 +7680,8 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && - pmap_enter_2mpage(pmap, va, m, prot, &lock)) + ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == + KERN_SUCCESS || rv == KERN_NO_SPACE)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, @@ -7510,6 +7719,7 @@ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { + pd_entry_t *pde; pt_entry_t newpte, *pte, PG_V; KASSERT(!VA_IS_CLEANMAP(va) || @@ -7517,14 +7727,15 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pde = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { + pdp_entry_t *pdpe; vm_pindex_t ptepindex; - pd_entry_t *ptepa; /* * Calculate pagetable page index @@ -7534,30 +7745,34 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, mpte->ref_count++; } else { /* - * Get the page directory entry - */ - ptepa = pmap_pde(pmap, va); - - /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we - * attempt to allocate a page table page. If this - * attempt fails, we don't retry. Instead, we give up. + * attempt to allocate a page table page, passing NULL + * instead of the PV list lock pointer because we don't + * intend to sleep. If this attempt fails, we don't + * retry. Instead, we give up. */ - if (ptepa && (*ptepa & PG_V) != 0) { - if (*ptepa & PG_PS) + pdpe = pmap_pdpe(pmap, va); + if (pdpe != NULL && (*pdpe & PG_V) != 0) { + if ((*pdpe & PG_PS) != 0) return (NULL); - mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); - mpte->ref_count++; + pde = pmap_pdpe_to_pde(pdpe, va); + if ((*pde & PG_V) != 0) { + if ((*pde & PG_PS) != 0) + return (NULL); + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + mpte->ref_count++; + } else { + mpte = pmap_allocpte_alloc(pmap, + ptepindex, NULL, va); + if (mpte == NULL) + return (NULL); + } } else { - /* - * Pass NULL instead of the PV list lock - * pointer, because we don't intend to sleep. - */ mpte = pmap_allocpte_alloc(pmap, ptepindex, NULL, va); if (mpte == NULL) - return (mpte); + return (NULL); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); @@ -7588,7 +7803,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, pmap_resident_count_adj(pmap, 1); newpte = VM_PAGE_TO_PHYS(m) | PG_V | - pmap_cache_bits(pmap, m->md.pat_mode, 0); + pmap_cache_bits(pmap, m->md.pat_mode, false); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) @@ -7596,6 +7811,27 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); + +#if VM_NRESERVLEVEL > 0 + /* + * If both the PTP and the reservation are fully populated, then + * attempt promotion. + */ + if ((mpte == NULL || mpte->ref_count == NPTEPG) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) { + if (pde == NULL) + pde = pmap_pde(pmap, va); + + /* + * If promotion succeeds, then the next call to this function + * should not be given the unmapped PTP as a hint. + */ + if (pmap_promote_pde(pmap, pde, va, mpte, lockp)) + mpte = NULL; + } +#endif + return (mpte); } @@ -7610,7 +7846,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i) va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); - invlpg(va); + pmap_invlpg(kernel_pmap, va); return ((void *)crashdumpmap); } @@ -7643,7 +7879,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); - KASSERT(p->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(p), ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; @@ -7663,7 +7899,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { - KASSERT(p->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(p), ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) @@ -7677,7 +7913,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, * will not affect the termination of this loop. */ PMAP_LOCK(pmap); - for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); + for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, true); pa < ptepa + size; pa += NBPDR) { pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); if (pde == NULL) { @@ -8036,9 +8272,16 @@ pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) void pmap_zero_page(vm_page_t m) { - vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + vm_offset_t va; +#ifdef TSLOG_PAGEZERO + TSENTER(); +#endif + va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); +#ifdef TSLOG_PAGEZERO + TSEXIT(); +#endif } /* @@ -8078,7 +8321,7 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; - boolean_t mapped; + bool mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; @@ -8087,12 +8330,12 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); - mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); + mapped = pmap_map_io_transient(pages, vaddr, 2, false); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) - pmap_unmap_io_transient(pages, vaddr, 2, FALSE); + pmap_unmap_io_transient(pages, vaddr, 2, false); a_offset += cnt; b_offset += cnt; xfersize -= cnt; @@ -8106,23 +8349,23 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ -boolean_t +bool pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; - boolean_t rv; + bool rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); - rv = FALSE; + rv = false; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { - rv = TRUE; + rv = true; break; } loops++; @@ -8133,7 +8376,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { - rv = TRUE; + rv = true; break; } loops++; @@ -8211,17 +8454,17 @@ restart: } /* - * Returns TRUE if the given page is mapped individually or as part of - * a 2mpage. Otherwise, returns FALSE. + * Returns true if the given page is mapped individually or as part of + * a 2mpage. Otherwise, returns false. */ -boolean_t +bool pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; - boolean_t rv; + bool rv; if ((m->oflags & VPO_UNMANAGED) != 0) - return (FALSE); + return (false); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || @@ -8275,7 +8518,7 @@ pmap_remove_pages(pmap_t pmap) #ifdef PV_STATS int freed; #endif - boolean_t superpage; + bool superpage; vm_paddr_t pa; /* @@ -8325,7 +8568,7 @@ pmap_remove_pages(pmap_t pmap) pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { - superpage = FALSE; + superpage = false; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); @@ -8342,7 +8585,7 @@ pmap_remove_pages(pmap_t pmap) * regular page could be mistaken for * a superpage. */ - superpage = TRUE; + superpage = true; } if ((tpte & PG_V) == 0) { @@ -8410,13 +8653,13 @@ pmap_remove_pages(pmap_t pmap) } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { - KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_any_valid(mpte), ("pmap_remove_pages: pte page not promoted")); pmap_pt_page_count_adj(pmap, -1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pages: pte page reference count error")); mpte->ref_count = 0; - pmap_add_delayed_free_list(mpte, &free, FALSE); + pmap_add_delayed_free_list(mpte, &free, false); } } else { pmap_resident_count_adj(pmap, -1); @@ -8453,8 +8696,8 @@ pmap_remove_pages(pmap_t pmap) vm_page_free_pages_toq(&free, true); } -static boolean_t -pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) +static bool +pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) { struct rwlock *lock; pv_entry_t pv; @@ -8463,9 +8706,9 @@ pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; - boolean_t rv; + bool rv; - rv = FALSE; + rv = false; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: @@ -8543,7 +8786,7 @@ out: * Return whether or not the specified physical page was modified * in any physical maps. */ -boolean_t +bool pmap_is_modified(vm_page_t m) { @@ -8554,8 +8797,8 @@ pmap_is_modified(vm_page_t m) * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) - return (FALSE); - return (pmap_page_test_mappings(m, FALSE, TRUE)); + return (false); + return (pmap_page_test_mappings(m, false, true)); } /* @@ -8564,20 +8807,20 @@ pmap_is_modified(vm_page_t m) * Return whether or not the specified virtual address is eligible * for prefault. */ -boolean_t +bool pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; - boolean_t rv; + bool rv; PG_V = pmap_valid_bit(pmap); /* - * Return TRUE if and only if the PTE for the specified virtual + * Return true if and only if the PTE for the specified virtual * address is allocated but invalid. */ - rv = FALSE; + rv = false; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { @@ -8594,13 +8837,13 @@ pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) * Return whether or not the specified physical page was referenced * in any physical maps. */ -boolean_t +bool pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); - return (pmap_page_test_mappings(m, TRUE, FALSE)); + return (pmap_page_test_mappings(m, true, false)); } /* @@ -8689,33 +8932,6 @@ retry: pmap_delayed_invl_wait(m); } -static __inline boolean_t -safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) -{ - - if (!pmap_emulate_ad_bits(pmap)) - return (TRUE); - - KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); - - /* - * XWR = 010 or 110 will cause an unconditional EPT misconfiguration - * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared - * if the EPT_PG_WRITE bit is set. - */ - if ((pte & EPT_PG_WRITE) != 0) - return (FALSE); - - /* - * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. - */ - if ((pte & EPT_PG_EXECUTE) == 0 || - ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) - return (TRUE); - else - return (FALSE); -} - /* * pmap_ts_referenced: * @@ -8749,7 +8965,7 @@ pmap_ts_referenced(vm_page_t m) vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; - boolean_t demoted; + bool demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); @@ -8817,7 +9033,7 @@ retry: if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); - demoted = FALSE; + demoted = false; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* @@ -8828,7 +9044,7 @@ retry: * this removal never frees a page * table page. */ - demoted = TRUE; + demoted = true; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); @@ -8836,7 +9052,7 @@ retry: NULL, &lock); pmap_invalidate_page(pmap, va); } else - demoted = TRUE; + demoted = true; if (demoted) { /* @@ -8986,13 +9202,8 @@ pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; - if ((*pdpe & PG_PS) != 0) { - KASSERT(va_next <= eva, - ("partial update of non-transparent 1G mapping " - "pdpe %#lx sva %#lx eva %#lx va_next %#lx", - *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) continue; - } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) @@ -9226,7 +9437,7 @@ pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) panic("%s: too many preinit mappings", __func__); } else { /* - * If we have a preinit mapping, re-use it. + * If we have a preinit mapping, reuse it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; @@ -9295,12 +9506,14 @@ pmap_mapbios(vm_paddr_t pa, vm_size_t size) } void -pmap_unmapdev(vm_offset_t va, vm_size_t size) +pmap_unmapdev(void *p, vm_size_t size) { struct pmap_preinit_mapping *ppim; - vm_offset_t offset; + vm_offset_t offset, va; int i; + va = (vm_offset_t)p; + /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; @@ -9330,7 +9543,7 @@ pmap_unmapdev(vm_offset_t va, vm_size_t size) /* * Tries to demote a 1GB page mapping. */ -static boolean_t +static bool pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; @@ -9353,7 +9566,7 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) if (pdpg == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); - return (FALSE); + return (false); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); @@ -9385,7 +9598,7 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) counter_u64_add(pmap_pdpe_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); - return (TRUE); + return (true); } /* @@ -9728,12 +9941,12 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, * is not mandatory. The caller may, however, request a TLB invalidation. */ void -pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) +pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; - boolean_t changed; + bool changed; if (len == 0) return; @@ -9742,7 +9955,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); - changed = FALSE; + changed = false; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) @@ -9750,7 +9963,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); - changed = TRUE; + changed = true; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); @@ -9759,7 +9972,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); - changed = TRUE; + changed = true; } } if (changed && invalidate) @@ -9835,20 +10048,20 @@ out: } static uint64_t -pmap_pcid_alloc(pmap_t pmap, u_int cpuid) +pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); - if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) + if (pcidp->pm_pcid == PMAP_PCID_KERN) return (pti ? 0 : CR3_PCID_SAVE); - if (pmap->pm_pcids[cpuid].pm_gen == gen) + if (pcidp->pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), - ("cpu %d pcid_next %#x", cpuid, pcid_next)); + ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next)); if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; @@ -9859,25 +10072,23 @@ pmap_pcid_alloc(pmap_t pmap, u_int cpuid) } else { new_gen = gen; } - pmap->pm_pcids[cpuid].pm_pcid = pcid_next; - pmap->pm_pcids[cpuid].pm_gen = new_gen; + pcidp->pm_pcid = pcid_next; + pcidp->pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } static uint64_t -pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) +pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp) { uint64_t cached; - cached = pmap_pcid_alloc(pmap, cpuid); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, - ("pmap %p cpu %d pcid %#x", pmap, cpuid, - pmap->pm_pcids[cpuid].pm_pcid)); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || - pmap == kernel_pmap, + cached = pmap_pcid_alloc(pmap, pcidp); + KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX, + ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); + KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap pmap %p cpu %d pcid %#x", - pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); + pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); return (cached); } @@ -9893,6 +10104,7 @@ static void pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { pmap_t old_pmap; + struct pmap_pcid *pcidp, *old_pcidp; uint64_t cached, cr3, kcr3, ucr3; KASSERT((read_rflags() & PSL_I) == 0, @@ -9903,17 +10115,18 @@ pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); old_pmap = PCPU_GET(curpmap); MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); - old_pmap->pm_pcids[cpuid].pm_gen = 0; + old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid); + old_pcidp->pm_gen = 0; } - cached = pmap_pcid_alloc_checked(pmap, cpuid); + pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); + cached = pmap_pcid_alloc_checked(pmap, pcidp); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) - load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); + load_cr3(pmap->pm_cr3 | pcidp->pm_pcid); PCPU_SET(curpmap, pmap); - kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; - ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | - PMAP_PCID_USER_PT; + kcr3 = pmap->pm_cr3 | pcidp->pm_pcid; + ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT; if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); @@ -9930,16 +10143,17 @@ static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { + struct pmap_pcid *pcidp; uint64_t cached, cr3; KASSERT((read_rflags() & PSL_I) == 0, ("PCID needs interrupts disabled in pmap_activate_sw()")); - cached = pmap_pcid_alloc_checked(pmap, cpuid); + pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); + cached = pmap_pcid_alloc_checked(pmap, pcidp); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) - load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | - cached); + load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached); PCPU_SET(curpmap, pmap); if (cached) counter_u64_add(pcid_save_cnt, 1); @@ -10053,7 +10267,7 @@ pmap_activate_boot(pmap_t pmap) if (pti) { kcr3 = pmap->pm_cr3; if (pmap_pcid_enabled) - kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; + kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE; } else { kcr3 = PMAP_NO_CR3; } @@ -10062,6 +10276,12 @@ pmap_activate_boot(pmap_t pmap) } void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} + +void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } @@ -10180,10 +10400,9 @@ pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->ref_count == NPTEPG) && - pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && - vm_reserv_level_iffullpop(m) == 0) { - pmap_promote_pde(pmap, pde, va, &lock); + vm_reserv_level_iffullpop(m) == 0 && + pmap_promote_pde(pmap, pde, va, mpte, &lock)) { #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif @@ -10252,19 +10471,19 @@ done: * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. - * \param can_fault TRUE if the thread using the mapped pages can take - * page faults, FALSE otherwise. + * \param can_fault true if the thread using the mapped pages can take + * page faults, false otherwise. * - * \returns TRUE if the caller must call pmap_unmap_io_transient when - * finished or FALSE otherwise. + * \returns true if the caller must call pmap_unmap_io_transient when + * finished or false otherwise. * */ -boolean_t +bool pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, - boolean_t can_fault) + bool can_fault) { vm_paddr_t paddr; - boolean_t needs_mapping; + bool needs_mapping; pt_entry_t *pte; int cache_bits, error __unused, i; @@ -10272,14 +10491,14 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ - needs_mapping = FALSE; + needs_mapping = false; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); - needs_mapping = TRUE; + needs_mapping = true; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } @@ -10287,7 +10506,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) - return (FALSE); + return (false); /* * NB: The sequence of updating a page table followed by accesses @@ -10313,10 +10532,10 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, - page[i]->md.pat_mode, 0); + page[i]->md.pat_mode, false); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); - invlpg(vaddr[i]); + pmap_invlpg(kernel_pmap, vaddr[i]); } } } @@ -10326,7 +10545,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, - boolean_t can_fault) + bool can_fault) { vm_paddr_t paddr; int i; @@ -10353,8 +10572,15 @@ pmap_quick_enter_page(vm_page_t m) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); + + /* + * Since qframe is exclusively mapped by us, and we do not set + * PG_G, we can use INVLPG here. + */ + invlpg(qframe); + pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | - X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); + X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, false)); return (qframe); } @@ -10365,7 +10591,6 @@ pmap_quick_remove_page(vm_offset_t addr) if (addr != qframe) return; pte_store(vtopte(qframe), 0); - invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } @@ -10568,7 +10793,7 @@ pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, MPASS(*pdpe == 0); *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | - pmap_cache_bits(kernel_pmap, mattr, TRUE); + pmap_cache_bits(kernel_pmap, mattr, true); inc = NBPDP; } else if (len >= NBPDR && (pa & PDRMASK) == 0 && (va & PDRMASK) == 0) { @@ -10576,7 +10801,7 @@ pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, MPASS(*pde == 0); *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | - pmap_cache_bits(kernel_pmap, mattr, TRUE); + pmap_cache_bits(kernel_pmap, mattr, true); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> ref_count++; inc = NBPDR; @@ -10585,7 +10810,7 @@ pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, MPASS(*pte == 0); *pte = pa | pg_g | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, - mattr, FALSE); + mattr, false); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> ref_count++; inc = PAGE_SIZE; @@ -10805,7 +11030,7 @@ pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) /* * If we saw other write-back - * occuring, we cannot rely on PG_M to + * occurring, we cannot rely on PG_M to * indicate state of the cache. The * PG_M bit is cleared before the * flush to avoid ignoring new writes, @@ -11099,7 +11324,7 @@ pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) pa = pmap_kextract(sva); ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, - VM_MEMATTR_DEFAULT, FALSE); + VM_MEMATTR_DEFAULT, false); if (*pte == 0) { pte_store(pte, ptev); pmap_pti_wire_pte(pte); @@ -11440,13 +11665,16 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) /* * Reserve enough memory to: * 1) allocate PDP pages for the shadow map(s), - * 2) shadow one page of memory, so one PD page, one PT page, and one shadow - * page per shadow map. + * 2) shadow the boot stack of KSTACK_PAGES pages, + * so we need one PD page, one or two PT pages, and KSTACK_PAGES shadow pages + * per shadow map. */ #ifdef KASAN -#define SAN_EARLY_PAGES (NKASANPML4E + 3) +#define SAN_EARLY_PAGES \ + (NKASANPML4E + 1 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE)) #else -#define SAN_EARLY_PAGES (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * 3) +#define SAN_EARLY_PAGES \ + (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (1 + 2 + KSTACK_PAGES)) #endif static uint64_t __nosanitizeaddress __nosanitizememory |