aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeandro Lupori <luporl@FreeBSD.org>2020-11-06 14:12:45 +0000
committerLeandro Lupori <luporl@FreeBSD.org>2020-11-06 14:12:45 +0000
commite2d6c417e3030c814e048ec2cda803cb7971bd75 (patch)
tree41b1c511a9ead5d7bd50a96f4ad93ddb7026caec
parent5d0e861910978450c46d2a65385aeb3286a17fa4 (diff)
Implement superpages for PowerPC64 (HPT)
This change adds support for transparent superpages for PowerPC64 systems using Hashed Page Tables (HPT). All pmap operations are supported. The changes were inspired by RISC-V implementation of superpages, by @markj (r344106), but heavily adapted to fit PPC64 HPT architecture and existing MMU OEA64 code. While these changes are not better tested, superpages support is disabled by default. To enable it, use vm.pmap.superpages_enabled=1. In this initial implementation, when superpages are disabled, system performance stays at the same level as without these changes. When superpages are enabled, buildworld time increases a bit (~2%). However, for workloads that put a heavy pressure on the TLB the performance boost is much bigger (see HPC Challenge and pgbench on D25237). Reviewed by: jhibbits Sponsored by: Eldorado Research Institute (eldorado.org.br) Differential Revision: https://reviews.freebsd.org/D25237
Notes
Notes: svn path=/head/; revision=367417
-rw-r--r--sys/powerpc/aim/mmu_oea64.c1146
-rw-r--r--sys/powerpc/aim/mmu_oea64.h10
-rw-r--r--sys/powerpc/aim/moea64_native.c328
-rw-r--r--sys/powerpc/include/pmap.h4
-rw-r--r--sys/powerpc/include/pte.h12
-rw-r--r--sys/powerpc/include/slb.h8
-rw-r--r--sys/powerpc/include/vmparam.h23
-rw-r--r--sys/powerpc/powernv/platform_powernv.c12
-rw-r--r--sys/powerpc/powerpc/pmap_dispatch.c2
-rw-r--r--sys/powerpc/pseries/mmu_phyp.c195
-rw-r--r--sys/vm/vm_fault.c3
11 files changed, 1583 insertions, 160 deletions
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 0a5664127397..0ea9ec55f489 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_extern.h>
#include <vm/vm_pageout.h>
#include <vm/vm_dumpset.h>
+#include <vm/vm_reserv.h>
#include <vm/uma.h>
#include <machine/_inttypes.h>
@@ -111,9 +112,6 @@ uintptr_t moea64_get_unique_vsid(void);
#define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff)
#define VSID_HASH_MASK 0x0000007fffffffffULL
-/* Get physical address from PVO. */
-#define PVO_PADDR(pvo) ((pvo)->pvo_pte.pa & LPTE_RPGN)
-
/*
* Locking semantics:
*
@@ -146,6 +144,48 @@ static struct mtx_padalign pv_lock[PV_LOCK_COUNT];
#define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m))
#define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m))
+/* Superpage PV lock */
+
+#define PV_LOCK_SIZE (1<<PDRSHIFT)
+
+static __always_inline void
+moea64_sp_pv_lock(vm_paddr_t pa)
+{
+ vm_paddr_t pa_end;
+
+ /* Note: breaking when pa_end is reached to avoid overflows */
+ pa_end = pa + (HPT_SP_SIZE - PV_LOCK_SIZE);
+ for (;;) {
+ mtx_lock_flags(PV_LOCKPTR(pa), MTX_DUPOK);
+ if (pa == pa_end)
+ break;
+ pa += PV_LOCK_SIZE;
+ }
+}
+
+static __always_inline void
+moea64_sp_pv_unlock(vm_paddr_t pa)
+{
+ vm_paddr_t pa_end;
+
+ /* Note: breaking when pa_end is reached to avoid overflows */
+ pa_end = pa;
+ pa += HPT_SP_SIZE - PV_LOCK_SIZE;
+ for (;;) {
+ mtx_unlock_flags(PV_LOCKPTR(pa), MTX_DUPOK);
+ if (pa == pa_end)
+ break;
+ pa -= PV_LOCK_SIZE;
+ }
+}
+
+#define SP_PV_LOCK_ALIGNED(pa) moea64_sp_pv_lock(pa)
+#define SP_PV_UNLOCK_ALIGNED(pa) moea64_sp_pv_unlock(pa)
+#define SP_PV_LOCK(pa) moea64_sp_pv_lock((pa) & ~HPT_SP_MASK)
+#define SP_PV_UNLOCK(pa) moea64_sp_pv_unlock((pa) & ~HPT_SP_MASK)
+#define SP_PV_PAGE_LOCK(m) SP_PV_LOCK(VM_PAGE_TO_PHYS(m))
+#define SP_PV_PAGE_UNLOCK(m) SP_PV_UNLOCK(VM_PAGE_TO_PHYS(m))
+
struct ofw_map {
cell_t om_va;
cell_t om_len;
@@ -234,6 +274,7 @@ struct mtx moea64_scratchpage_mtx;
uint64_t moea64_large_page_mask = 0;
uint64_t moea64_large_page_size = 0;
int moea64_large_page_shift = 0;
+bool moea64_has_lp_4k_16m = false;
/*
* PVO calls.
@@ -255,6 +296,95 @@ static void moea64_kremove(vm_offset_t);
static void moea64_syncicache(pmap_t pmap, vm_offset_t va,
vm_paddr_t pa, vm_size_t sz);
static void moea64_pmap_init_qpages(void);
+static void moea64_remove_locked(pmap_t, vm_offset_t,
+ vm_offset_t, struct pvo_dlist *);
+
+/*
+ * Superpages data and routines.
+ */
+
+/*
+ * PVO flags (in vaddr) that must match for promotion to succeed.
+ * Note that protection bits are checked separately, as they reside in
+ * another field.
+ */
+#define PVO_FLAGS_PROMOTE (PVO_WIRED | PVO_MANAGED | PVO_PTEGIDX_VALID)
+
+#define PVO_IS_SP(pvo) (((pvo)->pvo_vaddr & PVO_LARGE) && \
+ (pvo)->pvo_pmap != kernel_pmap)
+
+/* Get physical address from PVO. */
+#define PVO_PADDR(pvo) moea64_pvo_paddr(pvo)
+
+/* MD page flag indicating that the page is a superpage. */
+#define MDPG_ATTR_SP 0x40000000
+
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0,
+ "VM/pmap parameters");
+
+static int superpages_enabled = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN,
+ &superpages_enabled, 0, "Enable support for transparent superpages");
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0,
+ "SP page mapping counters");
+
+static u_long sp_demotions;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD,
+ &sp_demotions, 0, "SP page demotions");
+
+static u_long sp_mappings;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD,
+ &sp_mappings, 0, "SP page mappings");
+
+static u_long sp_p_failures;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD,
+ &sp_p_failures, 0, "SP page promotion failures");
+
+static u_long sp_p_fail_pa;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD,
+ &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match");
+
+static u_long sp_p_fail_flags;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD,
+ &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match");
+
+static u_long sp_p_fail_prot;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD,
+ &sp_p_fail_prot, 0,
+ "SP page promotion failure: page protections don't match");
+
+static u_long sp_p_fail_wimg;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD,
+ &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match");
+
+static u_long sp_promotions;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD,
+ &sp_promotions, 0, "SP page promotions");
+
+static bool moea64_ps_enabled(pmap_t);
+static void moea64_align_superpage(vm_object_t, vm_ooffset_t,
+ vm_offset_t *, vm_size_t);
+
+static int moea64_sp_enter(pmap_t pmap, vm_offset_t va,
+ vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind);
+static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp,
+ struct pvo_dlist *tofree);
+
+static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m);
+static void moea64_sp_demote_aligned(struct pvo_entry *sp);
+static void moea64_sp_demote(struct pvo_entry *pvo);
+
+static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp);
+static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp,
+ vm_prot_t prot);
+
+static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit);
+static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m,
+ uint64_t ptebit);
+
+static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo,
+ vm_offset_t sva, vm_offset_t eva);
/*
* Kernel MMU interface
@@ -362,6 +492,8 @@ static struct pmap_funcs moea64_methods = {
#ifdef __powerpc64__
.page_array_startup = moea64_page_array_startup,
#endif
+ .ps_enabled = moea64_ps_enabled,
+ .align_superpage = moea64_align_superpage,
/* Internal interfaces */
.mapdev = moea64_mapdev,
@@ -381,6 +513,26 @@ static struct pmap_funcs moea64_methods = {
MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods);
+/*
+ * Get physical address from PVO.
+ *
+ * For superpages, the lower bits are not stored on pvo_pte.pa and must be
+ * obtained from VA.
+ */
+static __always_inline vm_paddr_t
+moea64_pvo_paddr(struct pvo_entry *pvo)
+{
+ vm_paddr_t pa;
+
+ pa = (pvo)->pvo_pte.pa & LPTE_RPGN;
+
+ if (PVO_IS_SP(pvo)) {
+ pa &= ~HPT_SP_MASK; /* This is needed to clear LPTE_LP bits. */
+ pa |= PVO_VADDR(pvo) & HPT_SP_MASK;
+ }
+ return (pa);
+}
+
static struct pvo_head *
vm_page_to_pvoh(vm_page_t m)
{
@@ -428,8 +580,10 @@ init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va)
pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
| (vsid << 16);
- shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift :
- ADDR_PIDX_SHFT;
+ if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0)
+ shift = moea64_large_page_shift;
+ else
+ shift = ADDR_PIDX_SHFT;
hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
}
@@ -773,6 +927,9 @@ moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
vm_paddr_t kernelphysstart, kernelphysend;
int rm_pavail;
+ /* Level 0 reservations consist of 4096 pages (16MB superpage). */
+ vm_level_0_order = 12;
+
#ifndef __powerpc64__
/* We don't have a direct map since there is no BAT */
hw_direct_map = 0;
@@ -1204,6 +1361,17 @@ moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
pvo != NULL && PVO_VADDR(pvo) < eva;
pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ if (PVO_IS_SP(pvo)) {
+ if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
+ pvo = moea64_sp_unwire(pvo);
+ continue;
+ } else {
+ CTR1(KTR_PMAP, "%s: demote before unwire",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
+ }
+
if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
panic("moea64_unwire: pvo %p is missing PVO_WIRED",
pvo);
@@ -1489,10 +1657,11 @@ int
moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, u_int flags, int8_t psind)
{
- struct pvo_entry *pvo, *oldpvo;
+ struct pvo_entry *pvo, *oldpvo, *tpvo;
struct pvo_head *pvo_head;
uint64_t pte_lo;
int error;
+ vm_paddr_t pa;
if ((m->oflags & VPO_UNMANAGED) == 0) {
if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0)
@@ -1501,14 +1670,18 @@ moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
VM_OBJECT_ASSERT_LOCKED(m->object);
}
+ if (psind > 0)
+ return (moea64_sp_enter(pmap, va, m, prot, flags, psind));
+
pvo = alloc_pvo_entry(0);
if (pvo == NULL)
return (KERN_RESOURCE_SHORTAGE);
pvo->pvo_pmap = NULL; /* to be filled in later */
pvo->pvo_pte.prot = prot;
- pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m));
- pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo;
+ pa = VM_PAGE_TO_PHYS(m);
+ pte_lo = moea64_calc_wimg(pa, pmap_page_get_memattr(m));
+ pvo->pvo_pte.pa = pa | pte_lo;
if ((flags & PMAP_ENTER_WIRED) != 0)
pvo->pvo_vaddr |= PVO_WIRED;
@@ -1520,10 +1693,20 @@ moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
pvo->pvo_vaddr |= PVO_MANAGED;
}
- PV_PAGE_LOCK(m);
+ PV_LOCK(pa);
PMAP_LOCK(pmap);
if (pvo->pvo_pmap == NULL)
init_pvo_entry(pvo, pmap, va);
+
+ if (moea64_ps_enabled(pmap) &&
+ (tpvo = moea64_pvo_find_va(pmap, va & ~HPT_SP_MASK)) != NULL &&
+ PVO_IS_SP(tpvo)) {
+ /* Demote SP before entering a regular page */
+ CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx",
+ __func__, (uintmax_t)va);
+ moea64_sp_demote_aligned(tpvo);
+ }
+
if (prot & VM_PROT_WRITE)
if (pmap_bootstrapped &&
(m->oflags & VPO_UNMANAGED) == 0)
@@ -1544,9 +1727,10 @@ moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
}
/* Then just clean up and go home */
- PV_PAGE_UNLOCK(m);
PMAP_UNLOCK(pmap);
+ PV_UNLOCK(pa);
free_pvo_entry(pvo);
+ pvo = NULL;
goto out;
} else {
/* Otherwise, need to kill it first */
@@ -1557,7 +1741,7 @@ moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
}
}
PMAP_UNLOCK(pmap);
- PV_PAGE_UNLOCK(m);
+ PV_UNLOCK(pa);
/* Free any dead pages */
if (error == EEXIST) {
@@ -1573,8 +1757,23 @@ out:
if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 &&
(pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
vm_page_aflag_set(m, PGA_EXECUTABLE);
- moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE);
+ moea64_syncicache(pmap, va, pa, PAGE_SIZE);
}
+
+ /*
+ * Try to promote pages.
+ *
+ * If the VA of the entered page is not aligned with its PA,
+ * don't try page promotion as it is not possible.
+ * This reduces the number of promotion failures dramatically.
+ */
+ if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL &&
+ (pvo->pvo_vaddr & PVO_MANAGED) != 0 &&
+ (va & HPT_SP_MASK) == (pa & HPT_SP_MASK) &&
+ (m->flags & PG_FICTITIOUS) == 0 &&
+ vm_reserv_level_iffullpop(m) == 0)
+ moea64_sp_promote(pmap, va, m);
+
return (KERN_SUCCESS);
}
@@ -1633,15 +1832,25 @@ moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
{
vm_page_t m;
vm_pindex_t diff, psize;
+ vm_offset_t va;
+ int8_t psind;
VM_OBJECT_ASSERT_LOCKED(m_start->object);
psize = atop(end - start);
m = m_start;
while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
- moea64_enter(pm, start + ptoa(diff), m, prot &
- (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP |
- PMAP_ENTER_QUICK_LOCKED, 0);
+ va = start + ptoa(diff);
+ if ((va & HPT_SP_MASK) == 0 && va + HPT_SP_SIZE <= end &&
+ m->psind == 1 && moea64_ps_enabled(pm))
+ psind = 1;
+ else
+ psind = 0;
+ moea64_enter(pm, va, m, prot &
+ (VM_PROT_READ | VM_PROT_EXECUTE),
+ PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind);
+ if (psind == 1)
+ m = &m[HPT_SP_SIZE / PAGE_SIZE - 1];
m = TAILQ_NEXT(m, listq);
}
}
@@ -1755,6 +1964,27 @@ moea64_init()
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
UMA_ZONE_VM | UMA_ZONE_NOFREE);
+ /*
+ * Are large page mappings enabled?
+ */
+ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
+ if (superpages_enabled) {
+ KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
+ ("moea64_init: can't assign to pagesizes[1]"));
+
+ if (moea64_large_page_size == 0) {
+ printf("mmu_oea64: HW does not support large pages. "
+ "Disabling superpages...\n");
+ superpages_enabled = 0;
+ } else if (!moea64_has_lp_4k_16m) {
+ printf("mmu_oea64: "
+ "HW does not support mixed 4KB/16MB page sizes. "
+ "Disabling superpages...\n");
+ superpages_enabled = 0;
+ } else
+ pagesizes[1] = HPT_SP_SIZE;
+ }
+
if (!hw_direct_map) {
uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
}
@@ -1834,7 +2064,7 @@ moea64_remove_write(vm_page_t m)
vm_page_assert_busied(m);
if (!pmap_page_is_write_mapped(m))
- return
+ return;
powerpc_sync();
PV_PAGE_LOCK(m);
@@ -1844,6 +2074,11 @@ moea64_remove_write(vm_page_t m)
PMAP_LOCK(pmap);
if (!(pvo->pvo_vaddr & PVO_DEAD) &&
(pvo->pvo_pte.prot & VM_PROT_WRITE)) {
+ if (PVO_IS_SP(pvo)) {
+ CTR1(KTR_PMAP, "%s: demote before remwr",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
if (ret < 0)
@@ -1892,6 +2127,9 @@ moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
pmap_t pmap;
uint64_t lo;
+ CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
+ __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
+
if ((m->oflags & VPO_UNMANAGED) != 0) {
m->md.mdpg_cache_attrs = ma;
return;
@@ -1904,6 +2142,11 @@ moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
pmap = pvo->pvo_pmap;
PMAP_LOCK(pmap);
if (!(pvo->pvo_vaddr & PVO_DEAD)) {
+ if (PVO_IS_SP(pvo)) {
+ CTR1(KTR_PMAP,
+ "%s: demote before set_memattr", __func__);
+ moea64_sp_demote(pvo);
+ }
pvo->pvo_pte.pa &= ~LPTE_WIMG;
pvo->pvo_pte.pa |= lo;
refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
@@ -2356,7 +2599,7 @@ void
moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
vm_prot_t prot)
{
- struct pvo_entry *pvo, *tpvo, key;
+ struct pvo_entry *pvo, key;
CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
sva, eva, prot);
@@ -2372,8 +2615,18 @@ moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
PMAP_LOCK(pm);
key.pvo_vaddr = sva;
for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
- pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
- tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+ pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ if (PVO_IS_SP(pvo)) {
+ if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
+ pvo = moea64_sp_protect(pvo, prot);
+ continue;
+ } else {
+ CTR1(KTR_PMAP, "%s: demote before protect",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
+ }
moea64_pvo_protect(pm, pvo, prot);
}
PMAP_UNLOCK(pm);
@@ -2473,13 +2726,46 @@ moea64_remove_pages(pmap_t pm)
}
}
+static void
+moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
+ struct pvo_dlist *tofree)
+{
+ struct pvo_entry *pvo, *tpvo, key;
+
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ key.pvo_vaddr = sva;
+ for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
+ pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+ if (PVO_IS_SP(pvo)) {
+ if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
+ tpvo = moea64_sp_remove(pvo, tofree);
+ continue;
+ } else {
+ CTR1(KTR_PMAP, "%s: demote before remove",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
+ }
+ tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+
+ /*
+ * For locking reasons, remove this from the page table and
+ * pmap, but save delinking from the vm_page for a second
+ * pass
+ */
+ moea64_pvo_remove_from_pmap(pvo);
+ SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
+ }
+}
+
/*
* Remove the given range of addresses from the specified map.
*/
void
moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
{
- struct pvo_entry *pvo, *tpvo, key;
+ struct pvo_entry *pvo;
struct pvo_dlist tofree;
/*
@@ -2488,23 +2774,9 @@ moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
if (pm->pm_stats.resident_count == 0)
return;
- key.pvo_vaddr = sva;
-
SLIST_INIT(&tofree);
-
PMAP_LOCK(pm);
- for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
- pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
- tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
-
- /*
- * For locking reasons, remove this from the page table and
- * pmap, but save delinking from the vm_page for a second
- * pass
- */
- moea64_pvo_remove_from_pmap(pvo);
- SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
- }
+ moea64_remove_locked(pm, sva, eva, &tofree);
PMAP_UNLOCK(pm);
while (!SLIST_EMPTY(&tofree)) {
@@ -2534,8 +2806,14 @@ moea64_remove_all(vm_page_t m)
pmap = pvo->pvo_pmap;
PMAP_LOCK(pmap);
wasdead = (pvo->pvo_vaddr & PVO_DEAD);
- if (!wasdead)
+ if (!wasdead) {
+ if (PVO_IS_SP(pvo)) {
+ CTR1(KTR_PMAP, "%s: demote before remove_all",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
moea64_pvo_remove_from_pmap(pvo);
+ }
moea64_pvo_remove_from_page_locked(pvo, m);
if (!wasdead)
LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
@@ -2768,11 +3046,17 @@ moea64_query_bit(vm_page_t m, uint64_t ptebit)
struct pvo_entry *pvo;
int64_t ret;
boolean_t rv;
+ vm_page_t sp;
/*
* See if this bit is stored in the page already.
+ *
+ * For superpages, the bit is stored in the first vm page.
*/
- if (m->md.mdpg_attrs & ptebit)
+ if ((m->md.mdpg_attrs & ptebit) != 0 ||
+ ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK)) != NULL &&
+ (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) ==
+ (ptebit | MDPG_ATTR_SP)))
return (TRUE);
/*
@@ -2783,6 +3067,21 @@ moea64_query_bit(vm_page_t m, uint64_t ptebit)
powerpc_sync();
PV_PAGE_LOCK(m);
LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
+ if (PVO_IS_SP(pvo)) {
+ ret = moea64_sp_query(pvo, ptebit);
+ /*
+ * If SP was not demoted, check its REF/CHG bits here.
+ */
+ if (ret != -1) {
+ if ((ret & ptebit) != 0) {
+ rv = TRUE;
+ break;
+ }
+ continue;
+ }
+ /* else, fallthrough */
+ }
+
ret = 0;
/*
@@ -2828,6 +3127,12 @@ moea64_clear_bit(vm_page_t m, u_int64_t ptebit)
count = 0;
PV_PAGE_LOCK(m);
LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
+ if (PVO_IS_SP(pvo)) {
+ if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) {
+ count += ret;
+ continue;
+ }
+ }
ret = 0;
PMAP_LOCK(pvo->pvo_pmap);
@@ -3231,3 +3536,770 @@ DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method)
DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t),
moea64_null_method)
DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method)
+DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method)
+DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method)
+DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method)
+
+/* Superpage functions */
+
+/* MMU interface */
+
+static bool
+moea64_ps_enabled(pmap_t pmap)
+{
+ return (superpages_enabled);
+}
+
+static void
+moea64_align_superpage(vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t *addr, vm_size_t size)
+{
+ vm_offset_t sp_offset;
+
+ if (size < HPT_SP_SIZE)
+ return;
+
+ CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx",
+ __func__, (uintmax_t)offset, addr, (uintmax_t)size);
+
+ if (object != NULL && (object->flags & OBJ_COLORED) != 0)
+ offset += ptoa(object->pg_color);
+ sp_offset = offset & HPT_SP_MASK;
+ if (size - ((HPT_SP_SIZE - sp_offset) & HPT_SP_MASK) < HPT_SP_SIZE ||
+ (*addr & HPT_SP_MASK) == sp_offset)
+ return;
+ if ((*addr & HPT_SP_MASK) < sp_offset)
+ *addr = (*addr & ~HPT_SP_MASK) + sp_offset;
+ else
+ *addr = ((*addr + HPT_SP_MASK) & ~HPT_SP_MASK) + sp_offset;
+}
+
+/* Helpers */
+
+static __inline void
+moea64_pvo_cleanup(struct pvo_dlist *tofree)
+{
+ struct pvo_entry *pvo;
+
+ /* clean up */
+ while (!SLIST_EMPTY(tofree)) {
+ pvo = SLIST_FIRST(tofree);
+ SLIST_REMOVE_HEAD(tofree, pvo_dlink);
+ if (pvo->pvo_vaddr & PVO_DEAD)
+ moea64_pvo_remove_from_page(pvo);
+ free_pvo_entry(pvo);
+ }
+}
+
+static __inline uint16_t
+pvo_to_vmpage_flags(struct pvo_entry *pvo)
+{
+ uint16_t flags;
+
+ flags = 0;
+ if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0)
+ flags |= PGA_WRITEABLE;
+ if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0)
+ flags |= PGA_EXECUTABLE;
+
+ return (flags);
+}
+
+/*
+ * Check if the given pvo and its superpage are in sva-eva range.
+ */
+static __inline bool
+moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva)
+{
+ vm_offset_t spva;
+
+ spva = PVO_VADDR(pvo) & ~HPT_SP_MASK;
+ if (spva >= sva && spva + HPT_SP_SIZE <= eva) {
+ /*
+ * Because this function is intended to be called from loops
+ * that iterate over ordered pvo entries, if the condition
+ * above is true then the pvo must be the first of its
+ * superpage.
+ */
+ KASSERT(PVO_VADDR(pvo) == spva,
+ ("%s: unexpected unaligned superpage pvo", __func__));
+ return (true);
+ }
+ return (false);
+}
+
+/*
+ * Update vm about the REF/CHG bits if the superpage is managed and
+ * has (or had) write access.
+ */
+static void
+moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m,
+ int64_t sp_refchg, vm_prot_t prot)
+{
+ vm_page_t m_end;
+ int64_t refchg;
+
+ if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) {
+ for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) {
+ refchg = sp_refchg |
+ atomic_readandclear_32(&m->md.mdpg_attrs);
+ if (refchg & LPTE_CHG)
+ vm_page_dirty(m);
+ if (refchg & LPTE_REF)
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ }
+ }
+}
+
+/* Superpage ops */
+
+static int
+moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_prot_t prot, u_int flags, int8_t psind)
+{
+ struct pvo_entry *pvo, **pvos;
+ struct pvo_head *pvo_head;
+ vm_offset_t sva;
+ vm_page_t sm;
+ vm_paddr_t pa, spa;
+ bool sync;
+ struct pvo_dlist tofree;
+ int error, i;
+ uint16_t aflags;
+
+ KASSERT((va & HPT_SP_MASK) == 0, ("%s: va %#jx unaligned",
+ __func__, (uintmax_t)va));
+ KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind));
+ KASSERT(m->psind == 1, ("%s: invalid m->psind: %d",
+ __func__, m->psind));
+ KASSERT(pmap != kernel_pmap,
+ ("%s: function called with kernel pmap", __func__));
+
+ CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1",
+ __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m),
+ prot, flags);
+
+ SLIST_INIT(&tofree);
+
+ sva = va;
+ sm = m;
+ spa = pa = VM_PAGE_TO_PHYS(sm);
+
+ /* Try to allocate all PVOs first, to make failure handling easier. */
+ pvos = malloc(HPT_SP_PAGES * sizeof(struct pvo_entry *), M_TEMP,
+ M_NOWAIT);
+ if (pvos == NULL) {
+ CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+
+ for (i = 0; i < HPT_SP_PAGES; i++) {
+ pvos[i] = alloc_pvo_entry(0);
+ if (pvos[i] == NULL) {
+ CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__);
+ for (i = i - 1; i >= 0; i--)
+ free_pvo_entry(pvos[i]);
+ free(pvos, M_TEMP);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+ }
+
+ SP_PV_LOCK_ALIGNED(spa);
+ PMAP_LOCK(pmap);
+
+ /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */
+ moea64_remove_locked(pmap, va, va + HPT_SP_SIZE, &tofree);
+
+ /* Enter pages */
+ for (i = 0; i < HPT_SP_PAGES;
+ i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) {
+ pvo = pvos[i];
+
+ pvo->pvo_pte.prot = prot;
+ pvo->pvo_pte.pa = (pa & ~LPTE_LP_MASK) | LPTE_LP_4K_16M |
+ moea64_calc_wimg(pa, pmap_page_get_memattr(m));
+
+ if ((flags & PMAP_ENTER_WIRED) != 0)
+ pvo->pvo_vaddr |= PVO_WIRED;
+ pvo->pvo_vaddr |= PVO_LARGE;
+
+ if ((m->oflags & VPO_UNMANAGED) != 0)
+ pvo_head = NULL;
+ else {
+ pvo_head = &m->md.mdpg_pvoh;
+ pvo->pvo_vaddr |= PVO_MANAGED;
+ }
+
+ init_pvo_entry(pvo, pmap, va);
+
+ error = moea64_pvo_enter(pvo, pvo_head, NULL);
+ /*
+ * All superpage PVOs were previously removed, so no errors
+ * should occur while inserting the new ones.
+ */
+ KASSERT(error == 0, ("%s: unexpected error "
+ "when inserting superpage PVO: %d",
+ __func__, error));
+ }
+
+ PMAP_UNLOCK(pmap);
+ SP_PV_UNLOCK_ALIGNED(spa);
+
+ sync = (sm->a.flags & PGA_EXECUTABLE) == 0;
+ /* Note: moea64_pvo_cleanup() also clears page prot. flags. */
+ moea64_pvo_cleanup(&tofree);
+ pvo = pvos[0];
+
+ /* Set vm page flags */
+ aflags = pvo_to_vmpage_flags(pvo);
+ if (aflags != 0)
+ for (m = sm; m < &sm[HPT_SP_PAGES]; m++)
+ vm_page_aflag_set(m, aflags);
+
+ /*
+ * Flush the page from the instruction cache if this page is
+ * mapped executable and cacheable.
+ */
+ if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0)
+ moea64_syncicache(pmap, sva, spa, HPT_SP_SIZE);
+
+ atomic_add_long(&sp_mappings, 1);
+ CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p",
+ __func__, (uintmax_t)sva, pmap);
+
+ free(pvos, M_TEMP);
+ return (KERN_SUCCESS);
+}
+
+static void
+moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m)
+{
+ struct pvo_entry *first, *pvo;
+ vm_paddr_t pa, pa_end;
+ vm_offset_t sva, va_end;
+ int64_t sp_refchg;
+
+ /* This CTR may generate a lot of output. */
+ /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */
+
+ va &= ~HPT_SP_MASK;
+ sva = va;
+ /* Get superpage */
+ pa = VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK;
+ m = PHYS_TO_VM_PAGE(pa);
+
+ PMAP_LOCK(pmap);
+
+ /*
+ * Check if all pages meet promotion criteria.
+ *
+ * XXX In some cases the loop below may be executed for each or most
+ * of the entered pages of a superpage, which can be expensive
+ * (although it was not profiled) and need some optimization.
+ *
+ * Some cases where this seems to happen are:
+ * - When a superpage is first entered read-only and later becomes
+ * read-write.
+ * - When some of the superpage's virtual addresses map to previously
+ * wired/cached pages while others map to pages allocated from a
+ * different physical address range. A common scenario where this
+ * happens is when mmap'ing a file that is already present in FS
+ * block cache and doesn't fill a superpage.
+ */
+ first = pvo = moea64_pvo_find_va(pmap, sva);
+ for (pa_end = pa + HPT_SP_SIZE;
+ pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) {
+ if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR3(KTR_PMAP,
+ "%s: NULL or dead PVO: pmap=%p, va=%#jx",
+ __func__, pmap, (uintmax_t)va);
+ goto error;
+ }
+ if (PVO_PADDR(pvo) != pa) {
+ CTR5(KTR_PMAP, "%s: PAs don't match: "
+ "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx",
+ __func__, pmap, (uintmax_t)va,
+ (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa);
+ atomic_add_long(&sp_p_fail_pa, 1);
+ goto error;
+ }
+ if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) !=
+ (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) {
+ CTR5(KTR_PMAP, "%s: PVO flags don't match: "
+ "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx",
+ __func__, pmap, (uintmax_t)va,
+ (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE),
+ (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE));
+ atomic_add_long(&sp_p_fail_flags, 1);
+ goto error;
+ }
+ if (first->pvo_pte.prot != pvo->pvo_pte.prot) {
+ CTR5(KTR_PMAP, "%s: PVO protections don't match: "
+ "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x",
+ __func__, pmap, (uintmax_t)va,
+ pvo->pvo_pte.prot, first->pvo_pte.prot);
+ atomic_add_long(&sp_p_fail_prot, 1);
+ goto error;
+ }
+ if ((first->pvo_pte.pa & LPTE_WIMG) !=
+ (pvo->pvo_pte.pa & LPTE_WIMG)) {
+ CTR5(KTR_PMAP, "%s: WIMG bits don't match: "
+ "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx",
+ __func__, pmap, (uintmax_t)va,
+ (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG),
+ (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG));
+ atomic_add_long(&sp_p_fail_wimg, 1);
+ goto error;
+ }
+
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
+ }
+
+ /* All OK, promote. */
+
+ /*
+ * Handle superpage REF/CHG bits. If REF or CHG is set in
+ * any page, then it must be set in the superpage.
+ *
+ * Instead of querying each page, we take advantage of two facts:
+ * 1- If a page is being promoted, it was referenced.
+ * 2- If promoted pages are writable, they were modified.
+ */
+ sp_refchg = LPTE_REF |
+ ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0);
+
+ /* Promote pages */
+
+ for (pvo = first, va_end = PVO_VADDR(pvo) + HPT_SP_SIZE;
+ pvo != NULL && PVO_VADDR(pvo) < va_end;
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
+ pvo->pvo_pte.pa &= ~LPTE_LP_MASK;
+ pvo->pvo_pte.pa |= LPTE_LP_4K_16M;
+ pvo->pvo_vaddr |= PVO_LARGE;
+ }
+ moea64_pte_replace_sp(first);
+
+ /* Send REF/CHG bits to VM */
+ moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot);
+
+ /* Use first page to cache REF/CHG bits */
+ atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP);
+
+ PMAP_UNLOCK(pmap);
+
+ atomic_add_long(&sp_mappings, 1);
+ atomic_add_long(&sp_promotions, 1);
+ CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
+ __func__, (uintmax_t)sva, pmap);
+ return;
+
+error:
+ atomic_add_long(&sp_p_failures, 1);
+ PMAP_UNLOCK(pmap);
+}
+
+static void
+moea64_sp_demote_aligned(struct pvo_entry *sp)
+{
+ struct pvo_entry *pvo;
+ vm_offset_t va, va_end;
+ vm_paddr_t pa;
+ vm_page_t m;
+ pmap_t pmap;
+ int64_t refchg;
+
+ CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
+
+ pmap = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ pvo = sp;
+
+ /* Demote pages */
+
+ va = PVO_VADDR(pvo);
+ pa = PVO_PADDR(pvo);
+ m = PHYS_TO_VM_PAGE(pa);
+
+ for (pvo = sp, va_end = va + HPT_SP_SIZE;
+ pvo != NULL && PVO_VADDR(pvo) < va_end;
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo),
+ va += PAGE_SIZE, pa += PAGE_SIZE) {
+ KASSERT(pvo && PVO_VADDR(pvo) == va,
+ ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va));
+
+ pvo->pvo_vaddr &= ~PVO_LARGE;
+ pvo->pvo_pte.pa &= ~LPTE_RPGN;
+ pvo->pvo_pte.pa |= pa;
+
+ }
+ refchg = moea64_pte_replace_sp(sp);
+
+ /*
+ * Clear SP flag
+ *
+ * XXX It is possible that another pmap has this page mapped as
+ * part of a superpage, but as the SP flag is used only for
+ * caching SP REF/CHG bits, that will be queried if not set
+ * in cache, it should be ok to clear it here.
+ */
+ atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP);
+
+ /*
+ * Handle superpage REF/CHG bits. A bit set in the superpage
+ * means all pages should consider it set.
+ */
+ moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot);
+
+ atomic_add_long(&sp_demotions, 1);
+ CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
+ __func__, (uintmax_t)PVO_VADDR(sp), pmap);
+}
+
+static void
+moea64_sp_demote(struct pvo_entry *pvo)
+{
+ PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
+
+ if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
+ pvo = moea64_pvo_find_va(pvo->pvo_pmap,
+ PVO_VADDR(pvo) & ~HPT_SP_MASK);
+ KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx",
+ __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
+ }
+ moea64_sp_demote_aligned(pvo);
+}
+
+static struct pvo_entry *
+moea64_sp_unwire(struct pvo_entry *sp)
+{
+ struct pvo_entry *pvo, *prev;
+ vm_offset_t eva;
+ pmap_t pm;
+ int64_t ret, refchg;
+
+ CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
+
+ pm = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ eva = PVO_VADDR(sp) + HPT_SP_SIZE;
+ refchg = 0;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
+ panic("%s: pvo %p is missing PVO_WIRED",
+ __func__, pvo);
+ pvo->pvo_vaddr &= ~PVO_WIRED;
+
+ ret = moea64_pte_replace(pvo, 0 /* No invalidation */);
+ if (ret < 0)
+ refchg |= LPTE_CHG;
+ else
+ refchg |= ret;
+
+ pm->pm_stats.wired_count--;
+ }
+
+ /* Send REF/CHG bits to VM */
+ moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)),
+ refchg, sp->pvo_pte.prot);
+
+ return (prev);
+}
+
+static struct pvo_entry *
+moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot)
+{
+ struct pvo_entry *pvo, *prev;
+ vm_offset_t eva;
+ pmap_t pm;
+ vm_page_t m, m_end;
+ int64_t ret, refchg;
+ vm_prot_t oldprot;
+
+ CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x",
+ __func__, (uintmax_t)PVO_VADDR(sp), prot);
+
+ pm = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ oldprot = sp->pvo_pte.prot;
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
+ KASSERT(m != NULL, ("%s: missing vm page for pa %#jx",
+ __func__, (uintmax_t)PVO_PADDR(sp)));
+ eva = PVO_VADDR(sp) + HPT_SP_SIZE;
+ refchg = 0;
+
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ pvo->pvo_pte.prot = prot;
+ /*
+ * If the PVO is in the page table, update mapping
+ */
+ ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
+ if (ret < 0)
+ refchg |= LPTE_CHG;
+ else
+ refchg |= ret;
+ }
+
+ /* Send REF/CHG bits to VM */
+ moea64_sp_refchg_process(sp, m, refchg, oldprot);
+
+ /* Handle pages that became executable */
+ if ((m->a.flags & PGA_EXECUTABLE) == 0 &&
+ (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
+ if ((m->oflags & VPO_UNMANAGED) == 0)
+ for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++)
+ vm_page_aflag_set(m, PGA_EXECUTABLE);
+ moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp),
+ HPT_SP_SIZE);
+ }
+
+ return (prev);
+}
+
+static struct pvo_entry *
+moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree)
+{
+ struct pvo_entry *pvo, *tpvo;
+ vm_offset_t eva;
+ pmap_t pm;
+
+ CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
+
+ pm = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ eva = PVO_VADDR(sp) + HPT_SP_SIZE;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+ tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+
+ /*
+ * For locking reasons, remove this from the page table and
+ * pmap, but save delinking from the vm_page for a second
+ * pass
+ */
+ moea64_pvo_remove_from_pmap(pvo);
+ SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
+ }
+
+ /*
+ * Clear SP bit
+ *
+ * XXX See comment in moea64_sp_demote_aligned() for why it's
+ * ok to always clear the SP bit on remove/demote.
+ */
+ atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs,
+ MDPG_ATTR_SP);
+
+ return (tpvo);
+}
+
+static int64_t
+moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit)
+{
+ int64_t refchg, ret;
+ vm_offset_t eva;
+ vm_page_t m;
+ pmap_t pmap;
+ struct pvo_entry *sp;
+
+ pmap = pvo->pvo_pmap;
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ /* Get first SP PVO */
+ if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
+ sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
+ KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
+ __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
+ } else
+ sp = pvo;
+ eva = PVO_VADDR(sp) + HPT_SP_SIZE;
+
+ refchg = 0;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
+ ret = moea64_pte_synch(pvo);
+ if (ret > 0) {
+ refchg |= ret & (LPTE_CHG | LPTE_REF);
+ if ((refchg & ptebit) != 0)
+ break;
+ }
+ }
+
+ /* Save results */
+ if (refchg != 0) {
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
+ atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP);
+ }
+
+ return (refchg);
+}
+
+static int64_t
+moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit)
+{
+ int64_t refchg;
+ pmap_t pmap;
+
+ pmap = pvo->pvo_pmap;
+ PMAP_LOCK(pmap);
+
+ /*
+ * Check if SP was demoted/removed before pmap lock was acquired.
+ */
+ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
+ __func__, (uintmax_t)PVO_PADDR(pvo));
+ PMAP_UNLOCK(pmap);
+ return (-1);
+ }
+
+ refchg = moea64_sp_query_locked(pvo, ptebit);
+ PMAP_UNLOCK(pmap);
+
+ CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
+ __func__, (uintmax_t)PVO_VADDR(pvo),
+ (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg);
+
+ return (refchg);
+}
+
+static int64_t
+moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit)
+{
+ int64_t refchg, ret;
+ pmap_t pmap;
+ struct pvo_entry *sp;
+ vm_offset_t eva;
+ vm_page_t m;
+
+ pmap = pvo->pvo_pmap;
+ PMAP_LOCK(pmap);
+
+ /*
+ * Check if SP was demoted/removed before pmap lock was acquired.
+ */
+ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
+ __func__, (uintmax_t)PVO_PADDR(pvo));
+ PMAP_UNLOCK(pmap);
+ return (-1);
+ }
+
+ /* Get first SP PVO */
+ if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
+ sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
+ KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
+ __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
+ } else
+ sp = pvo;
+ eva = PVO_VADDR(sp) + HPT_SP_SIZE;
+
+ refchg = 0;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
+ ret = moea64_pte_clear(pvo, ptebit);
+ if (ret > 0)
+ refchg |= ret & (LPTE_CHG | LPTE_REF);
+ }
+
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
+ atomic_clear_32(&m->md.mdpg_attrs, ptebit);
+ PMAP_UNLOCK(pmap);
+
+ CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
+ __func__, (uintmax_t)PVO_VADDR(sp),
+ (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg);
+
+ return (refchg);
+}
+
+static int64_t
+moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit)
+{
+ int64_t count, ret;
+ pmap_t pmap;
+
+ count = 0;
+ pmap = pvo->pvo_pmap;
+
+ /*
+ * Since this reference bit is shared by 4096 4KB pages, it
+ * should not be cleared every time it is tested. Apply a
+ * simple "hash" function on the physical page number, the
+ * virtual superpage number, and the pmap address to select
+ * one 4KB page out of the 4096 on which testing the
+ * reference bit will result in clearing that reference bit.
+ * This function is designed to avoid the selection of the
+ * same 4KB page for every 16MB page mapping.
+ *
+ * Always leave the reference bit of a wired mapping set, as
+ * the current state of its reference bit won't affect page
+ * replacement.
+ */
+ if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^
+ (PVO_VADDR(pvo) >> HPT_SP_SHIFT) ^ (uintptr_t)pmap) &
+ (HPT_SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) {
+ if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1)
+ return (-1);
+
+ if ((ret & ptebit) != 0)
+ count++;
+
+ /*
+ * If this page was not selected by the hash function, then assume
+ * its REF bit was set.
+ */
+ } else if (ptebit == LPTE_REF) {
+ count++;
+
+ /*
+ * To clear the CHG bit of a single SP page, first it must be demoted.
+ * But if no CHG bit is set, no bit clear and thus no SP demotion is
+ * needed.
+ */
+ } else {
+ CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx",
+ __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo),
+ (uintmax_t)PVO_PADDR(pvo));
+
+ PMAP_LOCK(pmap);
+
+ /*
+ * Make sure SP wasn't demoted/removed before pmap lock
+ * was acquired.
+ */
+ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
+ __func__, (uintmax_t)PVO_PADDR(pvo));
+ PMAP_UNLOCK(pmap);
+ return (-1);
+ }
+
+ ret = moea64_sp_query_locked(pvo, ptebit);
+ if ((ret & ptebit) != 0)
+ count++;
+ else {
+ PMAP_UNLOCK(pmap);
+ return (0);
+ }
+
+ moea64_sp_demote(pvo);
+ moea64_pte_clear(pvo, ptebit);
+
+ /*
+ * Write protect the mapping to a single page so that a
+ * subsequent write access may repromote.
+ */
+ if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
+ moea64_pvo_protect(pmap, pvo,
+ pvo->pvo_pte.prot & ~VM_PROT_WRITE);
+
+ PMAP_UNLOCK(pmap);
+ }
+
+ return (count);
+}
diff --git a/sys/powerpc/aim/mmu_oea64.h b/sys/powerpc/aim/mmu_oea64.h
index a73a31250188..e0b47bad8eed 100644
--- a/sys/powerpc/aim/mmu_oea64.h
+++ b/sys/powerpc/aim/mmu_oea64.h
@@ -82,12 +82,18 @@ int64_t moea64_pte_insert(struct pvo_entry *);
int64_t moea64_pte_unset(struct pvo_entry *);
int64_t moea64_pte_clear(struct pvo_entry *, uint64_t);
int64_t moea64_pte_synch(struct pvo_entry *);
+int64_t moea64_pte_insert_sp(struct pvo_entry *);
+int64_t moea64_pte_unset_sp(struct pvo_entry *);
+int64_t moea64_pte_replace_sp(struct pvo_entry *);
typedef int64_t (*moea64_pte_replace_t)(struct pvo_entry *, int);
typedef int64_t (*moea64_pte_insert_t)(struct pvo_entry *);
typedef int64_t (*moea64_pte_unset_t)(struct pvo_entry *);
typedef int64_t (*moea64_pte_clear_t)(struct pvo_entry *, uint64_t);
typedef int64_t (*moea64_pte_synch_t)(struct pvo_entry *);
+typedef int64_t (*moea64_pte_insert_sp_t)(struct pvo_entry *);
+typedef int64_t (*moea64_pte_unset_sp_t)(struct pvo_entry *);
+typedef int64_t (*moea64_pte_replace_sp_t)(struct pvo_entry *);
struct moea64_funcs {
moea64_pte_replace_t pte_replace;
@@ -95,6 +101,9 @@ struct moea64_funcs {
moea64_pte_unset_t pte_unset;
moea64_pte_clear_t pte_clear;
moea64_pte_synch_t pte_synch;
+ moea64_pte_insert_sp_t pte_insert_sp;
+ moea64_pte_unset_sp_t pte_unset_sp;
+ moea64_pte_replace_sp_t pte_replace_sp;
};
extern struct moea64_funcs *moea64_ops;
@@ -128,5 +137,6 @@ extern uint64_t moea64_large_page_mask;
extern u_long moea64_pteg_count;
extern u_long moea64_pteg_mask;
extern int n_slbs;
+extern bool moea64_has_lp_4k_16m;
#endif /* _POWERPC_AIM_MMU_OEA64_H */
diff --git a/sys/powerpc/aim/moea64_native.c b/sys/powerpc/aim/moea64_native.c
index 20e6e57e1bdd..c352ea6fb6cc 100644
--- a/sys/powerpc/aim/moea64_native.c
+++ b/sys/powerpc/aim/moea64_native.c
@@ -132,11 +132,32 @@ __FBSDID("$FreeBSD$");
/* POWER9 only permits a 64k partition table size. */
#define PART_SIZE 0x10000
+/* Actual page sizes (to be used with tlbie, when L=0) */
+#define AP_4K 0x00
+#define AP_16M 0x80
+
+#define LPTE_KERNEL_VSID_BIT (KERNEL_VSID_BIT << \
+ (16 - (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)))
+
+/* Abbreviated Virtual Address Page - high bits */
+#define LPTE_AVA_PGNHI_MASK 0x0000000000000F80ULL
+#define LPTE_AVA_PGNHI_SHIFT 7
+
+/* Effective Address Page - low bits */
+#define EA_PAGELO_MASK 0x7ffULL
+#define EA_PAGELO_SHIFT 11
+
static bool moea64_crop_tlbie;
static bool moea64_need_lock;
+/*
+ * The tlbie instruction has two forms: an old one used by PowerISA
+ * 2.03 and prior, and a newer one used by PowerISA 2.06 and later.
+ * We need to support both.
+ */
static __inline void
-TLBIE(uint64_t vpn) {
+TLBIE(uint64_t vpn, uint64_t oldptehi)
+{
#ifndef __powerpc64__
register_t vpn_hi, vpn_lo;
register_t msr;
@@ -153,12 +174,32 @@ TLBIE(uint64_t vpn) {
while (!atomic_cmpset_int(&tlbie_lock, 0, 1));
isync(); /* Flush instruction queue once lock acquired */
- if (moea64_crop_tlbie)
+ if (moea64_crop_tlbie) {
vpn &= ~(0xffffULL << 48);
+#ifdef __powerpc64__
+ if ((oldptehi & LPTE_BIG) != 0)
+ __asm __volatile("tlbie %0, 1" :: "r"(vpn) :
+ "memory");
+ else
+ __asm __volatile("tlbie %0, 0" :: "r"(vpn) :
+ "memory");
+ __asm __volatile("eieio; tlbsync; ptesync" :::
+ "memory");
+ goto done;
+#endif
+ }
}
#ifdef __powerpc64__
/*
+ * If this page has LPTE_BIG set and is from userspace, then
+ * it must be a superpage with 4KB base/16MB actual page size.
+ */
+ if ((oldptehi & LPTE_BIG) != 0 &&
+ (oldptehi & LPTE_KERNEL_VSID_BIT) == 0)
+ vpn |= AP_16M;
+
+ /*
* Explicitly clobber r0. The tlbie instruction has two forms: an old
* one used by PowerISA 2.03 and prior, and a newer one used by PowerISA
* 2.06 (maybe 2.05?) and later. We need to support both, and it just
@@ -168,7 +209,7 @@ TLBIE(uint64_t vpn) {
* in the newer form is in the same position as the L(page size) bit of
* the old form, so a slong as RS is 0, we're good on both sides.
*/
- __asm __volatile("li 0, 0 \n tlbie %0" :: "r"(vpn) : "r0", "memory");
+ __asm __volatile("li 0, 0 \n tlbie %0, 0" :: "r"(vpn) : "r0", "memory");
__asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
#else
vpn_hi = (uint32_t)(vpn >> 32);
@@ -194,6 +235,7 @@ TLBIE(uint64_t vpn) {
intr_restore(intr);
#endif
+done:
/* No barriers or special ops -- taken care of by ptesync above */
if (need_lock)
tlbie_lock = 0;
@@ -224,6 +266,9 @@ static int64_t moea64_pte_synch_native(struct pvo_entry *);
static int64_t moea64_pte_clear_native(struct pvo_entry *, uint64_t);
static int64_t moea64_pte_replace_native(struct pvo_entry *, int);
static int64_t moea64_pte_unset_native(struct pvo_entry *);
+static int64_t moea64_pte_insert_sp_native(struct pvo_entry *);
+static int64_t moea64_pte_unset_sp_native(struct pvo_entry *);
+static int64_t moea64_pte_replace_sp_native(struct pvo_entry *);
/*
* Utility routines.
@@ -245,10 +290,13 @@ static struct pmap_funcs moea64_native_methods = {
static struct moea64_funcs moea64_native_funcs = {
.pte_synch = moea64_pte_synch_native,
- .pte_clear = moea64_pte_clear_native,
- .pte_unset = moea64_pte_unset_native,
- .pte_replace = moea64_pte_replace_native,
- .pte_insert = moea64_pte_insert_native,
+ .pte_clear = moea64_pte_clear_native,
+ .pte_unset = moea64_pte_unset_native,
+ .pte_replace = moea64_pte_replace_native,
+ .pte_insert = moea64_pte_insert_native,
+ .pte_insert_sp = moea64_pte_insert_sp_native,
+ .pte_unset_sp = moea64_pte_unset_sp_native,
+ .pte_replace_sp = moea64_pte_replace_sp_native,
};
MMU_DEF_INHERIT(oea64_mmu_native, MMU_TYPE_G5, moea64_native_methods, oea64_mmu);
@@ -321,7 +369,7 @@ moea64_pte_clear_native(struct pvo_entry *pvo, uint64_t ptebit)
rw_runlock(&moea64_eviction_lock);
critical_enter();
- TLBIE(pvo->pvo_vpn);
+ TLBIE(pvo->pvo_vpn, properpt.pte_hi);
critical_exit();
} else {
rw_runlock(&moea64_eviction_lock);
@@ -332,21 +380,10 @@ moea64_pte_clear_native(struct pvo_entry *pvo, uint64_t ptebit)
return (ptelo & (LPTE_REF | LPTE_CHG));
}
-static int64_t
-moea64_pte_unset_native(struct pvo_entry *pvo)
+static __always_inline int64_t
+moea64_pte_unset_locked(volatile struct lpte *pt, uint64_t vpn)
{
- volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot;
- uint64_t ptelo, pvo_ptevpn;
-
- pvo_ptevpn = moea64_pte_vpn_from_pvo_vpn(pvo);
-
- rw_rlock(&moea64_eviction_lock);
- if ((be64toh(pt->pte_hi) & LPTE_AVPN_MASK) != pvo_ptevpn) {
- /* Evicted */
- STAT_MOEA64(moea64_pte_overflow--);
- rw_runlock(&moea64_eviction_lock);
- return (-1);
- }
+ uint64_t ptelo;
/*
* Invalidate the pte, briefly locking it to collect RC bits. No
@@ -356,11 +393,10 @@ moea64_pte_unset_native(struct pvo_entry *pvo)
critical_enter();
pt->pte_hi = htobe64((be64toh(pt->pte_hi) & ~LPTE_VALID) | LPTE_LOCKED);
PTESYNC();
- TLBIE(pvo->pvo_vpn);
+ TLBIE(vpn, pt->pte_hi);
ptelo = be64toh(pt->pte_lo);
*((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */
critical_exit();
- rw_runlock(&moea64_eviction_lock);
/* Keep statistics */
STAT_MOEA64(moea64_pte_valid--);
@@ -369,6 +405,29 @@ moea64_pte_unset_native(struct pvo_entry *pvo)
}
static int64_t
+moea64_pte_unset_native(struct pvo_entry *pvo)
+{
+ volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot;
+ int64_t ret;
+ uint64_t pvo_ptevpn;
+
+ pvo_ptevpn = moea64_pte_vpn_from_pvo_vpn(pvo);
+
+ rw_rlock(&moea64_eviction_lock);
+
+ if ((be64toh(pt->pte_hi & LPTE_AVPN_MASK)) != pvo_ptevpn) {
+ /* Evicted */
+ STAT_MOEA64(moea64_pte_overflow--);
+ ret = -1;
+ } else
+ ret = moea64_pte_unset_locked(pt, pvo->pvo_vpn);
+
+ rw_runlock(&moea64_eviction_lock);
+
+ return (ret);
+}
+
+static int64_t
moea64_pte_replace_inval_native(struct pvo_entry *pvo,
volatile struct lpte *pt)
{
@@ -394,7 +453,7 @@ moea64_pte_replace_inval_native(struct pvo_entry *pvo,
critical_enter();
pt->pte_hi = htobe64((be64toh(pt->pte_hi) & ~LPTE_VALID) | LPTE_LOCKED);
PTESYNC();
- TLBIE(pvo->pvo_vpn);
+ TLBIE(pvo->pvo_vpn, pt->pte_hi);
ptelo = be64toh(pt->pte_lo);
EIEIO();
pt->pte_lo = htobe64(properpt.pte_lo);
@@ -734,7 +793,7 @@ moea64_insert_to_pteg_native(struct lpte *pvo_pt, uintptr_t slotbase,
va |= (oldptehi & LPTE_AVPN_MASK) <<
(ADDR_API_SHFT64 - ADDR_PIDX_SHFT);
PTESYNC();
- TLBIE(va);
+ TLBIE(va, oldptehi);
STAT_MOEA64(moea64_pte_valid--);
STAT_MOEA64(moea64_pte_overflow++);
}
@@ -754,26 +813,18 @@ moea64_insert_to_pteg_native(struct lpte *pvo_pt, uintptr_t slotbase,
return (k);
}
-static int64_t
-moea64_pte_insert_native(struct pvo_entry *pvo)
+static __always_inline int64_t
+moea64_pte_insert_locked(struct pvo_entry *pvo, struct lpte *insertpt,
+ uint64_t mask)
{
- struct lpte insertpt;
uintptr_t slot;
- /* Initialize PTE */
- moea64_pte_from_pvo(pvo, &insertpt);
-
- /* Make sure further insertion is locked out during evictions */
- rw_rlock(&moea64_eviction_lock);
-
/*
* First try primary hash.
*/
- pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */
- slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot,
- LPTE_VALID | LPTE_WIRED | LPTE_LOCKED);
+ slot = moea64_insert_to_pteg_native(insertpt, pvo->pvo_pte.slot,
+ mask | LPTE_WIRED | LPTE_LOCKED);
if (slot != -1) {
- rw_runlock(&moea64_eviction_lock);
pvo->pvo_pte.slot = slot;
return (0);
}
@@ -782,50 +833,52 @@ moea64_pte_insert_native(struct pvo_entry *pvo)
* Now try secondary hash.
*/
pvo->pvo_vaddr ^= PVO_HID;
- insertpt.pte_hi ^= LPTE_HID;
+ insertpt->pte_hi ^= LPTE_HID;
pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3);
- slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot,
- LPTE_VALID | LPTE_WIRED | LPTE_LOCKED);
+ slot = moea64_insert_to_pteg_native(insertpt, pvo->pvo_pte.slot,
+ mask | LPTE_WIRED | LPTE_LOCKED);
if (slot != -1) {
- rw_runlock(&moea64_eviction_lock);
pvo->pvo_pte.slot = slot;
return (0);
}
- /*
- * Out of luck. Find a PTE to sacrifice.
- */
+ return (-1);
+}
- /* Lock out all insertions for a bit */
- if (!rw_try_upgrade(&moea64_eviction_lock)) {
- rw_runlock(&moea64_eviction_lock);
- rw_wlock(&moea64_eviction_lock);
- }
+static int64_t
+moea64_pte_insert_native(struct pvo_entry *pvo)
+{
+ struct lpte insertpt;
+ int64_t ret;
- slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot,
- LPTE_WIRED | LPTE_LOCKED);
- if (slot != -1) {
- rw_wunlock(&moea64_eviction_lock);
- pvo->pvo_pte.slot = slot;
- return (0);
- }
+ /* Initialize PTE */
+ moea64_pte_from_pvo(pvo, &insertpt);
- /* Try other hash table. Now we're getting desperate... */
- pvo->pvo_vaddr ^= PVO_HID;
- insertpt.pte_hi ^= LPTE_HID;
- pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3);
- slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot,
- LPTE_WIRED | LPTE_LOCKED);
- if (slot != -1) {
+ /* Make sure further insertion is locked out during evictions */
+ rw_rlock(&moea64_eviction_lock);
+
+ pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */
+ ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_VALID);
+ if (ret == -1) {
+ /*
+ * Out of luck. Find a PTE to sacrifice.
+ */
+
+ /* Lock out all insertions for a bit */
+ if (!rw_try_upgrade(&moea64_eviction_lock)) {
+ rw_runlock(&moea64_eviction_lock);
+ rw_wlock(&moea64_eviction_lock);
+ }
+ /* Don't evict large pages */
+ ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_BIG);
rw_wunlock(&moea64_eviction_lock);
- pvo->pvo_pte.slot = slot;
- return (0);
- }
+ /* No freeable slots in either PTEG? We're hosed. */
+ if (ret == -1)
+ panic("moea64_pte_insert: overflow");
+ } else
+ rw_runlock(&moea64_eviction_lock);
- /* No freeable slots in either PTEG? We're hosed. */
- rw_wunlock(&moea64_eviction_lock);
- panic("moea64_pte_insert: overflow");
- return (-1);
+ return (0);
}
static void *
@@ -846,3 +899,134 @@ moea64_dump_pmap_native(void *ctx, void *buf, u_long *nbytes)
dctx->ptex = ptex_end;
return (__DEVOLATILE(struct lpte *, moea64_pteg_table) + ptex);
}
+
+static __always_inline uint64_t
+moea64_vpn_from_pte(uint64_t ptehi, uintptr_t slot)
+{
+ uint64_t pgn, pgnlo, vsid;
+
+ vsid = (ptehi & LPTE_AVA_MASK) >> LPTE_VSID_SHIFT;
+ if ((ptehi & LPTE_HID) != 0)
+ slot ^= (moea64_pteg_mask << 3);
+ pgnlo = ((vsid & VSID_HASH_MASK) ^ (slot >> 3)) & EA_PAGELO_MASK;
+ pgn = ((ptehi & LPTE_AVA_PGNHI_MASK) << (EA_PAGELO_SHIFT -
+ LPTE_AVA_PGNHI_SHIFT)) | pgnlo;
+ return ((vsid << 16) | pgn);
+}
+
+static __always_inline int64_t
+moea64_pte_unset_sp_locked(struct pvo_entry *pvo)
+{
+ volatile struct lpte *pt;
+ uint64_t ptehi, refchg, vpn;
+ vm_offset_t eva;
+ pmap_t pm;
+
+ pm = pvo->pvo_pmap;
+ refchg = 0;
+ eva = PVO_VADDR(pvo) + HPT_SP_SIZE;
+
+ for (; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ pt = moea64_pteg_table + pvo->pvo_pte.slot;
+ ptehi = be64toh(pt->pte_hi);
+ if ((ptehi & LPTE_AVPN_MASK) !=
+ moea64_pte_vpn_from_pvo_vpn(pvo)) {
+ /* Evicted: invalidate new entry */
+ STAT_MOEA64(moea64_pte_overflow--);
+ vpn = moea64_vpn_from_pte(ptehi, pvo->pvo_pte.slot);
+ CTR1(KTR_PMAP, "Evicted page in pte_unset_sp: vpn=%jx",
+ (uintmax_t)vpn);
+ /* Assume evicted page was modified */
+ refchg |= LPTE_CHG;
+ } else
+ vpn = pvo->pvo_vpn;
+
+ refchg |= moea64_pte_unset_locked(pt, vpn);
+ }
+
+ return (refchg);
+}
+
+static int64_t
+moea64_pte_unset_sp_native(struct pvo_entry *pvo)
+{
+ uint64_t refchg;
+
+ PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
+ KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0,
+ ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo)));
+
+ rw_rlock(&moea64_eviction_lock);
+ refchg = moea64_pte_unset_sp_locked(pvo);
+ rw_runlock(&moea64_eviction_lock);
+
+ return (refchg);
+}
+
+static __always_inline int64_t
+moea64_pte_insert_sp_locked(struct pvo_entry *pvo)
+{
+ struct lpte insertpt;
+ int64_t ret;
+ vm_offset_t eva;
+ pmap_t pm;
+
+ pm = pvo->pvo_pmap;
+ eva = PVO_VADDR(pvo) + HPT_SP_SIZE;
+
+ for (; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ moea64_pte_from_pvo(pvo, &insertpt);
+ pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */
+
+ ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_VALID);
+ if (ret == -1) {
+ /* Lock out all insertions for a bit */
+ if (!rw_try_upgrade(&moea64_eviction_lock)) {
+ rw_runlock(&moea64_eviction_lock);
+ rw_wlock(&moea64_eviction_lock);
+ }
+ /* Don't evict large pages */
+ ret = moea64_pte_insert_locked(pvo, &insertpt,
+ LPTE_BIG);
+ rw_downgrade(&moea64_eviction_lock);
+ /* No freeable slots in either PTEG? We're hosed. */
+ if (ret == -1)
+ panic("moea64_pte_insert_sp: overflow");
+ }
+ }
+
+ return (0);
+}
+
+static int64_t
+moea64_pte_insert_sp_native(struct pvo_entry *pvo)
+{
+ PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
+ KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0,
+ ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo)));
+
+ rw_rlock(&moea64_eviction_lock);
+ moea64_pte_insert_sp_locked(pvo);
+ rw_runlock(&moea64_eviction_lock);
+
+ return (0);
+}
+
+static int64_t
+moea64_pte_replace_sp_native(struct pvo_entry *pvo)
+{
+ uint64_t refchg;
+
+ PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
+ KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0,
+ ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo)));
+
+ rw_rlock(&moea64_eviction_lock);
+ refchg = moea64_pte_unset_sp_locked(pvo);
+ moea64_pte_insert_sp_locked(pvo);
+ rw_runlock(&moea64_eviction_lock);
+
+ return (refchg);
+}
diff --git a/sys/powerpc/include/pmap.h b/sys/powerpc/include/pmap.h
index 1ae90494ebfa..fd36e55a12e0 100644
--- a/sys/powerpc/include/pmap.h
+++ b/sys/powerpc/include/pmap.h
@@ -148,8 +148,8 @@ RB_PROTOTYPE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare);
#define PVO_MANAGED 0x020UL /* PVO entry is managed */
#define PVO_BOOTSTRAP 0x080UL /* PVO entry allocated during
bootstrap */
-#define PVO_DEAD 0x100UL /* waiting to be deleted */
-#define PVO_LARGE 0x200UL /* large page */
+#define PVO_DEAD 0x100UL /* waiting to be deleted */
+#define PVO_LARGE 0x200UL /* large page */
#define PVO_VADDR(pvo) ((pvo)->pvo_vaddr & ~ADDR_POFF)
#define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK)
#define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID)
diff --git a/sys/powerpc/include/pte.h b/sys/powerpc/include/pte.h
index 5e38e6bbcf28..9dc2787cf2ff 100644
--- a/sys/powerpc/include/pte.h
+++ b/sys/powerpc/include/pte.h
@@ -111,6 +111,7 @@ typedef struct lpte lpte_t;
/* High quadword: */
#define LPTE_VSID_SHIFT 12
#define LPTE_AVPN_MASK 0xFFFFFFFFFFFFFF80ULL
+#define LPTE_AVA_MASK 0x3FFFFFFFFFFFFF80ULL
#define LPTE_API 0x0000000000000F80ULL
#define LPTE_SWBITS 0x0000000000000078ULL
#define LPTE_WIRED 0x0000000000000010ULL
@@ -120,8 +121,13 @@ typedef struct lpte lpte_t;
#define LPTE_VALID 0x0000000000000001ULL
/* Low quadword: */
+#define LP_4K_16M 0x38 /* 4KB base, 16MB actual page size */
+
#define EXTEND_PTE(x) UINT64_C(x) /* make constants 64-bit */
#define LPTE_RPGN 0xfffffffffffff000ULL
+#define LPTE_LP_MASK 0x00000000000ff000ULL
+#define LPTE_LP_SHIFT 12
+#define LPTE_LP_4K_16M ((unsigned long long)(LP_4K_16M) << LPTE_LP_SHIFT)
#define LPTE_REF EXTEND_PTE( PTE_REF )
#define LPTE_CHG EXTEND_PTE( PTE_CHG )
#define LPTE_WIMG EXTEND_PTE( PTE_WIMG )
@@ -139,6 +145,12 @@ typedef struct lpte lpte_t;
#define LPTE_RW LPTE_BW
#define LPTE_RO LPTE_BR
+/* HPT superpage definitions */
+#define HPT_SP_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT)
+#define HPT_SP_SIZE (1 << HPT_SP_SHIFT)
+#define HPT_SP_MASK (HPT_SP_SIZE - 1)
+#define HPT_SP_PAGES (1 << VM_LEVEL_0_ORDER)
+
/* POWER ISA 3.0 Radix Table Definitions */
#define RPTE_VALID 0x8000000000000000ULL
#define RPTE_LEAF 0x4000000000000000ULL /* is a PTE: always 1 */
diff --git a/sys/powerpc/include/slb.h b/sys/powerpc/include/slb.h
index f93280030f94..f710aca8de97 100644
--- a/sys/powerpc/include/slb.h
+++ b/sys/powerpc/include/slb.h
@@ -64,6 +64,14 @@
#define SLBE_ESID_MASK 0xfffffffff0000000UL /* Effective segment ID mask */
#define SLBE_ESID_SHIFT 28
+/*
+ * SLB page sizes encoding, as present in property ibm,segment-page-sizes
+ * of CPU device tree node.
+ *
+ * See LoPAPR: CPU Node Properties, section C.6.1.4.
+ */
+#define SLB_PGSZ_4K_4K 0
+
/* Virtual real-mode VSID in LPARs */
#define VSID_VRMA 0x1ffffff
diff --git a/sys/powerpc/include/vmparam.h b/sys/powerpc/include/vmparam.h
index ab4c6aa2a651..77457717a3fd 100644
--- a/sys/powerpc/include/vmparam.h
+++ b/sys/powerpc/include/vmparam.h
@@ -185,31 +185,34 @@ struct pmap_physseg {
#define VM_NFREELIST 1
#define VM_FREELIST_DEFAULT 0
-/*
- * The largest allocation size is 4MB.
- */
#ifdef __powerpc64__
+/* The largest allocation size is 16MB. */
#define VM_NFREEORDER 13
#else
+/* The largest allocation size is 4MB. */
#define VM_NFREEORDER 11
#endif
#ifndef VM_NRESERVLEVEL
#ifdef __powerpc64__
+/* Enable superpage reservations: 1 level. */
#define VM_NRESERVLEVEL 1
#else
-/*
- * Disable superpage reservations.
- */
+/* Disable superpage reservations. */
#define VM_NRESERVLEVEL 0
#endif
#endif
-/*
- * Level 0 reservations consist of 512 pages.
- */
#ifndef VM_LEVEL_0_ORDER
-#define VM_LEVEL_0_ORDER 9
+/* Level 0 reservations consist of 512 (RPT) or 4096 (HPT) pages. */
+#define VM_LEVEL_0_ORDER vm_level_0_order
+#ifndef __ASSEMBLER__
+extern int vm_level_0_order;
+#endif
+#endif
+
+#ifndef VM_LEVEL_0_ORDER_MAX
+#define VM_LEVEL_0_ORDER_MAX 12
#endif
#ifdef __powerpc64__
diff --git a/sys/powerpc/powernv/platform_powernv.c b/sys/powerpc/powernv/platform_powernv.c
index 569ad6d9359e..434b642a66a8 100644
--- a/sys/powerpc/powernv/platform_powernv.c
+++ b/sys/powerpc/powernv/platform_powernv.c
@@ -141,6 +141,7 @@ powernv_attach(platform_t plat)
phandle_t opal;
int res, len, idx;
register_t msr;
+ bool has_lp;
/* Ping OPAL again just to make sure */
opal_check();
@@ -228,6 +229,7 @@ powernv_attach(platform_t plat)
sizeof(arr));
len /= 4;
idx = 0;
+ has_lp = false;
while (len > 0) {
shift = arr[idx];
slb_encoding = arr[idx + 1];
@@ -238,17 +240,21 @@ powernv_attach(platform_t plat)
lp_size = arr[idx];
lp_encoding = arr[idx+1];
if (slb_encoding == SLBV_L && lp_encoding == 0)
- break;
+ has_lp = true;
+
+ if (slb_encoding == SLB_PGSZ_4K_4K &&
+ lp_encoding == LP_4K_16M)
+ moea64_has_lp_4k_16m = true;
idx += 2;
len -= 2;
nptlp--;
}
- if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0)
+ if (has_lp && moea64_has_lp_4k_16m)
break;
}
- if (len == 0)
+ if (!has_lp)
panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) "
"not supported by this system.");
diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c
index f84d5e0165c6..5a00a0158620 100644
--- a/sys/powerpc/powerpc/pmap_dispatch.c
+++ b/sys/powerpc/powerpc/pmap_dispatch.c
@@ -77,6 +77,8 @@ vm_offset_t virtual_end;
caddr_t crashdumpmap;
int pmap_bootstrapped;
+/* Default level 0 reservations consist of 512 pages (2MB superpage). */
+int vm_level_0_order = 9;
#ifdef AIM
int
diff --git a/sys/powerpc/pseries/mmu_phyp.c b/sys/powerpc/pseries/mmu_phyp.c
index ca4ee79275a8..709a7dffc995 100644
--- a/sys/powerpc/pseries/mmu_phyp.c
+++ b/sys/powerpc/pseries/mmu_phyp.c
@@ -82,6 +82,9 @@ static int64_t mphyp_pte_synch(struct pvo_entry *pvo);
static int64_t mphyp_pte_clear(struct pvo_entry *pvo, uint64_t ptebit);
static int64_t mphyp_pte_unset(struct pvo_entry *pvo);
static int64_t mphyp_pte_insert(struct pvo_entry *pvo);
+static int64_t mphyp_pte_unset_sp(struct pvo_entry *pvo);
+static int64_t mphyp_pte_insert_sp(struct pvo_entry *pvo);
+static int64_t mphyp_pte_replace_sp(struct pvo_entry *pvo);
static struct pmap_funcs mphyp_methods = {
.install = mphyp_install,
@@ -95,6 +98,9 @@ static struct moea64_funcs mmu_phyp_funcs = {
.pte_clear = mphyp_pte_clear,
.pte_unset = mphyp_pte_unset,
.pte_insert = mphyp_pte_insert,
+ .pte_unset_sp = mphyp_pte_unset_sp,
+ .pte_insert_sp = mphyp_pte_insert_sp,
+ .pte_replace_sp = mphyp_pte_replace_sp,
};
MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, oea64_mmu);
@@ -135,6 +141,7 @@ mphyp_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
uint64_t vsid;
phandle_t dev, node, root;
int idx, len, res;
+ bool has_lp;
rm_init(&mphyp_eviction_lock, "pte eviction");
@@ -199,6 +206,7 @@ mphyp_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
sizeof(arr));
len /= 4;
idx = 0;
+ has_lp = false;
while (len > 0) {
shift = arr[idx];
slb_encoding = arr[idx + 1];
@@ -220,18 +228,22 @@ mphyp_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
lp_encoding);
if (slb_encoding == SLBV_L && lp_encoding == 0)
- break;
+ has_lp = true;
+
+ if (slb_encoding == SLB_PGSZ_4K_4K &&
+ lp_encoding == LP_4K_16M)
+ moea64_has_lp_4k_16m = true;
idx += 2;
len -= 2;
nptlp--;
}
dprintf("\n");
- if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0)
+ if (has_lp && moea64_has_lp_4k_16m)
break;
}
- if (len > 0) {
+ if (has_lp) {
moea64_large_page_shift = shift;
moea64_large_page_size = 1ULL << lp_size;
moea64_large_page_mask = moea64_large_page_size - 1;
@@ -393,7 +405,7 @@ mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict)
phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi,
&pt.pte_lo, &junk);
- if (pt.pte_hi & LPTE_WIRED)
+ if ((pt.pte_hi & (LPTE_WIRED | LPTE_BIG)) != 0)
continue;
/* This is a candidate, so remember it */
@@ -414,68 +426,61 @@ mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict)
return (k);
}
-static int64_t
-mphyp_pte_insert(struct pvo_entry *pvo)
+static __inline int64_t
+mphyp_pte_insert_locked(struct pvo_entry *pvo, struct lpte *pte)
{
- struct rm_priotracker track;
+ struct lpte evicted;
+ uint64_t index, junk;
int64_t result;
- struct lpte evicted, pte;
- uint64_t index, junk, lastptelo;
-
- PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
-
- /* Initialize PTE */
- moea64_pte_from_pvo(pvo, &pte);
- evicted.pte_hi = 0;
-
- /* Make sure further insertion is locked out during evictions */
- rm_rlock(&mphyp_eviction_lock, &track);
/*
* First try primary hash.
*/
pvo->pvo_pte.slot &= ~7UL; /* Base slot address */
- result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi,
- pte.pte_lo, &index, &evicted.pte_lo, &junk);
+ result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte->pte_hi,
+ pte->pte_lo, &index, &evicted.pte_lo, &junk);
if (result == H_SUCCESS) {
- rm_runlock(&mphyp_eviction_lock, &track);
pvo->pvo_pte.slot = index;
return (0);
}
KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld "
"(ptegidx: %#zx/%#lx, PTE %#lx/%#lx", result, pvo->pvo_pte.slot,
- moea64_pteg_count, pte.pte_hi, pte.pte_lo));
+ moea64_pteg_count, pte->pte_hi, pte->pte_lo));
/*
* Next try secondary hash.
*/
pvo->pvo_vaddr ^= PVO_HID;
- pte.pte_hi ^= LPTE_HID;
+ pte->pte_hi ^= LPTE_HID;
pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3);
result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot,
- pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk);
+ pte->pte_hi, pte->pte_lo, &index, &evicted.pte_lo, &junk);
if (result == H_SUCCESS) {
- rm_runlock(&mphyp_eviction_lock, &track);
pvo->pvo_pte.slot = index;
return (0);
}
KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld",
result));
- /*
- * Out of luck. Find a PTE to sacrifice.
- */
+ return (-1);
+}
- /* Lock out all insertions for a bit */
- rm_runlock(&mphyp_eviction_lock, &track);
- rm_wlock(&mphyp_eviction_lock);
+
+static __inline int64_t
+mphyp_pte_evict_and_insert_locked(struct pvo_entry *pvo, struct lpte *pte)
+{
+ struct lpte evicted;
+ uint64_t index, junk, lastptelo;
+ int64_t result;
+
+ evicted.pte_hi = 0;
index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted);
if (index == -1L) {
/* Try other hash table? */
pvo->pvo_vaddr ^= PVO_HID;
- pte.pte_hi ^= LPTE_HID;
+ pte->pte_hi ^= LPTE_HID;
pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3);
index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted);
}
@@ -500,18 +505,50 @@ mphyp_pte_insert(struct pvo_entry *pvo)
/*
* Set the new PTE.
*/
- result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi,
- pte.pte_lo, &index, &evicted.pte_lo, &junk);
- rm_wunlock(&mphyp_eviction_lock); /* All clear */
+ result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte->pte_hi,
+ pte->pte_lo, &index, &evicted.pte_lo, &junk);
pvo->pvo_pte.slot = index;
if (result == H_SUCCESS)
return (0);
+ rm_wunlock(&mphyp_eviction_lock);
panic("Page replacement error: %ld", result);
return (result);
}
+static int64_t
+mphyp_pte_insert(struct pvo_entry *pvo)
+{
+ struct rm_priotracker track;
+ int64_t ret;
+ struct lpte pte;
+
+ PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
+
+ /* Initialize PTE */
+ moea64_pte_from_pvo(pvo, &pte);
+
+ /* Make sure further insertion is locked out during evictions */
+ rm_rlock(&mphyp_eviction_lock, &track);
+
+ ret = mphyp_pte_insert_locked(pvo, &pte);
+ rm_runlock(&mphyp_eviction_lock, &track);
+
+ if (ret == -1) {
+ /*
+ * Out of luck. Find a PTE to sacrifice.
+ */
+
+ /* Lock out all insertions for a bit */
+ rm_wlock(&mphyp_eviction_lock);
+ ret = mphyp_pte_evict_and_insert_locked(pvo, &pte);
+ rm_wunlock(&mphyp_eviction_lock); /* All clear */
+ }
+
+ return (ret);
+}
+
static void *
mphyp_dump_pmap(void *ctx, void *buf, u_long *nbytes)
{
@@ -541,3 +578,91 @@ mphyp_dump_pmap(void *ctx, void *buf, u_long *nbytes)
dctx->ptex = ptex;
return (buf);
}
+
+static int64_t
+mphyp_pte_unset_sp(struct pvo_entry *pvo)
+{
+ struct lpte pte;
+ uint64_t junk, refchg;
+ int err;
+ vm_offset_t eva;
+ pmap_t pm;
+
+ pm = pvo->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+ KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0,
+ ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo)));
+
+ refchg = 0;
+ eva = PVO_VADDR(pvo) + HPT_SP_SIZE;
+
+ for (; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ moea64_pte_from_pvo(pvo, &pte);
+
+ err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot,
+ pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo,
+ &junk);
+ KASSERT(err == H_SUCCESS || err == H_NOT_FOUND,
+ ("Error removing page: %d", err));
+
+ if (err == H_NOT_FOUND)
+ STAT_MOEA64(moea64_pte_overflow--);
+ refchg |= pte.pte_lo & (LPTE_REF | LPTE_CHG);
+ }
+
+ return (refchg);
+}
+
+static int64_t
+mphyp_pte_insert_sp(struct pvo_entry *pvo)
+{
+ struct rm_priotracker track;
+ int64_t ret;
+ struct lpte pte;
+ vm_offset_t eva;
+ pmap_t pm;
+
+ pm = pvo->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+ KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0,
+ ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo)));
+
+ eva = PVO_VADDR(pvo) + HPT_SP_SIZE;
+
+ /* Make sure further insertion is locked out during evictions */
+ rm_rlock(&mphyp_eviction_lock, &track);
+
+ for (; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ /* Initialize PTE */
+ moea64_pte_from_pvo(pvo, &pte);
+
+ ret = mphyp_pte_insert_locked(pvo, &pte);
+ if (ret == -1) {
+ /*
+ * Out of luck. Find a PTE to sacrifice.
+ */
+
+ /* Lock out all insertions for a bit */
+ rm_runlock(&mphyp_eviction_lock, &track);
+ rm_wlock(&mphyp_eviction_lock);
+ mphyp_pte_evict_and_insert_locked(pvo, &pte);
+ rm_wunlock(&mphyp_eviction_lock); /* All clear */
+ rm_rlock(&mphyp_eviction_lock, &track);
+ }
+ }
+
+ rm_runlock(&mphyp_eviction_lock, &track);
+ return (0);
+}
+
+static int64_t
+mphyp_pte_replace_sp(struct pvo_entry *pvo)
+{
+ int64_t refchg;
+
+ refchg = mphyp_pte_unset_sp(pvo);
+ mphyp_pte_insert_sp(pvo);
+ return (refchg);
+}
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 5a4d392b5e69..81bcfb6b58b2 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -542,7 +542,8 @@ vm_fault_populate(struct faultstate *fs)
pidx += npages, m = vm_page_next(&m[npages - 1])) {
vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
#if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
- __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)
+ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) || \
+ defined(__powerpc64__)
psind = m->psind;
if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||