aboutsummaryrefslogtreecommitdiff
path: root/sys/vm
diff options
context:
space:
mode:
Diffstat (limited to 'sys/vm')
-rw-r--r--sys/vm/uma_core.c10
-rw-r--r--sys/vm/vm_domainset.c265
-rw-r--r--sys/vm/vm_domainset.h15
-rw-r--r--sys/vm/vm_fault.c209
-rw-r--r--sys/vm/vm_glue.c2
-rw-r--r--sys/vm/vm_kern.c12
-rw-r--r--sys/vm/vm_page.c21
7 files changed, 342 insertions, 192 deletions
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 5189f7405400..679b2e20e88b 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -4017,8 +4017,9 @@ restart:
rr = rdomain == UMA_ANYDOMAIN;
if (rr) {
aflags = (flags & ~M_WAITOK) | M_NOWAIT;
- vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
- &aflags);
+ if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
+ &aflags) != 0)
+ return (NULL);
} else {
aflags = flags;
domain = rdomain;
@@ -5245,8 +5246,9 @@ uma_prealloc(uma_zone_t zone, int items)
slabs = howmany(items, keg->uk_ipers);
while (slabs-- > 0) {
aflags = M_NOWAIT;
- vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
- &aflags);
+ if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
+ &aflags) != 0)
+ panic("%s: Domainset is empty", __func__);
for (;;) {
slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
aflags);
diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index b44bdb96b0d4..9fa17da954f7 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -58,6 +58,9 @@
static int vm_domainset_default_stride = 64;
+static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain);
+
+
/*
* Determine which policy is to be used for this allocation.
*/
@@ -93,28 +96,15 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,
pindex += (((uintptr_t)obj) / sizeof(*obj));
di->di_offset = pindex;
}
- /* Skip domains below min on the first pass. */
- di->di_minskip = true;
}
static void
vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)
{
+ /* Grab the next domain in 'ds_order'. */
*domain = di->di_domain->ds_order[
- ++(*di->di_iter) % di->di_domain->ds_cnt];
-}
-
-static void
-vm_domainset_iter_prefer(struct vm_domainset_iter *di, int *domain)
-{
- int d;
-
- do {
- d = di->di_domain->ds_order[
- ++(*di->di_iter) % di->di_domain->ds_cnt];
- } while (d == di->di_domain->ds_prefer);
- *domain = d;
+ (*di->di_iter)++ % di->di_domain->ds_cnt];
}
static void
@@ -127,79 +117,144 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)
*domain = di->di_domain->ds_order[d];
}
-static void
-vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
+/*
+ * Internal function determining the current phase's first candidate domain.
+ *
+ * Returns whether these is an eligible domain, which is returned through
+ * '*domain'. '*domain' can be modified even if there is no eligible domain.
+ *
+ * See herald comment of vm_domainset_iter_first() below about phases.
+ */
+static bool
+vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain)
{
-
- KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
switch (di->di_policy) {
case DOMAINSET_POLICY_FIRSTTOUCH:
- /*
- * To prevent impossible allocations we convert an invalid
- * first-touch to round-robin.
- */
- /* FALLTHROUGH */
- case DOMAINSET_POLICY_INTERLEAVE:
- /* FALLTHROUGH */
+ *domain = PCPU_GET(domain);
+ break;
case DOMAINSET_POLICY_ROUNDROBIN:
vm_domainset_iter_rr(di, domain);
break;
case DOMAINSET_POLICY_PREFER:
- vm_domainset_iter_prefer(di, domain);
+ *domain = di->di_domain->ds_prefer;
+ break;
+ case DOMAINSET_POLICY_INTERLEAVE:
+ vm_domainset_iter_interleave(di, domain);
break;
default:
panic("%s: Unknown policy %d", __func__, di->di_policy);
}
KASSERT(*domain < vm_ndomains,
("%s: Invalid domain %d", __func__, *domain));
+
+ /*
+ * Has the policy's start domain already been visited?
+ */
+ if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask))
+ return (vm_domainset_iter_next(di, domain));
+
+ DOMAINSET_CLR(*domain, &di->di_remain_mask);
+
+ /* Does it have enough free pages (phase 1)? */
+ if (di->di_minskip && vm_page_count_min_domain(*domain)) {
+ /* Mark the domain as eligible for phase 2. */
+ DOMAINSET_SET(*domain, &di->di_min_mask);
+ return (vm_domainset_iter_next(di, domain));
+ }
+
+ return (true);
}
-static void
+/*
+ * Resets an iterator to point to the first candidate domain.
+ *
+ * Returns whether there is an eligible domain to start with. '*domain' may be
+ * modified even if there is none.
+ *
+ * There must have been one call to vm_domainset_iter_init() before.
+ *
+ * This function must be called at least once before calling
+ * vm_domainset_iter_next(). Note that functions wrapping
+ * vm_domainset_iter_init() usually do that themselves.
+ *
+ * This function may be called again to reset the iterator to the policy's first
+ * candidate domain. After each reset, the iterator will visit the same domains
+ * as in the previous iteration minus those on which vm_domainset_iter_ignore()
+ * has been called. Note that the first candidate domain may change at each
+ * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN
+ * policy).
+ *
+ * Domains which have a number of free pages over 'v_free_min' are always
+ * visited first (this is called the "phase 1" in comments, "phase 2" being the
+ * examination of the remaining domains; no domains are ever visited twice).
+ */
+static bool
vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
{
+ /* Initialize the mask of domains to visit. */
+ DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);
+ /*
+ * No candidate domains for phase 2 at start. This will be filled by
+ * phase 1.
+ */
+ DOMAINSET_ZERO(&di->di_min_mask);
+ /* Skip domains below 'v_free_min' on phase 1. */
+ di->di_minskip = true;
- switch (di->di_policy) {
- case DOMAINSET_POLICY_FIRSTTOUCH:
- *domain = PCPU_GET(domain);
- if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) {
- /*
- * Add an extra iteration because we will visit the
- * current domain a second time in the rr iterator.
- */
- di->di_n = di->di_domain->ds_cnt + 1;
- break;
- }
- /*
- * To prevent impossible allocations we convert an invalid
- * first-touch to round-robin.
- */
- /* FALLTHROUGH */
- case DOMAINSET_POLICY_ROUNDROBIN:
- di->di_n = di->di_domain->ds_cnt;
+ return (vm_domainset_iter_phase_first(di, domain));
+}
+
+/*
+ * Advances the iterator to the next candidate domain.
+ *
+ * Returns whether there was another domain to visit. '*domain' may be modified
+ * even if there is none.
+ *
+ * vm_domainset_iter_first() must have been called at least once before using
+ * this function (see its herald comment for more details on iterators).
+ */
+static bool
+vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
+{
+ /* Loop while there remains domains to visit in the current phase. */
+ while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {
+ /* Grab the next domain in 'ds_order'. */
vm_domainset_iter_rr(di, domain);
- break;
- case DOMAINSET_POLICY_PREFER:
- *domain = di->di_domain->ds_prefer;
- di->di_n = di->di_domain->ds_cnt;
- break;
- case DOMAINSET_POLICY_INTERLEAVE:
- vm_domainset_iter_interleave(di, domain);
- di->di_n = di->di_domain->ds_cnt;
- break;
- default:
- panic("%s: Unknown policy %d", __func__, di->di_policy);
+ KASSERT(*domain < vm_ndomains,
+ ("%s: Invalid domain %d", __func__, *domain));
+
+ if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {
+ DOMAINSET_CLR(*domain, &di->di_remain_mask);
+ if (!di->di_minskip || !vm_page_count_min_domain(*domain))
+ return (true);
+ DOMAINSET_SET(*domain, &di->di_min_mask);
+ }
}
- KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
- KASSERT(*domain < vm_ndomains,
- ("%s: Invalid domain %d", __func__, *domain));
+
+ /*
+ * If phase 1 (skip low memory domains) is over, start phase 2 (consider
+ * low memory domains).
+ */
+ if (di->di_minskip) {
+ di->di_minskip = false;
+ /* Browse domains that were under 'v_free_min'. */
+ DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);
+ return (vm_domainset_iter_phase_first(di, domain));
+ }
+
+ return (false);
}
-void
+int
vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
- vm_pindex_t pindex, int *domain, int *req, struct pctrie_iter *pages)
+ vm_pindex_t pindex, int *domain, int *req)
{
struct domainset_ref *dr;
+ di->di_flags = *req;
+ *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
+ VM_ALLOC_NOWAIT;
+
/*
* Object policy takes precedence over thread policy. The policies
* are immutable and unsynchronized. Updates can race but pointer
@@ -209,36 +264,21 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
dr = &obj->domain;
else
dr = &curthread->td_domain;
+
vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex);
- di->di_flags = *req;
- *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
- VM_ALLOC_NOWAIT;
- vm_domainset_iter_first(di, domain);
- if (vm_page_count_min_domain(*domain))
- vm_domainset_iter_page(di, obj, domain, pages);
+ /*
+ * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was
+ * passed?
+ */
+ return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
}
int
vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
int *domain, struct pctrie_iter *pages)
{
- if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask)))
- return (ENOMEM);
-
- /* If there are more domains to visit we run the iterator. */
- while (--di->di_n != 0) {
- vm_domainset_iter_next(di, domain);
- if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
- (!di->di_minskip || !vm_page_count_min_domain(*domain)))
- return (0);
- }
-
- /* If we skipped domains below min restart the search. */
- if (di->di_minskip) {
- di->di_minskip = false;
- vm_domainset_iter_first(di, domain);
+ if (vm_domainset_iter_next(di, domain))
return (0);
- }
/* If we visited all domains and this was a NOWAIT we return error. */
if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0)
@@ -257,61 +297,43 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
return (ENOMEM);
/* Restart the search. */
- vm_domainset_iter_first(di, domain);
-
- return (0);
+ /* XXXOC: Shouldn't we just panic on 'false'? */
+ return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
}
-static void
+static int
_vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain,
int *flags)
{
-
di->di_flags = *flags;
*flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;
- vm_domainset_iter_first(di, domain);
- if (vm_page_count_min_domain(*domain))
- vm_domainset_iter_policy(di, domain);
+ /* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */
+ return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
}
-void
+int
vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
struct domainset *ds, int *domain, int *flags)
{
vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0);
- _vm_domainset_iter_policy_init(di, domain, flags);
+ return (_vm_domainset_iter_policy_init(di, domain, flags));
}
-void
+int
vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
struct domainset_ref *dr, int *domain, int *flags)
{
vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0);
- _vm_domainset_iter_policy_init(di, domain, flags);
+ return (_vm_domainset_iter_policy_init(di, domain, flags));
}
int
vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
{
- if (DOMAINSET_EMPTY(&di->di_valid_mask))
- return (ENOMEM);
-
- /* If there are more domains to visit we run the iterator. */
- while (--di->di_n != 0) {
- vm_domainset_iter_next(di, domain);
- if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
- (!di->di_minskip || !vm_page_count_min_domain(*domain)))
- return (0);
- }
-
- /* If we skipped domains below min restart the search. */
- if (di->di_minskip) {
- di->di_minskip = false;
- vm_domainset_iter_first(di, domain);
+ if (vm_domainset_iter_next(di, domain))
return (0);
- }
/* If we visited all domains and this was a NOWAIT we return error. */
if ((di->di_flags & M_WAITOK) == 0)
@@ -321,9 +343,8 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
vm_wait_doms(&di->di_valid_mask, 0);
/* Restart the search. */
- vm_domainset_iter_first(di, domain);
-
- return (0);
+ /* XXXOC: Shouldn't we just panic on 'false'? */
+ return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
}
void
@@ -345,12 +366,12 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
return (EJUSTRETURN);
}
-void
+int
vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
- vm_pindex_t pindex, int *domain, int *flags, struct pctrie_iter *pages)
+ vm_pindex_t pindex, int *domain, int *flags)
{
-
*domain = 0;
+ return (0);
}
int
@@ -360,20 +381,20 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
return (EJUSTRETURN);
}
-void
+int
vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
struct domainset *ds, int *domain, int *flags)
{
-
*domain = 0;
+ return (0);
}
-void
+int
vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
struct domainset_ref *dr, int *domain, int *flags)
{
-
*domain = 0;
+ return (0);
}
void
diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h
index 0d325a642f40..ef86c8ccb5e4 100644
--- a/sys/vm/vm_domainset.h
+++ b/sys/vm/vm_domainset.h
@@ -33,23 +33,26 @@ struct pctrie_iter;
struct vm_domainset_iter {
struct domainset *di_domain;
unsigned int *di_iter;
+ /* Initialized from 'di_domain', initial value after reset. */
domainset_t di_valid_mask;
+ /* Domains to browse in the current phase. */
+ domainset_t di_remain_mask;
+ /* Domains skipped in phase 1 because under 'v_free_min'. */
+ domainset_t di_min_mask;
vm_pindex_t di_offset;
int di_flags;
uint16_t di_policy;
- domainid_t di_n;
bool di_minskip;
};
int vm_domainset_iter_page(struct vm_domainset_iter *, struct vm_object *,
int *, struct pctrie_iter *);
-void vm_domainset_iter_page_init(struct vm_domainset_iter *,
- struct vm_object *, vm_pindex_t, int *, int *,
- struct pctrie_iter *);
+int vm_domainset_iter_page_init(struct vm_domainset_iter *,
+ struct vm_object *, vm_pindex_t, int *, int *);
int vm_domainset_iter_policy(struct vm_domainset_iter *, int *);
-void vm_domainset_iter_policy_init(struct vm_domainset_iter *,
+int vm_domainset_iter_policy_init(struct vm_domainset_iter *,
struct domainset *, int *, int *);
-void vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *,
+int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *,
struct domainset_ref *, int *, int *);
void vm_domainset_iter_ignore(struct vm_domainset_iter *, int);
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 58f8ac16fa0c..2e150b368d71 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -71,11 +71,9 @@
* Page fault handling module.
*/
-#include <sys/cdefs.h>
#include "opt_ktrace.h"
#include "opt_vm.h"
-#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
@@ -204,7 +202,10 @@ vm_fault_page_release(vm_page_t *mp)
* pageout while optimizing fault restarts.
*/
vm_page_deactivate(m);
- vm_page_xunbusy(m);
+ if (vm_page_xbusied(m))
+ vm_page_xunbusy(m);
+ else
+ vm_page_sunbusy(m);
*mp = NULL;
}
}
@@ -260,6 +261,12 @@ vm_fault_unlock_vp(struct faultstate *fs)
}
}
+static bool
+vm_fault_might_be_cow(struct faultstate *fs)
+{
+ return (fs->object != fs->first_object);
+}
+
static void
vm_fault_deallocate(struct faultstate *fs)
{
@@ -267,7 +274,7 @@ vm_fault_deallocate(struct faultstate *fs)
vm_fault_page_release(&fs->m_cow);
vm_fault_page_release(&fs->m);
vm_object_pip_wakeup(fs->object);
- if (fs->object != fs->first_object) {
+ if (vm_fault_might_be_cow(fs)) {
VM_OBJECT_WLOCK(fs->first_object);
vm_fault_page_free(&fs->first_m);
VM_OBJECT_WUNLOCK(fs->first_object);
@@ -329,6 +336,13 @@ vm_fault_dirty(struct faultstate *fs, vm_page_t m)
}
+static bool
+vm_fault_is_read(const struct faultstate *fs)
+{
+ return ((fs->prot & VM_PROT_WRITE) == 0 &&
+ (fs->fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) == 0);
+}
+
/*
* Unlocks fs.first_object and fs.map on success.
*/
@@ -694,21 +708,18 @@ _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT");
#endif
/*
- * vm_fault_trap:
- *
- * Handle a page fault occurring at the given address,
- * requiring the given permissions, in the map specified.
- * If successful, the page is inserted into the
- * associated physical map.
+ * vm_fault_trap:
*
- * NOTE: the given address should be truncated to the
- * proper page address.
+ * Helper for the page fault trap handlers, wrapping vm_fault().
+ * Issues ktrace(2) tracepoints for the faults.
*
- * KERN_SUCCESS is returned if the page fault is handled; otherwise,
- * a standard error specifying why the fault is fatal is returned.
+ * If a fault cannot be handled successfully by satisfying the
+ * required mapping, and the faulted instruction cannot be restarted,
+ * the signal number and si_code values are returned for trapsignal()
+ * to deliver.
*
- * The map in question must be referenced, and remains so.
- * Caller may hold no locks.
+ * Returns Mach error codes, but callers should only check for
+ * KERN_SUCCESS.
*/
int
vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
@@ -1002,12 +1013,22 @@ vm_fault_relookup(struct faultstate *fs)
return (KERN_SUCCESS);
}
+static bool
+vm_fault_can_cow_rename(struct faultstate *fs)
+{
+ return (
+ /* Only one shadow object and no other refs. */
+ fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
+ /* No other ways to look the object up. */
+ fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0);
+}
+
static void
vm_fault_cow(struct faultstate *fs)
{
- bool is_first_object_locked;
+ bool is_first_object_locked, rename_cow;
- KASSERT(fs->object != fs->first_object,
+ KASSERT(vm_fault_might_be_cow(fs),
("source and target COW objects are identical"));
/*
@@ -1019,21 +1040,29 @@ vm_fault_cow(struct faultstate *fs)
* object so that it will go out to swap when needed.
*/
is_first_object_locked = false;
- if (
- /*
- * Only one shadow object and no other refs.
- */
- fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
- /*
- * No other ways to look the object up
- */
- fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 &&
- /*
- * We don't chase down the shadow chain and we can acquire locks.
- */
- (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) &&
- fs->object == fs->first_object->backing_object &&
- VM_OBJECT_TRYWLOCK(fs->object)) {
+ rename_cow = false;
+
+ if (vm_fault_can_cow_rename(fs) && vm_page_xbusied(fs->m)) {
+ /*
+ * Check that we don't chase down the shadow chain and
+ * we can acquire locks. Recheck the conditions for
+ * rename after the shadow chain is stable after the
+ * object locking.
+ */
+ is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object);
+ if (is_first_object_locked &&
+ fs->object == fs->first_object->backing_object) {
+ if (VM_OBJECT_TRYWLOCK(fs->object)) {
+ rename_cow = vm_fault_can_cow_rename(fs);
+ if (!rename_cow)
+ VM_OBJECT_WUNLOCK(fs->object);
+ }
+ }
+ }
+
+ if (rename_cow) {
+ vm_page_assert_xbusied(fs->m);
+
/*
* Remove but keep xbusy for replace. fs->m is moved into
* fs->first_object and left busy while fs->first_m is
@@ -1090,8 +1119,12 @@ vm_fault_cow(struct faultstate *fs)
* address space. If OBJ_ONEMAPPING is set after the check,
* removing mappings will at worse trigger some unnecessary page
* faults.
+ *
+ * In the fs->m shared busy case, the xbusy state of
+ * fs->first_m prevents new mappings of fs->m from
+ * being created because a parallel fault on this
+ * shadow chain should wait for xbusy on fs->first_m.
*/
- vm_page_assert_xbusied(fs->m_cow);
if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0)
pmap_remove_all(fs->m_cow);
}
@@ -1171,7 +1204,7 @@ vm_fault_zerofill(struct faultstate *fs)
* If there's no object left, fill the page in the top
* object with zeros.
*/
- if (fs->object != fs->first_object) {
+ if (vm_fault_might_be_cow(fs)) {
vm_object_pip_wakeup(fs->object);
fs->object = fs->first_object;
fs->pindex = fs->first_pindex;
@@ -1420,14 +1453,13 @@ vm_fault_getpages(struct faultstate *fs, int *behindp, int *aheadp)
* and we could end up trying to pagein and pageout the same page
* simultaneously.
*
- * We can theoretically allow the busy case on a read fault if the page
- * is marked valid, but since such pages are typically already pmap'd,
- * putting that special case in might be more effort then it is worth.
- * We cannot under any circumstances mess around with a shared busied
- * page except, perhaps, to pmap it.
+ * We allow the busy case on a read fault if the page is valid. We
+ * cannot under any circumstances mess around with a shared busied
+ * page except, perhaps, to pmap it. This is controlled by the
+ * VM_ALLOC_SBUSY bit in the allocflags argument.
*/
static void
-vm_fault_busy_sleep(struct faultstate *fs)
+vm_fault_busy_sleep(struct faultstate *fs, int allocflags)
{
/*
* Reference the page before unlocking and
@@ -1435,13 +1467,13 @@ vm_fault_busy_sleep(struct faultstate *fs)
* likely to reclaim it.
*/
vm_page_aflag_set(fs->m, PGA_REFERENCED);
- if (fs->object != fs->first_object) {
+ if (vm_fault_might_be_cow(fs)) {
vm_fault_page_release(&fs->first_m);
vm_object_pip_wakeup(fs->first_object);
}
vm_object_pip_wakeup(fs->object);
vm_fault_unlock_map(fs);
- if (!vm_page_busy_sleep(fs->m, "vmpfw", 0))
+ if (!vm_page_busy_sleep(fs->m, "vmpfw", allocflags))
VM_OBJECT_UNLOCK(fs->object);
VM_CNT_INC(v_intrans);
vm_object_deallocate(fs->first_object);
@@ -1487,8 +1519,53 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
vm_page_iter_init(&pages, fs->object);
fs->m = vm_radix_iter_lookup(&pages, fs->pindex);
if (fs->m != NULL) {
+ /*
+ * If the found page is valid, will be either shadowed
+ * or mapped read-only, and will not be renamed for
+ * COW, then busy it in shared mode. This allows
+ * other faults needing this page to proceed in
+ * parallel.
+ *
+ * Unlocked check for validity, rechecked after busy
+ * is obtained.
+ */
+ if (vm_page_all_valid(fs->m) &&
+ /*
+ * No write permissions for the new fs->m mapping,
+ * or the first object has only one mapping, so
+ * other writeable COW mappings of fs->m cannot
+ * appear under us.
+ */
+ (vm_fault_is_read(fs) || vm_fault_might_be_cow(fs)) &&
+ /*
+ * fs->m cannot be renamed from object to
+ * first_object. These conditions will be
+ * re-checked with proper synchronization in
+ * vm_fault_cow().
+ */
+ (!vm_fault_can_cow_rename(fs) ||
+ fs->object != fs->first_object->backing_object)) {
+ if (!vm_page_trysbusy(fs->m)) {
+ vm_fault_busy_sleep(fs, VM_ALLOC_SBUSY);
+ return (FAULT_RESTART);
+ }
+
+ /*
+ * Now make sure that racily checked
+ * conditions are still valid.
+ */
+ if (__predict_true(vm_page_all_valid(fs->m) &&
+ (vm_fault_is_read(fs) ||
+ vm_fault_might_be_cow(fs)))) {
+ VM_OBJECT_UNLOCK(fs->object);
+ return (FAULT_SOFT);
+ }
+
+ vm_page_sunbusy(fs->m);
+ }
+
if (!vm_page_tryxbusy(fs->m)) {
- vm_fault_busy_sleep(fs);
+ vm_fault_busy_sleep(fs, 0);
return (FAULT_RESTART);
}
@@ -1546,6 +1623,27 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
return (res);
}
+/*
+ * vm_fault:
+ *
+ * Handle a page fault occurring at the given address, requiring the
+ * given permissions, in the map specified. If successful, the page
+ * is inserted into the associated physical map, and optionally
+ * referenced and returned in *m_hold.
+ *
+ * The given address should be truncated to the proper page address.
+ *
+ * KERN_SUCCESS is returned if the page fault is handled; otherwise, a
+ * Mach error specifying why the fault is fatal is returned.
+ *
+ * The map in question must be alive, either being the map for current
+ * process, or the owner process hold count incremented to prevent
+ * exit().
+ *
+ * If the thread private TDP_NOFAULTING flag is set, any fault results
+ * in immediate protection failure. Otherwise the fault is processed,
+ * and caller may hold no locks.
+ */
int
vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold)
@@ -1701,10 +1799,15 @@ RetryFault:
found:
/*
- * A valid page has been found and exclusively busied. The
- * object lock must no longer be held.
+ * A valid page has been found and busied. The object lock
+ * must no longer be held if the page was busied.
+ *
+ * Regardless of the busy state of fs.m, fs.first_m is always
+ * exclusively busied after the first iteration of the loop
+ * calling vm_fault_object(). This is an ordering point for
+ * the parallel faults occuring in on the same page.
*/
- vm_page_assert_xbusied(fs.m);
+ vm_page_assert_busied(fs.m);
VM_OBJECT_ASSERT_UNLOCKED(fs.object);
/*
@@ -1712,7 +1815,7 @@ found:
* top-level object, we have to copy it into a new page owned by the
* top-level object.
*/
- if (fs.object != fs.first_object) {
+ if (vm_fault_might_be_cow(&fs)) {
/*
* We only really need to copy if we want to write it.
*/
@@ -1773,7 +1876,7 @@ found:
* Page must be completely valid or it is not fit to
* map into user space. vm_pager_get_pages() ensures this.
*/
- vm_page_assert_xbusied(fs.m);
+ vm_page_assert_busied(fs.m);
KASSERT(vm_page_all_valid(fs.m),
("vm_fault: page %p partially invalid", fs.m));
@@ -1805,7 +1908,13 @@ found:
(*fs.m_hold) = fs.m;
vm_page_wire(fs.m);
}
- vm_page_xunbusy(fs.m);
+
+ KASSERT(fs.first_object == fs.object || vm_page_xbusied(fs.first_m),
+ ("first_m must be xbusy"));
+ if (vm_page_xbusied(fs.m))
+ vm_page_xunbusy(fs.m);
+ else
+ vm_page_sunbusy(fs.m);
fs.m = NULL;
/*
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 94df2c2f9a9e..e0f1807a1b32 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -453,7 +453,7 @@ vm_thread_stack_create(struct domainset *ds, int pages)
obj = vm_thread_kstack_size_to_obj(pages);
if (vm_ndomains > 1)
obj->domain.dr_policy = ds;
- vm_domainset_iter_page_init(&di, obj, 0, &domain, &req, NULL);
+ vm_domainset_iter_page_init(&di, obj, 0, &domain, &req);
do {
/*
* Get a kernel virtual address for this thread's kstack.
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index e7d7b6726d2c..ac327aa37b72 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -323,7 +323,9 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags,
start_segind = -1;
- vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
+ if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0)
+ return (NULL);
+
do {
addr = kmem_alloc_attr_domain(domain, size, flags, low, high,
memattr);
@@ -417,7 +419,9 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags,
start_segind = -1;
- vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
+ if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags))
+ return (NULL);
+
do {
addr = kmem_alloc_contig_domain(domain, size, flags, low, high,
alignment, boundary, memattr);
@@ -517,7 +521,9 @@ kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags)
void *addr;
int domain;
- vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
+ if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0)
+ return (NULL);
+
do {
addr = kmem_malloc_domain(domain, size, flags);
if (addr != NULL)
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index abad5efb8a79..16878604fa11 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -2015,8 +2015,9 @@ vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req,
vm_page_t m;
int domain;
- vm_domainset_iter_page_init(&di, object, pindex, &domain, &req,
- pages);
+ if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
+ return (NULL);
+
do {
m = vm_page_alloc_domain_iter(object, pindex, domain, req,
pages);
@@ -2268,7 +2269,9 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
start_segind = -1;
- vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, NULL);
+ if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
+ return (NULL);
+
do {
m = vm_page_alloc_contig_domain(object, pindex, domain, req,
npages, low, high, alignment, boundary, memattr);
@@ -2596,7 +2599,9 @@ vm_page_alloc_noobj(int req)
vm_page_t m;
int domain;
- vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL);
+ if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
+ return (NULL);
+
do {
m = vm_page_alloc_noobj_domain(domain, req);
if (m != NULL)
@@ -2615,7 +2620,9 @@ vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low,
vm_page_t m;
int domain;
- vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL);
+ if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
+ return (NULL);
+
do {
m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low,
high, alignment, boundary, memattr);
@@ -3334,7 +3341,9 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
ret = ERANGE;
- vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL);
+ if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
+ return (ret);
+
do {
status = vm_page_reclaim_contig_domain(domain, req, npages, low,
high, alignment, boundary);