7 files changed, 342 insertions, 192 deletions
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 5189f7405400..679b2e20e88b 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -4017,8 +4017,9 @@ restart:
 	rr = rdomain == UMA_ANYDOMAIN;
 	if (rr) {
 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
-		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
-		    &aflags);
+		if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
+		    &aflags) != 0)
+			return (NULL);
 	} else {
 		aflags = flags;
 		domain = rdomain;
@@ -5245,8 +5246,9 @@ uma_prealloc(uma_zone_t zone, int items)
 	slabs = howmany(items, keg->uk_ipers);
 	while (slabs-- > 0) {
 		aflags = M_NOWAIT;
-		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
-		    &aflags);
+		if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
+		    &aflags) != 0)
+			panic("%s: Domainset is empty", __func__);
 		for (;;) {
 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
 			    aflags);
diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index b44bdb96b0d4..9fa17da954f7 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -58,6 +58,9 @@
 
 static int vm_domainset_default_stride = 64;
 
+static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain);
+
+
 /*
  * Determine which policy is to be used for this allocation.
  */
@@ -93,28 +96,15 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,
 			pindex += (((uintptr_t)obj) / sizeof(*obj));
 		di->di_offset = pindex;
 	}
-	/* Skip domains below min on the first pass. */
-	di->di_minskip = true;
 }
 
 static void
 vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)
 {
 
+	/* Grab the next domain in 'ds_order'. */
 	*domain = di->di_domain->ds_order[
-	    ++(*di->di_iter) % di->di_domain->ds_cnt];
-}
-
-static void
-vm_domainset_iter_prefer(struct vm_domainset_iter *di, int *domain)
-{
-	int d;
-
-	do {
-		d = di->di_domain->ds_order[
-		    ++(*di->di_iter) % di->di_domain->ds_cnt];
-	} while (d == di->di_domain->ds_prefer);
-	*domain = d;
+	    (*di->di_iter)++ % di->di_domain->ds_cnt];
 }
 
 static void
@@ -127,79 +117,144 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)
 	*domain = di->di_domain->ds_order[d];
 }
 
-static void
-vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
+/*
+ * Internal function determining the current phase's first candidate domain.
+ *
+ * Returns whether these is an eligible domain, which is returned through
+ * '*domain'.  '*domain' can be modified even if there is no eligible domain.
+ *
+ * See herald comment of vm_domainset_iter_first() below about phases.
+ */
+static bool
+vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain)
 {
-
-	KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
 	switch (di->di_policy) {
 	case DOMAINSET_POLICY_FIRSTTOUCH:
-		/*
-		 * To prevent impossible allocations we convert an invalid
-		 * first-touch to round-robin.
-		 */
-		/* FALLTHROUGH */
-	case DOMAINSET_POLICY_INTERLEAVE:
-		/* FALLTHROUGH */
+		*domain = PCPU_GET(domain);
+		break;
 	case DOMAINSET_POLICY_ROUNDROBIN:
 		vm_domainset_iter_rr(di, domain);
 		break;
 	case DOMAINSET_POLICY_PREFER:
-		vm_domainset_iter_prefer(di, domain);
+		*domain = di->di_domain->ds_prefer;
+		break;
+	case DOMAINSET_POLICY_INTERLEAVE:
+		vm_domainset_iter_interleave(di, domain);
 		break;
 	default:
 		panic("%s: Unknown policy %d", __func__, di->di_policy);
 	}
 	KASSERT(*domain < vm_ndomains,
 	    ("%s: Invalid domain %d", __func__, *domain));
+
+	/*
+	 * Has the policy's start domain already been visited?
+	 */
+	if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask))
+		return (vm_domainset_iter_next(di, domain));
+
+	DOMAINSET_CLR(*domain, &di->di_remain_mask);
+
+	/* Does it have enough free pages (phase 1)? */
+	if (di->di_minskip && vm_page_count_min_domain(*domain)) {
+		/* Mark the domain as eligible for phase 2. */
+		DOMAINSET_SET(*domain, &di->di_min_mask);
+		return (vm_domainset_iter_next(di, domain));
+	}
+
+	return (true);
 }
 
-static void
+/*
+ * Resets an iterator to point to the first candidate domain.
+ *
+ * Returns whether there is an eligible domain to start with.  '*domain' may be
+ * modified even if there is none.
+ *
+ * There must have been one call to vm_domainset_iter_init() before.
+ *
+ * This function must be called at least once before calling
+ * vm_domainset_iter_next().  Note that functions wrapping
+ * vm_domainset_iter_init() usually do that themselves.
+ *
+ * This function may be called again to reset the iterator to the policy's first
+ * candidate domain.  After each reset, the iterator will visit the same domains
+ * as in the previous iteration minus those on which vm_domainset_iter_ignore()
+ * has been called.  Note that the first candidate domain may change at each
+ * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN
+ * policy).
+ *
+ * Domains which have a number of free pages over 'v_free_min' are always
+ * visited first (this is called the "phase 1" in comments, "phase 2" being the
+ * examination of the remaining domains; no domains are ever visited twice).
+ */
+static bool
 vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
 {
+	/* Initialize the mask of domains to visit. */
+	DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);
+	/*
+	 * No candidate domains for phase 2 at start.  This will be filled by
+	 * phase 1.
+	 */
+	DOMAINSET_ZERO(&di->di_min_mask);
+	/* Skip domains below 'v_free_min' on phase 1. */
+	di->di_minskip = true;
 
-	switch (di->di_policy) {
-	case DOMAINSET_POLICY_FIRSTTOUCH:
-		*domain = PCPU_GET(domain);
-		if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) {
-			/*
-			 * Add an extra iteration because we will visit the
-			 * current domain a second time in the rr iterator.
-			 */
-			di->di_n = di->di_domain->ds_cnt + 1;
-			break;
-		}
-		/*
-		 * To prevent impossible allocations we convert an invalid
-		 * first-touch to round-robin.
-		 */
-		/* FALLTHROUGH */
-	case DOMAINSET_POLICY_ROUNDROBIN:
-		di->di_n = di->di_domain->ds_cnt;
+	return (vm_domainset_iter_phase_first(di, domain));
+}
+
+/*
+ * Advances the iterator to the next candidate domain.
+ *
+ * Returns whether there was another domain to visit.  '*domain' may be modified
+ * even if there is none.
+ *
+ * vm_domainset_iter_first() must have been called at least once before using
+ * this function (see its herald comment for more details on iterators).
+ */
+static bool
+vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
+{
+	/* Loop while there remains domains to visit in the current phase. */
+	while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {
+		/* Grab the next domain in 'ds_order'. */
 		vm_domainset_iter_rr(di, domain);
-		break;
-	case DOMAINSET_POLICY_PREFER:
-		*domain = di->di_domain->ds_prefer;
-		di->di_n = di->di_domain->ds_cnt;
-		break;
-	case DOMAINSET_POLICY_INTERLEAVE:
-		vm_domainset_iter_interleave(di, domain);
-		di->di_n = di->di_domain->ds_cnt;
-		break;
-	default:
-		panic("%s: Unknown policy %d", __func__, di->di_policy);
+		KASSERT(*domain < vm_ndomains,
+		    ("%s: Invalid domain %d", __func__, *domain));
+
+		if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {
+			DOMAINSET_CLR(*domain, &di->di_remain_mask);
+			if (!di->di_minskip || !vm_page_count_min_domain(*domain))
+				return (true);
+			DOMAINSET_SET(*domain, &di->di_min_mask);
+		}
 	}
-	KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
-	KASSERT(*domain < vm_ndomains,
-	    ("%s: Invalid domain %d", __func__, *domain));
+
+	/*
+	 * If phase 1 (skip low memory domains) is over, start phase 2 (consider
+	 * low memory domains).
+	 */
+	if (di->di_minskip) {
+		di->di_minskip = false;
+		/* Browse domains that were under 'v_free_min'. */
+		DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);
+		return (vm_domainset_iter_phase_first(di, domain));
+	}
+
+	return (false);
 }
 
-void
+int
 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
-    vm_pindex_t pindex, int *domain, int *req, struct pctrie_iter *pages)
+    vm_pindex_t pindex, int *domain, int *req)
 {
 	struct domainset_ref *dr;
 
+	di->di_flags = *req;
+	*req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
+	    VM_ALLOC_NOWAIT;
+
 	/*
 	 * Object policy takes precedence over thread policy.  The policies
 	 * are immutable and unsynchronized.  Updates can race but pointer
@@ -209,36 +264,21 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
 		dr = &obj->domain;
 	else
 		dr = &curthread->td_domain;
+
 	vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex);
-	di->di_flags = *req;
-	*req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
-	    VM_ALLOC_NOWAIT;
-	vm_domainset_iter_first(di, domain);
-	if (vm_page_count_min_domain(*domain))
-		vm_domainset_iter_page(di, obj, domain, pages);
+	/*
+	 * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was
+	 * passed?
+	 */
+	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
 }
 
 int
 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
     int *domain, struct pctrie_iter *pages)
 {
-	if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask)))
-		return (ENOMEM);
-
-	/* If there are more domains to visit we run the iterator. */
-	while (--di->di_n != 0) {
-		vm_domainset_iter_next(di, domain);
-		if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
-		    (!di->di_minskip || !vm_page_count_min_domain(*domain)))
-			return (0);
-	}
-
-	/* If we skipped domains below min restart the search. */
-	if (di->di_minskip) {
-		di->di_minskip = false;
-		vm_domainset_iter_first(di, domain);
+	if (vm_domainset_iter_next(di, domain))
 		return (0);
-	}
 
 	/* If we visited all domains and this was a NOWAIT we return error. */
 	if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0)
@@ -257,61 +297,43 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
 		return (ENOMEM);
 
 	/* Restart the search. */
-	vm_domainset_iter_first(di, domain);
-
-	return (0);
+	/* XXXOC: Shouldn't we just panic on 'false'? */
+	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
 }
 
-static void
+static int
 _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain,
     int *flags)
 {
-
 	di->di_flags = *flags;
 	*flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;
-	vm_domainset_iter_first(di, domain);
-	if (vm_page_count_min_domain(*domain))
-		vm_domainset_iter_policy(di, domain);
+	/* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */
+	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
 }
 
-void
+int
 vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
     struct domainset *ds, int *domain, int *flags)
 {
 
 	vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0);
-	_vm_domainset_iter_policy_init(di, domain, flags);
+	return (_vm_domainset_iter_policy_init(di, domain, flags));
 }
 
-void
+int
 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
     struct domainset_ref *dr, int *domain, int *flags)
 {
 
 	vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0);
-	_vm_domainset_iter_policy_init(di, domain, flags);
+	return (_vm_domainset_iter_policy_init(di, domain, flags));
 }
 
 int
 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
 {
-	if (DOMAINSET_EMPTY(&di->di_valid_mask))
-		return (ENOMEM);
-
-	/* If there are more domains to visit we run the iterator. */
-	while (--di->di_n != 0) {
-		vm_domainset_iter_next(di, domain);
-		if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
-		    (!di->di_minskip || !vm_page_count_min_domain(*domain)))
-			return (0);
-	}
-
-	/* If we skipped domains below min restart the search. */
-	if (di->di_minskip) {
-		di->di_minskip = false;
-		vm_domainset_iter_first(di, domain);
+	if (vm_domainset_iter_next(di, domain))
 		return (0);
-	}
 
 	/* If we visited all domains and this was a NOWAIT we return error. */
 	if ((di->di_flags & M_WAITOK) == 0)
@@ -321,9 +343,8 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
 	vm_wait_doms(&di->di_valid_mask, 0);
 
 	/* Restart the search. */
-	vm_domainset_iter_first(di, domain);
-
-	return (0);
+	/* XXXOC: Shouldn't we just panic on 'false'? */
+	return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);
 }
 
 void
@@ -345,12 +366,12 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
 	return (EJUSTRETURN);
 }
 
-void
+int
 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
-    vm_pindex_t pindex, int *domain, int *flags, struct pctrie_iter *pages)
+    vm_pindex_t pindex, int *domain, int *flags)
 {
-
 	*domain = 0;
+	return (0);
 }
 
 int
@@ -360,20 +381,20 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
 	return (EJUSTRETURN);
 }
 
-void
+int
 vm_domainset_iter_policy_init(struct vm_domainset_iter *di,
     struct domainset *ds, int *domain, int *flags)
 {
-
 	*domain = 0;
+	return (0);
 }
 
-void
+int
 vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
     struct domainset_ref *dr, int *domain, int *flags)
 {
-
 	*domain = 0;
+	return (0);
 }
 
 void
diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h
index 0d325a642f40..ef86c8ccb5e4 100644
--- a/sys/vm/vm_domainset.h
+++ b/sys/vm/vm_domainset.h
@@ -33,23 +33,26 @@ struct pctrie_iter;
 struct vm_domainset_iter {
 	struct domainset	*di_domain;
 	unsigned int		*di_iter;
+	/* Initialized from 'di_domain', initial value after reset. */
 	domainset_t		di_valid_mask;
+	/* Domains to browse in the current phase. */
+	domainset_t		di_remain_mask;
+	/* Domains skipped in phase 1 because under 'v_free_min'. */
+	domainset_t		di_min_mask;
 	vm_pindex_t		di_offset;
 	int			di_flags;
 	uint16_t		di_policy;
-	domainid_t		di_n;
 	bool			di_minskip;
 };
 
 int	vm_domainset_iter_page(struct vm_domainset_iter *, struct vm_object *,
 	    int *, struct pctrie_iter *);
-void	vm_domainset_iter_page_init(struct vm_domainset_iter *,
-	    struct vm_object *, vm_pindex_t, int *, int *,
-	     struct pctrie_iter *);
+int	vm_domainset_iter_page_init(struct vm_domainset_iter *,
+	    struct vm_object *, vm_pindex_t, int *, int *);
 int	vm_domainset_iter_policy(struct vm_domainset_iter *, int *);
-void	vm_domainset_iter_policy_init(struct vm_domainset_iter *,
+int	vm_domainset_iter_policy_init(struct vm_domainset_iter *,
 	    struct domainset *, int *, int *);
-void	vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *,
+int	vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *,
 	    struct domainset_ref *, int *, int *);
 void	vm_domainset_iter_ignore(struct vm_domainset_iter *, int);
 
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 58f8ac16fa0c..2e150b368d71 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -71,11 +71,9 @@
  *	Page fault handling module.
  */
 
-#include <sys/cdefs.h>
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
-#include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
@@ -204,7 +202,10 @@ vm_fault_page_release(vm_page_t *mp)
 		 * pageout while optimizing fault restarts.
 		 */
 		vm_page_deactivate(m);
-		vm_page_xunbusy(m);
+		if (vm_page_xbusied(m))
+			vm_page_xunbusy(m);
+		else
+			vm_page_sunbusy(m);
 		*mp = NULL;
 	}
 }
@@ -260,6 +261,12 @@ vm_fault_unlock_vp(struct faultstate *fs)
 	}
 }
 
+static bool
+vm_fault_might_be_cow(struct faultstate *fs)
+{
+	return (fs->object != fs->first_object);
+}
+
 static void
 vm_fault_deallocate(struct faultstate *fs)
 {
@@ -267,7 +274,7 @@ vm_fault_deallocate(struct faultstate *fs)
 	vm_fault_page_release(&fs->m_cow);
 	vm_fault_page_release(&fs->m);
 	vm_object_pip_wakeup(fs->object);
-	if (fs->object != fs->first_object) {
+	if (vm_fault_might_be_cow(fs)) {
 		VM_OBJECT_WLOCK(fs->first_object);
 		vm_fault_page_free(&fs->first_m);
 		VM_OBJECT_WUNLOCK(fs->first_object);
@@ -329,6 +336,13 @@ vm_fault_dirty(struct faultstate *fs, vm_page_t m)
 
 }
 
+static bool
+vm_fault_is_read(const struct faultstate *fs)
+{
+	return ((fs->prot & VM_PROT_WRITE) == 0 &&
+	    (fs->fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) == 0);
+}
+
 /*
  * Unlocks fs.first_object and fs.map on success.
  */
@@ -694,21 +708,18 @@ _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT");
 #endif
 
 /*
- *	vm_fault_trap:
- *
- *	Handle a page fault occurring at the given address,
- *	requiring the given permissions, in the map specified.
- *	If successful, the page is inserted into the
- *	associated physical map.
+ * vm_fault_trap:
  *
- *	NOTE: the given address should be truncated to the
- *	proper page address.
+ * Helper for the page fault trap handlers, wrapping vm_fault().
+ * Issues ktrace(2) tracepoints for the faults.
  *
- *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
- *	a standard error specifying why the fault is fatal is returned.
+ * If a fault cannot be handled successfully by satisfying the
+ * required mapping, and the faulted instruction cannot be restarted,
+ * the signal number and si_code values are returned for trapsignal()
+ * to deliver.
  *
- *	The map in question must be referenced, and remains so.
- *	Caller may hold no locks.
+ * Returns Mach error codes, but callers should only check for
+ * KERN_SUCCESS.
  */
 int
 vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
@@ -1002,12 +1013,22 @@ vm_fault_relookup(struct faultstate *fs)
 	return (KERN_SUCCESS);
 }
 
+static bool
+vm_fault_can_cow_rename(struct faultstate *fs)
+{
+	return (
+	    /* Only one shadow object and no other refs. */
+	    fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
+	    /* No other ways to look the object up. */
+	    fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0);
+}
+
 static void
 vm_fault_cow(struct faultstate *fs)
 {
-	bool is_first_object_locked;
+	bool is_first_object_locked, rename_cow;
 
-	KASSERT(fs->object != fs->first_object,
+	KASSERT(vm_fault_might_be_cow(fs),
 	    ("source and target COW objects are identical"));
 
 	/*
@@ -1019,21 +1040,29 @@ vm_fault_cow(struct faultstate *fs)
 	 * object so that it will go out to swap when needed.
 	 */
 	is_first_object_locked = false;
-	if (
-	    /*
-	     * Only one shadow object and no other refs.
-	     */
-	    fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
-	    /*
-	     * No other ways to look the object up
-	     */
-	    fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 &&
-	    /*
-	     * We don't chase down the shadow chain and we can acquire locks.
-	     */
-	    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) &&
-	    fs->object == fs->first_object->backing_object &&
-	    VM_OBJECT_TRYWLOCK(fs->object)) {
+	rename_cow = false;
+
+	if (vm_fault_can_cow_rename(fs) && vm_page_xbusied(fs->m)) {
+		/*
+		 * Check that we don't chase down the shadow chain and
+		 * we can acquire locks.  Recheck the conditions for
+		 * rename after the shadow chain is stable after the
+		 * object locking.
+		 */
+		is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object);
+		if (is_first_object_locked &&
+		    fs->object == fs->first_object->backing_object) {
+			if (VM_OBJECT_TRYWLOCK(fs->object)) {
+				rename_cow = vm_fault_can_cow_rename(fs);
+				if (!rename_cow)
+					VM_OBJECT_WUNLOCK(fs->object);
+			}
+		}
+	}
+
+	if (rename_cow) {
+		vm_page_assert_xbusied(fs->m);
+
 		/*
 		 * Remove but keep xbusy for replace.  fs->m is moved into
 		 * fs->first_object and left busy while fs->first_m is
@@ -1090,8 +1119,12 @@ vm_fault_cow(struct faultstate *fs)
 		 * address space.  If OBJ_ONEMAPPING is set after the check,
 		 * removing mappings will at worse trigger some unnecessary page
 		 * faults.
+		 *
+		 * In the fs->m shared busy case, the xbusy state of
+		 * fs->first_m prevents new mappings of fs->m from
+		 * being created because a parallel fault on this
+		 * shadow chain should wait for xbusy on fs->first_m.
 		 */
-		vm_page_assert_xbusied(fs->m_cow);
 		if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0)
 			pmap_remove_all(fs->m_cow);
 	}
@@ -1171,7 +1204,7 @@ vm_fault_zerofill(struct faultstate *fs)
 	 * If there's no object left, fill the page in the top
 	 * object with zeros.
 	 */
-	if (fs->object != fs->first_object) {
+	if (vm_fault_might_be_cow(fs)) {
 		vm_object_pip_wakeup(fs->object);
 		fs->object = fs->first_object;
 		fs->pindex = fs->first_pindex;
@@ -1420,14 +1453,13 @@ vm_fault_getpages(struct faultstate *fs, int *behindp, int *aheadp)
  * and we could end up trying to pagein and pageout the same page
  * simultaneously.
  *
- * We can theoretically allow the busy case on a read fault if the page
- * is marked valid, but since such pages are typically already pmap'd,
- * putting that special case in might be more effort then it is worth.
- * We cannot under any circumstances mess around with a shared busied
- * page except, perhaps, to pmap it.
+ * We allow the busy case on a read fault if the page is valid.  We
+ * cannot under any circumstances mess around with a shared busied
+ * page except, perhaps, to pmap it.  This is controlled by the
+ * VM_ALLOC_SBUSY bit in the allocflags argument.
  */
 static void
-vm_fault_busy_sleep(struct faultstate *fs)
+vm_fault_busy_sleep(struct faultstate *fs, int allocflags)
 {
 	/*
 	 * Reference the page before unlocking and
@@ -1435,13 +1467,13 @@ vm_fault_busy_sleep(struct faultstate *fs)
 	 * likely to reclaim it.
 	 */
 	vm_page_aflag_set(fs->m, PGA_REFERENCED);
-	if (fs->object != fs->first_object) {
+	if (vm_fault_might_be_cow(fs)) {
 		vm_fault_page_release(&fs->first_m);
 		vm_object_pip_wakeup(fs->first_object);
 	}
 	vm_object_pip_wakeup(fs->object);
 	vm_fault_unlock_map(fs);
-	if (!vm_page_busy_sleep(fs->m, "vmpfw", 0))
+	if (!vm_page_busy_sleep(fs->m, "vmpfw", allocflags))
 		VM_OBJECT_UNLOCK(fs->object);
 	VM_CNT_INC(v_intrans);
 	vm_object_deallocate(fs->first_object);
@@ -1487,8 +1519,53 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 	vm_page_iter_init(&pages, fs->object);
 	fs->m = vm_radix_iter_lookup(&pages, fs->pindex);
 	if (fs->m != NULL) {
+		/*
+		 * If the found page is valid, will be either shadowed
+		 * or mapped read-only, and will not be renamed for
+		 * COW, then busy it in shared mode.  This allows
+		 * other faults needing this page to proceed in
+		 * parallel.
+		 *
+		 * Unlocked check for validity, rechecked after busy
+		 * is obtained.
+		 */
+		if (vm_page_all_valid(fs->m) &&
+		    /*
+		     * No write permissions for the new fs->m mapping,
+		     * or the first object has only one mapping, so
+		     * other writeable COW mappings of fs->m cannot
+		     * appear under us.
+		     */
+		    (vm_fault_is_read(fs) || vm_fault_might_be_cow(fs)) &&
+		    /*
+		     * fs->m cannot be renamed from object to
+		     * first_object.  These conditions will be
+		     * re-checked with proper synchronization in
+		     * vm_fault_cow().
+		     */
+		    (!vm_fault_can_cow_rename(fs) ||
+		    fs->object != fs->first_object->backing_object)) {
+			if (!vm_page_trysbusy(fs->m)) {
+				vm_fault_busy_sleep(fs, VM_ALLOC_SBUSY);
+				return (FAULT_RESTART);
+			}
+
+			/*
+			 * Now make sure that racily checked
+			 * conditions are still valid.
+			 */
+			if (__predict_true(vm_page_all_valid(fs->m) &&
+			    (vm_fault_is_read(fs) ||
+			    vm_fault_might_be_cow(fs)))) {
+				VM_OBJECT_UNLOCK(fs->object);
+				return (FAULT_SOFT);
+			}
+
+			vm_page_sunbusy(fs->m);
+		}
+
 		if (!vm_page_tryxbusy(fs->m)) {
-			vm_fault_busy_sleep(fs);
+			vm_fault_busy_sleep(fs, 0);
 			return (FAULT_RESTART);
 		}
 
@@ -1546,6 +1623,27 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 	return (res);
 }
 
+/*
+ * vm_fault:
+ *
+ * Handle a page fault occurring at the given address, requiring the
+ * given permissions, in the map specified.  If successful, the page
+ * is inserted into the associated physical map, and optionally
+ * referenced and returned in *m_hold.
+ *
+ * The given address should be truncated to the proper page address.
+ *
+ * KERN_SUCCESS is returned if the page fault is handled; otherwise, a
+ * Mach error specifying why the fault is fatal is returned.
+ *
+ * The map in question must be alive, either being the map for current
+ * process, or the owner process hold count incremented to prevent
+ * exit().
+ *
+ * If the thread private TDP_NOFAULTING flag is set, any fault results
+ * in immediate protection failure.  Otherwise the fault is processed,
+ * and caller may hold no locks.
+ */
 int
 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold)
@@ -1701,10 +1799,15 @@ RetryFault:
 
 found:
 	/*
-	 * A valid page has been found and exclusively busied.  The
-	 * object lock must no longer be held.
+	 * A valid page has been found and busied.  The object lock
+	 * must no longer be held if the page was busied.
+	 *
+	 * Regardless of the busy state of fs.m, fs.first_m is always
+	 * exclusively busied after the first iteration of the loop
+	 * calling vm_fault_object().  This is an ordering point for
+	 * the parallel faults occuring in on the same page.
 	 */
-	vm_page_assert_xbusied(fs.m);
+	vm_page_assert_busied(fs.m);
 	VM_OBJECT_ASSERT_UNLOCKED(fs.object);
 
 	/*
@@ -1712,7 +1815,7 @@ found:
 	 * top-level object, we have to copy it into a new page owned by the
 	 * top-level object.
 	 */
-	if (fs.object != fs.first_object) {
+	if (vm_fault_might_be_cow(&fs)) {
 		/*
 		 * We only really need to copy if we want to write it.
 		 */
@@ -1773,7 +1876,7 @@ found:
 	 * Page must be completely valid or it is not fit to
 	 * map into user space.  vm_pager_get_pages() ensures this.
 	 */
-	vm_page_assert_xbusied(fs.m);
+	vm_page_assert_busied(fs.m);
 	KASSERT(vm_page_all_valid(fs.m),
 	    ("vm_fault: page %p partially invalid", fs.m));
 
@@ -1805,7 +1908,13 @@ found:
 		(*fs.m_hold) = fs.m;
 		vm_page_wire(fs.m);
 	}
-	vm_page_xunbusy(fs.m);
+
+	KASSERT(fs.first_object == fs.object || vm_page_xbusied(fs.first_m),
+	    ("first_m must be xbusy"));
+	if (vm_page_xbusied(fs.m))
+		vm_page_xunbusy(fs.m);
+	else
+		vm_page_sunbusy(fs.m);
 	fs.m = NULL;
 
 	/*
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 94df2c2f9a9e..e0f1807a1b32 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -453,7 +453,7 @@ vm_thread_stack_create(struct domainset *ds, int pages)
 	obj = vm_thread_kstack_size_to_obj(pages);
 	if (vm_ndomains > 1)
 		obj->domain.dr_policy = ds;
-	vm_domainset_iter_page_init(&di, obj, 0, &domain, &req, NULL);
+	vm_domainset_iter_page_init(&di, obj, 0, &domain, &req);
 	do {
 		/*
 		 * Get a kernel virtual address for this thread's kstack.
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index e7d7b6726d2c..ac327aa37b72 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -323,7 +323,9 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags,
 
 	start_segind = -1;
 
-	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
+	if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0)
+		return (NULL);
+
 	do {
 		addr = kmem_alloc_attr_domain(domain, size, flags, low, high,
 		    memattr);
@@ -417,7 +419,9 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags,
 
 	start_segind = -1;
 
-	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
+	if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags))
+		return (NULL);
+
 	do {
 		addr = kmem_alloc_contig_domain(domain, size, flags, low, high,
 		    alignment, boundary, memattr);
@@ -517,7 +521,9 @@ kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags)
 	void *addr;
 	int domain;
 
-	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
+	if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0)
+		return (NULL);
+
 	do {
 		addr = kmem_malloc_domain(domain, size, flags);
 		if (addr != NULL)
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index abad5efb8a79..16878604fa11 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -2015,8 +2015,9 @@ vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req,
 	vm_page_t m;
 	int domain;
 
-	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req,
-	    pages);
+	if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
+		return (NULL);
+
 	do {
 		m = vm_page_alloc_domain_iter(object, pindex, domain, req,
 		    pages);
@@ -2268,7 +2269,9 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
 
 	start_segind = -1;
 
-	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, NULL);
+	if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
+		return (NULL);
+
 	do {
 		m = vm_page_alloc_contig_domain(object, pindex, domain, req,
 		    npages, low, high, alignment, boundary, memattr);
@@ -2596,7 +2599,9 @@ vm_page_alloc_noobj(int req)
 	vm_page_t m;
 	int domain;
 
-	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL);
+	if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
+		return (NULL);
+
 	do {
 		m = vm_page_alloc_noobj_domain(domain, req);
 		if (m != NULL)
@@ -2615,7 +2620,9 @@ vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low,
 	vm_page_t m;
 	int domain;
 
-	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL);
+	if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
+		return (NULL);
+
 	do {
 		m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low,
 		    high, alignment, boundary, memattr);
@@ -3334,7 +3341,9 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
 
 	ret = ERANGE;
 
-	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL);
+	if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
+		return (ret);
+
 	do {
 		status = vm_page_reclaim_contig_domain(domain, req, npages, low,
 		    high, alignment, boundary);