diff options
Diffstat (limited to 'sys/vm')
-rw-r--r-- | sys/vm/uma_core.c | 10 | ||||
-rw-r--r-- | sys/vm/vm_domainset.c | 265 | ||||
-rw-r--r-- | sys/vm/vm_domainset.h | 15 | ||||
-rw-r--r-- | sys/vm/vm_fault.c | 209 | ||||
-rw-r--r-- | sys/vm/vm_glue.c | 2 | ||||
-rw-r--r-- | sys/vm/vm_kern.c | 12 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 21 |
7 files changed, 342 insertions, 192 deletions
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 5189f7405400..679b2e20e88b 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -4017,8 +4017,9 @@ restart: rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, - &aflags); + if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags) != 0) + return (NULL); } else { aflags = flags; domain = rdomain; @@ -5245,8 +5246,9 @@ uma_prealloc(uma_zone_t zone, int items) slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, - &aflags); + if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags) != 0) + panic("%s: Domainset is empty", __func__); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index b44bdb96b0d4..9fa17da954f7 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -58,6 +58,9 @@ static int vm_domainset_default_stride = 64; +static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain); + + /* * Determine which policy is to be used for this allocation. */ @@ -93,28 +96,15 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds, pindex += (((uintptr_t)obj) / sizeof(*obj)); di->di_offset = pindex; } - /* Skip domains below min on the first pass. */ - di->di_minskip = true; } static void vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain) { + /* Grab the next domain in 'ds_order'. */ *domain = di->di_domain->ds_order[ - ++(*di->di_iter) % di->di_domain->ds_cnt]; -} - -static void -vm_domainset_iter_prefer(struct vm_domainset_iter *di, int *domain) -{ - int d; - - do { - d = di->di_domain->ds_order[ - ++(*di->di_iter) % di->di_domain->ds_cnt]; - } while (d == di->di_domain->ds_prefer); - *domain = d; + (*di->di_iter)++ % di->di_domain->ds_cnt]; } static void @@ -127,79 +117,144 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) *domain = di->di_domain->ds_order[d]; } -static void -vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) +/* + * Internal function determining the current phase's first candidate domain. + * + * Returns whether these is an eligible domain, which is returned through + * '*domain'. '*domain' can be modified even if there is no eligible domain. + * + * See herald comment of vm_domainset_iter_first() below about phases. + */ +static bool +vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain) { - - KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n)); switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: - /* - * To prevent impossible allocations we convert an invalid - * first-touch to round-robin. - */ - /* FALLTHROUGH */ - case DOMAINSET_POLICY_INTERLEAVE: - /* FALLTHROUGH */ + *domain = PCPU_GET(domain); + break; case DOMAINSET_POLICY_ROUNDROBIN: vm_domainset_iter_rr(di, domain); break; case DOMAINSET_POLICY_PREFER: - vm_domainset_iter_prefer(di, domain); + *domain = di->di_domain->ds_prefer; + break; + case DOMAINSET_POLICY_INTERLEAVE: + vm_domainset_iter_interleave(di, domain); break; default: panic("%s: Unknown policy %d", __func__, di->di_policy); } KASSERT(*domain < vm_ndomains, ("%s: Invalid domain %d", __func__, *domain)); + + /* + * Has the policy's start domain already been visited? + */ + if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask)) + return (vm_domainset_iter_next(di, domain)); + + DOMAINSET_CLR(*domain, &di->di_remain_mask); + + /* Does it have enough free pages (phase 1)? */ + if (di->di_minskip && vm_page_count_min_domain(*domain)) { + /* Mark the domain as eligible for phase 2. */ + DOMAINSET_SET(*domain, &di->di_min_mask); + return (vm_domainset_iter_next(di, domain)); + } + + return (true); } -static void +/* + * Resets an iterator to point to the first candidate domain. + * + * Returns whether there is an eligible domain to start with. '*domain' may be + * modified even if there is none. + * + * There must have been one call to vm_domainset_iter_init() before. + * + * This function must be called at least once before calling + * vm_domainset_iter_next(). Note that functions wrapping + * vm_domainset_iter_init() usually do that themselves. + * + * This function may be called again to reset the iterator to the policy's first + * candidate domain. After each reset, the iterator will visit the same domains + * as in the previous iteration minus those on which vm_domainset_iter_ignore() + * has been called. Note that the first candidate domain may change at each + * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN + * policy). + * + * Domains which have a number of free pages over 'v_free_min' are always + * visited first (this is called the "phase 1" in comments, "phase 2" being the + * examination of the remaining domains; no domains are ever visited twice). + */ +static bool vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) { + /* Initialize the mask of domains to visit. */ + DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask); + /* + * No candidate domains for phase 2 at start. This will be filled by + * phase 1. + */ + DOMAINSET_ZERO(&di->di_min_mask); + /* Skip domains below 'v_free_min' on phase 1. */ + di->di_minskip = true; - switch (di->di_policy) { - case DOMAINSET_POLICY_FIRSTTOUCH: - *domain = PCPU_GET(domain); - if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) { - /* - * Add an extra iteration because we will visit the - * current domain a second time in the rr iterator. - */ - di->di_n = di->di_domain->ds_cnt + 1; - break; - } - /* - * To prevent impossible allocations we convert an invalid - * first-touch to round-robin. - */ - /* FALLTHROUGH */ - case DOMAINSET_POLICY_ROUNDROBIN: - di->di_n = di->di_domain->ds_cnt; + return (vm_domainset_iter_phase_first(di, domain)); +} + +/* + * Advances the iterator to the next candidate domain. + * + * Returns whether there was another domain to visit. '*domain' may be modified + * even if there is none. + * + * vm_domainset_iter_first() must have been called at least once before using + * this function (see its herald comment for more details on iterators). + */ +static bool +vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) +{ + /* Loop while there remains domains to visit in the current phase. */ + while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { + /* Grab the next domain in 'ds_order'. */ vm_domainset_iter_rr(di, domain); - break; - case DOMAINSET_POLICY_PREFER: - *domain = di->di_domain->ds_prefer; - di->di_n = di->di_domain->ds_cnt; - break; - case DOMAINSET_POLICY_INTERLEAVE: - vm_domainset_iter_interleave(di, domain); - di->di_n = di->di_domain->ds_cnt; - break; - default: - panic("%s: Unknown policy %d", __func__, di->di_policy); + KASSERT(*domain < vm_ndomains, + ("%s: Invalid domain %d", __func__, *domain)); + + if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { + DOMAINSET_CLR(*domain, &di->di_remain_mask); + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) + return (true); + DOMAINSET_SET(*domain, &di->di_min_mask); + } } - KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n)); - KASSERT(*domain < vm_ndomains, - ("%s: Invalid domain %d", __func__, *domain)); + + /* + * If phase 1 (skip low memory domains) is over, start phase 2 (consider + * low memory domains). + */ + if (di->di_minskip) { + di->di_minskip = false; + /* Browse domains that were under 'v_free_min'. */ + DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask); + return (vm_domainset_iter_phase_first(di, domain)); + } + + return (false); } -void +int vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, - vm_pindex_t pindex, int *domain, int *req, struct pctrie_iter *pages) + vm_pindex_t pindex, int *domain, int *req) { struct domainset_ref *dr; + di->di_flags = *req; + *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | + VM_ALLOC_NOWAIT; + /* * Object policy takes precedence over thread policy. The policies * are immutable and unsynchronized. Updates can race but pointer @@ -209,36 +264,21 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, dr = &obj->domain; else dr = &curthread->td_domain; + vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); - di->di_flags = *req; - *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | - VM_ALLOC_NOWAIT; - vm_domainset_iter_first(di, domain); - if (vm_page_count_min_domain(*domain)) - vm_domainset_iter_page(di, obj, domain, pages); + /* + * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was + * passed? + */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } int vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, int *domain, struct pctrie_iter *pages) { - if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask))) - return (ENOMEM); - - /* If there are more domains to visit we run the iterator. */ - while (--di->di_n != 0) { - vm_domainset_iter_next(di, domain); - if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) && - (!di->di_minskip || !vm_page_count_min_domain(*domain))) - return (0); - } - - /* If we skipped domains below min restart the search. */ - if (di->di_minskip) { - di->di_minskip = false; - vm_domainset_iter_first(di, domain); + if (vm_domainset_iter_next(di, domain)) return (0); - } /* If we visited all domains and this was a NOWAIT we return error. */ if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0) @@ -257,61 +297,43 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, return (ENOMEM); /* Restart the search. */ - vm_domainset_iter_first(di, domain); - - return (0); + /* XXXOC: Shouldn't we just panic on 'false'? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } -static void +static int _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain, int *flags) { - di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; - vm_domainset_iter_first(di, domain); - if (vm_page_count_min_domain(*domain)) - vm_domainset_iter_policy(di, domain); + /* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } -void +int vm_domainset_iter_policy_init(struct vm_domainset_iter *di, struct domainset *ds, int *domain, int *flags) { vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0); - _vm_domainset_iter_policy_init(di, domain, flags); + return (_vm_domainset_iter_policy_init(di, domain, flags)); } -void +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, struct domainset_ref *dr, int *domain, int *flags) { vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0); - _vm_domainset_iter_policy_init(di, domain, flags); + return (_vm_domainset_iter_policy_init(di, domain, flags)); } int vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) { - if (DOMAINSET_EMPTY(&di->di_valid_mask)) - return (ENOMEM); - - /* If there are more domains to visit we run the iterator. */ - while (--di->di_n != 0) { - vm_domainset_iter_next(di, domain); - if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) && - (!di->di_minskip || !vm_page_count_min_domain(*domain))) - return (0); - } - - /* If we skipped domains below min restart the search. */ - if (di->di_minskip) { - di->di_minskip = false; - vm_domainset_iter_first(di, domain); + if (vm_domainset_iter_next(di, domain)) return (0); - } /* If we visited all domains and this was a NOWAIT we return error. */ if ((di->di_flags & M_WAITOK) == 0) @@ -321,9 +343,8 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) vm_wait_doms(&di->di_valid_mask, 0); /* Restart the search. */ - vm_domainset_iter_first(di, domain); - - return (0); + /* XXXOC: Shouldn't we just panic on 'false'? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } void @@ -345,12 +366,12 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, return (EJUSTRETURN); } -void +int vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, - vm_pindex_t pindex, int *domain, int *flags, struct pctrie_iter *pages) + vm_pindex_t pindex, int *domain, int *flags) { - *domain = 0; + return (0); } int @@ -360,20 +381,20 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) return (EJUSTRETURN); } -void +int vm_domainset_iter_policy_init(struct vm_domainset_iter *di, struct domainset *ds, int *domain, int *flags) { - *domain = 0; + return (0); } -void +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, struct domainset_ref *dr, int *domain, int *flags) { - *domain = 0; + return (0); } void diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h index 0d325a642f40..ef86c8ccb5e4 100644 --- a/sys/vm/vm_domainset.h +++ b/sys/vm/vm_domainset.h @@ -33,23 +33,26 @@ struct pctrie_iter; struct vm_domainset_iter { struct domainset *di_domain; unsigned int *di_iter; + /* Initialized from 'di_domain', initial value after reset. */ domainset_t di_valid_mask; + /* Domains to browse in the current phase. */ + domainset_t di_remain_mask; + /* Domains skipped in phase 1 because under 'v_free_min'. */ + domainset_t di_min_mask; vm_pindex_t di_offset; int di_flags; uint16_t di_policy; - domainid_t di_n; bool di_minskip; }; int vm_domainset_iter_page(struct vm_domainset_iter *, struct vm_object *, int *, struct pctrie_iter *); -void vm_domainset_iter_page_init(struct vm_domainset_iter *, - struct vm_object *, vm_pindex_t, int *, int *, - struct pctrie_iter *); +int vm_domainset_iter_page_init(struct vm_domainset_iter *, + struct vm_object *, vm_pindex_t, int *, int *); int vm_domainset_iter_policy(struct vm_domainset_iter *, int *); -void vm_domainset_iter_policy_init(struct vm_domainset_iter *, +int vm_domainset_iter_policy_init(struct vm_domainset_iter *, struct domainset *, int *, int *); -void vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, struct domainset_ref *, int *, int *); void vm_domainset_iter_ignore(struct vm_domainset_iter *, int); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 58f8ac16fa0c..2e150b368d71 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -71,11 +71,9 @@ * Page fault handling module. */ -#include <sys/cdefs.h> #include "opt_ktrace.h" #include "opt_vm.h" -#include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/lock.h> @@ -204,7 +202,10 @@ vm_fault_page_release(vm_page_t *mp) * pageout while optimizing fault restarts. */ vm_page_deactivate(m); - vm_page_xunbusy(m); + if (vm_page_xbusied(m)) + vm_page_xunbusy(m); + else + vm_page_sunbusy(m); *mp = NULL; } } @@ -260,6 +261,12 @@ vm_fault_unlock_vp(struct faultstate *fs) } } +static bool +vm_fault_might_be_cow(struct faultstate *fs) +{ + return (fs->object != fs->first_object); +} + static void vm_fault_deallocate(struct faultstate *fs) { @@ -267,7 +274,7 @@ vm_fault_deallocate(struct faultstate *fs) vm_fault_page_release(&fs->m_cow); vm_fault_page_release(&fs->m); vm_object_pip_wakeup(fs->object); - if (fs->object != fs->first_object) { + if (vm_fault_might_be_cow(fs)) { VM_OBJECT_WLOCK(fs->first_object); vm_fault_page_free(&fs->first_m); VM_OBJECT_WUNLOCK(fs->first_object); @@ -329,6 +336,13 @@ vm_fault_dirty(struct faultstate *fs, vm_page_t m) } +static bool +vm_fault_is_read(const struct faultstate *fs) +{ + return ((fs->prot & VM_PROT_WRITE) == 0 && + (fs->fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) == 0); +} + /* * Unlocks fs.first_object and fs.map on success. */ @@ -694,21 +708,18 @@ _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT"); #endif /* - * vm_fault_trap: - * - * Handle a page fault occurring at the given address, - * requiring the given permissions, in the map specified. - * If successful, the page is inserted into the - * associated physical map. + * vm_fault_trap: * - * NOTE: the given address should be truncated to the - * proper page address. + * Helper for the page fault trap handlers, wrapping vm_fault(). + * Issues ktrace(2) tracepoints for the faults. * - * KERN_SUCCESS is returned if the page fault is handled; otherwise, - * a standard error specifying why the fault is fatal is returned. + * If a fault cannot be handled successfully by satisfying the + * required mapping, and the faulted instruction cannot be restarted, + * the signal number and si_code values are returned for trapsignal() + * to deliver. * - * The map in question must be referenced, and remains so. - * Caller may hold no locks. + * Returns Mach error codes, but callers should only check for + * KERN_SUCCESS. */ int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, @@ -1002,12 +1013,22 @@ vm_fault_relookup(struct faultstate *fs) return (KERN_SUCCESS); } +static bool +vm_fault_can_cow_rename(struct faultstate *fs) +{ + return ( + /* Only one shadow object and no other refs. */ + fs->object->shadow_count == 1 && fs->object->ref_count == 1 && + /* No other ways to look the object up. */ + fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0); +} + static void vm_fault_cow(struct faultstate *fs) { - bool is_first_object_locked; + bool is_first_object_locked, rename_cow; - KASSERT(fs->object != fs->first_object, + KASSERT(vm_fault_might_be_cow(fs), ("source and target COW objects are identical")); /* @@ -1019,21 +1040,29 @@ vm_fault_cow(struct faultstate *fs) * object so that it will go out to swap when needed. */ is_first_object_locked = false; - if ( - /* - * Only one shadow object and no other refs. - */ - fs->object->shadow_count == 1 && fs->object->ref_count == 1 && - /* - * No other ways to look the object up - */ - fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && - /* - * We don't chase down the shadow chain and we can acquire locks. - */ - (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && - fs->object == fs->first_object->backing_object && - VM_OBJECT_TRYWLOCK(fs->object)) { + rename_cow = false; + + if (vm_fault_can_cow_rename(fs) && vm_page_xbusied(fs->m)) { + /* + * Check that we don't chase down the shadow chain and + * we can acquire locks. Recheck the conditions for + * rename after the shadow chain is stable after the + * object locking. + */ + is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object); + if (is_first_object_locked && + fs->object == fs->first_object->backing_object) { + if (VM_OBJECT_TRYWLOCK(fs->object)) { + rename_cow = vm_fault_can_cow_rename(fs); + if (!rename_cow) + VM_OBJECT_WUNLOCK(fs->object); + } + } + } + + if (rename_cow) { + vm_page_assert_xbusied(fs->m); + /* * Remove but keep xbusy for replace. fs->m is moved into * fs->first_object and left busy while fs->first_m is @@ -1090,8 +1119,12 @@ vm_fault_cow(struct faultstate *fs) * address space. If OBJ_ONEMAPPING is set after the check, * removing mappings will at worse trigger some unnecessary page * faults. + * + * In the fs->m shared busy case, the xbusy state of + * fs->first_m prevents new mappings of fs->m from + * being created because a parallel fault on this + * shadow chain should wait for xbusy on fs->first_m. */ - vm_page_assert_xbusied(fs->m_cow); if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0) pmap_remove_all(fs->m_cow); } @@ -1171,7 +1204,7 @@ vm_fault_zerofill(struct faultstate *fs) * If there's no object left, fill the page in the top * object with zeros. */ - if (fs->object != fs->first_object) { + if (vm_fault_might_be_cow(fs)) { vm_object_pip_wakeup(fs->object); fs->object = fs->first_object; fs->pindex = fs->first_pindex; @@ -1420,14 +1453,13 @@ vm_fault_getpages(struct faultstate *fs, int *behindp, int *aheadp) * and we could end up trying to pagein and pageout the same page * simultaneously. * - * We can theoretically allow the busy case on a read fault if the page - * is marked valid, but since such pages are typically already pmap'd, - * putting that special case in might be more effort then it is worth. - * We cannot under any circumstances mess around with a shared busied - * page except, perhaps, to pmap it. + * We allow the busy case on a read fault if the page is valid. We + * cannot under any circumstances mess around with a shared busied + * page except, perhaps, to pmap it. This is controlled by the + * VM_ALLOC_SBUSY bit in the allocflags argument. */ static void -vm_fault_busy_sleep(struct faultstate *fs) +vm_fault_busy_sleep(struct faultstate *fs, int allocflags) { /* * Reference the page before unlocking and @@ -1435,13 +1467,13 @@ vm_fault_busy_sleep(struct faultstate *fs) * likely to reclaim it. */ vm_page_aflag_set(fs->m, PGA_REFERENCED); - if (fs->object != fs->first_object) { + if (vm_fault_might_be_cow(fs)) { vm_fault_page_release(&fs->first_m); vm_object_pip_wakeup(fs->first_object); } vm_object_pip_wakeup(fs->object); vm_fault_unlock_map(fs); - if (!vm_page_busy_sleep(fs->m, "vmpfw", 0)) + if (!vm_page_busy_sleep(fs->m, "vmpfw", allocflags)) VM_OBJECT_UNLOCK(fs->object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs->first_object); @@ -1487,8 +1519,53 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp) vm_page_iter_init(&pages, fs->object); fs->m = vm_radix_iter_lookup(&pages, fs->pindex); if (fs->m != NULL) { + /* + * If the found page is valid, will be either shadowed + * or mapped read-only, and will not be renamed for + * COW, then busy it in shared mode. This allows + * other faults needing this page to proceed in + * parallel. + * + * Unlocked check for validity, rechecked after busy + * is obtained. + */ + if (vm_page_all_valid(fs->m) && + /* + * No write permissions for the new fs->m mapping, + * or the first object has only one mapping, so + * other writeable COW mappings of fs->m cannot + * appear under us. + */ + (vm_fault_is_read(fs) || vm_fault_might_be_cow(fs)) && + /* + * fs->m cannot be renamed from object to + * first_object. These conditions will be + * re-checked with proper synchronization in + * vm_fault_cow(). + */ + (!vm_fault_can_cow_rename(fs) || + fs->object != fs->first_object->backing_object)) { + if (!vm_page_trysbusy(fs->m)) { + vm_fault_busy_sleep(fs, VM_ALLOC_SBUSY); + return (FAULT_RESTART); + } + + /* + * Now make sure that racily checked + * conditions are still valid. + */ + if (__predict_true(vm_page_all_valid(fs->m) && + (vm_fault_is_read(fs) || + vm_fault_might_be_cow(fs)))) { + VM_OBJECT_UNLOCK(fs->object); + return (FAULT_SOFT); + } + + vm_page_sunbusy(fs->m); + } + if (!vm_page_tryxbusy(fs->m)) { - vm_fault_busy_sleep(fs); + vm_fault_busy_sleep(fs, 0); return (FAULT_RESTART); } @@ -1546,6 +1623,27 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp) return (res); } +/* + * vm_fault: + * + * Handle a page fault occurring at the given address, requiring the + * given permissions, in the map specified. If successful, the page + * is inserted into the associated physical map, and optionally + * referenced and returned in *m_hold. + * + * The given address should be truncated to the proper page address. + * + * KERN_SUCCESS is returned if the page fault is handled; otherwise, a + * Mach error specifying why the fault is fatal is returned. + * + * The map in question must be alive, either being the map for current + * process, or the owner process hold count incremented to prevent + * exit(). + * + * If the thread private TDP_NOFAULTING flag is set, any fault results + * in immediate protection failure. Otherwise the fault is processed, + * and caller may hold no locks. + */ int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) @@ -1701,10 +1799,15 @@ RetryFault: found: /* - * A valid page has been found and exclusively busied. The - * object lock must no longer be held. + * A valid page has been found and busied. The object lock + * must no longer be held if the page was busied. + * + * Regardless of the busy state of fs.m, fs.first_m is always + * exclusively busied after the first iteration of the loop + * calling vm_fault_object(). This is an ordering point for + * the parallel faults occuring in on the same page. */ - vm_page_assert_xbusied(fs.m); + vm_page_assert_busied(fs.m); VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* @@ -1712,7 +1815,7 @@ found: * top-level object, we have to copy it into a new page owned by the * top-level object. */ - if (fs.object != fs.first_object) { + if (vm_fault_might_be_cow(&fs)) { /* * We only really need to copy if we want to write it. */ @@ -1773,7 +1876,7 @@ found: * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ - vm_page_assert_xbusied(fs.m); + vm_page_assert_busied(fs.m); KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); @@ -1805,7 +1908,13 @@ found: (*fs.m_hold) = fs.m; vm_page_wire(fs.m); } - vm_page_xunbusy(fs.m); + + KASSERT(fs.first_object == fs.object || vm_page_xbusied(fs.first_m), + ("first_m must be xbusy")); + if (vm_page_xbusied(fs.m)) + vm_page_xunbusy(fs.m); + else + vm_page_sunbusy(fs.m); fs.m = NULL; /* diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 94df2c2f9a9e..e0f1807a1b32 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -453,7 +453,7 @@ vm_thread_stack_create(struct domainset *ds, int pages) obj = vm_thread_kstack_size_to_obj(pages); if (vm_ndomains > 1) obj->domain.dr_policy = ds; - vm_domainset_iter_page_init(&di, obj, 0, &domain, &req, NULL); + vm_domainset_iter_page_init(&di, obj, 0, &domain, &req); do { /* * Get a kernel virtual address for this thread's kstack. diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index e7d7b6726d2c..ac327aa37b72 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -323,7 +323,9 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags, start_segind = -1; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0) + return (NULL); + do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); @@ -417,7 +419,9 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags, start_segind = -1; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags)) + return (NULL); + do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); @@ -517,7 +521,9 @@ kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags) void *addr; int domain; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0) + return (NULL); + do { addr = kmem_malloc_domain(domain, size, flags); if (addr != NULL) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index abad5efb8a79..16878604fa11 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2015,8 +2015,9 @@ vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, - pages); + if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_domain_iter(object, pindex, domain, req, pages); @@ -2268,7 +2269,9 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, start_segind = -1; - vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); @@ -2596,7 +2599,9 @@ vm_page_alloc_noobj(int req) vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_noobj_domain(domain, req); if (m != NULL) @@ -2615,7 +2620,9 @@ vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, high, alignment, boundary, memattr); @@ -3334,7 +3341,9 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, ret = ERANGE; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (ret); + do { status = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); |