aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/vm/swap_pager.c37
-rw-r--r--sys/vm/vm_object.c571
-rw-r--r--sys/vm/vm_object.h2
3 files changed, 330 insertions, 280 deletions
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index e83403da0755..3347536e54a6 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -974,15 +974,12 @@ swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject,
* Destination has no swapblk and is not resident, transfer source.
* swp_pager_meta_build() can sleep.
*/
- vm_object_pip_add(srcobject, 1);
VM_OBJECT_WUNLOCK(srcobject);
- vm_object_pip_add(dstobject, 1);
dstaddr = swp_pager_meta_build(dstobject, pindex, addr);
KASSERT(dstaddr == SWAPBLK_NONE,
("Unexpected destination swapblk"));
- vm_object_pip_wakeup(dstobject);
VM_OBJECT_WLOCK(srcobject);
- vm_object_pip_wakeup(srcobject);
+
return (true);
}
@@ -995,8 +992,7 @@ swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject,
* we keep the destination's.
*
* This routine is allowed to sleep. It may sleep allocating metadata
- * indirectly through swp_pager_meta_build() or if paging is still in
- * progress on the source.
+ * indirectly through swp_pager_meta_build().
*
* The source object contains no vm_page_t's (which is just as well)
*
@@ -1019,18 +1015,14 @@ swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
*/
if (destroysource && (srcobject->flags & OBJ_ANON) == 0 &&
srcobject->handle != NULL) {
- vm_object_pip_add(srcobject, 1);
VM_OBJECT_WUNLOCK(srcobject);
- vm_object_pip_add(dstobject, 1);
VM_OBJECT_WUNLOCK(dstobject);
sx_xlock(&sw_alloc_sx);
TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
pager_object_list);
sx_xunlock(&sw_alloc_sx);
VM_OBJECT_WLOCK(dstobject);
- vm_object_pip_wakeup(dstobject);
VM_OBJECT_WLOCK(srcobject);
- vm_object_pip_wakeup(srcobject);
}
/*
@@ -1207,26 +1199,29 @@ swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
reqcount = count;
- /*
- * Determine the final number of read-behind pages and
- * allocate them BEFORE releasing the object lock. Otherwise,
- * there can be a problematic race with vm_object_split().
- * Specifically, vm_object_split() might first transfer pages
- * that precede ma[0] in the current object to a new object,
- * and then this function incorrectly recreates those pages as
- * read-behind pages in the current object.
- */
KASSERT(object->type == OBJT_SWAP,
("%s: object not swappable", __func__));
if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
return (VM_PAGER_FAIL);
+ KASSERT(reqcount - 1 <= maxahead,
+ ("page count %d extends beyond swap block", reqcount));
+
+ /*
+ * Do not transfer any pages other than those that are xbusied
+ * when running during a split or collapse operation. This
+ * prevents clustering from re-creating pages which are being
+ * moved into another object.
+ */
+ if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) {
+ maxahead = reqcount - 1;
+ maxbehind = 0;
+ }
+
/*
* Clip the readahead and readbehind ranges to exclude resident pages.
*/
if (rahead != NULL) {
- KASSERT(reqcount - 1 <= maxahead,
- ("page count %d extends beyond swap block", reqcount));
*rahead = imin(*rahead, maxahead - (reqcount - 1));
pindex = ma[reqcount - 1]->pindex;
msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index a3f691390c11..395ba6bad8c2 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -116,8 +116,6 @@ static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
boolean_t *eio);
static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
boolean_t *allclean);
-static void vm_object_qcollapse(vm_object_t object);
-static void vm_object_vndeallocate(vm_object_t object);
static void vm_object_backing_remove(vm_object_t object);
/*
@@ -164,12 +162,18 @@ SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
&object_bypasses,
"VM object bypasses");
+static counter_u64_t object_collapse_waits = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD,
+ &object_collapse_waits,
+ "Number of sleeps for collapse");
+
static void
counter_startup(void)
{
object_collapses = counter_u64_alloc(M_WAITOK);
object_bypasses = counter_u64_alloc(M_WAITOK);
+ object_collapse_waits = counter_u64_alloc(M_WAITOK);
}
SYSINIT(object_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL);
@@ -376,6 +380,19 @@ vm_object_pip_wakeupn(vm_object_t object, short i)
refcount_releasen(&object->paging_in_progress, i);
}
+/*
+ * Atomically drop the interlock and wait for pip to drain. This protects
+ * from sleep/wakeup races due to identity changes. The lock is not
+ * re-acquired on return.
+ */
+static void
+vm_object_pip_sleep(vm_object_t object, char *waitid)
+{
+
+ refcount_sleep_interlock(&object->paging_in_progress,
+ &object->lock, waitid, PVM);
+}
+
void
vm_object_pip_wait(vm_object_t object, char *waitid)
{
@@ -383,8 +400,7 @@ vm_object_pip_wait(vm_object_t object, char *waitid)
VM_OBJECT_ASSERT_WLOCKED(object);
while (REFCOUNT_COUNT(object->paging_in_progress) > 0) {
- VM_OBJECT_WUNLOCK(object);
- refcount_wait(&object->paging_in_progress, waitid, PVM);
+ vm_object_pip_sleep(object, waitid);
VM_OBJECT_WLOCK(object);
}
}
@@ -466,30 +482,17 @@ vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object,
return (object);
}
-
-/*
- * vm_object_reference:
- *
- * Gets another reference to the given object. Note: OBJ_DEAD
- * objects can be referenced during final cleaning.
- */
-void
-vm_object_reference(vm_object_t object)
+static void
+vm_object_reference_vnode(vm_object_t object)
{
struct vnode *vp;
u_int old;
- if (object == NULL)
- return;
-
/*
- * Many places assume exclusive access to objects with a single
- * ref. vm_object_collapse() in particular will directly mainpulate
- * references for objects in this state. vnode objects only need
- * the lock for the first ref to reference the vnode.
+ * vnode objects need the lock for the first reference
+ * to serialize with vnode_object_deallocate().
*/
- if (!refcount_acquire_if_gt(&object->ref_count,
- object->type == OBJT_VNODE ? 0 : 1)) {
+ if (!refcount_acquire_if_gt(&object->ref_count, 0)) {
VM_OBJECT_RLOCK(object);
old = refcount_acquire(&object->ref_count);
if (object->type == OBJT_VNODE && old == 0) {
@@ -501,6 +504,26 @@ vm_object_reference(vm_object_t object)
}
/*
+ * vm_object_reference:
+ *
+ * Acquires a reference to the given object.
+ */
+void
+vm_object_reference(vm_object_t object)
+{
+
+ if (object == NULL)
+ return;
+
+ if (object->type == OBJT_VNODE)
+ vm_object_reference_vnode(object);
+ else
+ refcount_acquire(&object->ref_count);
+ KASSERT((object->flags & OBJ_DEAD) == 0,
+ ("vm_object_reference: Referenced dead object."));
+}
+
+/*
* vm_object_reference_locked:
*
* Gets another reference to the given object.
@@ -516,23 +539,23 @@ vm_object_reference_locked(vm_object_t object)
VM_OBJECT_ASSERT_LOCKED(object);
old = refcount_acquire(&object->ref_count);
if (object->type == OBJT_VNODE && old == 0) {
- vp = object->handle;
- vref(vp);
- }
+ vp = object->handle; vref(vp); }
+ KASSERT((object->flags & OBJ_DEAD) == 0,
+ ("vm_object_reference: Referenced dead object."));
}
/*
* Handle deallocating an object of type OBJT_VNODE.
*/
static void
-vm_object_vndeallocate(vm_object_t object)
+vm_object_deallocate_vnode(vm_object_t object)
{
struct vnode *vp = (struct vnode *) object->handle;
bool last;
KASSERT(object->type == OBJT_VNODE,
- ("vm_object_vndeallocate: not a vnode object"));
- KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
+ ("vm_object_deallocate_vnode: not a vnode object"));
+ KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp"));
/* Object lock to protect handle lookup. */
last = refcount_release(&object->ref_count);
@@ -548,6 +571,53 @@ vm_object_vndeallocate(vm_object_t object)
vrele(vp);
}
+
+/*
+ * We dropped a reference on an object and discovered that it had a
+ * single remaining shadow. This is a sibling of the reference we
+ * dropped. Attempt to collapse the sibling and backing object.
+ */
+static vm_object_t
+vm_object_deallocate_anon(vm_object_t backing_object)
+{
+ vm_object_t object;
+
+ /* Fetch the final shadow. */
+ object = LIST_FIRST(&backing_object->shadow_head);
+ KASSERT(object != NULL && backing_object->shadow_count == 1,
+ ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d",
+ backing_object->ref_count, backing_object->shadow_count));
+ KASSERT((object->flags & (OBJ_TMPFS_NODE | OBJ_ANON)) == OBJ_ANON,
+ ("invalid shadow object %p", object));
+
+ if (!VM_OBJECT_TRYWLOCK(object)) {
+ /*
+ * Prevent object from disappearing since we do not have a
+ * reference.
+ */
+ vm_object_pip_add(object, 1);
+ VM_OBJECT_WUNLOCK(backing_object);
+ VM_OBJECT_WLOCK(object);
+ vm_object_pip_wakeup(object);
+ } else
+ VM_OBJECT_WUNLOCK(backing_object);
+
+ /*
+ * Check for a collapse/terminate race with the last reference holder.
+ */
+ if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 ||
+ !refcount_acquire_if_not_zero(&object->ref_count)) {
+ VM_OBJECT_WUNLOCK(object);
+ return (NULL);
+ }
+ backing_object = object->backing_object;
+ if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0)
+ vm_object_collapse(object);
+ VM_OBJECT_WUNLOCK(object);
+
+ return (object);
+}
+
/*
* vm_object_deallocate:
*
@@ -562,7 +632,7 @@ vm_object_vndeallocate(vm_object_t object)
void
vm_object_deallocate(vm_object_t object)
{
- vm_object_t robject, temp;
+ vm_object_t temp;
bool released;
while (object != NULL) {
@@ -583,7 +653,7 @@ vm_object_deallocate(vm_object_t object)
if (object->type == OBJT_VNODE) {
VM_OBJECT_RLOCK(object);
if (object->type == OBJT_VNODE) {
- vm_object_vndeallocate(object);
+ vm_object_deallocate_vnode(object);
return;
}
VM_OBJECT_RUNLOCK(object);
@@ -594,92 +664,30 @@ vm_object_deallocate(vm_object_t object)
("vm_object_deallocate: object deallocated too many times: %d",
object->type));
- if (refcount_release(&object->ref_count))
- goto doterm;
- if (object->ref_count > 1) {
- VM_OBJECT_WUNLOCK(object);
- return;
- } else if (object->ref_count == 1) {
- if (object->shadow_count == 0 &&
- (object->flags & OBJ_ANON) != 0) {
- vm_object_set_flag(object, OBJ_ONEMAPPING);
- } else if (object->shadow_count == 1) {
- KASSERT((object->flags & OBJ_ANON) != 0,
- ("obj %p with shadow_count > 0 is not anon",
- object));
- robject = LIST_FIRST(&object->shadow_head);
- KASSERT(robject != NULL,
- ("vm_object_deallocate: ref_count: %d, "
- "shadow_count: %d", object->ref_count,
- object->shadow_count));
- KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
- ("shadowed tmpfs v_object %p", object));
- if (!VM_OBJECT_TRYWLOCK(robject)) {
- /*
- * Avoid a potential deadlock.
- */
- refcount_acquire(&object->ref_count);
- VM_OBJECT_WUNLOCK(object);
- /*
- * More likely than not the thread
- * holding robject's lock has lower
- * priority than the current thread.
- * Let the lower priority thread run.
- */
- pause("vmo_de", 1);
- continue;
- }
- /*
- * Collapse object into its shadow unless its
- * shadow is dead. In that case, object will
- * be deallocated by the thread that is
- * deallocating its shadow.
- */
- if ((robject->flags &
- (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON) {
-
- refcount_acquire(&robject->ref_count);
-retry:
- if (REFCOUNT_COUNT(robject->paging_in_progress) > 0) {
- VM_OBJECT_WUNLOCK(object);
- vm_object_pip_wait(robject,
- "objde1");
- temp = robject->backing_object;
- if (object == temp) {
- VM_OBJECT_WLOCK(object);
- goto retry;
- }
- } else if (REFCOUNT_COUNT(object->paging_in_progress) > 0) {
- VM_OBJECT_WUNLOCK(robject);
- VM_OBJECT_WUNLOCK(object);
- refcount_wait(
- &object->paging_in_progress,
- "objde2", PVM);
- VM_OBJECT_WLOCK(robject);
- temp = robject->backing_object;
- if (object == temp) {
- VM_OBJECT_WLOCK(object);
- goto retry;
- }
- } else
- VM_OBJECT_WUNLOCK(object);
-
- if (robject->ref_count == 1) {
- refcount_release(&robject->ref_count);
- object = robject;
- goto doterm;
- }
- object = robject;
- vm_object_collapse(object);
- VM_OBJECT_WUNLOCK(object);
- continue;
- }
- VM_OBJECT_WUNLOCK(robject);
+ /*
+ * If this is not the final reference to an anonymous
+ * object we may need to collapse the shadow chain.
+ */
+ if (!refcount_release(&object->ref_count)) {
+ if (object->ref_count > 1 ||
+ object->shadow_count == 0) {
+ if ((object->flags & OBJ_ANON) != 0 &&
+ object->ref_count == 1)
+ vm_object_set_flag(object,
+ OBJ_ONEMAPPING);
+ VM_OBJECT_WUNLOCK(object);
+ return;
}
- VM_OBJECT_WUNLOCK(object);
- return;
+
+ /* Handle collapsing last ref on anonymous objects. */
+ object = vm_object_deallocate_anon(object);
+ continue;
}
-doterm:
+
+ /*
+ * Handle the final reference to an object. We restart
+ * the loop with the backing object to avoid recursion.
+ */
umtx_shm_object_terminated(object);
temp = object->backing_object;
if (temp != NULL) {
@@ -687,16 +695,11 @@ doterm:
("shadowed tmpfs v_object 2 %p", object));
vm_object_backing_remove(object);
}
- /*
- * Don't double-terminate, we could be in a termination
- * recursion due to the terminate having to sync data
- * to disk.
- */
- if ((object->flags & OBJ_DEAD) == 0) {
- vm_object_set_flag(object, OBJ_DEAD);
- vm_object_terminate(object);
- } else
- VM_OBJECT_WUNLOCK(object);
+
+ KASSERT((object->flags & OBJ_DEAD) == 0,
+ ("vm_object_deallocate: Terminating dead object."));
+ vm_object_set_flag(object, OBJ_DEAD);
+ vm_object_terminate(object);
object = temp;
}
}
@@ -734,6 +737,9 @@ vm_object_backing_remove_locked(vm_object_t object)
VM_OBJECT_ASSERT_WLOCKED(object);
VM_OBJECT_ASSERT_WLOCKED(backing_object);
+ KASSERT((object->flags & OBJ_COLLAPSING) == 0,
+ ("vm_object_backing_remove: Removing collapsing object."));
+
if ((object->flags & OBJ_SHADOWLIST) != 0) {
LIST_REMOVE(object, shadow_list);
backing_object->shadow_count--;
@@ -788,6 +794,98 @@ vm_object_backing_insert(vm_object_t object, vm_object_t backing_object)
object->backing_object = backing_object;
}
+/*
+ * Insert an object into a backing_object's shadow list with an additional
+ * reference to the backing_object added.
+ */
+static void
+vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object)
+{
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ if ((backing_object->flags & OBJ_ANON) != 0) {
+ VM_OBJECT_WLOCK(backing_object);
+ KASSERT((backing_object->flags & OBJ_DEAD) == 0,
+ ("shadowing dead anonymous object"));
+ vm_object_reference_locked(backing_object);
+ vm_object_backing_insert_locked(object, backing_object);
+ vm_object_clear_flag(backing_object, OBJ_ONEMAPPING);
+ VM_OBJECT_WUNLOCK(backing_object);
+ } else {
+ vm_object_reference(backing_object);
+ object->backing_object = backing_object;
+ }
+}
+
+/*
+ * Transfer a backing reference from backing_object to object.
+ */
+static void
+vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object)
+{
+ vm_object_t new_backing_object;
+
+ /*
+ * Note that the reference to backing_object->backing_object
+ * moves from within backing_object to within object.
+ */
+ vm_object_backing_remove_locked(object);
+ new_backing_object = backing_object->backing_object;
+ if (new_backing_object == NULL)
+ return;
+ if ((new_backing_object->flags & OBJ_ANON) != 0) {
+ VM_OBJECT_WLOCK(new_backing_object);
+ vm_object_backing_remove_locked(backing_object);
+ vm_object_backing_insert_locked(object, new_backing_object);
+ VM_OBJECT_WUNLOCK(new_backing_object);
+ } else {
+ object->backing_object = new_backing_object;
+ backing_object->backing_object = NULL;
+ }
+}
+
+/*
+ * Wait for a concurrent collapse to settle.
+ */
+static void
+vm_object_collapse_wait(vm_object_t object)
+{
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ while ((object->flags & OBJ_COLLAPSING) != 0) {
+ vm_object_pip_wait(object, "vmcolwait");
+ counter_u64_add(object_collapse_waits, 1);
+ }
+}
+
+/*
+ * Waits for a backing object to clear a pending collapse and returns
+ * it locked if it is an ANON object.
+ */
+static vm_object_t
+vm_object_backing_collapse_wait(vm_object_t object)
+{
+ vm_object_t backing_object;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ for (;;) {
+ backing_object = object->backing_object;
+ if (backing_object == NULL ||
+ (backing_object->flags & OBJ_ANON) == 0)
+ return (NULL);
+ VM_OBJECT_WLOCK(backing_object);
+ if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0)
+ break;
+ VM_OBJECT_WUNLOCK(object);
+ vm_object_pip_sleep(backing_object, "vmbckwait");
+ counter_u64_add(object_collapse_waits, 1);
+ VM_OBJECT_WLOCK(object);
+ }
+ return (backing_object);
+}
/*
* vm_object_terminate_pages removes any remaining pageable pages
@@ -843,9 +941,14 @@ vm_object_terminate_pages(vm_object_t object)
void
vm_object_terminate(vm_object_t object)
{
+
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & OBJ_DEAD) != 0,
("terminating non-dead obj %p", object));
+ KASSERT((object->flags & OBJ_COLLAPSING) == 0,
+ ("terminating collapsing obj %p", object));
+ KASSERT(object->backing_object == NULL,
+ ("terminating shadow obj %p", object));
/*
* wait for the pageout daemon to be done with the object
@@ -853,11 +956,11 @@ vm_object_terminate(vm_object_t object)
vm_object_pip_wait(object, "objtrm");
KASSERT(!REFCOUNT_COUNT(object->paging_in_progress),
- ("vm_object_terminate: pageout in progress"));
+ ("vm_object_terminate: pageout in progress"));
- KASSERT(object->ref_count == 0,
- ("vm_object_terminate: object with references, ref_count=%d",
- object->ref_count));
+ KASSERT(object->ref_count == 0,
+ ("vm_object_terminate: object with references, ref_count=%d",
+ object->ref_count));
if ((object->flags & OBJ_PG_DTOR) == 0)
vm_object_terminate_pages(object);
@@ -1397,11 +1500,13 @@ void
vm_object_split(vm_map_entry_t entry)
{
vm_page_t m, m_next;
- vm_object_t orig_object, new_object, source;
+ vm_object_t orig_object, new_object, backing_object;
vm_pindex_t idx, offidxstart;
vm_size_t size;
orig_object = entry->object.vm_object;
+ KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0,
+ ("vm_object_split: Splitting object with multiple mappings."));
if ((orig_object->flags & OBJ_ANON) == 0)
return;
if (orig_object->ref_count <= 1)
@@ -1419,35 +1524,25 @@ vm_object_split(vm_map_entry_t entry)
orig_object->cred, ptoa(size));
/*
+ * We must wait for the orig_object to complete any in-progress
+ * collapse so that the swap blocks are stable below. The
+ * additional reference on backing_object by new object will
+ * prevent further collapse operations until split completes.
+ */
+ VM_OBJECT_WLOCK(orig_object);
+ vm_object_collapse_wait(orig_object);
+
+ /*
* At this point, the new object is still private, so the order in
* which the original and new objects are locked does not matter.
*/
VM_OBJECT_WLOCK(new_object);
- VM_OBJECT_WLOCK(orig_object);
new_object->domain = orig_object->domain;
- source = orig_object->backing_object;
- if (source != NULL) {
- if ((source->flags & (OBJ_ANON | OBJ_DEAD)) != 0) {
- VM_OBJECT_WLOCK(source);
- if ((source->flags & OBJ_DEAD) != 0) {
- VM_OBJECT_WUNLOCK(source);
- VM_OBJECT_WUNLOCK(orig_object);
- VM_OBJECT_WUNLOCK(new_object);
- new_object->cred = NULL;
- vm_object_deallocate(new_object);
- VM_OBJECT_WLOCK(orig_object);
- return;
- }
- vm_object_backing_insert_locked(new_object, source);
- vm_object_reference_locked(source); /* for new_object */
- vm_object_clear_flag(source, OBJ_ONEMAPPING);
- VM_OBJECT_WUNLOCK(source);
- } else {
- vm_object_backing_insert(new_object, source);
- vm_object_reference(source);
- }
+ backing_object = orig_object->backing_object;
+ if (backing_object != NULL) {
+ vm_object_backing_insert_ref(new_object, backing_object);
new_object->backing_object_offset =
- orig_object->backing_object_offset + entry->offset;
+ orig_object->backing_object_offset + entry->offset;
}
if (orig_object->cred != NULL) {
crhold(orig_object->cred);
@@ -1455,6 +1550,12 @@ vm_object_split(vm_map_entry_t entry)
("orig_object->charge < 0"));
orig_object->charge -= ptoa(size);
}
+
+ /*
+ * Mark the split operation so that swap_pager_getpages() knows
+ * that the object is in transition.
+ */
+ vm_object_set_flag(orig_object, OBJ_SPLIT);
retry:
m = vm_page_find_least(orig_object, offidxstart);
for (; m != NULL && (idx = m->pindex - offidxstart) < size;
@@ -1523,6 +1624,7 @@ retry:
TAILQ_FOREACH(m, &new_object->memq, listq)
vm_page_xunbusy(m);
}
+ vm_object_clear_flag(orig_object, OBJ_SPLIT);
VM_OBJECT_WUNLOCK(orig_object);
VM_OBJECT_WUNLOCK(new_object);
entry->object.vm_object = new_object;
@@ -1531,12 +1633,8 @@ retry:
VM_OBJECT_WLOCK(new_object);
}
-#define OBSC_COLLAPSE_NOWAIT 0x0002
-#define OBSC_COLLAPSE_WAIT 0x0004
-
static vm_page_t
-vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
- int op)
+vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p)
{
vm_object_t backing_object;
@@ -1546,8 +1644,6 @@ vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
KASSERT(p == NULL || p->object == object || p->object == backing_object,
("invalid ownership %p %p %p", p, object, backing_object));
- if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
- return (next);
/* The page is only NULL when rename fails. */
if (p == NULL) {
VM_OBJECT_WUNLOCK(object);
@@ -1632,8 +1728,8 @@ vm_object_scan_all_shadowed(vm_object_t object)
return (true);
}
-static bool
-vm_object_collapse_scan(vm_object_t object, int op)
+static void
+vm_object_collapse_scan(vm_object_t object)
{
vm_object_t backing_object;
vm_page_t next, p, pp;
@@ -1646,12 +1742,6 @@ vm_object_collapse_scan(vm_object_t object, int op)
backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
/*
- * Initial conditions
- */
- if ((op & OBSC_COLLAPSE_WAIT) != 0)
- vm_object_set_flag(backing_object, OBJ_DEAD);
-
- /*
* Our scan
*/
for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
@@ -1662,12 +1752,16 @@ vm_object_collapse_scan(vm_object_t object, int op)
* Check for busy page
*/
if (vm_page_tryxbusy(p) == 0) {
- next = vm_object_collapse_scan_wait(object, p, next, op);
+ next = vm_object_collapse_scan_wait(object, p);
continue;
}
+ KASSERT(object->backing_object == backing_object,
+ ("vm_object_collapse_scan: backing object mismatch %p != %p",
+ object->backing_object, backing_object));
KASSERT(p->object == backing_object,
- ("vm_object_collapse_scan: object mismatch"));
+ ("vm_object_collapse_scan: object mismatch %p != %p",
+ p->object, backing_object));
if (p->pindex < backing_offset_index ||
new_pindex >= object->size) {
@@ -1689,16 +1783,9 @@ vm_object_collapse_scan(vm_object_t object, int op)
* The page in the parent is busy and possibly not
* (yet) valid. Until its state is finalized by the
* busy bit owner, we can't tell whether it shadows the
- * original page. Therefore, we must either skip it
- * and the original (backing_object) page or wait for
- * its state to be finalized.
- *
- * This is due to a race with vm_fault() where we must
- * unbusy the original (backing_obj) page before we can
- * (re)lock the parent. Hence we can get here.
+ * original page.
*/
- next = vm_object_collapse_scan_wait(object, pp, next,
- op);
+ next = vm_object_collapse_scan_wait(object, pp);
continue;
}
@@ -1742,10 +1829,7 @@ vm_object_collapse_scan(vm_object_t object, int op)
*/
if (vm_page_rename(p, object, new_pindex)) {
vm_page_xunbusy(p);
- if (pp != NULL)
- vm_page_xunbusy(pp);
- next = vm_object_collapse_scan_wait(object, NULL, next,
- op);
+ next = vm_object_collapse_scan_wait(object, NULL);
continue;
}
@@ -1763,27 +1847,7 @@ vm_object_collapse_scan(vm_object_t object, int op)
#endif
vm_page_xunbusy(p);
}
- return (true);
-}
-
-
-/*
- * this version of collapse allows the operation to occur earlier and
- * when paging_in_progress is true for an object... This is not a complete
- * operation, but should plug 99.9% of the rest of the leaks.
- */
-static void
-vm_object_qcollapse(vm_object_t object)
-{
- vm_object_t backing_object = object->backing_object;
-
- VM_OBJECT_ASSERT_WLOCKED(object);
- VM_OBJECT_ASSERT_WLOCKED(backing_object);
-
- if (backing_object->ref_count != 1)
- return;
-
- vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
+ return;
}
/*
@@ -1801,53 +1865,48 @@ vm_object_collapse(vm_object_t object)
VM_OBJECT_ASSERT_WLOCKED(object);
while (TRUE) {
- /*
- * Verify that the conditions are right for collapse:
- *
- * The object exists and the backing object exists.
- */
- if ((backing_object = object->backing_object) == NULL)
- break;
+ KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON,
+ ("collapsing invalid object"));
/*
- * we check the backing object first, because it is most likely
- * not collapsable.
+ * Wait for the backing_object to finish any pending
+ * collapse so that the caller sees the shortest possible
+ * shadow chain.
*/
- if ((backing_object->flags & OBJ_ANON) == 0)
- break;
- VM_OBJECT_WLOCK(backing_object);
- if ((backing_object->flags & OBJ_DEAD) != 0 ||
- (object->flags & (OBJ_DEAD | OBJ_ANON)) != OBJ_ANON) {
- VM_OBJECT_WUNLOCK(backing_object);
- break;
- }
+ backing_object = vm_object_backing_collapse_wait(object);
+ if (backing_object == NULL)
+ return;
- if (REFCOUNT_COUNT(object->paging_in_progress) > 0 ||
- REFCOUNT_COUNT(backing_object->paging_in_progress) > 0) {
- vm_object_qcollapse(object);
- VM_OBJECT_WUNLOCK(backing_object);
- break;
- }
+ KASSERT(object->ref_count > 0 &&
+ object->ref_count > object->shadow_count,
+ ("collapse with invalid ref %d or shadow %d count.",
+ object->ref_count, object->shadow_count));
+ KASSERT((backing_object->flags &
+ (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
+ ("vm_object_collapse: Backing object already collapsing."));
+ KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
+ ("vm_object_collapse: object is already collapsing."));
/*
- * We know that we can either collapse the backing object (if
- * the parent is the only reference to it) or (perhaps) have
+ * We know that we can either collapse the backing object if
+ * the parent is the only reference to it, or (perhaps) have
* the parent bypass the object if the parent happens to shadow
* all the resident pages in the entire backing object.
- *
- * This is ignoring pager-backed pages such as swap pages.
- * vm_object_collapse_scan fails the shadowing test in this
- * case.
*/
if (backing_object->ref_count == 1) {
+ KASSERT(backing_object->shadow_count == 1,
+ ("vm_object_collapse: shadow_count: %d",
+ backing_object->shadow_count));
vm_object_pip_add(object, 1);
+ vm_object_set_flag(object, OBJ_COLLAPSING);
vm_object_pip_add(backing_object, 1);
+ vm_object_set_flag(backing_object, OBJ_DEAD);
/*
* If there is exactly one reference to the backing
* object, we can collapse it into the parent.
*/
- vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
+ vm_object_collapse_scan(object);
#if VM_NRESERVLEVEL > 0
/*
@@ -1866,31 +1925,24 @@ vm_object_collapse(vm_object_t object)
* the backing_object's and object's locks are
* released and reacquired.
* Since swap_pager_copy() is being asked to
- * destroy the source, it will change the
- * backing_object's type to OBJT_DEFAULT.
+ * destroy backing_object, it will change the
+ * type to OBJT_DEFAULT.
*/
swap_pager_copy(
backing_object,
object,
OFF_TO_IDX(object->backing_object_offset), TRUE);
}
+
/*
* Object now shadows whatever backing_object did.
- * Note that the reference to
- * backing_object->backing_object moves from within
- * backing_object to within object.
*/
- vm_object_backing_remove_locked(object);
- new_backing_object = backing_object->backing_object;
- if (new_backing_object != NULL) {
- VM_OBJECT_WLOCK(new_backing_object);
- vm_object_backing_remove_locked(backing_object);
- vm_object_backing_insert_locked(object,
- new_backing_object);
- VM_OBJECT_WUNLOCK(new_backing_object);
- }
+ vm_object_clear_flag(object, OBJ_COLLAPSING);
+ vm_object_backing_transfer(object, backing_object);
object->backing_object_offset +=
backing_object->backing_object_offset;
+ VM_OBJECT_WUNLOCK(object);
+ vm_object_pip_wakeup(object);
/*
* Discard backing_object.
@@ -1903,17 +1955,17 @@ vm_object_collapse(vm_object_t object)
"backing_object %p was somehow re-referenced during collapse!",
backing_object));
vm_object_pip_wakeup(backing_object);
- backing_object->type = OBJT_DEAD;
- refcount_release(&backing_object->ref_count);
- VM_OBJECT_WUNLOCK(backing_object);
- vm_object_destroy(backing_object);
-
- vm_object_pip_wakeup(object);
+ (void)refcount_release(&backing_object->ref_count);
+ vm_object_terminate(backing_object);
counter_u64_add(object_collapses, 1);
+ VM_OBJECT_WLOCK(object);
} else {
/*
* If we do not entirely shadow the backing object,
* there is nothing we can do so we give up.
+ *
+ * The object lock and backing_object lock must not
+ * be dropped during this sequence.
*/
if (!vm_object_scan_all_shadowed(object)) {
VM_OBJECT_WUNLOCK(backing_object);
@@ -1926,21 +1978,22 @@ vm_object_collapse(vm_object_t object)
* it, since its reference count is at least 2.
*/
vm_object_backing_remove_locked(object);
-
new_backing_object = backing_object->backing_object;
if (new_backing_object != NULL) {
- vm_object_backing_insert(object,
+ vm_object_backing_insert_ref(object,
new_backing_object);
- vm_object_reference(new_backing_object);
object->backing_object_offset +=
- backing_object->backing_object_offset;
+ backing_object->backing_object_offset;
}
/*
* Drop the reference count on backing_object. Since
* its ref_count was at least 2, it will not vanish.
*/
- refcount_release(&backing_object->ref_count);
+ (void)refcount_release(&backing_object->ref_count);
+ KASSERT(backing_object->ref_count >= 1, (
+"backing_object %p was somehow dereferenced during collapse!",
+ backing_object));
VM_OBJECT_WUNLOCK(backing_object);
counter_u64_add(object_bypasses, 1);
}
@@ -2155,7 +2208,7 @@ vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
VM_OBJECT_WLOCK(prev_object);
/*
- * Try to collapse the object first
+ * Try to collapse the object first.
*/
vm_object_collapse(prev_object);
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index b5b6e0c70dc1..780f7c7e83b7 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -190,6 +190,8 @@ struct vm_object {
#define OBJ_SIZEVNLOCK 0x0040 /* lock vnode to check obj size */
#define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */
#define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */
+#define OBJ_SPLIT 0x0400 /* object is being split */
+#define OBJ_COLLAPSING 0x0800 /* Parent of collapse. */
#define OBJ_COLORED 0x1000 /* pg_color is defined */
#define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */
#define OBJ_SHADOWLIST 0x4000 /* Object is on the shadow list. */