src - FreeBSD source tree

diff options


context:
space:
mode:

author	Matthew Dillon <dillon@FreeBSD.org>	1999-09-17 04:56:40 +0000
committer	Matthew Dillon <dillon@FreeBSD.org>	1999-09-17 04:56:40 +0000
commit	90ecac61c08aaa62351a9835b6788470d91c0e72 (patch)
tree	062792ff2a372baca6dc9a49cea43b6d5250b3f4
parent	1ed9e51a4de04accbd9ae52d42c169b9091e95d9 (diff)
download	src-90ecac61c08aaa62351a9835b6788470d91c0e72.tar.gz src-90ecac61c08aaa62351a9835b6788470d91c0e72.zip

Reviewed by: Alan Cox <alc@cs.rice.edu>, David Greenman <dg@root.com>

Replace various VM related page count calculations strewn over the VM code with inlines to aid in readability and to reduce fragility in the code where modules depend on the same test being performed to properly sleep and wakeup. Split out a portion of the page deactivation code into an inline in vm_page.c to support vm_page_dontneed(). add vm_page_dontneed(), which handles the madvise MADV_DONTNEED feature in a related commit coming up for vm_map.c/vm_object.c. This code prevents degenerate cases where an essentially active page may be rotated through a subset of the paging lists, resulting in premature disposal.

Notes

Notes: svn path=/head/; revision=51337

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

187

6 files changed, 274 insertions, 116 deletions

diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h
index e382d900f6df..2ae45a378cdc 100644
--- a/sys/sys/vmmeter.h
+++ b/sys/sys/vmmeter.h

@@ -91,9 +91,95 @@ struct vmmeter {

u_int v_cache_max; /* max number of pages in cached obj */

u_int v_pageout_free_min; /* min number pages reserved for kernel */

u_int v_interrupt_free_min; /* reserved number of pages for int code */

+ u_int v_free_severe; /* severe depletion of pages below this pt */

};

#ifdef KERNEL

extern struct vmmeter cnt;

+/*

+ * Return TRUE if we are under our reserved low-free-pages threshold

+ */

+static __inline

+int

+vm_page_count_reserved(void)

+ return (cnt.v_free_reserved > (cnt.v_free_count + cnt.v_cache_count));

+/*

+ * Return TRUE if we are under our severe low-free-pages threshold

+ *

+ * This routine is typically used at the user<->system interface to determine

+ * whether we need to block in order to avoid a low memory deadlock.

+ */

+static __inline

+int

+vm_page_count_severe(void)

+ return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count));

+/*

+ * Return TRUE if we are under our minimum low-free-pages threshold.

+ *

+ * This routine is typically used within the system to determine whether

+ * we can execute potentially very expensive code in terms of memory. It

+ * is also used by the pageout daemon to calculate when to sleep, when

+ * to wake waiters up, and when (after making a pass) to become more

+ * desparate.

+ */

+static __inline

+int

+vm_page_count_min(void)

+ return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count));

+/*

+ * Return TRUE if we have not reached our free page target during

+ * free page recovery operations.

+ */

+static __inline

+int

+vm_page_count_target(void)

+ return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count));

+/*

+ * Return the number of pages we need to free-up or cache

+ * A positive number indicates that we do not have enough free pages.

+ */

+static __inline

+int

+vm_paging_target(void)

+ return (

+ (cnt.v_free_target + cnt.v_cache_min) -

+ (cnt.v_free_count + cnt.v_cache_count)

+ );

+/*

+ * Return a positive number if the pagedaemon needs to be woken up.

+ */

+static __inline

+int

+vm_paging_needed(void)

+ return (

+ (cnt.v_free_reserved + cnt.v_cache_min) >

+ (cnt.v_free_count + cnt.v_cache_count)

+ );

#endif

/* systemwide totals computed every five seconds */

diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index e53079a43a0f..1d7157c7450a 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c

@@ -209,19 +209,9 @@ vm_fork(p1, p2, flags)

p1->p_vmspace->vm_refcnt++;

}

- /*

- * Great, so we have a memory-heavy process and the

- * entire machine comes to a screaching halt because

- * nobody can fork/exec anything. What we really need

- * to do is fix the process swapper so it swaps out the right

- * processes.

- */

-#if 0

- while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {

- vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN);

+ while (vm_page_count_severe()) {

VM_WAIT;

}

-#endif

if ((flags & RFMEM) == 0) {

p2->p_vmspace = vmspace_fork(p1->p_vmspace);

@@ -339,8 +329,9 @@ scheduler(dummy)

int ppri;

loop:

- while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {

+ if (vm_page_count_min()) {

VM_WAIT;

+ goto loop;

}

pp = NULL;

diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index 4e7f0fb12c3d..6c695624bf52 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c

@@ -119,6 +119,8 @@ SYSCTL_INT(_vm, VM_V_CACHE_MAX, v_cache_max,

CTLFLAG_RW, &cnt.v_cache_max, 0, "");

SYSCTL_INT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,

CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");

+SYSCTL_INT(_vm, OID_AUTO, v_free_severe,

+ CTLFLAG_RW, &cnt.v_free_severe, 0, "");

SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD,

&averunnable, loadavg, "Machine loadaverage history");

diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index f6db00ef98a0..533ba37fff6f 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c

@@ -615,8 +615,7 @@ vm_page_unqueue(m)

(*pq->cnt)--;

pq->lcnt--;

if ((queue - m->pc) == PQ_CACHE) {

- if ((cnt.v_cache_count + cnt.v_free_count) <

- (cnt.v_free_reserved + cnt.v_cache_min))

+ if (vm_paging_needed())

pagedaemon_wakeup();

}

@@ -871,9 +870,7 @@ loop:

* Don't wakeup too often - wakeup the pageout daemon when

* we would be nearly out of memory.

- if (((cnt.v_free_count + cnt.v_cache_count) <

- (cnt.v_free_reserved + cnt.v_cache_min)) ||

- (cnt.v_free_count < cnt.v_pageout_free_min))

+ if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)

pagedaemon_wakeup();

splx(s);

@@ -991,6 +988,8 @@ vm_page_asleep(vm_page_t m, char *msg, char *busy) {

* vm_page_activate:

* Put the specified page on the active list (if appropriate).

+ * Ensure that act_count is at least ACT_INIT but do not otherwise

+ * mess with it.

* The page queues must be locked.

* This routine may not block.

@@ -1050,8 +1049,7 @@ vm_page_free_wakeup()

* high water mark. And wakeup scheduler process if we have

* lots of memory. this process will swapin processes.

- if (vm_pages_needed &&

- ((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {

+ if (vm_pages_needed && vm_page_count_min()) {

wakeup(&cnt.v_free_count);

vm_pages_needed = 0;

}

@@ -1261,11 +1259,14 @@ vm_page_unwire(m, activate)

* Move the specified page to the inactive queue. If the page has

* any associated swap, the swap is deallocated.

+ * Normally athead is 0 resulting in LRU operation. athead is set

+ * to 1 if we want this page to be 'as if it were placed in the cache',

+ * except without unmapping it from the process address space.

+ *

* This routine may not block.

-void

-vm_page_deactivate(m)

- register vm_page_t m;

+static __inline void

+_vm_page_deactivate(vm_page_t m, int athead)

{

int s;

@@ -1280,7 +1281,10 @@ vm_page_deactivate(m)

if ((m->queue - m->pc) == PQ_CACHE)

cnt.v_reactivated++;

vm_page_unqueue(m);

- TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);

+ if (athead)

+ TAILQ_INSERT_HEAD(&vm_page_queue_inactive, m, pageq);

+ else

+ TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);

m->queue = PQ_INACTIVE;

vm_page_queues[PQ_INACTIVE].lcnt++;

cnt.v_inactive_count++;

@@ -1288,6 +1292,12 @@ vm_page_deactivate(m)

splx(s);

}

+void

+vm_page_deactivate(vm_page_t m)

+ _vm_page_deactivate(m, 0);

* vm_page_cache

@@ -1333,6 +1343,70 @@ vm_page_cache(m)

}

+ * vm_page_dontneed

+ *

+ * Cache, deactivate, or do nothing as appropriate. This routine

+ * is typically used by madvise() MADV_DONTNEED.

+ *

+ * Generally speaking we want to move the page into the cache so

+ * it gets reused quickly. However, this can result in a silly syndrome

+ * due to the page recycling too quickly. Small objects will not be

+ * fully cached. On the otherhand, if we move the page to the inactive

+ * queue we wind up with a problem whereby very large objects

+ * unnecessarily blow away our inactive and cache queues.

+ *

+ * The solution is to move the pages based on a fixed weighting. We

+ * either leave them alone, deactivate them, or move them to the cache,

+ * where moving them to the cache has the highest weighting.

+ * By forcing some pages into other queues we eventually force the

+ * system to balance the queues, potentially recovering other unrelated

+ * space from active. The idea is to not force this to happen too

+ * often.

+ */

+void

+vm_page_dontneed(m)

+ vm_page_t m;

+ static int dnweight;

+ int dnw;

+ int head;

+ dnw = ++dnweight;

+ /*

+ * occassionally leave the page alone

+ */

+ if ((dnw & 0x01F0) == 0 ||

+ m->queue == PQ_INACTIVE ||

+ m->queue - m->pc == PQ_CACHE

+ ) {

+ if (m->act_count >= ACT_INIT)

+ --m->act_count;

+ return;

+ }

+ if (m->dirty == 0)

+ vm_page_test_dirty(m);

+ if (m->dirty || (dnw & 0x0070) == 0) {

+ /*

+ * Deactivate the page 3 times out of 32.

+ */

+ head = 0;

+ } else {

+ /*

+ * Cache the page 28 times out of every 32. Note that

+ * the page is deactivated instead of cached, but placed

+ * at the head of the queue instead of the tail.

+ */

+ head = 1;

+ }

+ _vm_page_deactivate(m, head);

+/*

* Grab a page, waiting until we are waken up due to the page

* changing state. We keep on waiting, if the page continues

* to be in the object. If the page doesn't exist, allocate it.

diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 6ffb8676eebb..2d7e7401dfe2 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h

@@ -136,7 +136,8 @@ struct vm_page {

};

- * note SWAPBLK_NONE is a flag, basically the high bit.

+ * note: currently use SWAPBLK_NONE as an absolute value rather then

+ * a flag bit.

#define SWAPBLK_MASK ((daddr_t)((u_daddr_t)-1 >> 1)) /* mask */

@@ -391,6 +392,7 @@ void vm_page_activate __P((vm_page_t));

vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));

vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));

void vm_page_cache __P((register vm_page_t));

+void vm_page_dontneed __P((register vm_page_t));

static __inline void vm_page_copy __P((vm_page_t, vm_page_t));

static __inline void vm_page_free __P((vm_page_t));

static __inline void vm_page_free_zero __P((vm_page_t));

diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index bc8784cc587d..d24e51cec6f8 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c

@@ -219,7 +219,7 @@ vm_pageout_clean(m)

vm_page_t mc[2*vm_pageout_page_count];

int pageout_count;

- int i, forward_okay, backward_okay, page_base;

+ int ib, is, page_base;

vm_pindex_t pindex = m->pindex;

object = m->object;

@@ -243,11 +243,9 @@ vm_pageout_clean(m)

mc[vm_pageout_page_count] = m;

pageout_count = 1;

page_base = vm_pageout_page_count;

- forward_okay = TRUE;

- if (pindex != 0)

- backward_okay = TRUE;

- else

- backward_okay = FALSE;

+ ib = 1;

+ is = 1;

* Scan object for clusterable pages.

@@ -258,82 +256,84 @@ vm_pageout_clean(m)

* active page.

* -or-

* 2) we force the issue.

+ *

+ * During heavy mmap/modification loads the pageout

+ * daemon can really fragment the underlying file

+ * due to flushing pages out of order and not trying

+ * align the clusters (which leave sporatic out-of-order

+ * holes). To solve this problem we do the reverse scan

+ * first and attempt to align our cluster, then do a

+ * forward scan if room remains.

- for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) {

+more:

+ while (ib && pageout_count < vm_pageout_page_count) {

vm_page_t p;

- /*

- * See if forward page is clusterable.

- */

- if (forward_okay) {

- /*

- * Stop forward scan at end of object.

- */

- if ((pindex + i) > object->size) {

- forward_okay = FALSE;

- goto do_backward;

- }

- p = vm_page_lookup(object, pindex + i);

- if (p) {

- if (((p->queue - p->pc) == PQ_CACHE) ||

- (p->flags & PG_BUSY) || p->busy) {

- forward_okay = FALSE;

- goto do_backward;

- }

- vm_page_test_dirty(p);

- if ((p->dirty & p->valid) != 0 &&

- (p->queue == PQ_INACTIVE) &&

- (p->wire_count == 0) &&

- (p->hold_count == 0)) {

- mc[vm_pageout_page_count + i] = p;

- pageout_count++;

- if (pageout_count == vm_pageout_page_count)

- break;

- } else {

- forward_okay = FALSE;

- }

- } else {

- forward_okay = FALSE;

- }

+ if (ib > pindex) {

+ ib = 0;

+ break;

+ }

+ if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {

+ ib = 0;

+ break;

+ }

+ if (((p->queue - p->pc) == PQ_CACHE) ||

+ (p->flags & PG_BUSY) || p->busy) {

+ ib = 0;

+ break;

+ }

+ vm_page_test_dirty(p);

+ if ((p->dirty & p->valid) == 0 ||

+ p->queue != PQ_INACTIVE ||

+ p->wire_count != 0 ||

+ p->hold_count != 0) {

+ ib = 0;

+ break;

}

-do_backward:

+ mc[--page_base] = p;

+ ++pageout_count;

+ ++ib;

- * See if backward page is clusterable.

+ * alignment boundry, stop here and switch directions. Do

+ * not clear ib.

- if (backward_okay) {

- /*

- * Stop backward scan at beginning of object.

- */

- if ((pindex - i) == 0) {

- backward_okay = FALSE;

- }

- p = vm_page_lookup(object, pindex - i);

- if (p) {

- if (((p->queue - p->pc) == PQ_CACHE) ||

- (p->flags & PG_BUSY) || p->busy) {

- backward_okay = FALSE;

- continue;

- }

- vm_page_test_dirty(p);

- if ((p->dirty & p->valid) != 0 &&

- (p->queue == PQ_INACTIVE) &&

- (p->wire_count == 0) &&

- (p->hold_count == 0)) {

- mc[vm_pageout_page_count - i] = p;

- pageout_count++;

- page_base--;

- if (pageout_count == vm_pageout_page_count)

- break;

- } else {

- backward_okay = FALSE;

- }

- } else {

- backward_okay = FALSE;

- }

+ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)

+ break;

+ }

+ while (pageout_count < vm_pageout_page_count &&

+ pindex + is < object->size) {

+ vm_page_t p;

+ if ((p = vm_page_lookup(object, pindex + is)) == NULL)

+ break;

+ if (((p->queue - p->pc) == PQ_CACHE) ||

+ (p->flags & PG_BUSY) || p->busy) {

+ break;

}

+ vm_page_test_dirty(p);

+ if ((p->dirty & p->valid) == 0 ||

+ p->queue != PQ_INACTIVE ||

+ p->wire_count != 0 ||

+ p->hold_count != 0) {

+ break;

+ }

+ mc[page_base + pageout_count] = p;

+ ++pageout_count;

+ ++is;

}

+ * If we exhausted our forward scan, continue with the reverse scan

+ * when possible, even past a page boundry. This catches boundry

+ * conditions.

+ */

+ if (ib && pageout_count < vm_pageout_page_count)

+ goto more;

+ /*

* we allow reads during pageouts...

return vm_pageout_flush(&mc[page_base], pageout_count, 0);

@@ -397,7 +397,7 @@ vm_pageout_flush(mc, count, flags)

* worked.

pmap_clear_modify(VM_PAGE_TO_PHYS(mt));

- mt->dirty = 0;

+ vm_page_undirty(mt);

break;

case VM_PAGER_ERROR:

case VM_PAGER_FAIL:

@@ -646,9 +646,7 @@ vm_pageout_scan()

* to the cache.

- page_shortage = (cnt.v_free_target + cnt.v_cache_min) -

- (cnt.v_free_count + cnt.v_cache_count);

- page_shortage += addl_page_shortage_init;

+ page_shortage = vm_paging_target() + addl_page_shortage_init;

* Figure out what to do with dirty pages when they are encountered.

@@ -787,7 +785,7 @@ rescan0:

} else {

swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);

swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&

- (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min);

+ vm_page_count_min());

}

@@ -1082,15 +1080,11 @@ rescan0:

* in a writeable object, wakeup the sync daemon. And kick swapout

* if we did not get enough free pages.

- if ((cnt.v_cache_count + cnt.v_free_count) <

- (cnt.v_free_target + cnt.v_cache_min) ) {

- if (vnodes_skipped &&

- (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) {

+ if (vm_paging_target() > 0) {

+ if (vnodes_skipped && vm_page_count_min())

(void) speedup_syncer();

- }

#if !defined(NO_SWAPPING)

- if (vm_swap_enabled &&

- (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) {

+ if (vm_swap_enabled && vm_page_count_target()) {

vm_req_vmdaemon();

vm_pageout_req_swapout |= VM_SWAP_NORMAL;

}

@@ -1101,8 +1095,7 @@ rescan0:

* make sure that we have swap space -- if we are low on memory and

* swap -- then kill the biggest process.

- if ((vm_swap_size == 0 || swap_pager_full) &&

- ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) {

+ if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {

bigproc = NULL;

bigsize = 0;

for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {

@@ -1160,8 +1153,10 @@ vm_pageout_page_stats()

static int fullintervalcount = 0;

int page_shortage;

- page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -

+ page_shortage =

+ (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -

(cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);

if (page_shortage <= 0)

return;

@@ -1253,7 +1248,9 @@ vm_size_t count;

cnt.v_interrupt_free_min;

cnt.v_free_reserved = vm_pageout_page_count +

cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;

+ cnt.v_free_severe = cnt.v_free_min / 2;

cnt.v_free_min += cnt.v_free_reserved;

+ cnt.v_free_severe += cnt.v_free_reserved;

return 1;

}

@@ -1326,8 +1323,17 @@ vm_pageout()

while (TRUE) {

int error;

int s = splvm();

- if (!vm_pages_needed ||

- ((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) {

+ if (vm_pages_needed && vm_page_count_min()) {

+ /*

+ * Still not done, sleep a bit and go again

+ */

+ vm_pages_needed = 0;

+ tsleep(&vm_pages_needed, PVM, "psleep", hz/2);

+ } else {

+ /*

+ * Good enough, sleep & handle stats

+ */

vm_pages_needed = 0;

error = tsleep(&vm_pages_needed,

PVM, "psleep", vm_pageout_stats_interval * hz);

@@ -1336,9 +1342,6 @@ vm_pageout()

vm_pageout_page_stats();

continue;

}

- } else if (vm_pages_needed) {

- vm_pages_needed = 0;

- tsleep(&vm_pages_needed, PVM, "psleep", hz/2);

}

if (vm_pages_needed)