aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/amd64/amd64/pmap.c2
-rw-r--r--sys/arm/arm/pmap-v6.c2
-rw-r--r--sys/arm/arm/pmap.c2
-rw-r--r--sys/geom/geom.h1
-rw-r--r--sys/geom/geom_io.c106
-rw-r--r--sys/geom/geom_vfs.c8
-rw-r--r--sys/i386/i386/pmap.c2
-rw-r--r--sys/i386/xen/pmap.c2
-rw-r--r--sys/ia64/ia64/pmap.c2
-rw-r--r--sys/kern/subr_bus_dma.c23
-rw-r--r--sys/kern/subr_param.c7
-rw-r--r--sys/kern/vfs_bio.c963
-rw-r--r--sys/kern/vfs_cluster.c105
-rw-r--r--sys/mips/mips/pmap.c2
-rw-r--r--sys/powerpc/aim/mmu_oea64.c8
-rw-r--r--sys/powerpc/powerpc/pmap_dispatch.c2
-rw-r--r--sys/sparc64/sparc64/pmap.c2
-rw-r--r--sys/sys/bio.h9
-rw-r--r--sys/sys/buf.h11
-rw-r--r--sys/sys/systm.h1
-rw-r--r--sys/vm/vm.h2
-rw-r--r--sys/vm/vm_init.c7
-rw-r--r--sys/vm/vm_kern.c1
23 files changed, 955 insertions, 315 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 55d2cff775a2..1b1c86cc8c0b 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -4235,6 +4235,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
pagecopy((void *)src, (void *)dst);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 0083f29eb769..9d165099e668 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -3312,6 +3312,8 @@ pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
mtx_unlock(&cmtx);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c
index c18783b22185..0875f83b84ca 100644
--- a/sys/arm/arm/pmap.c
+++ b/sys/arm/arm/pmap.c
@@ -4428,6 +4428,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
#endif
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/geom/geom.h b/sys/geom/geom.h
index 351b05dd4834..660bf6e73804 100644
--- a/sys/geom/geom.h
+++ b/sys/geom/geom.h
@@ -205,6 +205,7 @@ struct g_provider {
u_int flags;
#define G_PF_WITHER 0x2
#define G_PF_ORPHAN 0x4
+#define G_PF_ACCEPT_UNMAPPED 0x8
/* Two fields for the implementing class to use */
void *private;
diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
index c6a5da8f8a60..6ffc06ededde 100644
--- a/sys/geom/geom_io.c
+++ b/sys/geom/geom_io.c
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2002 Poul-Henning Kamp
* Copyright (c) 2002 Networks Associates Technology, Inc.
+ * Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Poul-Henning Kamp
@@ -8,6 +9,9 @@
* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
* DARPA CHATS research program.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -44,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/stack.h>
+#include <sys/sysctl.h>
#include <sys/errno.h>
#include <geom/geom.h>
@@ -51,6 +56,13 @@ __FBSDID("$FreeBSD$");
#include <sys/devicestat.h>
#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
static struct g_bioq g_bio_run_down;
static struct g_bioq g_bio_run_up;
@@ -180,12 +192,17 @@ g_clone_bio(struct bio *bp)
/*
* BIO_ORDERED flag may be used by disk drivers to enforce
* ordering restrictions, so this flag needs to be cloned.
+ * BIO_UNMAPPED should be inherited, to properly indicate
+ * which way the buffer is passed.
* Other bio flags are not suitable for cloning.
*/
- bp2->bio_flags = bp->bio_flags & BIO_ORDERED;
+ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED);
bp2->bio_length = bp->bio_length;
bp2->bio_offset = bp->bio_offset;
bp2->bio_data = bp->bio_data;
+ bp2->bio_ma = bp->bio_ma;
+ bp2->bio_ma_n = bp->bio_ma_n;
+ bp2->bio_ma_offset = bp->bio_ma_offset;
bp2->bio_attribute = bp->bio_attribute;
/* Inherit classification info from the parent */
bp2->bio_classifier1 = bp->bio_classifier1;
@@ -210,11 +227,15 @@ g_duplicate_bio(struct bio *bp)
struct bio *bp2;
bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
+ bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED;
bp2->bio_parent = bp;
bp2->bio_cmd = bp->bio_cmd;
bp2->bio_length = bp->bio_length;
bp2->bio_offset = bp->bio_offset;
bp2->bio_data = bp->bio_data;
+ bp2->bio_ma = bp->bio_ma;
+ bp2->bio_ma_n = bp->bio_ma_n;
+ bp2->bio_ma_offset = bp->bio_ma_offset;
bp2->bio_attribute = bp->bio_attribute;
bp->bio_children++;
#ifdef KTR
@@ -575,6 +596,83 @@ g_io_deliver(struct bio *bp, int error)
return;
}
+SYSCTL_DECL(_kern_geom);
+
+static long transient_maps;
+SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
+ &transient_maps, 0,
+ "Total count of the transient mapping requests");
+u_int transient_map_retries = 10;
+SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
+ &transient_map_retries, 0,
+ "Max count of retries used before giving up on creating transient map");
+int transient_map_hard_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
+ &transient_map_hard_failures, 0,
+ "Failures to establish the transient mapping due to retry attempts "
+ "exhausted");
+int transient_map_soft_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
+ &transient_map_soft_failures, 0,
+ "Count of retried failures to establish the transient mapping");
+int inflight_transient_maps;
+SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
+ &inflight_transient_maps, 0,
+ "Current count of the active transient maps");
+
+static int
+g_io_transient_map_bio(struct bio *bp)
+{
+ vm_offset_t addr;
+ long size;
+ u_int retried;
+ int rv;
+
+ size = round_page(bp->bio_ma_offset + bp->bio_length);
+ KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
+ addr = 0;
+ retried = 0;
+ atomic_add_long(&transient_maps, 1);
+retry:
+ vm_map_lock(bio_transient_map);
+ if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map),
+ size, &addr)) {
+ vm_map_unlock(bio_transient_map);
+ if (transient_map_retries != 0 &&
+ retried >= transient_map_retries) {
+ g_io_deliver(bp, EDEADLK/* XXXKIB */);
+ CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
+ bp, bp->bio_to->name);
+ atomic_add_int(&transient_map_hard_failures, 1);
+ return (1);
+ } else {
+ /*
+ * Naive attempt to quisce the I/O to get more
+ * in-flight requests completed and defragment
+ * the bio_transient_map.
+ */
+ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
+ bp, bp->bio_to->name, retried);
+ pause("g_d_tra", hz / 10);
+ retried++;
+ atomic_add_int(&transient_map_soft_failures, 1);
+ goto retry;
+ }
+ }
+ rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+ KASSERT(rv == KERN_SUCCESS,
+ ("vm_map_insert(bio_transient_map) rv %d %jx %lx",
+ rv, (uintmax_t)addr, size));
+ vm_map_unlock(bio_transient_map);
+ atomic_add_int(&inflight_transient_maps, 1);
+ pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
+ bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
+ bp->bio_flags |= BIO_TRANSIENT_MAPPING;
+ bp->bio_flags &= ~BIO_UNMAPPED;
+ return (0);
+}
+
void
g_io_schedule_down(struct thread *tp __unused)
{
@@ -636,6 +734,12 @@ g_io_schedule_down(struct thread *tp __unused)
default:
break;
}
+ if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
+ (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
+ (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
+ if (g_io_transient_map_bio(bp))
+ continue;
+ }
THREAD_NO_SLEEPING();
CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
"len %ld", bp, bp->bio_to->name, bp->bio_offset,
diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c
index bbed550df062..92f1ad2f509b 100644
--- a/sys/geom/geom_vfs.c
+++ b/sys/geom/geom_vfs.c
@@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp)
bip = g_alloc_bio();
bip->bio_cmd = bp->b_iocmd;
bip->bio_offset = bp->b_iooffset;
- bip->bio_data = bp->b_data;
- bip->bio_done = g_vfs_done;
- bip->bio_caller2 = bp;
bip->bio_length = bp->b_bcount;
- if (bp->b_flags & B_BARRIER) {
+ bdata2bio(bp, bip);
+ if ((bp->b_flags & B_BARRIER) != 0) {
bip->bio_flags |= BIO_ORDERED;
bp->b_flags &= ~B_BARRIER;
}
+ bip->bio_done = g_vfs_done;
+ bip->bio_caller2 = bp;
g_io_request(bip, cp);
}
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 6499986fca02..f4681bfd2256 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -4205,6 +4205,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
mtx_unlock(&sysmaps->lock);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c
index 0f7a80f66243..a9bc1247a399 100644
--- a/sys/i386/xen/pmap.c
+++ b/sys/i386/xen/pmap.c
@@ -3448,6 +3448,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
mtx_unlock(&sysmaps->lock);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/ia64/ia64/pmap.c b/sys/ia64/ia64/pmap.c
index 32566009a72a..883f39ce5e6d 100644
--- a/sys/ia64/ia64/pmap.c
+++ b/sys/ia64/ia64/pmap.c
@@ -2014,6 +2014,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
bcopy(src, dst, PAGE_SIZE);
}
+int unmapped_buf_allowed;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c
index 773d01ae2d07..4d344b48a392 100644
--- a/sys/kern/subr_bus_dma.c
+++ b/sys/kern/subr_bus_dma.c
@@ -126,11 +126,28 @@ static int
_bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
int *nsegs, int flags)
{
- int error;
+ vm_paddr_t paddr;
+ bus_size_t len, tlen;
+ int error, i, ma_offs;
- error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
- bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+ if ((bio->bio_flags & BIO_UNMAPPED) == 0) {
+ error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
+ bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+ return (error);
+ }
+ error = 0;
+ tlen = bio->bio_bcount;
+ ma_offs = bio->bio_ma_offset;
+ for (i = 0; tlen > 0; i++, tlen -= len) {
+ len = min(PAGE_SIZE - ma_offs, tlen);
+ paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs;
+ error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+ flags, NULL, nsegs);
+ if (error != 0)
+ break;
+ ma_offs = 0;
+ }
return (error);
}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
index 825a3a032f7a..a2e822c793f6 100644
--- a/sys/kern/subr_param.c
+++ b/sys/kern/subr_param.c
@@ -92,6 +92,7 @@ int maxfiles; /* sys. wide open files limit */
int maxfilesperproc; /* per-proc open files limit */
int msgbufsize; /* size of kernel message buffer */
int nbuf;
+int bio_transient_maxcnt;
int ngroups_max; /* max # groups per process */
int nswbuf;
pid_t pid_max = PID_MAX;
@@ -118,6 +119,9 @@ SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0,
"Maximum memory for swap metadata");
SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0,
"Maximum value of vfs.maxbufspace");
+SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN,
+ &bio_transient_maxcnt, 0,
+ "Maximum number of transient BIOs mappings");
SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0,
"Maximum text size");
SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0,
@@ -266,6 +270,8 @@ init_param1(void)
pid_max = PID_MAX;
else if (pid_max < 300)
pid_max = 300;
+
+ TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed);
}
/*
@@ -322,6 +328,7 @@ init_param2(long physpages)
*/
nbuf = NBUF;
TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+ TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt);
/*
* The default for maxpipekva is min(1/64 of the kernel address space,
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index d20c8297d601..cded596bab66 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,8 +1,12 @@
/*-
* Copyright (c) 2004 Poul-Henning Kamp
* Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -92,6 +96,7 @@ struct buf_ops buf_ops_bio = {
* carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c.
*/
struct buf *buf; /* buffer header pool */
+caddr_t unmapped_buf;
static struct proc *bufdaemonproc;
@@ -132,6 +137,10 @@ SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
"Virtual memory used for buffers");
#endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+ &unmapped_bufspace, 0,
+ "Amount of unmapped buffers, inclusive in the bufspace");
static long maxbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
"Maximum allowed value of bufspace (including buf_daemon)");
@@ -201,6 +210,10 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
"Number of times getnewbuf has had to restart a buffer aquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+ "Number of times getblk has had to restart a buffer mapping for "
+ "unmapped buffer");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -210,6 +223,9 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
"Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+ &unmapped_buf_allowed, 0,
+ "Permit the use of the unmapped i/o");
/*
* Wakeup point for bufdaemon, as well as indicator of whether it is already
@@ -281,6 +297,9 @@ static struct mtx nblock;
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
/* Lock for the bufqueues */
static struct mtx bqlock;
@@ -511,7 +530,7 @@ caddr_t
kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
{
int tuned_nbuf;
- long maxbuf;
+ long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
/*
* physmem_est is in pages. Convert it to kilobytes (assumes
@@ -555,6 +574,52 @@ kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
}
/*
+ * Ideal allocation size for the transient bio submap if 10%
+ * of the maximal space buffer map. This roughly corresponds
+ * to the amount of the buffer mapped for typical UFS load.
+ *
+ * Clip the buffer map to reserve space for the transient
+ * BIOs, if its extent is bigger than 90% of the maximum
+ * buffer map extent on the platform.
+ *
+ * The fall-back to the maxbuf in case of maxbcache unset,
+ * allows to not trim the buffer KVA for the architectures
+ * with ample KVA space.
+ */
+ if (bio_transient_maxcnt == 0) {
+ maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+ buf_sz = (long)nbuf * BKVASIZE;
+ if (buf_sz < maxbuf_sz / 10 * 9) {
+ /*
+ * There is more KVA than memory. Do not
+ * adjust buffer map size, and assign the rest
+ * of maxbuf to transient map.
+ */
+ biotmap_sz = maxbuf_sz - buf_sz;
+ } else {
+ /*
+ * Buffer map spans all KVA we could afford on
+ * this platform. Give 10% of the buffer map
+ * to the transient bio map.
+ */
+ biotmap_sz = buf_sz / 10;
+ buf_sz -= biotmap_sz;
+ }
+ if (biotmap_sz / INT_MAX > MAXPHYS)
+ bio_transient_maxcnt = INT_MAX;
+ else
+ bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+ /*
+ * Artifically limit to 1024 simultaneous in-flight I/Os
+ * using the transient mapping.
+ */
+ if (bio_transient_maxcnt > 1024)
+ bio_transient_maxcnt = 1024;
+ if (tuned_nbuf)
+ nbuf = buf_sz / BKVASIZE;
+ }
+
+ /*
* swbufs are used as temporary holders for I/O, such as paging I/O.
* We have no less then 16 and no more then 256.
*/
@@ -607,6 +672,9 @@ bufinit(void)
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[QUEUE_EMPTY]++;
+#endif
}
/*
@@ -675,6 +743,55 @@ bufinit(void)
bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+ unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+ ("mapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase != unmapped_buf,
+ ("mapped buf: b_kvabase was not updated %p", bp));
+ KASSERT(bp->b_data != unmapped_buf,
+ ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+ ("unmapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase == unmapped_buf,
+ ("unmapped buf: corrupted b_kvabase %p", bp));
+ KASSERT(bp->b_data == unmapped_buf,
+ ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define BUF_CHECK_MAPPED(bp) do {} while (0)
+#define BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+ BUF_CHECK_MAPPED(bp);
+
+ /*
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
}
/*
@@ -686,14 +803,26 @@ static void
bfreekva(struct buf *bp)
{
- if (bp->b_kvasize) {
- atomic_add_int(&buffreekvacnt, 1);
- atomic_subtract_long(&bufspace, bp->b_kvasize);
- vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
- (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
- bp->b_kvasize = 0;
- bufspacewakeup();
+ if (bp->b_kvasize == 0)
+ return;
+
+ atomic_add_int(&buffreekvacnt, 1);
+ atomic_subtract_long(&bufspace, bp->b_kvasize);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase,
+ (vm_offset_t)bp->b_kvabase + bp->b_kvasize);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ if ((bp->b_flags & B_KVAALLOC) != 0) {
+ vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc,
+ (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize);
+ }
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
}
+ bp->b_kvasize = 0;
+ bufspacewakeup();
}
/*
@@ -760,6 +889,11 @@ bremfreel(struct buf *bp)
mtx_assert(&bqlock, MA_OWNED);
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+ bp->b_qindex));
+ bq_len[bp->b_qindex]--;
+#endif
bp->b_qindex = QUEUE_NONE;
/*
* If this was a delayed bremfree() we only need to remove the buffer
@@ -1414,7 +1548,8 @@ brelse(struct buf *bp)
}
}
- if ((bp->b_flags & B_INVAL) == 0) {
+ if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(
trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
@@ -1509,11 +1644,17 @@ brelse(struct buf *bp)
bp->b_qindex = QUEUE_DIRTY;
else
bp->b_qindex = QUEUE_CLEAN;
- if (bp->b_flags & B_AGE)
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
- else
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+ if (bp->b_flags & B_AGE) {
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
+ b_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
+ b_freelist);
+ }
}
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
mtx_unlock(&bqlock);
/*
@@ -1604,6 +1745,9 @@ bqrelse(struct buf *bp)
if (bp->b_flags & B_DELWRI) {
bp->b_qindex = QUEUE_DIRTY;
TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
} else {
/*
* The locking of the BO_LOCK for checking of the
@@ -1616,6 +1760,9 @@ bqrelse(struct buf *bp)
bp->b_qindex = QUEUE_CLEAN;
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
b_freelist);
+#ifdef INVARIANTS
+ bq_len[QUEUE_CLEAN]++;
+#endif
} else {
/*
* We are too low on memory, we have to try to free
@@ -1657,7 +1804,11 @@ vfs_vmio_release(struct buf *bp)
int i;
vm_page_t m;
- pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
@@ -1761,8 +1912,10 @@ vfs_bio_awrite(struct buf *bp)
int nwritten;
int size;
int maxcl;
+ int gbflags;
bo = &vp->v_bufobj;
+ gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
/*
* right now we support clustered writing only to regular files. If
* we find a clusterable block we could be in the middle of a cluster
@@ -1794,7 +1947,7 @@ vfs_bio_awrite(struct buf *bp)
if (ncl != 1) {
BUF_UNLOCK(bp);
nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
- 0);
+ gbflags);
return (nwritten);
}
}
@@ -1811,46 +1964,207 @@ vfs_bio_awrite(struct buf *bp)
return (nwritten);
}
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+ KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_kvabase = (caddr_t)addr;
+ } else if ((gbflags & GB_KVAALLOC) != 0) {
+ KASSERT((gbflags & GB_UNMAPPED) != 0,
+ ("GB_KVAALLOC without GB_UNMAPPED"));
+ bp->b_kvaalloc = (caddr_t)addr;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ }
+ bp->b_kvasize = maxsize;
+}
+
/*
- * getnewbuf:
- *
- * Find and initialize a new buffer header, freeing up existing buffers
- * in the bufqueues as necessary. The new buffer is returned locked.
- *
- * Important: B_INVAL is not set. If the caller wishes to throw the
- * buffer away, the caller must set B_INVAL prior to calling brelse().
- *
- * We block if:
- * We have insufficient buffer headers
- * We have insufficient buffer space
- * buffer_map is too fragmented ( space reservation fails )
- * If we have to flush dirty buffers ( but we try to avoid this )
- *
- * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- * Instead we ask the buf daemon to do it for us. We attempt to
- * avoid piecemeal wakeups of the pageout daemon.
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
*/
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+ vm_offset_t addr;
+ int rv;
-static struct buf *
-getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
- int gbflags)
+ bfreekva(bp);
+ addr = 0;
+
+ vm_map_lock(buffer_map);
+ if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize,
+ &addr)) {
+ vm_map_unlock(buffer_map);
+ /*
+ * Buffer map is too fragmented. Request the caller
+ * to defragment the map.
+ */
+ atomic_add_int(&bufdefragcnt, 1);
+ return (1);
+ }
+ rv = vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+ KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv));
+ vm_map_unlock(buffer_map);
+ setbufkva(bp, addr, maxsize, gbflags);
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+ int defrag)
{
struct thread *td;
- struct buf *bp;
- struct buf *nbp;
- int defrag = 0;
- int nqindex;
- static int flushingbufs;
+ char *waitmsg;
+ int fl, flags, norunbuf;
+
+ mtx_assert(&bqlock, MA_OWNED);
+
+ if (defrag) {
+ flags = VFS_BIO_NEED_BUFSPACE;
+ waitmsg = "nbufkv";
+ } else if (bufspace >= hibufspace) {
+ waitmsg = "nbufbs";
+ flags = VFS_BIO_NEED_BUFSPACE;
+ } else {
+ waitmsg = "newbuf";
+ flags = VFS_BIO_NEED_ANY;
+ }
+ mtx_lock(&nblock);
+ needsbuffer |= flags;
+ mtx_unlock(&nblock);
+ mtx_unlock(&bqlock);
+
+ bd_speedup(); /* heeeelp */
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
td = curthread;
+ mtx_lock(&nblock);
+ while (needsbuffer & flags) {
+ if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+ mtx_unlock(&nblock);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+ /* play bufdaemon */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_do_flush(vp);
+ td->td_pflags &= norunbuf;
+ mtx_lock(&nblock);
+ if (fl != 0)
+ continue;
+ if ((needsbuffer & flags) == 0)
+ break;
+ }
+ if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
+ waitmsg, slptimeo))
+ break;
+ }
+ mtx_unlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+ CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+ "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+ bp->b_kvasize, bp->b_bufsize, qindex);
+ mtx_assert(&bqlock, MA_NOTOWNED);
+
/*
- * We can't afford to block since we might be holding a vnode lock,
- * which may prevent system daemons from running. We deal with
- * low-memory situations by proactively returning memory and running
- * async I/O rather then sync I/O.
+ * Note: we no longer distinguish between VMIO and non-VMIO
+ * buffers.
*/
- atomic_add_int(&getnewbufcalls, 1);
- atomic_subtract_int(&getnewbufrestarts, 1);
+ KASSERT((bp->b_flags & B_DELWRI) == 0,
+ ("delwri buffer %p found in queue %d", bp, qindex));
+
+ if (qindex == QUEUE_CLEAN) {
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+ if (bp->b_vp != NULL)
+ brelvp(bp);
+ }
+
+ /*
+ * Get the rest of the buffer freed up. b_kva* is still valid
+ * after this operation.
+ */
+
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 3");
+ KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d",
+ bp, bp->b_vp, qindex));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+
+ bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
+ ("buf %p still counted as free?", bp));
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+
+ LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+ struct buf *bp, *nbp;
+ int nqindex, qindex, pass;
+
+ KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+ pass = 1;
restart:
atomic_add_int(&getnewbufrestarts, 1);
@@ -1860,66 +2174,90 @@ restart:
* that if we are specially marked process, we are allowed to
* dip into our reserves.
*
- * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * for the allocation of the mapped buffer. For unmapped, the
+ * easiest is to start with EMPTY outright.
*
* We start with EMPTYKVA. If the list is empty we backup to EMPTY.
* However, there are a number of cases (defragging, reusing, ...)
* where we cannot backup.
*/
+ nbp = NULL;
mtx_lock(&bqlock);
- nqindex = QUEUE_EMPTYKVA;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
+ if (!defrag && unmapped) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
if (nbp == NULL) {
- /*
- * If no EMPTYKVA buffers and we are either
- * defragging or reusing, locate a CLEAN buffer
- * to free or reuse. If bufspace useage is low
- * skip this step so we can allocate a new buffer.
- */
- if (defrag || bufspace >= lobufspace) {
- nqindex = QUEUE_CLEAN;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
- }
+ nqindex = QUEUE_EMPTYKVA;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ }
- /*
- * If we could not find or were not allowed to reuse a
- * CLEAN buffer, check to see if it is ok to use an EMPTY
- * buffer. We can only use an EMPTY buffer if allocating
- * its KVA would not otherwise run us out of buffer space.
- */
- if (nbp == NULL && defrag == 0 &&
- bufspace + maxsize < hibufspace) {
- nqindex = QUEUE_EMPTY;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
- }
+ /*
+ * If no EMPTYKVA buffers and we are either defragging or
+ * reusing, locate a CLEAN buffer to free or reuse. If
+ * bufspace useage is low skip this step so we can allocate a
+ * new buffer.
+ */
+ if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ }
+
+ /*
+ * If we could not find or were not allowed to reuse a CLEAN
+ * buffer, check to see if it is ok to use an EMPTY buffer.
+ * We can only use an EMPTY buffer if allocating its KVA would
+ * not otherwise run us out of buffer space. No KVA is needed
+ * for the unmapped allocation.
+ */
+ if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+ metadata)) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
+
+ /*
+ * All available buffers might be clean, retry ignoring the
+ * lobufspace as the last resort.
+ */
+ if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
}
/*
* Run scan, possibly freeing data and/or kva mappings on the fly
* depending.
*/
-
while ((bp = nbp) != NULL) {
- int qindex = nqindex;
+ qindex = nqindex;
/*
- * Calculate next bp ( we can only use it if we do not block
- * or do other fancy things ).
+ * Calculate next bp (we can only use it if we do not
+ * block or do other fancy things).
*/
if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
- switch(qindex) {
+ switch (qindex) {
case QUEUE_EMPTY:
nqindex = QUEUE_EMPTYKVA;
- if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ if (nbp != NULL)
break;
/* FALLTHROUGH */
case QUEUE_EMPTYKVA:
nqindex = QUEUE_CLEAN;
- if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ if (nbp != NULL)
break;
/* FALLTHROUGH */
case QUEUE_CLEAN:
+ if (metadata && pass == 1) {
+ pass = 2;
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(
+ &bufqueues[QUEUE_EMPTY]);
+ }
/*
* nbp is NULL.
*/
@@ -1952,22 +2290,9 @@ restart:
}
BO_UNLOCK(bp->b_bufobj);
}
- CTR6(KTR_BUF,
- "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
- "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
- bp->b_kvasize, bp->b_bufsize, qindex);
-
- /*
- * Sanity Checks
- */
- KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
-
- /*
- * Note: we no longer distinguish between VMIO and non-VMIO
- * buffers.
- */
- KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+ KASSERT(bp->b_qindex == qindex,
+ ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
if (bp->b_bufobj != NULL)
BO_LOCK(bp->b_bufobj);
@@ -1975,68 +2300,13 @@ restart:
if (bp->b_bufobj != NULL)
BO_UNLOCK(bp->b_bufobj);
mtx_unlock(&bqlock);
-
- if (qindex == QUEUE_CLEAN) {
- if (bp->b_flags & B_VMIO) {
- bp->b_flags &= ~B_ASYNC;
- vfs_vmio_release(bp);
- }
- if (bp->b_vp)
- brelvp(bp);
- }
-
/*
* NOTE: nbp is now entirely invalid. We can only restart
* the scan from this point on.
- *
- * Get the rest of the buffer freed up. b_kva* is still
- * valid after this operation.
*/
- if (bp->b_rcred != NOCRED) {
- crfree(bp->b_rcred);
- bp->b_rcred = NOCRED;
- }
- if (bp->b_wcred != NOCRED) {
- crfree(bp->b_wcred);
- bp->b_wcred = NOCRED;
- }
- if (!LIST_EMPTY(&bp->b_dep))
- buf_deallocate(bp);
- if (bp->b_vflags & BV_BKGRDINPROG)
- panic("losing buffer 3");
- KASSERT(bp->b_vp == NULL,
- ("bp: %p still has vnode %p. qindex: %d",
- bp, bp->b_vp, qindex));
- KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
- ("bp: %p still on a buffer list. xflags %X",
- bp, bp->b_xflags));
-
- if (bp->b_bufsize)
- allocbuf(bp, 0);
-
- bp->b_flags = 0;
- bp->b_ioflags = 0;
- bp->b_xflags = 0;
- KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
- ("buf %p still counted as free?", bp));
- bp->b_vflags = 0;
- bp->b_vp = NULL;
- bp->b_blkno = bp->b_lblkno = 0;
- bp->b_offset = NOOFFSET;
- bp->b_iodone = 0;
- bp->b_error = 0;
- bp->b_resid = 0;
- bp->b_bcount = 0;
- bp->b_npages = 0;
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_bufobj = NULL;
- bp->b_pin_count = 0;
- bp->b_fsprivate1 = NULL;
- bp->b_fsprivate2 = NULL;
- bp->b_fsprivate3 = NULL;
-
- LIST_INIT(&bp->b_dep);
+ getnewbuf_reuse_bp(bp, qindex);
+ mtx_assert(&bqlock, MA_NOTOWNED);
/*
* If we are defragging then free the buffer.
@@ -2060,6 +2330,9 @@ restart:
goto restart;
}
+ if (metadata)
+ break;
+
/*
* If we are overcomitted then recover the buffer and its
* KVM space. This occurs in rare situations when multiple
@@ -2077,6 +2350,59 @@ restart:
flushingbufs = 0;
break;
}
+ return (bp);
+}
+
+/*
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary. The new buffer is returned locked.
+ *
+ * Important: B_INVAL is not set. If the caller wishes to throw the
+ * buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_map is too fragmented ( space reservation fails )
+ * If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
+ * Instead we ask the buf daemon to do it for us. We attempt to
+ * avoid piecemeal wakeups of the pageout daemon.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+ int gbflags)
+{
+ struct buf *bp;
+ int defrag, metadata;
+
+ KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ if (!unmapped_buf_allowed)
+ gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ defrag = 0;
+ if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+ vp->v_type == VCHR)
+ metadata = 1;
+ else
+ metadata = 0;
+ /*
+ * We can't afford to block since we might be holding a vnode lock,
+ * which may prevent system daemons from running. We deal with
+ * low-memory situations by proactively returning memory and running
+ * async I/O rather then sync I/O.
+ */
+ atomic_add_int(&getnewbufcalls, 1);
+ atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+ bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+ GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+ if (bp != NULL)
+ defrag = 0;
/*
* If we exhausted our list, sleep as appropriate. We may have to
@@ -2084,65 +2410,23 @@ restart:
*
* Generally we are sleeping due to insufficient buffer space.
*/
-
if (bp == NULL) {
- int flags, norunbuf;
- char *waitmsg;
- int fl;
-
- if (defrag) {
- flags = VFS_BIO_NEED_BUFSPACE;
- waitmsg = "nbufkv";
- } else if (bufspace >= hibufspace) {
- waitmsg = "nbufbs";
- flags = VFS_BIO_NEED_BUFSPACE;
- } else {
- waitmsg = "newbuf";
- flags = VFS_BIO_NEED_ANY;
- }
- mtx_lock(&nblock);
- needsbuffer |= flags;
- mtx_unlock(&nblock);
- mtx_unlock(&bqlock);
-
- bd_speedup(); /* heeeelp */
- if (gbflags & GB_NOWAIT_BD)
- return (NULL);
-
- mtx_lock(&nblock);
- while (needsbuffer & flags) {
- if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
- mtx_unlock(&nblock);
- /*
- * getblk() is called with a vnode
- * locked, and some majority of the
- * dirty buffers may as well belong to
- * the vnode. Flushing the buffers
- * there would make a progress that
- * cannot be achieved by the
- * buf_daemon, that cannot lock the
- * vnode.
- */
- norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
- (td->td_pflags & TDP_NORUNNINGBUF);
- /* play bufdaemon */
- td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
- fl = buf_do_flush(vp);
- td->td_pflags &= norunbuf;
- mtx_lock(&nblock);
- if (fl != 0)
- continue;
- if ((needsbuffer & flags) == 0)
- break;
- }
- if (msleep(&needsbuffer, &nblock,
- (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
- mtx_unlock(&nblock);
- return (NULL);
- }
- }
- mtx_unlock(&nblock);
+ mtx_assert(&bqlock, MA_OWNED);
+ getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+ mtx_assert(&bqlock, MA_NOTOWNED);
+ } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+ mtx_assert(&bqlock, MA_NOTOWNED);
+
+ bfreekva(bp);
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_kvabase = bp->b_data = unmapped_buf;
+ bp->b_kvasize = maxsize;
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
} else {
+ mtx_assert(&bqlock, MA_NOTOWNED);
+
/*
* We finally have a valid bp. We aren't quite out of the
* woods, we still have to reserve kva space. In order
@@ -2151,39 +2435,47 @@ restart:
*/
maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
- if (maxsize != bp->b_kvasize) {
- vm_offset_t addr = 0;
- int rv;
-
- bfreekva(bp);
-
- vm_map_lock(buffer_map);
- if (vm_map_findspace(buffer_map,
- vm_map_min(buffer_map), maxsize, &addr)) {
- /*
- * Buffer map is too fragmented.
- * We must defragment the map.
- */
- atomic_add_int(&bufdefragcnt, 1);
- vm_map_unlock(buffer_map);
+ if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+ B_KVAALLOC)) == B_UNMAPPED) {
+ if (allocbufkva(bp, maxsize, gbflags)) {
defrag = 1;
bp->b_flags |= B_INVAL;
brelse(bp);
goto restart;
}
- rv = vm_map_insert(buffer_map, NULL, 0, addr,
- addr + maxsize, VM_PROT_ALL, VM_PROT_ALL,
- MAP_NOFAULT);
- KASSERT(rv == KERN_SUCCESS,
- ("vm_map_insert(buffer_map) rv %d", rv));
- vm_map_unlock(buffer_map);
- bp->b_kvabase = (caddr_t)addr;
- bp->b_kvasize = maxsize;
- atomic_add_long(&bufspace, bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+ /*
+ * If the reused buffer has KVA allocated,
+ * reassign b_kvaalloc to b_kvabase.
+ */
+ bp->b_kvabase = bp->b_kvaalloc;
+ bp->b_flags &= ~B_KVAALLOC;
+ atomic_subtract_long(&unmapped_bufspace,
+ bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+ GB_KVAALLOC)) {
+ /*
+ * The case of reused buffer already have KVA
+ * mapped, but the request is for unmapped
+ * buffer with KVA allocated.
+ */
+ bp->b_kvaalloc = bp->b_kvabase;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace,
+ bp->b_kvasize);
atomic_add_int(&bufreusecnt, 1);
}
- bp->b_saveaddr = bp->b_kvabase;
- bp->b_data = bp->b_saveaddr;
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr;
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ }
}
return (bp);
}
@@ -2594,6 +2886,90 @@ vfs_setdirty_locked_object(struct buf *bp)
}
/*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+ struct buf *scratch_bp;
+ int bsize, maxsize, need_mapping, need_kva;
+ off_t offset;
+
+ need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+ (gbflags & GB_UNMAPPED) == 0;
+ need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+ (gbflags & GB_KVAALLOC) != 0;
+ if (!need_mapping && !need_kva)
+ return;
+
+ BUF_CHECK_UNMAPPED(bp);
+
+ if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+ /*
+ * Buffer is not mapped, but the KVA was already
+ * reserved at the time of the instantiation. Use the
+ * allocated space.
+ */
+ bp->b_flags &= ~B_KVAALLOC;
+ KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+ bp->b_kvabase = bp->b_kvaalloc;
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ goto has_addr;
+ }
+
+ /*
+ * Calculate the amount of the address space we would reserve
+ * if the buffer was mapped.
+ */
+ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+ offset = blkno * bsize;
+ maxsize = size + (offset & PAGE_MASK);
+ maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+ if (allocbufkva(bp, maxsize, gbflags)) {
+ /*
+ * Request defragmentation. getnewbuf() returns us the
+ * allocated space by the scratch buffer KVA.
+ */
+ scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+ (GB_UNMAPPED | GB_KVAALLOC));
+ if (scratch_bp == NULL) {
+ if ((gbflags & GB_NOWAIT_BD) != 0) {
+ /*
+ * XXXKIB: defragmentation cannot
+ * succeed, not sure what else to do.
+ */
+ panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+ }
+ atomic_add_int(&mappingrestarts, 1);
+ goto mapping_loop;
+ }
+ KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+ ("scratch bp !B_KVAALLOC %p", scratch_bp));
+ setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+ scratch_bp->b_kvasize, gbflags);
+
+ /* Get rid of the scratch buffer. */
+ scratch_bp->b_kvasize = 0;
+ scratch_bp->b_flags |= B_INVAL;
+ scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+ brelse(scratch_bp);
+ }
+ if (!need_mapping)
+ return;
+
+has_addr:
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ bpmap_qenter(bp);
+}
+
+/*
* getblk:
*
* Get a block given a specified block and offset into a file/device.
@@ -2635,12 +3011,17 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
{
struct buf *bp;
struct bufobj *bo;
- int error;
+ int bsize, error, maxsize, vmio;
+ off_t offset;
CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+ KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
ASSERT_VOP_LOCKED(vp, "getblk");
if (size > MAXBSIZE)
panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+ if (!unmapped_buf_allowed)
+ flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
bo = &vp->v_bufobj;
loop:
@@ -2743,12 +3124,18 @@ loop:
}
/*
+ * Handle the case of unmapped buffer which should
+ * become mapped, or the buffer for which KVA
+ * reservation is requested.
+ */
+ bp_unmapped_get_kva(bp, blkno, size, flags);
+
+ /*
* If the size is inconsistant in the VMIO case, we can resize
* the buffer. This might lead to B_CACHE getting set or
* cleared. If the size has not changed, B_CACHE remains
* unchanged from its previous state.
*/
-
if (bp->b_bcount != size)
allocbuf(bp, size);
@@ -2789,9 +3176,6 @@ loop:
}
bp->b_flags &= ~B_DONE;
} else {
- int bsize, maxsize, vmio;
- off_t offset;
-
/*
* Buffer is not in-core, create new buffer. The buffer
* returned by getnewbuf() is locked. Note that the returned
@@ -2807,7 +3191,13 @@ loop:
bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
offset = blkno * bsize;
vmio = vp->v_object != NULL;
- maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+ if (vmio) {
+ maxsize = size + (offset & PAGE_MASK);
+ } else {
+ maxsize = size;
+ /* Do not allow non-VMIO notmapped buffers. */
+ flags &= ~GB_UNMAPPED;
+ }
maxsize = imax(maxsize, bsize);
bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
@@ -2863,6 +3253,7 @@ loop:
KASSERT(bp->b_bufobj->bo_object == NULL,
("ARGH! has b_bufobj->bo_object %p %p\n",
bp, bp->b_bufobj->bo_object));
+ BUF_CHECK_MAPPED(bp);
}
allocbuf(bp, size);
@@ -3038,10 +3429,14 @@ allocbuf(struct buf *bp, int size)
if (desiredpages < bp->b_npages) {
vm_page_t m;
- pmap_qremove((vm_offset_t)trunc_page(
- (vm_offset_t)bp->b_data) +
- (desiredpages << PAGE_SHIFT),
- (bp->b_npages - desiredpages));
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove((vm_offset_t)trunc_page(
+ (vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT),
+ (bp->b_npages - desiredpages));
+ } else
+ BUF_CHECK_UNMAPPED(bp);
VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (i = desiredpages; i < bp->b_npages; i++) {
/*
@@ -3147,21 +3542,12 @@ allocbuf(struct buf *bp, int size)
VM_OBJECT_WUNLOCK(obj);
/*
- * Step 3, fixup the KVM pmap. Remember that
- * bp->b_data is relative to bp->b_offset, but
- * bp->b_offset may be offset into the first page.
+ * Step 3, fixup the KVM pmap.
*/
-
- bp->b_data = (caddr_t)
- trunc_page((vm_offset_t)bp->b_data);
- pmap_qenter(
- (vm_offset_t)bp->b_data,
- bp->b_pages,
- bp->b_npages
- );
-
- bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
- (vm_offset_t)(bp->b_offset & PAGE_MASK));
+ if ((bp->b_flags & B_UNMAPPED) == 0)
+ bpmap_qenter(bp);
+ else
+ BUF_CHECK_UNMAPPED(bp);
}
}
if (newbsize < bp->b_bufsize)
@@ -3171,21 +3557,38 @@ allocbuf(struct buf *bp, int size)
return 1;
}
+extern int inflight_transient_maps;
+
void
biodone(struct bio *bp)
{
struct mtx *mtxp;
void (*done)(struct bio *);
+ vm_offset_t start, end;
+ int transient;
mtxp = mtx_pool_find(mtxpool_sleep, bp);
mtx_lock(mtxp);
bp->bio_flags |= BIO_DONE;
+ if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+ start = trunc_page((vm_offset_t)bp->bio_data);
+ end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+ transient = 1;
+ } else {
+ transient = 0;
+ start = end = 0;
+ }
done = bp->bio_done;
if (done == NULL)
wakeup(bp);
mtx_unlock(mtxp);
if (done != NULL)
done(bp);
+ if (transient) {
+ pmap_qremove(start, OFF_TO_IDX(end - start));
+ vm_map_remove(bio_transient_map, start, end);
+ atomic_add_int(&inflight_transient_maps, -1);
+ }
}
/*
@@ -3288,7 +3691,7 @@ dev_strategy(struct cdev *dev, struct buf *bp)
bip->bio_offset = bp->b_iooffset;
bip->bio_length = bp->b_bcount;
bip->bio_bcount = bp->b_bcount; /* XXX: remove */
- bip->bio_data = bp->b_data;
+ bdata2bio(bp, bip);
bip->bio_done = bufdonebio;
bip->bio_caller2 = bp;
bip->bio_dev = dev;
@@ -3442,9 +3845,11 @@ bufdone_finish(struct buf *bp)
}
vm_object_pip_wakeupn(obj, 0);
VM_OBJECT_WUNLOCK(obj);
- if (bogus)
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
+ }
}
/*
@@ -3487,8 +3892,12 @@ vfs_unbusy_pages(struct buf *bp)
if (!m)
panic("vfs_unbusy_pages: page missing\n");
bp->b_pages[i] = m;
- pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
- bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
}
vm_object_pip_subtract(obj, 1);
vm_page_io_finish(m);
@@ -3653,9 +4062,11 @@ vfs_busy_pages(struct buf *bp, int clear_modify)
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
}
VM_OBJECT_WUNLOCK(obj);
- if (bogus)
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
+ }
}
/*
@@ -3777,6 +4188,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
vm_page_t p;
int index;
+ BUF_CHECK_MAPPED(bp);
+
to = round_page(to);
from = round_page(from);
index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
@@ -3808,6 +4221,8 @@ vm_hold_free_pages(struct buf *bp, int newbsize)
vm_page_t p;
int index, newnpages;
+ BUF_CHECK_MAPPED(bp);
+
from = round_page((vm_offset_t)bp->b_data + newbsize);
newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
if (bp->b_npages > newnpages)
@@ -4009,6 +4424,30 @@ bunpin_wait(struct buf *bp)
mtx_unlock(mtxp);
}
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ KASSERT(unmapped_buf_allowed, ("unmapped"));
+ bip->bio_ma = bp->b_pages;
+ bip->bio_ma_n = bp->b_npages;
+ bip->bio_data = unmapped_buf;
+ bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+ bip->bio_flags |= BIO_UNMAPPED;
+ KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+ PAGE_SIZE == bp->b_npages,
+ ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset,
+ bip->bio_length, bip->bio_ma_n));
+ } else {
+ bip->bio_data = bp->b_data;
+ bip->bio_ma = NULL;
+ }
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 28aa4ff8eaf9..91a044319185 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -61,11 +61,11 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
-static struct cluster_save *
- cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
-static struct buf *
- cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
- daddr_t blkno, long size, int run, struct buf *fbp);
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+ struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+ daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+ struct buf *fbp);
static void cluster_callback(struct buf *);
static int write_behind = 1;
@@ -97,6 +97,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
error = 0;
bo = &vp->v_bufobj;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
/*
* Try to limit the amount of read-ahead by a few
@@ -112,7 +114,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
/*
* get the requested block
*/
- *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0);
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
origblkno = lblkno;
/*
@@ -203,7 +205,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
if (ncontig < nblks)
nblks = ncontig;
bp = cluster_rbuild(vp, filesize, lblkno,
- blkno, size, nblks, bp);
+ blkno, size, nblks, gbflags, bp);
lblkno += (bp->b_bufsize / size);
} else {
bp->b_flags |= B_RAM;
@@ -247,14 +249,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
if (ncontig) {
ncontig = min(ncontig + 1, racluster);
rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
- size, ncontig, NULL);
+ size, ncontig, gbflags, NULL);
lblkno += (rbp->b_bufsize / size);
if (rbp->b_flags & B_DELWRI) {
bqrelse(rbp);
continue;
}
} else {
- rbp = getblk(vp, lblkno, size, 0, 0, 0);
+ rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
lblkno += 1;
if (rbp->b_flags & B_DELWRI) {
bqrelse(rbp);
@@ -293,14 +295,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
* and then parcel them up into logical blocks in the buffer hash table.
*/
static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
- struct vnode *vp;
- u_quad_t filesize;
- daddr_t lbn;
- daddr_t blkno;
- long size;
- int run;
- struct buf *fbp;
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
{
struct bufobj *bo;
struct buf *bp, *tbp;
@@ -324,7 +320,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
tbp = fbp;
tbp->b_iocmd = BIO_READ;
} else {
- tbp = getblk(vp, lbn, size, 0, 0, 0);
+ tbp = getblk(vp, lbn, size, 0, 0, gbflags);
if (tbp->b_flags & B_CACHE)
return tbp;
tbp->b_flags |= B_ASYNC | B_RAM;
@@ -345,9 +341,14 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
* address may not be either. Inherit the b_data offset
* from the original buffer.
*/
- bp->b_data = (char *)((vm_offset_t)bp->b_data |
- ((vm_offset_t)tbp->b_data & PAGE_MASK));
bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+ if ((gbflags & GB_UNMAPPED) != 0) {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ } else {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ }
bp->b_iocmd = BIO_READ;
bp->b_iodone = cluster_callback;
bp->b_blkno = blkno;
@@ -371,7 +372,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
break;
}
- tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT);
+ tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+ (gbflags & GB_UNMAPPED));
/* Don't wait around for locked bufs. */
if (tbp == NULL)
@@ -493,8 +495,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
bp->b_bufsize, bp->b_kvasize);
bp->b_kvasize = bp->b_bufsize;
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *)bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
return (bp);
}
@@ -517,7 +521,10 @@ cluster_callback(bp)
if (bp->b_ioflags & BIO_ERROR)
error = bp->b_error;
- pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+ bp->b_npages);
+ }
/*
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
@@ -559,7 +566,8 @@ cluster_callback(bp)
*/
static __inline int
-cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
{
int r = 0;
@@ -570,7 +578,7 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
start_lbn -= len;
/* FALLTHROUGH */
case 1:
- r = cluster_wbuild(vp, size, start_lbn, len, 0);
+ r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
/* FALLTHROUGH */
default:
/* FALLTHROUGH */
@@ -598,6 +606,9 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
int lblocksize;
int async;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
if (vp->v_type == VREG) {
async = DOINGASYNC(vp);
lblocksize = vp->v_mount->mnt_stat.f_iosize;
@@ -637,13 +648,13 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
if (!async && seqcount > 0) {
cluster_wbuild_wb(vp, lblocksize,
- vp->v_cstart, cursize);
+ vp->v_cstart, cursize, gbflags);
}
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
- buflist = cluster_collectbufs(vp, bp);
+ buflist = cluster_collectbufs(vp, bp, gbflags);
endbp = &buflist->bs_children
[buflist->bs_nchildren - 1];
if (VOP_REALLOCBLKS(vp, buflist)) {
@@ -662,7 +673,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
if (seqcount > 1) {
cluster_wbuild_wb(vp,
lblocksize, vp->v_cstart,
- cursize);
+ cursize, gbflags);
}
} else {
/*
@@ -710,8 +721,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
* update daemon handle it.
*/
bdwrite(bp);
- if (seqcount > 1)
- cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+ vp->v_clen + 1, gbflags);
+ }
vp->v_clen = 0;
vp->v_cstart = lbn + 1;
} else if (vm_page_count_severe()) {
@@ -746,6 +759,9 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
int totalwritten = 0;
int dbsize = btodb(size);
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
bo = &vp->v_bufobj;
while (len > 0) {
/*
@@ -824,10 +840,16 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
* address may not be either. Inherit the b_data offset
* from the original buffer.
*/
- bp->b_data = (char *)((vm_offset_t)bp->b_data |
- ((vm_offset_t)tbp->b_data & PAGE_MASK));
- bp->b_flags |= B_CLUSTER |
- (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+ if ((gbflags & GB_UNMAPPED) == 0 ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ } else {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ }
+ bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+ B_NEEDCOMMIT));
bp->b_iodone = cluster_callback;
pbgetvp(vp, bp);
/*
@@ -954,8 +976,10 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
tbp, b_cluster.cluster_entry);
}
finishcluster:
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *) bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
if (bp->b_bufsize > bp->b_kvasize)
panic(
"cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
@@ -976,9 +1000,7 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
* Plus add one additional buffer.
*/
static struct cluster_save *
-cluster_collectbufs(vp, last_bp)
- struct vnode *vp;
- struct buf *last_bp;
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
{
struct cluster_save *buflist;
struct buf *bp;
@@ -991,7 +1013,8 @@ cluster_collectbufs(vp, last_bp)
buflist->bs_nchildren = 0;
buflist->bs_children = (struct buf **) (buflist + 1);
for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
- (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+ (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+ gbflags, &bp);
buflist->bs_children[i] = bp;
if (bp->b_blkno == bp->b_lblkno)
VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c
index 4fe6ebe68225..5c94a39c60fa 100644
--- a/sys/mips/mips/pmap.c
+++ b/sys/mips/mips/pmap.c
@@ -2576,6 +2576,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
}
}
+int unmapped_buf_allowed;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 90466e8a485e..66194df8bed8 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -648,6 +648,14 @@ moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart,
moea64_kenter(mmup, pa, pa);
}
ENABLE_TRANS(msr);
+
+ /*
+ * Allow user to override unmapped_buf_allowed for testing.
+ * XXXKIB Only direct map implementation was tested.
+ */
+ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
+ &unmapped_buf_allowed))
+ unmapped_buf_allowed = hw_direct_map;
}
void
diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c
index 42f1a3975b5a..7fd98f4b9b29 100644
--- a/sys/powerpc/powerpc/pmap_dispatch.c
+++ b/sys/powerpc/powerpc/pmap_dispatch.c
@@ -574,3 +574,5 @@ pmap_mmu_install(char *name, int prio)
return (FALSE);
}
+
+int unmapped_buf_allowed;
diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c
index 97a085a171a3..2a24eb5f5064 100644
--- a/sys/sparc64/sparc64/pmap.c
+++ b/sys/sparc64/sparc64/pmap.c
@@ -1918,6 +1918,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
}
}
+int unmapped_buf_allowed;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index c016ee67b904..7678f5aa4f13 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -55,10 +55,13 @@
#define BIO_DONE 0x02
#define BIO_ONQUEUE 0x04
#define BIO_ORDERED 0x08
+#define BIO_UNMAPPED 0x10
+#define BIO_TRANSIENT_MAPPING 0x20
#ifdef _KERNEL
struct disk;
struct bio;
+struct vm_map;
/* Empty classifier tag, to prevent further classification. */
#define BIO_NOTCLASSIFIED (void *)(~0UL)
@@ -78,6 +81,9 @@ struct bio {
off_t bio_offset; /* Offset into file. */
long bio_bcount; /* Valid bytes in buffer. */
caddr_t bio_data; /* Memory, superblocks, indirect etc. */
+ struct vm_page **bio_ma; /* Or unmapped. */
+ int bio_ma_offset; /* Offset in the first page of bio_ma. */
+ int bio_ma_n; /* Number of pages in bio_ma. */
int bio_error; /* Errno for BIO_ERROR. */
long bio_resid; /* Remaining I/O in bytes. */
void (*bio_done)(struct bio *);
@@ -121,6 +127,9 @@ struct bio_queue_head {
struct bio *insert_point;
};
+extern struct vm_map *bio_transient_map;
+extern int bio_transient_maxcnt;
+
void biodone(struct bio *bp);
void biofinish(struct bio *bp, struct devstat *stat, int error);
int biowait(struct bio *bp, const char *wchan);
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 87f2b764444a..cc8029e37919 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -117,6 +117,7 @@ struct buf {
long b_bufsize; /* Allocated buffer size. */
long b_runningbufspace; /* when I/O is running, pipelining */
caddr_t b_kvabase; /* base kva for buffer */
+ caddr_t b_kvaalloc; /* allocated kva for B_KVAALLOC */
int b_kvasize; /* size of kva for buffer */
daddr_t b_lblkno; /* Logical block number. */
struct vnode *b_vp; /* Device vnode. */
@@ -202,8 +203,8 @@ struct buf {
#define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */
#define B_DONE 0x00000200 /* I/O completed. */
#define B_EINTR 0x00000400 /* I/O was interrupted */
-#define B_00000800 0x00000800 /* Available flag. */
-#define B_00001000 0x00001000 /* Available flag. */
+#define B_UNMAPPED 0x00000800 /* KVA is not mapped. */
+#define B_KVAALLOC 0x00001000 /* But allocated. */
#define B_INVAL 0x00002000 /* Does not contain valid info. */
#define B_BARRIER 0x00004000 /* Write this and all preceeding first. */
#define B_NOCACHE 0x00008000 /* Do not cache block after use. */
@@ -453,7 +454,9 @@ buf_countdeps(struct buf *bp, int i)
*/
#define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */
#define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */
-#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */
+#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */
+#define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */
+#define GB_KVAALLOC 0x0010 /* But allocate KVA. */
#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
@@ -470,11 +473,13 @@ extern struct buf *swbuf; /* Swap I/O buffer headers. */
extern int nswbuf; /* Number of swap I/O buffer headers. */
extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */
extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */
+extern caddr_t unmapped_buf;
void runningbufwakeup(struct buf *);
void waitrunningbufspace(void);
caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
void bufinit(void);
+void bdata2bio(struct buf *bp, struct bio *bip);
void bwillwrite(void);
int buf_dirty_count_severe(void);
void bremfree(struct buf *);
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index 12337de5e544..61b467936ec3 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -138,6 +138,7 @@ extern char **kenvp;
extern const void *zero_region; /* address space maps to a zeroed page */
+extern int unmapped_buf_allowed;
extern int iosize_max_clamp;
#define IOSIZE_MAX (iosize_max_clamp ? INT_MAX : SSIZE_MAX)
diff --git a/sys/vm/vm.h b/sys/vm/vm.h
index 132c10e02bed..106c510ceb7c 100644
--- a/sys/vm/vm.h
+++ b/sys/vm/vm.h
@@ -136,6 +136,8 @@ struct kva_md_info {
vm_offset_t clean_eva;
vm_offset_t pager_sva;
vm_offset_t pager_eva;
+ vm_offset_t bio_transient_sva;
+ vm_offset_t bio_transient_eva;
};
extern struct kva_md_info kmi;
diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c
index 08c9b037afcf..b350e96020f2 100644
--- a/sys/vm/vm_init.c
+++ b/sys/vm/vm_init.c
@@ -184,10 +184,15 @@ again:
panic("startup: table size inconsistency");
clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva,
- (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE);
+ (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS +
+ (long)bio_transient_maxcnt * MAXPHYS, TRUE);
buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva,
&kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE);
buffer_map->system_map = 1;
+ bio_transient_map = kmem_suballoc(clean_map, &kmi->bio_transient_sva,
+ &kmi->bio_transient_eva, (long)bio_transient_maxcnt * MAXPHYS,
+ FALSE);
+ bio_transient_map->system_map = 1;
pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva,
(long)nswbuf * MAXPHYS, FALSE);
pager_map->system_map = 1;
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 64a2ebb53053..7c7ccc1d7e14 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -90,6 +90,7 @@ vm_map_t kmem_map;
vm_map_t exec_map;
vm_map_t pipe_map;
vm_map_t buffer_map;
+vm_map_t bio_transient_map;
const void *zero_region;
CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);