aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/conf/files3
-rw-r--r--sys/conf/kern.mk1
-rw-r--r--sys/kern/kern_mbuf.c392
-rw-r--r--sys/kern/subr_bus_dma.c71
-rw-r--r--sys/kern/subr_sglist.c145
-rw-r--r--sys/kern/uipc_mbuf.c272
-rw-r--r--sys/kern/uipc_sockbuf.c45
-rw-r--r--sys/kern/uipc_socket.c6
-rw-r--r--sys/net/bpf.c1
-rw-r--r--sys/net/bpf_buffer.c11
-rw-r--r--sys/net/if.h1
-rw-r--r--sys/netinet/ip_output.c31
-rw-r--r--sys/netinet/tcp_pcap.c8
-rw-r--r--sys/netinet/tcp_usrreq.c3
-rw-r--r--sys/netinet6/ip6_output.c31
-rw-r--r--sys/sys/mbuf.h119
-rw-r--r--sys/sys/sglist.h7
17 files changed, 1101 insertions, 46 deletions
diff --git a/sys/conf/files b/sys/conf/files
index c62a9e285e47..0b3fb72d3ae0 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4255,7 +4255,8 @@ netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
-netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
+netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
+ compile-with "${NORMAL_C} ${NO_WNONNULL}"
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
netinet/tcp_subr.c optional inet | inet6
diff --git a/sys/conf/kern.mk b/sys/conf/kern.mk
index 881ec0d7487f..a8cf270ed13e 100644
--- a/sys/conf/kern.mk
+++ b/sys/conf/kern.mk
@@ -76,6 +76,7 @@ CWARNEXTRA?= -Wno-uninitialized
# GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for
# the few files that are already known to generate cast-qual warnings.
NO_WCAST_QUAL= -Wno-cast-qual
+NO_WNONNULL= -Wno-nonnull
.endif
.endif
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
index 0543d4b75c8e..dee14a8e1b27 100644
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
+#include <sys/sf_buf.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@@ -281,6 +282,7 @@ uma_zone_t zone_pack;
uma_zone_t zone_jumbop;
uma_zone_t zone_jumbo9;
uma_zone_t zone_jumbo16;
+uma_zone_t zone_extpgs;
/*
* Local prototypes.
@@ -298,6 +300,9 @@ static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
/* Ensure that MSIZE is a power of 2. */
CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
+_Static_assert(sizeof(struct mbuf_ext_pgs) == 256,
+ "mbuf_ext_pgs size mismatch");
+
/*
* Initialize FreeBSD Network buffer allocation.
*/
@@ -379,6 +384,15 @@ mbuf_init(void *dummy)
uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
+ zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME,
+ sizeof(struct mbuf_ext_pgs),
+#ifdef INVARIANTS
+ trash_ctor, trash_dtor, trash_init, trash_fini,
+#else
+ NULL, NULL, NULL, NULL,
+#endif
+ UMA_ALIGN_CACHE, 0);
+
/*
* Hook event handler for low-memory situation, used to
* drain protocols and push data back to the caches (UMA
@@ -824,6 +838,380 @@ mb_reclaim(uma_zone_t zone __unused, int pending __unused)
}
/*
+ * Free "count" units of I/O from an mbuf chain. They could be held
+ * in EXT_PGS or just as a normal mbuf. This code is intended to be
+ * called in an error path (I/O error, closed connection, etc).
+ */
+void
+mb_free_notready(struct mbuf *m, int count)
+{
+ int i;
+
+ for (i = 0; i < count && m != NULL; i++) {
+ if ((m->m_flags & M_EXT) != 0 &&
+ m->m_ext.ext_type == EXT_PGS) {
+ m->m_ext.ext_pgs->nrdy--;
+ if (m->m_ext.ext_pgs->nrdy != 0)
+ continue;
+ }
+ m = m_free(m);
+ }
+ KASSERT(i == count, ("Removed only %d items from %p", i, m));
+}
+
+/*
+ * Compress an unmapped mbuf into a simple mbuf when it holds a small
+ * amount of data. This is used as a DOS defense to avoid having
+ * small packets tie up wired pages, an ext_pgs structure, and an
+ * mbuf. Since this converts the existing mbuf in place, it can only
+ * be used if there are no other references to 'm'.
+ */
+int
+mb_unmapped_compress(struct mbuf *m)
+{
+ volatile u_int *refcnt;
+ struct mbuf m_temp;
+
+ /*
+ * Assert that 'm' does not have a packet header. If 'm' had
+ * a packet header, it would only be able to hold MHLEN bytes
+ * and m_data would have to be initialized differently.
+ */
+ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
+ m->m_ext.ext_type == EXT_PGS,
+ ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
+ KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
+
+ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+ refcnt = &m->m_ext.ext_count;
+ } else {
+ KASSERT(m->m_ext.ext_cnt != NULL,
+ ("%s: no refcounting pointer on %p", __func__, m));
+ refcnt = m->m_ext.ext_cnt;
+ }
+
+ if (*refcnt != 1)
+ return (EBUSY);
+
+ /*
+ * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to
+ * create a "fake" EXT_PGS mbuf that can be used with
+ * m_copydata() as well as the ext_free callback.
+ */
+ memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext));
+ m_temp.m_next = NULL;
+ m_temp.m_nextpkt = NULL;
+
+ /* Turn 'm' into a "normal" mbuf. */
+ m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP);
+ m->m_data = m->m_dat;
+
+ /* Copy data from template's ext_pgs. */
+ m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t));
+
+ /* Free the backing pages. */
+ m_temp.m_ext.ext_free(&m_temp);
+
+ /* Finally, free the ext_pgs struct. */
+ uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs);
+ return (0);
+}
+
+/*
+ * These next few routines are used to permit downgrading an unmapped
+ * mbuf to a chain of mapped mbufs. This is used when an interface
+ * doesn't supported unmapped mbufs or if checksums need to be
+ * computed in software.
+ *
+ * Each unmapped mbuf is converted to a chain of mbufs. First, any
+ * TLS header data is stored in a regular mbuf. Second, each page of
+ * unmapped data is stored in an mbuf with an EXT_SFBUF external
+ * cluster. These mbufs use an sf_buf to provide a valid KVA for the
+ * associated physical page. They also hold a reference on the
+ * original EXT_PGS mbuf to ensure the physical page doesn't go away.
+ * Finally, any TLS trailer data is stored in a regular mbuf.
+ *
+ * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
+ * mbufs. It frees the associated sf_buf and releases its reference
+ * on the original EXT_PGS mbuf.
+ *
+ * _mb_unmapped_to_ext() is a helper function that converts a single
+ * unmapped mbuf into a chain of mbufs.
+ *
+ * mb_unmapped_to_ext() is the public function that walks an mbuf
+ * chain converting any unmapped mbufs to mapped mbufs. It returns
+ * the new chain of unmapped mbufs on success. On failure it frees
+ * the original mbuf chain and returns NULL.
+ */
+static void
+mb_unmapped_free_mext(struct mbuf *m)
+{
+ struct sf_buf *sf;
+ struct mbuf *old_m;
+
+ sf = m->m_ext.ext_arg1;
+ sf_buf_free(sf);
+
+ /* Drop the reference on the backing EXT_PGS mbuf. */
+ old_m = m->m_ext.ext_arg2;
+ mb_free_ext(old_m);
+}
+
+static struct mbuf *
+_mb_unmapped_to_ext(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ struct mbuf *m_new, *top, *prev, *mref;
+ struct sf_buf *sf;
+ vm_page_t pg;
+ int i, len, off, pglen, pgoff, seglen, segoff;
+ volatile u_int *refcnt;
+ u_int ref_inc = 0;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ len = m->m_len;
+ KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p",
+ __func__, m));
+
+ /* See if this is the mbuf that holds the embedded refcount. */
+ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+ refcnt = &m->m_ext.ext_count;
+ mref = m;
+ } else {
+ KASSERT(m->m_ext.ext_cnt != NULL,
+ ("%s: no refcounting pointer on %p", __func__, m));
+ refcnt = m->m_ext.ext_cnt;
+ mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
+ }
+
+ /* Skip over any data removed from the front. */
+ off = mtod(m, vm_offset_t);
+
+ top = NULL;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ m_new = m_get(M_NOWAIT, MT_DATA);
+ if (m_new == NULL)
+ goto fail;
+ m_new->m_len = seglen;
+ prev = top = m_new;
+ memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff],
+ seglen);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ m_new = m_get(M_NOWAIT, MT_DATA);
+ if (m_new == NULL)
+ goto fail;
+ if (top == NULL) {
+ top = prev = m_new;
+ } else {
+ prev->m_next = m_new;
+ prev = m_new;
+ }
+ sf = sf_buf_alloc(pg, SFB_NOWAIT);
+ if (sf == NULL)
+ goto fail;
+
+ ref_inc++;
+ m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
+ mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
+ m_new->m_data += segoff;
+ m_new->m_len = seglen;
+
+ pgoff = 0;
+ };
+ if (len != 0) {
+ KASSERT((off + len) <= ext_pgs->trail_len,
+ ("off + len > trail (%d + %d > %d)", off, len,
+ ext_pgs->trail_len));
+ m_new = m_get(M_NOWAIT, MT_DATA);
+ if (m_new == NULL)
+ goto fail;
+ if (top == NULL)
+ top = m_new;
+ else
+ prev->m_next = m_new;
+ m_new->m_len = len;
+ memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len);
+ }
+
+ if (ref_inc != 0) {
+ /*
+ * Obtain an additional reference on the old mbuf for
+ * each created EXT_SFBUF mbuf. They will be dropped
+ * in mb_unmapped_free_mext().
+ */
+ if (*refcnt == 1)
+ *refcnt += ref_inc;
+ else
+ atomic_add_int(refcnt, ref_inc);
+ }
+ m_free(m);
+ return (top);
+
+fail:
+ if (ref_inc != 0) {
+ /*
+ * Obtain an additional reference on the old mbuf for
+ * each created EXT_SFBUF mbuf. They will be
+ * immediately dropped when these mbufs are freed
+ * below.
+ */
+ if (*refcnt == 1)
+ *refcnt += ref_inc;
+ else
+ atomic_add_int(refcnt, ref_inc);
+ }
+ m_free(m);
+ m_freem(top);
+ return (NULL);
+}
+
+struct mbuf *
+mb_unmapped_to_ext(struct mbuf *top)
+{
+ struct mbuf *m, *next, *prev = NULL;
+
+ prev = NULL;
+ for (m = top; m != NULL; m = next) {
+ /* m might be freed, so cache the next pointer. */
+ next = m->m_next;
+ if (m->m_flags & M_NOMAP) {
+ if (prev != NULL) {
+ /*
+ * Remove 'm' from the new chain so
+ * that the 'top' chain terminates
+ * before 'm' in case 'top' is freed
+ * due to an error.
+ */
+ prev->m_next = NULL;
+ }
+ m = _mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ m_freem(top);
+ m_freem(next);
+ return (NULL);
+ }
+ if (prev == NULL) {
+ top = m;
+ } else {
+ prev->m_next = m;
+ }
+
+ /*
+ * Replaced one mbuf with a chain, so we must
+ * find the end of chain.
+ */
+ prev = m_last(m);
+ } else {
+ if (prev != NULL) {
+ prev->m_next = m;
+ }
+ prev = m;
+ }
+ }
+ return (top);
+}
+
+/*
+ * Allocate an empty EXT_PGS mbuf. The ext_free routine is
+ * responsible for freeing any pages backing this mbuf when it is
+ * freed.
+ */
+struct mbuf *
+mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free)
+{
+ struct mbuf *m;
+ struct mbuf_ext_pgs *ext_pgs;
+
+ if (pkthdr)
+ m = m_gethdr(how, MT_DATA);
+ else
+ m = m_get(how, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+
+ ext_pgs = uma_zalloc(zone_extpgs, how);
+ if (ext_pgs == NULL) {
+ m_free(m);
+ return (NULL);
+ }
+ ext_pgs->npgs = 0;
+ ext_pgs->nrdy = 0;
+ ext_pgs->first_pg_off = 0;
+ ext_pgs->last_pg_len = 0;
+ ext_pgs->hdr_len = 0;
+ ext_pgs->trail_len = 0;
+ ext_pgs->tls = NULL;
+ ext_pgs->so = NULL;
+ m->m_data = NULL;
+ m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP);
+ m->m_ext.ext_type = EXT_PGS;
+ m->m_ext.ext_flags = EXT_FLAG_EMBREF;
+ m->m_ext.ext_count = 1;
+ m->m_ext.ext_pgs = ext_pgs;
+ m->m_ext.ext_size = 0;
+ m->m_ext.ext_free = ext_free;
+ return (m);
+}
+
+#ifdef INVARIANT_SUPPORT
+void
+mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs)
+{
+
+ /*
+ * NB: This expects a non-empty buffer (npgs > 0 and
+ * last_pg_len > 0).
+ */
+ KASSERT(ext_pgs->npgs > 0,
+ ("ext_pgs with no valid pages: %p", ext_pgs));
+ KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa),
+ ("ext_pgs with too many pages: %p", ext_pgs));
+ KASSERT(ext_pgs->nrdy <= ext_pgs->npgs,
+ ("ext_pgs with too many ready pages: %p", ext_pgs));
+ KASSERT(ext_pgs->first_pg_off < PAGE_SIZE,
+ ("ext_pgs with too large page offset: %p", ext_pgs));
+ KASSERT(ext_pgs->last_pg_len > 0,
+ ("ext_pgs with zero last page length: %p", ext_pgs));
+ KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE,
+ ("ext_pgs with too large last page length: %p", ext_pgs));
+ if (ext_pgs->npgs == 1) {
+ KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <=
+ PAGE_SIZE, ("ext_pgs with single page too large: %p",
+ ext_pgs));
+ }
+ KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr),
+ ("ext_pgs with too large header length: %p", ext_pgs));
+ KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail),
+ ("ext_pgs with too large header length: %p", ext_pgs));
+}
+#endif
+
+/*
* Clean up after mbufs with M_EXT storage attached to them if the
* reference count hits 1.
*/
@@ -888,6 +1276,10 @@ mb_free_ext(struct mbuf *m)
uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
uma_zfree(zone_mbuf, mref);
break;
+ case EXT_PGS:
+ uma_zfree(zone_extpgs, mref->m_ext.ext_pgs);
+ uma_zfree(zone_mbuf, mref);
+ break;
case EXT_SFBUF:
case EXT_NET_DRV:
case EXT_MOD_TYPE:
diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c
index dbfd66bd2300..b050c00dfde2 100644
--- a/sys/kern/subr_bus_dma.c
+++ b/sys/kern/subr_bus_dma.c
@@ -111,6 +111,67 @@ _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
}
/*
+ * Load an unmapped mbuf
+ */
+static int
+_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+ struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ int error, i, off, len, pglen, pgoff, seglen, segoff;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+
+ len = m->m_len;
+ error = 0;
+
+ /* Skip over any data removed from the front. */
+ off = mtod(m, vm_offset_t);
+
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ error = _bus_dmamap_load_buffer(dmat, map,
+ &ext_pgs->hdr[segoff], seglen, kernel_pmap,
+ flags, segs, nsegs);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+ error = _bus_dmamap_load_phys(dmat, map,
+ ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs);
+ pgoff = 0;
+ };
+ if (len != 0 && error == 0) {
+ KASSERT((off + len) <= ext_pgs->trail_len,
+ ("off + len > trail (%d + %d > %d)", off, len,
+ ext_pgs->trail_len));
+ error = _bus_dmamap_load_buffer(dmat, map,
+ &ext_pgs->trail[off], len, kernel_pmap, flags, segs,
+ nsegs);
+ }
+ return (error);
+}
+
+/*
* Load an mbuf chain.
*/
static int
@@ -123,9 +184,13 @@ _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
error = 0;
for (m = m0; m != NULL && error == 0; m = m->m_next) {
if (m->m_len > 0) {
- error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
- m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
- segs, nsegs);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
+ map, m, segs, nsegs, flags);
+ else
+ error = _bus_dmamap_load_buffer(dmat, map,
+ m->m_data, m->m_len, kernel_pmap,
+ flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
}
}
CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
diff --git a/sys/kern/subr_sglist.c b/sys/kern/subr_sglist.c
index ff0020383931..07f3c5b18ba6 100644
--- a/sys/kern/subr_sglist.c
+++ b/sys/kern/subr_sglist.c
@@ -219,6 +219,75 @@ sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len)
}
/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS buffer.
+ */
+int
+sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
+{
+ vm_paddr_t nextaddr, paddr;
+ size_t seglen, segoff;
+ int i, nsegs, pglen, pgoff;
+
+ if (len == 0)
+ return (0);
+
+ nsegs = 0;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = MIN(seglen, len);
+ off = 0;
+ len -= seglen;
+ nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen);
+ }
+ }
+ nextaddr = 0;
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = MIN(seglen, len);
+ len -= seglen;
+ paddr = ext_pgs->pa[i] + segoff;
+ if (paddr != nextaddr)
+ nsegs++;
+ nextaddr = paddr + seglen;
+ pgoff = 0;
+ };
+ if (len != 0) {
+ seglen = MIN(len, ext_pgs->trail_len - off);
+ len -= seglen;
+ nsegs += sglist_count(&ext_pgs->trail[off], seglen);
+ }
+ KASSERT(len == 0, ("len != 0"));
+ return (nsegs);
+}
+
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS mbuf.
+ */
+int
+sglist_count_mb_ext_pgs(struct mbuf *m)
+{
+
+ MBUF_EXT_PGS_ASSERT(m);
+ return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t),
+ m->m_len));
+}
+
+/*
* Allocate a scatter/gather list along with 'nsegs' segments. The
* 'mflags' parameters are the same as passed to malloc(9). The caller
* should use sglist_free() to free this list.
@@ -320,6 +389,76 @@ sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
}
/*
+ * Append the segments to describe an EXT_PGS buffer to a
+ * scatter/gather list. If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+ size_t off, size_t len)
+{
+ size_t seglen, segoff;
+ vm_paddr_t paddr;
+ int error, i, pglen, pgoff;
+
+ error = 0;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = MIN(seglen, len);
+ off = 0;
+ len -= seglen;
+ error = sglist_append(sg,
+ &ext_pgs->hdr[segoff], seglen);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = MIN(seglen, len);
+ len -= seglen;
+ paddr = ext_pgs->pa[i] + segoff;
+ error = sglist_append_phys(sg, paddr, seglen);
+ pgoff = 0;
+ };
+ if (error == 0 && len > 0) {
+ seglen = MIN(len, ext_pgs->trail_len - off);
+ len -= seglen;
+ error = sglist_append(sg,
+ &ext_pgs->trail[off], seglen);
+ }
+ if (error == 0)
+ KASSERT(len == 0, ("len != 0"));
+ return (error);
+}
+
+/*
+ * Append the segments to describe an EXT_PGS mbuf to a scatter/gather
+ * list. If there are insufficient segments, then this fails with
+ * EFBIG.
+ */
+int
+sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
+{
+
+ /* for now, all unmapped mbufs are assumed to be EXT_PGS */
+ MBUF_EXT_PGS_ASSERT(m);
+ return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs,
+ mtod(m, vm_offset_t), m->m_len));
+}
+
+/*
* Append the segments that describe a single mbuf chain to a
* scatter/gather list. If there are insufficient segments, then this
* fails with EFBIG.
@@ -338,7 +477,11 @@ sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
SGLIST_SAVE(sg, save);
for (m = m0; m != NULL; m = m->m_next) {
if (m->m_len > 0) {
- error = sglist_append(sg, m->m_data, m->m_len);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = sglist_append_mb_ext_pgs(sg, m);
+ else
+ error = sglist_append(sg, m->m_data,
+ m->m_len);
if (error) {
SGLIST_RESTORE(sg, save);
return (error);
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index ba3efec8e85c..4b97c15856ba 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -49,7 +49,11 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/uio.h>
+#include <sys/vmmeter.h>
#include <sys/sdt.h>
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
"struct mbuf *", "mbufinfo_t *",
@@ -202,7 +206,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m)
else
bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
n->m_flags |= M_EXT;
- n->m_flags |= m->m_flags & M_RDONLY;
+ n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP);
/* See if this is the mbuf that holds the embedded refcount. */
if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
@@ -246,7 +250,8 @@ m_demote(struct mbuf *m0, int all, int flags)
__func__, m, m0));
if (m->m_flags & M_PKTHDR)
m_demote_pkthdr(m);
- m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
+ m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE |
+ M_NOMAP | flags);
}
}
@@ -376,7 +381,8 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from)
if (to->m_flags & M_PKTHDR)
m_tag_delete_chain(to, NULL);
#endif
- to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+ to->m_flags = (from->m_flags & M_COPYFLAGS) |
+ (to->m_flags & (M_EXT | M_NOMAP));
if ((to->m_flags & M_EXT) == 0)
to->m_data = to->m_pktdat;
to->m_pkthdr = from->m_pkthdr; /* especially tags */
@@ -414,7 +420,8 @@ m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
if (to->m_flags & M_PKTHDR)
m_tag_delete_chain(to, NULL);
#endif
- to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+ to->m_flags = (from->m_flags & M_COPYFLAGS) |
+ (to->m_flags & (M_EXT | M_NOMAP));
if ((to->m_flags & M_EXT) == 0)
to->m_data = to->m_pktdat;
to->m_pkthdr = from->m_pkthdr;
@@ -579,6 +586,30 @@ nospace:
return (NULL);
}
+static void
+m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+ struct iovec iov;
+ struct uio uio;
+ int error;
+
+ KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
+ KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
+ KASSERT(off < m->m_len,
+ ("m_copyfromunmapped: len exceeds mbuf length"));
+ iov.iov_base = cp;
+ iov.iov_len = len;
+ uio.uio_resid = len;
+ uio.uio_iov = &iov;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_rw = UIO_READ;
+ error = m_unmappedtouio(m, off, &uio, len);
+ KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
+ len));
+}
+
/*
* Copy data from an mbuf chain starting "off" bytes from the beginning,
* continuing for "len" bytes, into the indicated buffer.
@@ -600,7 +631,10 @@ m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
while (len > 0) {
KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
count = min(m->m_len - off, len);
- bcopy(mtod(m, caddr_t) + off, cp, count);
+ if ((m->m_flags & M_NOMAP) != 0)
+ m_copyfromunmapped(m, off, count, cp);
+ else
+ bcopy(mtod(m, caddr_t) + off, cp, count);
len -= count;
cp += count;
off = 0;
@@ -695,6 +729,7 @@ m_cat(struct mbuf *m, struct mbuf *n)
m = m->m_next;
while (n) {
if (!M_WRITABLE(m) ||
+ (n->m_flags & M_NOMAP) != 0 ||
M_TRAILINGSPACE(m) < n->m_len) {
/* just join the two chains */
m->m_next = n;
@@ -812,6 +847,9 @@ m_pullup(struct mbuf *n, int len)
int count;
int space;
+ KASSERT((n->m_flags & M_NOMAP) == 0,
+ ("%s: unmapped mbuf %p", __func__, n));
+
/*
* If first mbuf has no cluster, and has room for len bytes
* without shifting current data, pullup into it,
@@ -1365,6 +1403,41 @@ nospace:
}
/*
+ * Return the number of fragments an mbuf will use. This is usually
+ * used as a proxy for the number of scatter/gather elements needed by
+ * a DMA engine to access an mbuf. In general mapped mbufs are
+ * assumed to be backed by physically contiguous buffers that only
+ * need a single fragment. Unmapped mbufs, on the other hand, can
+ * span disjoint physical pages.
+ */
+static int
+frags_per_mbuf(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ int frags;
+
+ if ((m->m_flags & M_NOMAP) == 0)
+ return (1);
+
+ /*
+ * The header and trailer are counted as a single fragment
+ * each when present.
+ *
+ * XXX: This overestimates the number of fragments by assuming
+ * all the backing physical pages are disjoint.
+ */
+ ext_pgs = m->m_ext.ext_pgs;
+ frags = 0;
+ if (ext_pgs->hdr_len != 0)
+ frags++;
+ frags += ext_pgs->npgs;
+ if (ext_pgs->trail_len != 0)
+ frags++;
+
+ return (frags);
+}
+
+/*
* Defragment an mbuf chain, returning at most maxfrags separate
* mbufs+clusters. If this is not possible NULL is returned and
* the original mbuf chain is left in its present (potentially
@@ -1384,7 +1457,7 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
*/
curfrags = 0;
for (m = m0; m != NULL; m = m->m_next)
- curfrags++;
+ curfrags += frags_per_mbuf(m);
/*
* First, try to collapse mbufs. Note that we always collapse
* towards the front so we don't need to deal with moving the
@@ -1399,12 +1472,13 @@ again:
break;
if (M_WRITABLE(m) &&
n->m_len < M_TRAILINGSPACE(m)) {
- bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
- n->m_len);
+ m_copydata(n, 0, n->m_len,
+ mtod(m, char *) + m->m_len);
m->m_len += n->m_len;
m->m_next = n->m_next;
+ curfrags -= frags_per_mbuf(n);
m_free(n);
- if (--curfrags <= maxfrags)
+ if (curfrags <= maxfrags)
return m0;
} else
m = n;
@@ -1421,15 +1495,18 @@ again:
m = m_getcl(how, MT_DATA, 0);
if (m == NULL)
goto bad;
- bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
- bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
- n2->m_len);
+ m_copydata(n, 0, n->m_len, mtod(m, char *));
+ m_copydata(n2, 0, n2->m_len,
+ mtod(m, char *) + n->m_len);
m->m_len = n->m_len + n2->m_len;
m->m_next = n2->m_next;
*prev = m;
+ curfrags += 1; /* For the new cluster */
+ curfrags -= frags_per_mbuf(n);
+ curfrags -= frags_per_mbuf(n2);
m_free(n);
m_free(n2);
- if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */
+ if (curfrags <= maxfrags)
return m0;
/*
* Still not there, try the normal collapse
@@ -1530,6 +1607,111 @@ nospace:
#endif
/*
+ * Free pages from mbuf_ext_pgs, assuming they were allocated via
+ * vm_page_alloc() and aren't associated with any object. Complement
+ * to allocator from m_uiotombuf_nomap().
+ */
+void
+mb_free_mext_pgs(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ vm_page_t pg;
+ int wire_adj;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ wire_adj = 0;
+ for (int i = 0; i < ext_pgs->npgs; i++) {
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ /*
+ * Note: page is not locked, as it has no
+ * object and is not on any queues.
+ */
+ vm_page_free_toq(pg);
+ wire_adj++;
+ }
+ if (wire_adj)
+ vm_wire_sub(wire_adj);
+}
+
+static struct mbuf *
+m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
+{
+ struct mbuf *m, *mb, *prev;
+ struct mbuf_ext_pgs *pgs;
+ vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
+ int error, length, i, needed, wire_adj = 0;
+ ssize_t total;
+ int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP;
+
+ /*
+ * len can be zero or an arbitrary large value bound by
+ * the total data supplied by the uio.
+ */
+ if (len > 0)
+ total = MIN(uio->uio_resid, len);
+ else
+ total = uio->uio_resid;
+
+ if (maxseg == 0)
+ maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
+
+ /*
+ * Allocate the pages
+ */
+ m = NULL;
+ while (total > 0) {
+ mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR),
+ mb_free_mext_pgs);
+ if (mb == NULL)
+ goto failed;
+ if (m == NULL)
+ m = mb;
+ else
+ prev->m_next = mb;
+ prev = mb;
+ pgs = mb->m_ext.ext_pgs;
+ needed = length = MIN(maxseg, total);
+ for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
+retry_page:
+ pg_array[i] = vm_page_alloc(NULL, 0, pflags);
+ if (pg_array[i] == NULL) {
+ if (wire_adj)
+ vm_wire_add(wire_adj);
+ wire_adj = 0;
+ if (how & M_NOWAIT) {
+ goto failed;
+ } else {
+ vm_wait(NULL);
+ goto retry_page;
+ }
+ }
+ wire_adj++;
+ pg_array[i]->flags &= ~PG_ZERO;
+ pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
+ pgs->npgs++;
+ }
+ pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1);
+ MBUF_EXT_PGS_ASSERT_SANITY(pgs);
+ vm_wire_add(wire_adj);
+ wire_adj = 0;
+ total -= length;
+ error = uiomove_fromphys(pg_array, 0, length, uio);
+ if (error != 0)
+ goto failed;
+ mb->m_len = length;
+ mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs;
+ if (flags & M_PKTHDR)
+ m->m_pkthdr.len += length;
+ }
+ return (m);
+
+failed:
+ m_freem(m);
+ return (NULL);
+}
+
+/*
* Copy the contents of uio into a properly sized mbuf chain.
*/
struct mbuf *
@@ -1540,6 +1722,9 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
ssize_t total;
int progress = 0;
+ if (flags & M_NOMAP)
+ return (m_uiotombuf_nomap(uio, how, len, align, flags));
+
/*
* len can be zero or an arbitrary large value bound by
* the total data supplied by the uio.
@@ -1586,6 +1771,62 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
}
/*
+ * Copy data from an unmapped mbuf into a uio limited by len if set.
+ */
+int
+m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ vm_page_t pg;
+ int error, i, off, pglen, pgoff, seglen, segoff;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ error = 0;
+
+ /* Skip over any data removed from the front. */
+ off = mtod(m, vm_offset_t);
+
+ off += m_off;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ error = uiomove(&ext_pgs->hdr[segoff], seglen, uio);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ error = uiomove_fromphys(&pg, segoff, seglen, uio);
+ pgoff = 0;
+ };
+ if (len != 0 && error == 0) {
+ KASSERT((off + len) <= ext_pgs->trail_len,
+ ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
+ ext_pgs->trail_len, m_off));
+ error = uiomove(&ext_pgs->trail[off], len, uio);
+ }
+ return (error);
+}
+
+/*
* Copy an mbuf chain into a uio limited by len if set.
*/
int
@@ -1603,7 +1844,10 @@ m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
for (; m != NULL; m = m->m_next) {
length = min(m->m_len, total - progress);
- error = uiomove(mtod(m, void *), length, uio);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = m_unmappedtouio(m, 0, uio, length);
+ else
+ error = uiomove(mtod(m, void *), length, uio);
if (error)
return (error);
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index daf890019804..558df3f8712f 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -89,28 +89,57 @@ sbm_clrprotoflags(struct mbuf *m, int flags)
}
/*
- * Mark ready "count" mbufs starting with "m".
+ * Mark ready "count" units of I/O starting with "m". Most mbufs
+ * count as a single unit of I/O except for EXT_PGS-backed mbufs which
+ * can be backed by multiple pages.
*/
int
-sbready(struct sockbuf *sb, struct mbuf *m, int count)
+sbready(struct sockbuf *sb, struct mbuf *m0, int count)
{
+ struct mbuf *m;
u_int blocker;
SOCKBUF_LOCK_ASSERT(sb);
KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
+ KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
+ m = m0;
blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
- for (int i = 0; i < count; i++, m = m->m_next) {
+ while (count > 0) {
KASSERT(m->m_flags & M_NOTREADY,
("%s: m %p !M_NOTREADY", __func__, m));
+ if ((m->m_flags & M_EXT) != 0 &&
+ m->m_ext.ext_type == EXT_PGS) {
+ if (count < m->m_ext.ext_pgs->nrdy) {
+ m->m_ext.ext_pgs->nrdy -= count;
+ count = 0;
+ break;
+ }
+ count -= m->m_ext.ext_pgs->nrdy;
+ m->m_ext.ext_pgs->nrdy = 0;
+ } else
+ count--;
+
m->m_flags &= ~(M_NOTREADY | blocker);
if (blocker)
sb->sb_acc += m->m_len;
+ m = m->m_next;
}
- if (!blocker)
+ /*
+ * If the first mbuf is still not fully ready because only
+ * some of its backing pages were readied, no further progress
+ * can be made.
+ */
+ if (m0 == m) {
+ MPASS(m->m_flags & M_NOTREADY);
+ return (EINPROGRESS);
+ }
+
+ if (!blocker) {
return (EINPROGRESS);
+ }
/* This one was blocking all the queue. */
for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
@@ -1030,12 +1059,11 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
M_WRITABLE(n) &&
((sb->sb_flags & SB_NOCOALESCE) == 0) &&
!(m->m_flags & M_NOTREADY) &&
- !(n->m_flags & M_NOTREADY) &&
+ !(n->m_flags & (M_NOTREADY | M_NOMAP)) &&
m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
m->m_len <= M_TRAILINGSPACE(n) &&
n->m_type == m->m_type) {
- bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
- (unsigned)m->m_len);
+ m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
n->m_len += m->m_len;
sb->sb_ccc += m->m_len;
if (sb->sb_fnrdy == NULL)
@@ -1046,6 +1074,9 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
m = m_free(m);
continue;
}
+ if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) &&
+ (m->m_flags & M_NOTREADY) == 0)
+ (void)mb_unmapped_compress(m);
if (n)
n->m_next = m;
else
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 05be168e8a84..ac7345b93d7d 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1982,7 +1982,11 @@ dontblock:
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
SOCKBUF_UNLOCK(&so->so_rcv);
- error = uiomove(mtod(m, char *) + moff, (int)len, uio);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = m_unmappedtouio(m, moff, uio, (int)len);
+ else
+ error = uiomove(mtod(m, char *) + moff,
+ (int)len, uio);
SOCKBUF_LOCK(&so->so_rcv);
if (error) {
/*
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 6ad51bfd21e8..2bec4654bfd7 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -2369,6 +2369,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
* Note that we cut corners here; we only setup what's
* absolutely needed--this mbuf should never go anywhere else.
*/
+ mb.m_flags = 0;
mb.m_next = m;
mb.m_data = data;
mb.m_len = dlen;
diff --git a/sys/net/bpf_buffer.c b/sys/net/bpf_buffer.c
index 4a1310f4ce4f..60318727ff14 100644
--- a/sys/net/bpf_buffer.c
+++ b/sys/net/bpf_buffer.c
@@ -119,19 +119,10 @@ bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
{
const struct mbuf *m;
u_char *dst;
- u_int count;
m = (struct mbuf *)src;
dst = (u_char *)buf + offset;
- while (len > 0) {
- if (m == NULL)
- panic("bpf_mcopy");
- count = min(m->m_len, len);
- bcopy(mtod(m, void *), dst, count);
- m = m->m_next;
- dst += count;
- len -= count;
- }
+ m_copydata(m, 0, len, dst);
}
/*
diff --git a/sys/net/if.h b/sys/net/if.h
index d6e032e36f7b..3c22a408f45c 100644
--- a/sys/net/if.h
+++ b/sys/net/if.h
@@ -246,6 +246,7 @@ struct if_data {
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */
+#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index 2a7eb7f56286..223262003086 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -691,11 +691,30 @@ sendit:
m->m_pkthdr.csum_flags |= CSUM_IP;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ IPSTAT_INC(ips_odropped);
+ error = ENOBUFS;
+ goto bad;
+ }
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ IPSTAT_INC(ips_odropped);
+ error = ENOBUFS;
+ goto bad;
+ }
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ IPSTAT_INC(ips_odropped);
+ error = ENOBUFS;
+ goto bad;
+ }
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
@@ -831,11 +850,23 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
* fragmented packets, then do it here.
*/
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m0 = mb_unmapped_to_ext(m0);
+ if (m0 == NULL) {
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
+ m0 = mb_unmapped_to_ext(m0);
+ if (m0 == NULL) {
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
sctp_delayed_cksum(m0, hlen);
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
diff --git a/sys/netinet/tcp_pcap.c b/sys/netinet/tcp_pcap.c
index 5cb807c8c43b..66c18dcc856f 100644
--- a/sys/netinet/tcp_pcap.c
+++ b/sys/netinet/tcp_pcap.c
@@ -311,6 +311,7 @@ tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
if (mhead->m_flags & M_EXT) {
switch (mhead->m_ext.ext_type) {
case EXT_SFBUF:
+ case EXT_PGS:
/* Don't mess around with these. */
tcp_pcap_m_freem(mhead);
continue;
@@ -383,8 +384,11 @@ tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
__func__, n->m_flags));
n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
n->m_len = m->m_len;
- bcopy(M_START(m), n->m_dat,
- m->m_len + M_LEADINGSPACE_NOWRITE(m));
+ if (m->m_flags & M_NOMAP)
+ m_copydata(m, 0, m->m_len, n->m_data);
+ else
+ bcopy(M_START(m), n->m_dat,
+ m->m_len + M_LEADINGSPACE_NOWRITE(m));
}
else {
/*
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index dee08a2180c4..024f7da06960 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1190,8 +1190,7 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
- for (int i = 0; i < count; i++)
- m = m_free(m);
+ mb_free_notready(m, count);
return (ECONNRESET);
}
tp = intotcpcb(inp);
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index f1e36c9f1966..6ec682645d92 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -963,11 +963,30 @@ passout:
*/
if (sw_csum & CSUM_DELAY_DATA_IPV6) {
sw_csum &= ~CSUM_DELAY_DATA_IPV6;
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IP6STAT_INC(ip6s_odropped);
+ goto bad;
+ }
in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
+ } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IP6STAT_INC(ip6s_odropped);
+ goto bad;
+ }
}
#ifdef SCTP
if (sw_csum & CSUM_SCTP_IPV6) {
sw_csum &= ~CSUM_SCTP_IPV6;
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IP6STAT_INC(ip6s_odropped);
+ goto bad;
+ }
sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
}
#endif
@@ -1055,11 +1074,23 @@ passout:
* XXX-BZ handle the hw offloading case. Need flags.
*/
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ in6_ifstat_inc(ifp, ifs6_out_fragfail);
+ error = ENOBUFS;
+ goto bad;
+ }
in6_delayed_cksum(m, plen, hlen);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ in6_ifstat_inc(ifp, ifs6_out_fragfail);
+ error = ENOBUFS;
+ goto bad;
+ }
sctp_delayed_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
}
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index 1f9d9a7d515f..a6ddc72aca15 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -227,7 +227,15 @@ struct m_ext {
volatile u_int ext_count;
volatile u_int *ext_cnt;
};
- char *ext_buf; /* start of buffer */
+ union {
+ /*
+ * If ext_type == EXT_PGS, 'ext_pgs' points to a
+ * structure describing the buffer. Otherwise,
+ * 'ext_buf' points to the start of the buffer.
+ */
+ struct mbuf_ext_pgs *ext_pgs;
+ char *ext_buf;
+ };
uint32_t ext_size; /* size of buffer, for ext_free */
uint32_t ext_type:8, /* type of external storage */
ext_flags:24; /* external storage mbuf flags */
@@ -293,6 +301,92 @@ struct mbuf {
};
};
+struct socket;
+
+/*
+ * TLS records for TLS 1.0-1.2 can have the following header lengths:
+ * - 5 (AES-CBC with implicit IV)
+ * - 21 (AES-CBC with explicit IV)
+ * - 13 (AES-GCM with 8 byte explicit IV)
+ */
+#define MBUF_PEXT_HDR_LEN 24
+
+/*
+ * TLS records for TLS 1.0-1.2 can have the following maximum trailer
+ * lengths:
+ * - 16 (AES-GCM)
+ * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
+ * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
+ * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
+ */
+#define MBUF_PEXT_TRAIL_LEN 64
+
+#ifdef __LP64__
+#define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t))
+#else
+#define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t))
+#endif
+
+#define MBUF_PEXT_MAX_BYTES \
+ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
+
+/*
+ * This struct is 256 bytes in size and is arranged so that the most
+ * common case (accessing the first 4 pages of a 16KB TLS record) will
+ * fit in a single 64 byte cacheline.
+ */
+struct mbuf_ext_pgs {
+ uint8_t npgs; /* Number of attached pages */
+ uint8_t nrdy; /* Pages with I/O pending */
+ uint8_t hdr_len; /* TLS header length */
+ uint8_t trail_len; /* TLS trailer length */
+ uint16_t first_pg_off; /* Offset into 1st page */
+ uint16_t last_pg_len; /* Length of last page */
+ vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */
+ char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */
+ void *tls; /* TLS session */
+#if defined(__i386__) || \
+ (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE))
+ /*
+ * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is
+ * a 4 byte remainder from the space allocated for pa[].
+ */
+ uint32_t pad;
+#endif
+ union {
+ char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */
+ struct {
+ struct socket *so;
+ void *mbuf;
+ uint64_t seqno;
+ STAILQ_ENTRY(mbuf_ext_pgs) stailq;
+ };
+ };
+};
+
+#ifdef _KERNEL
+static inline int
+mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff)
+{
+ KASSERT(pgoff == 0 || pidx == 0,
+ ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs));
+ if (pidx == ext_pgs->npgs - 1) {
+ return (ext_pgs->last_pg_len);
+ } else {
+ return (PAGE_SIZE - pgoff);
+ }
+}
+
+#ifdef INVARIANT_SUPPORT
+void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
+#endif
+#ifdef INVARIANTS
+#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs))
+#else
+#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs)
+#endif
+#endif
+
/*
* mbuf flags of global significance and layer crossing.
* Those of only protocol/layer specific significance are to be mapped
@@ -307,7 +401,7 @@ struct mbuf {
#define M_MCAST 0x00000020 /* send/received as link-level multicast */
#define M_PROMISC 0x00000040 /* packet was not for us */
#define M_VLANTAG 0x00000080 /* ether_vtag is valid */
-#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
+#define M_NOMAP 0x00000100 /* mbuf data is unmapped */
#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
@@ -348,7 +442,7 @@ struct mbuf {
*/
#define M_FLAG_BITS \
"\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
- "\7M_PROMISC\10M_VLANTAG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
+ "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
#define M_FLAG_PROTOBITS \
"\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
"\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
@@ -420,6 +514,7 @@ struct mbuf {
#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
#define EXT_MBUF 7 /* external mbuf reference */
#define EXT_RXRING 8 /* data in NIC receive ring */
+#define EXT_PGS 9 /* array of unmapped pages */
#define EXT_VENDOR1 224 /* for vendor-internal use */
#define EXT_VENDOR2 225 /* for vendor-internal use */
@@ -464,6 +559,11 @@ struct mbuf {
"\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
"\30EXT_FLAG_EXP4"
+#define MBUF_EXT_PGS_ASSERT(m) \
+ KASSERT((((m)->m_flags & M_EXT) != 0) && \
+ ((m)->m_ext.ext_type == EXT_PGS), \
+ ("%s: m %p !M_EXT or !EXT_PGS", __func__, m))
+
/*
* Flags indicating checksum, segmentation and other offload work to be
* done, or already done, by hardware or lower layers. It is split into
@@ -566,6 +666,7 @@ struct mbuf {
#define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k"
#define MBUF_TAG_MEM_NAME "mbuf_tag"
#define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt"
+#define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs"
#ifdef _KERNEL
@@ -590,9 +691,15 @@ extern uma_zone_t zone_pack;
extern uma_zone_t zone_jumbop;
extern uma_zone_t zone_jumbo9;
extern uma_zone_t zone_jumbo16;
+extern uma_zone_t zone_extpgs;
void mb_dupcl(struct mbuf *, struct mbuf *);
void mb_free_ext(struct mbuf *);
+void mb_free_mext_pgs(struct mbuf *);
+struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t);
+int mb_unmapped_compress(struct mbuf *m);
+struct mbuf *mb_unmapped_to_ext(struct mbuf *m);
+void mb_free_notready(struct mbuf *m, int count);
void m_adj(struct mbuf *, int);
int m_apply(struct mbuf *, int, int,
int (*)(void *, void *, u_int), void *);
@@ -627,6 +734,7 @@ struct mbuf *m_getm2(struct mbuf *, int, int, short, int);
struct mbuf *m_getptr(struct mbuf *, int, int *);
u_int m_length(struct mbuf *, struct mbuf **);
int m_mbuftouio(struct uio *, const struct mbuf *, int);
+int m_unmappedtouio(const struct mbuf *, int, struct uio *, int);
void m_move_pkthdr(struct mbuf *, struct mbuf *);
int m_pkthdr_init(struct mbuf *, int);
struct mbuf *m_prepend(struct mbuf *, int, int);
@@ -881,7 +989,7 @@ m_extrefcnt(struct mbuf *m)
* be both the local data payload, or an external buffer area, depending on
* whether M_EXT is set).
*/
-#define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \
+#define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 && \
(!(((m)->m_flags & M_EXT)) || \
(m_extrefcnt(m) == 1)))
@@ -904,7 +1012,8 @@ m_extrefcnt(struct mbuf *m)
* handling external storage, packet-header mbufs, and regular data mbufs.
*/
#define M_START(m) \
- (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
+ (((m)->m_flags & M_NOMAP) ? NULL : \
+ ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \
&(m)->m_dat[0])
diff --git a/sys/sys/sglist.h b/sys/sys/sglist.h
index 5674416c07af..f11c74a4e2ae 100644
--- a/sys/sys/sglist.h
+++ b/sys/sys/sglist.h
@@ -57,6 +57,7 @@ struct sglist {
struct bio;
struct mbuf;
+struct mbuf_ext_pgs;
struct uio;
static __inline void
@@ -87,6 +88,9 @@ sglist_hold(struct sglist *sg)
struct sglist *sglist_alloc(int nsegs, int mflags);
int sglist_append(struct sglist *sg, void *buf, size_t len);
int sglist_append_bio(struct sglist *sg, struct bio *bp);
+int sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+ size_t off, size_t len);
+int sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m);
int sglist_append_mbuf(struct sglist *sg, struct mbuf *m0);
int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
size_t len);
@@ -101,6 +105,9 @@ struct sglist *sglist_build(void *buf, size_t len, int mflags);
struct sglist *sglist_clone(struct sglist *sg, int mflags);
int sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid);
int sglist_count(void *buf, size_t len);
+int sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off,
+ size_t len);
+int sglist_count_mb_ext_pgs(struct mbuf *m);
int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
void sglist_free(struct sglist *sg);
int sglist_join(struct sglist *first, struct sglist *second);