aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--share/man/man4/Makefile1
-rw-r--r--share/man/man4/netmap.4300
-rw-r--r--sys/dev/netmap/head.diff654
-rw-r--r--sys/dev/netmap/if_em_netmap.h383
-rw-r--r--sys/dev/netmap/if_igb_netmap.h378
-rw-r--r--sys/dev/netmap/if_lem_netmap.h344
-rw-r--r--sys/dev/netmap/if_re_netmap.h415
-rw-r--r--sys/dev/netmap/ixgbe_netmap.h376
-rw-r--r--sys/dev/netmap/netmap.c1762
-rw-r--r--sys/dev/netmap/netmap_kern.h221
-rw-r--r--sys/net/netmap.h281
-rw-r--r--sys/net/netmap_user.h98
-rw-r--r--tools/tools/README1
-rw-r--r--tools/tools/netmap/Makefile25
-rw-r--r--tools/tools/netmap/README11
-rw-r--r--tools/tools/netmap/bridge.c456
-rw-r--r--tools/tools/netmap/click-test.cfg19
-rw-r--r--tools/tools/netmap/pcap.c761
-rw-r--r--tools/tools/netmap/pkt-gen.c1021
19 files changed, 7507 insertions, 0 deletions
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index e5fa955f6cc9..0d5a780970e4 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -253,6 +253,7 @@ MAN= aac.4 \
net80211.4 \
netgraph.4 \
netintro.4 \
+ netmap.4 \
${_nfe.4} \
${_nfsmb.4} \
ng_async.4 \
diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4
new file mode 100644
index 000000000000..8b646f9fa070
--- /dev/null
+++ b/share/man/man4/netmap.4
@@ -0,0 +1,300 @@
+.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" This document is derived in part from the enet man page (enet.4)
+.\" distributed with 4.3BSD Unix.
+.\"
+.\" $FreeBSD$
+.\" $Id: netmap.4 9662 2011-11-16 13:18:06Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $
+.\"
+.Dd November 16, 2011
+.Dt NETMAP 4
+.Os
+.Sh NAME
+.Nm netmap
+.Nd a framework for fast packet I/O
+.Sh SYNOPSIS
+.Cd device netmap
+.Sh DESCRIPTION
+.Nm
+is a framework for fast and safe access to network devices
+(reaching 14.88 Mpps at less than 1 GHz).
+.Nm
+uses memory mapped buffers and metadata
+(buffer indexes and lengths) to communicate with the kernel,
+which is in charge of validating information through
+.Pa ioctl()
+and
+.Pa select()/poll().
+.Nm
+can exploit the parallelism in multiqueue devices and
+multicore systems.
+.Pp
+.Pp
+.Nm
+requires explicit support in device drivers.
+For a list of supported devices, see the end of this manual page.
+.Sh OPERATION
+.Nm
+clients must first open the
+.Pa open("/dev/netmap") ,
+and then issue an
+.Pa ioctl(...,NIOCREGIF,...)
+to bind the file descriptor to a network device.
+.Pp
+When a device is put in
+.Nm
+mode, its data path is disconnected from the host stack.
+The processes owning the file descriptor
+can exchange packets with the device, or with the host stack,
+through an mmapped memory region that contains pre-allocated
+buffers and metadata.
+.Pp
+Non blocking I/O is done with special
+.Pa ioctl()'s ,
+whereas the file descriptor can be passed to
+.Pa select()/poll()
+to be notified about incoming packet or available transmit buffers.
+.Ss Data structures
+All data structures for all devices in
+.Nm
+mode are in a memory
+region shared by the kernel and all processes
+who open
+.Pa /dev/netmap
+(NOTE: visibility may be restricted in future implementations).
+All references between the shared data structure
+are relative (offsets or indexes). Some macros help converting
+them into actual pointers.
+.Pp
+The data structures in shared memory are the following:
+.Pp
+.Bl -tag -width XXX
+.It Dv struct netmap_if (one per interface)
+indicates the number of rings supported by an interface, their
+sizes, and the offsets of the
+.Pa netmap_rings
+associated to the interface.
+The offset of a
+.Pa struct netmap_if
+in the shared memory region is indicated by the
+.Pa nr_offset
+field in the structure returned by the
+.Pa NIOCREGIF
+(see below).
+.Bd -literal
+struct netmap_if {
+ char ni_name[IFNAMSIZ]; /* name of the interface. */
+ const u_int ni_num_queues; /* number of hw ring pairs */
+ const ssize_t ring_ofs[]; /* offset of tx and rx rings */
+};
+.Ed
+.It Dv struct netmap_ring (one per ring)
+contains the index of the current read or write slot (cur),
+the number of slots available for reception or transmission (avail),
+and an array of
+.Pa slots
+describing the buffers.
+There is one ring pair for each of the N hardware ring pairs
+supported by the card (numbered 0..N-1), plus
+one ring pair (numbered N) for packets from/to the host stack.
+.Bd -literal
+struct netmap_ring {
+ const ssize_t buf_ofs;
+ const uint32_t num_slots; /* number of slots in the ring. */
+ uint32_t avail; /* number of usable slots */
+ uint32_t cur; /* 'current' index for the user side */
+
+ const uint16_t nr_buf_size;
+ uint16_t flags;
+ struct netmap_slot slot[0]; /* array of slots. */
+}
+.Ed
+.It Dv struct netmap_slot (one per packet)
+contains the metadata for a packet: a buffer index (buf_idx),
+a buffer length (len), and some flags.
+.Bd -literal
+struct netmap_slot {
+ uint32_t buf_idx; /* buffer index */
+ uint16_t len; /* packet length */
+ uint16_t flags; /* buf changed, etc. */
+#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */
+#define NS_REPORT 0x0002 /* tell hw to report results
+ * e.g. by generating an interrupt
+ */
+};
+.Ed
+.It Dv packet buffers
+are fixed size (approximately 2k) buffers allocated by the kernel
+that contain packet data. Buffers addresses are computed through
+macros.
+.El
+.Pp
+Some macros support the access to objects in the shared memory
+region. In particular:
+.Bd -literal
+struct netmap_if *nifp;
+...
+struct netmap_ring *txring = NETMAP_TXRING(nifp, i);
+struct netmap_ring *rxring = NETMAP_RXRING(nifp, i);
+int i = txring->slot[txring->cur].buf_idx;
+char *buf = NETMAP_BUF(txring, i);
+.Ed
+.Ss IOCTLS
+.Pp
+.Nm
+supports some ioctl() to synchronize the state of the rings
+between the kernel and the user processes, plus some
+to query and configure the interface.
+The former do not require any argument, whereas the latter
+use a
+.Pa struct netmap_req
+defined as follows:
+.Bd -literal
+struct nmreq {
+ char nr_name[IFNAMSIZ];
+ uint32_t nr_offset; /* nifp offset in the shared region */
+ uint32_t nr_memsize; /* size of the shared region */
+ uint32_t nr_numdescs; /* descriptors per queue */
+ uint16_t nr_numqueues;
+ uint16_t nr_ringid; /* ring(s) we care about */
+#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */
+#define NETMAP_SW_RING 0x2000 /* we process the sw ring */
+#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */
+#define NETMAP_RING_MASK 0xfff /* the actual ring number */
+};
+
+.Ed
+A device descriptor obtained through
+.Pa /dev/netmap
+also supports the ioctl supported by network devices.
+.Pp
+The netmap-specific
+.Xr ioctl 2
+command codes below are defined in
+.In net/netmap.h
+and are:
+.Bl -tag -width XXXX
+.It Dv NIOCGINFO
+returns information about the interface named in nr_name.
+On return, nr_memsize indicates the size of the shared netmap
+memory region (this is device-independent),
+nr_numslots indicates how many buffers are in a ring,
+nr_numrings indicates the number of rings supported by the hardware.
+.Pp
+If the device does not support netmap, the ioctl returns EINVAL.
+.It Dv NIOCREGIF
+puts the interface named in nr_name into netmap mode, disconnecting
+it from the host stack, and/or defines which rings are controlled
+through this file descriptor.
+On return, it gives the same info as NIOCGINFO, and nr_ringid
+indicates the identity of the rings controlled through the file
+descriptor.
+.Pp
+Possible values for nr_ringid are
+.Bl -tag -width XXXXX
+.It 0
+default, all hardware rings
+.It NETMAP_SW_RING
+the ``host rings'' connecting to the host stack
+.It NETMAP_HW_RING + i
+the i-th hardware ring
+.El
+By default, a
+.Nm poll
+or
+.Nm select
+call pushes out any pending packets on the transmit ring, even if
+no write events are specified.
+The feature can be disabled by or-ing
+.Nm NETMAP_NO_TX_SYNC
+to nr_ringid.
+But normally you should keep this feature unless you are using
+separate file descriptors for the send and receive rings, because
+otherwise packets are pushed out only if NETMAP_TXSYNC is called,
+or the send queue is full.
+.Pp
+.Pa NIOCREGIF
+can be used multiple times to change the association of a
+file descriptor to a ring pair, always within the same device.
+.It Dv NIOCUNREGIF
+brings an interface back to normal mode.
+.It Dv NIOCTXSYNC
+tells the hardware of new packets to transmit, and updates the
+number of slots available for transmission.
+.It Dv NIOCRXSYNC
+tells the hardware of consumed packets, and asks for newly available
+packets.
+.El
+.Ss SYSTEM CALLS
+.Nm
+uses
+.Nm select
+and
+.Nm poll
+to wake up processes when significant events occur.
+.Sh EXAMPLES
+The following code implements a traffic generator
+.Pp
+.Bd -literal -compact
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+struct netmap_if *nifp;
+struct netmap_ring *ring;
+struct netmap_request nmr;
+
+fd = open("/dev/netmap", O_RDWR);
+bzero(&nmr, sizeof(nmr));
+strcpy(nmr.nm_name, "ix0");
+ioctl(fd, NIOCREG, &nmr);
+p = mmap(0, nmr.memsize, fd);
+nifp = NETMAP_IF(p, nmr.offset);
+ring = NETMAP_TXRING(nifp, 0);
+fds.fd = fd;
+fds.events = POLLOUT;
+for (;;) {
+ poll(list, 1, -1);
+ while (ring->avail-- > 0) {
+ i = ring->cur;
+ buf = NETMAP_BUF(ring, ring->slot[i].buf_index);
+ ... prepare packet in buf ...
+ ring->slot[i].len = ... packet length ...
+ ring->cur = NETMAP_RING_NEXT(ring, i);
+ }
+}
+.Ed
+.Sh SUPPORTED INTERFACES
+.Nm
+supports the following interfaces:
+.Xr em 4 ,
+.Xr ixgbe 4 ,
+.Xr re 4 ,
+.Sh AUTHORS
+The
+.Nm
+framework has been designed and implemented by
+.An Luigi Rizzo
+and
+.An Matteo Landi
+in 2011 at the Universita` di Pisa.
diff --git a/sys/dev/netmap/head.diff b/sys/dev/netmap/head.diff
new file mode 100644
index 000000000000..51a8e34e74d1
--- /dev/null
+++ b/sys/dev/netmap/head.diff
@@ -0,0 +1,654 @@
+Index: conf/NOTES
+===================================================================
+--- conf/NOTES (revision 227552)
++++ conf/NOTES (working copy)
+@@ -799,6 +799,12 @@
+ # option. DHCP requires bpf.
+ device bpf
+
++# The `netmap' device implements memory-mapped access to network
++# devices from userspace, enabling wire-speed packet capture and
++# generation even at 10Gbit/s. Requires support in the device
++# driver. Supported drivers are ixgbe, e1000, re.
++device netmap
++
+ # The `disc' device implements a minimal network interface,
+ # which throws away all packets sent and never receives any. It is
+ # included for testing and benchmarking purposes.
+Index: conf/files
+===================================================================
+--- conf/files (revision 227552)
++++ conf/files (working copy)
+@@ -1507,6 +1507,7 @@
+ dev/my/if_my.c optional my
+ dev/ncv/ncr53c500.c optional ncv
+ dev/ncv/ncr53c500_pccard.c optional ncv pccard
++dev/netmap/netmap.c optional netmap
+ dev/nge/if_nge.c optional nge
+ dev/nxge/if_nxge.c optional nxge
+ dev/nxge/xgehal/xgehal-device.c optional nxge
+Index: conf/options
+===================================================================
+--- conf/options (revision 227552)
++++ conf/options (working copy)
+@@ -689,6 +689,7 @@
+
+ # various 'device presence' options.
+ DEV_BPF opt_bpf.h
++DEV_NETMAP opt_global.h
+ DEV_MCA opt_mca.h
+ DEV_CARP opt_carp.h
+ DEV_SPLASH opt_splash.h
+Index: dev/e1000/if_igb.c
+===================================================================
+--- dev/e1000/if_igb.c (revision 227552)
++++ dev/e1000/if_igb.c (working copy)
+@@ -369,6 +369,9 @@
+ &igb_rx_process_limit, 0,
+ "Maximum number of received packets to process at a time, -1 means unlimited");
+
++#ifdef DEV_NETMAP
++#include <dev/netmap/if_igb_netmap.h>
++#endif /* DEV_NETMAP */
+ /*********************************************************************
+ * Device identification routine
+ *
+@@ -664,6 +667,9 @@
+ adapter->led_dev = led_create(igb_led_func, adapter,
+ device_get_nameunit(dev));
+
++#ifdef DEV_NETMAP
++ igb_netmap_attach(adapter);
++#endif /* DEV_NETMAP */
+ INIT_DEBUGOUT("igb_attach: end");
+
+ return (0);
+@@ -742,6 +748,9 @@
+
+ callout_drain(&adapter->timer);
+
++#ifdef DEV_NETMAP
++ netmap_detach(adapter->ifp);
++#endif /* DEV_NETMAP */
+ igb_free_pci_resources(adapter);
+ bus_generic_detach(dev);
+ if_free(ifp);
+@@ -3212,6 +3221,10 @@
+ struct adapter *adapter = txr->adapter;
+ struct igb_tx_buffer *txbuf;
+ int i;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp),
++ NR_TX, txr->me, 0);
++#endif
+
+ /* Clear the old descriptor contents */
+ IGB_TX_LOCK(txr);
+@@ -3231,6 +3244,13 @@
+ m_freem(txbuf->m_head);
+ txbuf->m_head = NULL;
+ }
++#ifdef DEV_NETMAP
++ if (slot) {
++ netmap_load_map(txr->txtag, txbuf->map,
++ NMB(slot), adapter->rx_mbuf_sz);
++ slot++;
++ }
++#endif /* DEV_NETMAP */
+ /* clear the watch index */
+ txbuf->next_eop = -1;
+ }
+@@ -3626,6 +3646,19 @@
+
+ IGB_TX_LOCK_ASSERT(txr);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ struct netmap_adapter *na = NA(ifp);
++
++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
++ IGB_TX_UNLOCK(txr);
++ IGB_CORE_LOCK(adapter);
++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET);
++ IGB_CORE_UNLOCK(adapter);
++ IGB_TX_LOCK(txr); // the caller is supposed to own the lock
++ return FALSE;
++ }
++#endif /* DEV_NETMAP */
+ if (txr->tx_avail == adapter->num_tx_desc) {
+ txr->queue_status = IGB_QUEUE_IDLE;
+ return FALSE;
+@@ -3949,6 +3982,10 @@
+ bus_dma_segment_t pseg[1], hseg[1];
+ struct lro_ctrl *lro = &rxr->lro;
+ int rsize, nsegs, error = 0;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp),
++ NR_RX, rxr->me, 0);
++#endif
+
+ adapter = rxr->adapter;
+ dev = adapter->dev;
+@@ -3974,6 +4011,18 @@
+ struct mbuf *mh, *mp;
+
+ rxbuf = &rxr->rx_buffers[j];
++#ifdef DEV_NETMAP
++ if (slot) {
++ netmap_load_map(rxr->ptag,
++ rxbuf->pmap, NMB(slot),
++ adapter->rx_mbuf_sz);
++ /* Update descriptor */
++ rxr->rx_base[j].read.pkt_addr =
++ htole64(vtophys(NMB(slot)));
++ slot++;
++ continue;
++ }
++#endif /* DEV_NETMAP */
+ if (rxr->hdr_split == FALSE)
+ goto skip_head;
+
+@@ -4436,6 +4485,19 @@
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ struct netmap_adapter *na = NA(ifp);
++
++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET);
++ IGB_RX_UNLOCK(rxr);
++ IGB_CORE_LOCK(adapter);
++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET);
++ IGB_CORE_UNLOCK(adapter);
++ return (0);
++ }
++#endif /* DEV_NETMAP */
++
+ /* Main clean loop */
+ for (i = rxr->next_to_check; count != 0;) {
+ struct mbuf *sendmp, *mh, *mp;
+Index: dev/e1000/if_lem.c
+===================================================================
+--- dev/e1000/if_lem.c (revision 227552)
++++ dev/e1000/if_lem.c (working copy)
+@@ -316,6 +316,10 @@
+ /* Global used in WOL setup with multiport cards */
+ static int global_quad_port_a = 0;
+
++#ifdef DEV_NETMAP
++#include <dev/netmap/if_lem_netmap.h>
++#endif /* DEV_NETMAP */
++
+ /*********************************************************************
+ * Device identification routine
+ *
+@@ -646,6 +650,9 @@
+ adapter->led_dev = led_create(lem_led_func, adapter,
+ device_get_nameunit(dev));
+
++#ifdef DEV_NETMAP
++ lem_netmap_attach(adapter);
++#endif /* DEV_NETMAP */
+ INIT_DEBUGOUT("lem_attach: end");
+
+ return (0);
+@@ -724,6 +731,9 @@
+ callout_drain(&adapter->timer);
+ callout_drain(&adapter->tx_fifo_timer);
+
++#ifdef DEV_NETMAP
++ netmap_detach(ifp);
++#endif /* DEV_NETMAP */
+ lem_free_pci_resources(adapter);
+ bus_generic_detach(dev);
+ if_free(ifp);
+@@ -2637,6 +2647,9 @@
+ lem_setup_transmit_structures(struct adapter *adapter)
+ {
+ struct em_buffer *tx_buffer;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_TX, 0, 0);
++#endif
+
+ /* Clear the old ring contents */
+ bzero(adapter->tx_desc_base,
+@@ -2650,6 +2663,15 @@
+ bus_dmamap_unload(adapter->txtag, tx_buffer->map);
+ m_freem(tx_buffer->m_head);
+ tx_buffer->m_head = NULL;
++#ifdef DEV_NETMAP
++ if (slot) {
++ /* reload the map for netmap mode */
++ netmap_load_map(adapter->txtag,
++ tx_buffer->map, NMB(slot),
++ NA(adapter->ifp)->buff_size);
++ slot++;
++ }
++#endif /* DEV_NETMAP */
+ tx_buffer->next_eop = -1;
+ }
+
+@@ -2951,6 +2973,12 @@
+
+ EM_TX_LOCK_ASSERT(adapter);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET);
++ return;
++ }
++#endif /* DEV_NETMAP */
+ if (adapter->num_tx_desc_avail == adapter->num_tx_desc)
+ return;
+
+@@ -3181,6 +3209,9 @@
+ {
+ struct em_buffer *rx_buffer;
+ int i, error;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_RX, 0, 0);
++#endif
+
+ /* Reset descriptor ring */
+ bzero(adapter->rx_desc_base,
+@@ -3200,6 +3231,18 @@
+
+ /* Allocate new ones. */
+ for (i = 0; i < adapter->num_rx_desc; i++) {
++#ifdef DEV_NETMAP
++ if (slot) {
++ netmap_load_map(adapter->rxtag,
++ rx_buffer->map, NMB(slot),
++ NA(adapter->ifp)->buff_size);
++ /* Update descriptor */
++ adapter->rx_desc_base[i].buffer_addr =
++ htole64(vtophys(NMB(slot)));
++ slot++;
++ continue;
++ }
++#endif /* DEV_NETMAP */
+ error = lem_get_buf(adapter, i);
+ if (error)
+ return (error);
+@@ -3407,6 +3450,14 @@
+ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ selwakeuppri(&NA(ifp)->rx_rings[0].si, PI_NET);
++ EM_RX_UNLOCK(adapter);
++ return (0);
++ }
++#endif /* DEV_NETMAP */
++
+ if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
+ if (done != NULL)
+ *done = rx_sent;
+Index: dev/e1000/if_em.c
+===================================================================
+--- dev/e1000/if_em.c (revision 227552)
++++ dev/e1000/if_em.c (working copy)
+@@ -399,6 +399,10 @@
+ /* Global used in WOL setup with multiport cards */
+ static int global_quad_port_a = 0;
+
++#ifdef DEV_NETMAP
++#include <dev/netmap/if_em_netmap.h>
++#endif /* DEV_NETMAP */
++
+ /*********************************************************************
+ * Device identification routine
+ *
+@@ -714,6 +718,9 @@
+
+ adapter->led_dev = led_create(em_led_func, adapter,
+ device_get_nameunit(dev));
++#ifdef DEV_NETMAP
++ em_netmap_attach(adapter);
++#endif /* DEV_NETMAP */
+
+ INIT_DEBUGOUT("em_attach: end");
+
+@@ -785,6 +792,10 @@
+ ether_ifdetach(adapter->ifp);
+ callout_drain(&adapter->timer);
+
++#ifdef DEV_NETMAP
++ netmap_detach(ifp);
++#endif /* DEV_NETMAP */
++
+ em_free_pci_resources(adapter);
+ bus_generic_detach(dev);
+ if_free(ifp);
+@@ -3213,6 +3224,10 @@
+ struct adapter *adapter = txr->adapter;
+ struct em_buffer *txbuf;
+ int i;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp),
++ NR_TX, txr->me, 0);
++#endif
+
+ /* Clear the old descriptor contents */
+ EM_TX_LOCK(txr);
+@@ -3232,6 +3247,16 @@
+ m_freem(txbuf->m_head);
+ txbuf->m_head = NULL;
+ }
++#ifdef DEV_NETMAP
++ if (slot) {
++ /* reload the map for netmap mode */
++ netmap_load_map(txr->txtag,
++ txbuf->map, NMB(slot),
++ adapter->rx_mbuf_sz);
++ slot++;
++ }
++#endif /* DEV_NETMAP */
++
+ /* clear the watch index */
+ txbuf->next_eop = -1;
+ }
+@@ -3682,6 +3707,12 @@
+ struct ifnet *ifp = adapter->ifp;
+
+ EM_TX_LOCK_ASSERT(txr);
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ selwakeuppri(&NA(ifp)->tx_rings[txr->me].si, PI_NET);
++ return (FALSE);
++ }
++#endif /* DEV_NETMAP */
+
+ /* No work, make sure watchdog is off */
+ if (txr->tx_avail == adapter->num_tx_desc) {
+@@ -3978,6 +4009,33 @@
+ if (++j == adapter->num_rx_desc)
+ j = 0;
+ }
++#ifdef DEV_NETMAP
++ {
++ /* slot is NULL if we are not in netmap mode */
++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp),
++ NR_RX, rxr->me, rxr->next_to_check);
++ /*
++ * we need to restore all buffer addresses in the ring as they might
++ * be in the wrong state if we are exiting from netmap mode.
++ */
++ for (j = 0; j != adapter->num_rx_desc; ++j) {
++ void *addr;
++ rxbuf = &rxr->rx_buffers[j];
++ if (rxbuf->m_head == NULL && !slot)
++ continue;
++ addr = slot ? NMB(slot) : rxbuf->m_head->m_data;
++ // XXX load or reload ?
++ netmap_load_map(rxr->rxtag, rxbuf->map, addr, adapter->rx_mbuf_sz);
++ /* Update descriptor */
++ rxr->rx_base[j].buffer_addr = htole64(vtophys(addr));
++ bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD);
++ if (slot)
++ slot++;
++ }
++ /* Setup our descriptor indices */
++ NA(adapter->ifp)->rx_rings[rxr->me].nr_hwcur = rxr->next_to_check;
++ }
++#endif /* DEV_NETMAP */
+
+ fail:
+ rxr->next_to_refresh = i;
+@@ -4247,6 +4305,14 @@
+
+ EM_RX_LOCK(rxr);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ selwakeuppri(&NA(ifp)->rx_rings[rxr->me].si, PI_NET);
++ EM_RX_UNLOCK(rxr);
++ return (0);
++ }
++#endif /* DEV_NETMAP */
++
+ for (i = rxr->next_to_check, processed = 0; count != 0;) {
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+Index: dev/re/if_re.c
+===================================================================
+--- dev/re/if_re.c (revision 227552)
++++ dev/re/if_re.c (working copy)
+@@ -291,6 +291,10 @@
+ static void re_setwol (struct rl_softc *);
+ static void re_clrwol (struct rl_softc *);
+
++#ifdef DEV_NETMAP
++#include <dev/netmap/if_re_netmap.h>
++#endif /* !DEV_NETMAP */
++
+ #ifdef RE_DIAG
+ static int re_diag (struct rl_softc *);
+ #endif
+@@ -1583,6 +1587,9 @@
+ */
+ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
+
++#ifdef DEV_NETMAP
++ re_netmap_attach(sc);
++#endif /* DEV_NETMAP */
+ #ifdef RE_DIAG
+ /*
+ * Perform hardware diagnostic on the original RTL8169.
+@@ -1778,6 +1785,9 @@
+ bus_dma_tag_destroy(sc->rl_ldata.rl_stag);
+ }
+
++#ifdef DEV_NETMAP
++ netmap_detach(ifp);
++#endif /* DEV_NETMAP */
+ if (sc->rl_parent_tag)
+ bus_dma_tag_destroy(sc->rl_parent_tag);
+
+@@ -1952,6 +1962,9 @@
+ sc->rl_ldata.rl_tx_desc_cnt * sizeof(struct rl_desc));
+ for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++)
+ sc->rl_ldata.rl_tx_desc[i].tx_m = NULL;
++#ifdef DEV_NETMAP
++ re_netmap_tx_init(sc);
++#endif /* DEV_NETMAP */
+ /* Set EOR. */
+ desc = &sc->rl_ldata.rl_tx_list[sc->rl_ldata.rl_tx_desc_cnt - 1];
+ desc->rl_cmdstat |= htole32(RL_TDESC_CMD_EOR);
+@@ -1979,6 +1992,9 @@
+ if ((error = re_newbuf(sc, i)) != 0)
+ return (error);
+ }
++#ifdef DEV_NETMAP
++ re_netmap_rx_init(sc);
++#endif /* DEV_NETMAP */
+
+ /* Flush the RX descriptors */
+
+@@ -2035,6 +2051,12 @@
+ RL_LOCK_ASSERT(sc);
+
+ ifp = sc->rl_ifp;
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ selwakeuppri(&NA(ifp)->rx_rings->si, PI_NET);
++ return 0;
++ }
++#endif /* DEV_NETMAP */
+ if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0)
+ jumbo = 1;
+ else
+@@ -2276,6 +2298,12 @@
+ return;
+
+ ifp = sc->rl_ifp;
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET);
++ return;
++ }
++#endif /* DEV_NETMAP */
+ /* Invalidate the TX descriptor list */
+ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
+ sc->rl_ldata.rl_tx_list_map,
+@@ -2794,6 +2822,20 @@
+
+ sc = ifp->if_softc;
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ struct netmap_kring *kring = &NA(ifp)->tx_rings[0];
++ if (sc->rl_ldata.rl_tx_prodidx != kring->nr_hwcur) {
++ /* kick the tx unit */
++ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
++#ifdef RE_TX_MODERATION
++ CSR_WRITE_4(sc, RL_TIMERCNT, 1);
++#endif
++ sc->rl_watchdog_timer = 5;
++ }
++ return;
++ }
++#endif /* DEV_NETMAP */
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING || (sc->rl_flags & RL_FLAG_LINK) == 0)
+ return;
+Index: dev/ixgbe/ixgbe.c
+===================================================================
+--- dev/ixgbe/ixgbe.c (revision 227552)
++++ dev/ixgbe/ixgbe.c (working copy)
+@@ -313,6 +313,10 @@
+ static int fdir_pballoc = 1;
+ #endif
+
++#ifdef DEV_NETMAP
++#include <dev/netmap/ixgbe_netmap.h>
++#endif /* DEV_NETMAP */
++
+ /*********************************************************************
+ * Device identification routine
+ *
+@@ -578,6 +582,9 @@
+
+ ixgbe_add_hw_stats(adapter);
+
++#ifdef DEV_NETMAP
++ ixgbe_netmap_attach(adapter);
++#endif /* DEV_NETMAP */
+ INIT_DEBUGOUT("ixgbe_attach: end");
+ return (0);
+ err_late:
+@@ -652,6 +659,9 @@
+
+ ether_ifdetach(adapter->ifp);
+ callout_drain(&adapter->timer);
++#ifdef DEV_NETMAP
++ netmap_detach(adapter->ifp);
++#endif /* DEV_NETMAP */
+ ixgbe_free_pci_resources(adapter);
+ bus_generic_detach(dev);
+ if_free(adapter->ifp);
+@@ -1719,6 +1729,7 @@
+ if (++i == adapter->num_tx_desc)
+ i = 0;
+
++ // XXX should we sync each buffer ?
+ txbuf->m_head = NULL;
+ txbuf->eop_index = -1;
+ }
+@@ -2813,6 +2824,10 @@
+ struct adapter *adapter = txr->adapter;
+ struct ixgbe_tx_buf *txbuf;
+ int i;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp),
++ NR_TX, txr->me, 0);
++#endif
+
+ /* Clear the old ring contents */
+ IXGBE_TX_LOCK(txr);
+@@ -2832,6 +2847,13 @@
+ m_freem(txbuf->m_head);
+ txbuf->m_head = NULL;
+ }
++#ifdef DEV_NETMAP
++ if (slot) {
++ netmap_load_map(txr->txtag, txbuf->map,
++ NMB(slot), adapter->rx_mbuf_sz);
++ slot++;
++ }
++#endif /* DEV_NETMAP */
+ /* Clear the EOP index */
+ txbuf->eop_index = -1;
+ }
+@@ -3310,6 +3332,20 @@
+
+ mtx_assert(&txr->tx_mtx, MA_OWNED);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ struct netmap_adapter *na = NA(ifp);
++
++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
++ IXGBE_TX_UNLOCK(txr);
++ IXGBE_CORE_LOCK(adapter);
++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET);
++ IXGBE_CORE_UNLOCK(adapter);
++ IXGBE_TX_LOCK(txr); // the caller is supposed to own the lock
++ return (FALSE);
++ }
++#endif /* DEV_NETMAP */
++
+ if (txr->tx_avail == adapter->num_tx_desc) {
+ txr->queue_status = IXGBE_QUEUE_IDLE;
+ return FALSE;
+@@ -3698,6 +3734,10 @@
+ bus_dma_segment_t pseg[1], hseg[1];
+ struct lro_ctrl *lro = &rxr->lro;
+ int rsize, nsegs, error = 0;
++#ifdef DEV_NETMAP
++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp),
++ NR_RX, rxr->me, 0);
++#endif /* DEV_NETMAP */
+
+ adapter = rxr->adapter;
+ ifp = adapter->ifp;
+@@ -3721,6 +3761,18 @@
+ struct mbuf *mh, *mp;
+
+ rxbuf = &rxr->rx_buffers[j];
++#ifdef DEV_NETMAP
++ if (slot) {
++ netmap_load_map(rxr->ptag,
++ rxbuf->pmap, NMB(slot),
++ adapter->rx_mbuf_sz);
++ /* Update descriptor */
++ rxr->rx_base[j].read.pkt_addr =
++ htole64(vtophys(NMB(slot)));
++ slot++;
++ continue;
++ }
++#endif /* DEV_NETMAP */
+ /*
+ ** Don't allocate mbufs if not
+ ** doing header split, its wasteful
+@@ -4148,6 +4200,18 @@
+
+ IXGBE_RX_LOCK(rxr);
+
++#ifdef DEV_NETMAP
++ if (ifp->if_capenable & IFCAP_NETMAP) {
++ struct netmap_adapter *na = NA(ifp);
++
++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET);
++ IXGBE_RX_UNLOCK(rxr);
++ IXGBE_CORE_LOCK(adapter);
++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET);
++ IXGBE_CORE_UNLOCK(adapter);
++ return (0);
++ }
++#endif /* DEV_NETMAP */
+ for (i = rxr->next_to_check; count != 0;) {
+ struct mbuf *sendmp, *mh, *mp;
+ u32 rsc, ptype;
diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h
new file mode 100644
index 000000000000..0e220e755d68
--- /dev/null
+++ b/sys/dev/netmap/if_em_netmap.h
@@ -0,0 +1,383 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: if_em_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * netmap changes for if_em.
+ */
+
+#include <net/netmap.h>
+#include <sys/selinfo.h>
+#include <vm/vm.h>
+#include <vm/pmap.h> /* vtophys ? */
+#include <dev/netmap/netmap_kern.h>
+
+static void em_netmap_block_tasks(struct adapter *);
+static void em_netmap_unblock_tasks(struct adapter *);
+static int em_netmap_reg(struct ifnet *, int onoff);
+static int em_netmap_txsync(void *, u_int, int);
+static int em_netmap_rxsync(void *, u_int, int);
+static void em_netmap_lock_wrapper(void *, int, u_int);
+
+static void
+em_netmap_attach(struct adapter *adapter)
+{
+ struct netmap_adapter na;
+
+ bzero(&na, sizeof(na));
+
+ na.ifp = adapter->ifp;
+ na.separate_locks = 1;
+ na.num_tx_desc = adapter->num_tx_desc;
+ na.num_rx_desc = adapter->num_rx_desc;
+ na.nm_txsync = em_netmap_txsync;
+ na.nm_rxsync = em_netmap_rxsync;
+ na.nm_lock = em_netmap_lock_wrapper;
+ na.nm_register = em_netmap_reg;
+ /*
+ * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode
+ * we allocate the buffers on the first register. So we must
+ * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set.
+ */
+ na.buff_size = MCLBYTES;
+ netmap_attach(&na, adapter->num_queues);
+}
+
+
+/*
+ * wrapper to export locks to the generic code
+ */
+static void
+em_netmap_lock_wrapper(void *_a, int what, u_int queueid)
+{
+ struct adapter *adapter = _a;
+
+ ASSERT(queueid < adapter->num_queues);
+ switch (what) {
+ case NETMAP_CORE_LOCK:
+ EM_CORE_LOCK(adapter);
+ break;
+ case NETMAP_CORE_UNLOCK:
+ EM_CORE_UNLOCK(adapter);
+ break;
+ case NETMAP_TX_LOCK:
+ EM_TX_LOCK(&adapter->tx_rings[queueid]);
+ break;
+ case NETMAP_TX_UNLOCK:
+ EM_TX_UNLOCK(&adapter->tx_rings[queueid]);
+ break;
+ case NETMAP_RX_LOCK:
+ EM_RX_LOCK(&adapter->rx_rings[queueid]);
+ break;
+ case NETMAP_RX_UNLOCK:
+ EM_RX_UNLOCK(&adapter->rx_rings[queueid]);
+ break;
+ }
+}
+
+
+static void
+em_netmap_block_tasks(struct adapter *adapter)
+{
+ if (adapter->msix > 1) { /* MSIX */
+ int i;
+ struct tx_ring *txr = adapter->tx_rings;
+ struct rx_ring *rxr = adapter->rx_rings;
+
+ for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) {
+ taskqueue_block(txr->tq);
+ taskqueue_drain(txr->tq, &txr->tx_task);
+ taskqueue_block(rxr->tq);
+ taskqueue_drain(rxr->tq, &rxr->rx_task);
+ }
+ } else { /* legacy */
+ taskqueue_block(adapter->tq);
+ taskqueue_drain(adapter->tq, &adapter->link_task);
+ taskqueue_drain(adapter->tq, &adapter->que_task);
+ }
+}
+
+
+static void
+em_netmap_unblock_tasks(struct adapter *adapter)
+{
+ if (adapter->msix > 1) {
+ struct tx_ring *txr = adapter->tx_rings;
+ struct rx_ring *rxr = adapter->rx_rings;
+ int i;
+
+ for (i = 0; i < adapter->num_queues; i++) {
+ taskqueue_unblock(txr->tq);
+ taskqueue_unblock(rxr->tq);
+ }
+ } else { /* legacy */
+ taskqueue_unblock(adapter->tq);
+ }
+}
+
+/*
+ * register-unregister routine
+ */
+static int
+em_netmap_reg(struct ifnet *ifp, int onoff)
+{
+ struct adapter *adapter = ifp->if_softc;
+ struct netmap_adapter *na = NA(ifp);
+ int error = 0;
+
+ if (na == NULL)
+ return EINVAL; /* no netmap support here */
+
+ em_disable_intr(adapter);
+
+ /* Tell the stack that the interface is no longer active */
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ em_netmap_block_tasks(adapter);
+
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* save if_transmit for later restore.
+ * XXX also if_start and if_qflush ?
+ */
+ na->if_transmit = ifp->if_transmit;
+ ifp->if_transmit = netmap_start;
+
+ em_init_locked(adapter);
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ } else {
+fail:
+ /* restore if_transmit */
+ ifp->if_transmit = na->if_transmit;
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ em_init_locked(adapter); /* also enable intr */
+
+ }
+ em_netmap_unblock_tasks(adapter);
+ return (error);
+}
+
+/*
+ * Reconcile hardware and user view of the transmit ring, see
+ * ixgbe.c for details.
+ */
+static int
+em_netmap_txsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->tx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ /* generate an interrupt approximately every half ring */
+ int report_frequency = kring->nkr_num_slots >> 1;
+
+ k = ring->cur;
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ EM_TX_LOCK(txr);
+ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ /* record completed transmissions TODO
+ *
+ * instead of using TDH, we could read the transmitted status bit.
+ */
+ j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+ if (j >= kring->nkr_num_slots) { /* XXX can happen */
+ D("TDH wrap %d", j);
+ j -= kring->nkr_num_slots;
+ }
+ int delta = j - txr->next_to_clean;
+ if (delta) {
+ /* new transmissions were completed, increment
+ ring->nr_hwavail. */
+ if (delta < 0)
+ delta += kring->nkr_num_slots;
+ txr->next_to_clean = j;
+ kring->nr_hwavail += delta;
+ }
+
+ /* update avail to what the hardware knows */
+ ring->avail = kring->nr_hwavail;
+
+ j = kring->nr_hwcur;
+ if (j != k) { /* we have packets to send */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct e1000_tx_desc *curr = &txr->tx_base[j];
+ struct em_buffer *txbuf = &txr->tx_buffers[j];
+ int flags = ((slot->flags & NS_REPORT) ||
+ j == 0 || j == report_frequency) ?
+ E1000_TXD_CMD_RS : 0;
+ void *addr = NMB(slot);
+ int len = slot->len;
+ if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
+ if (do_lock)
+ EM_TX_UNLOCK(txr);
+ return netmap_ring_reinit(kring);
+ }
+
+ slot->flags &= ~NS_REPORT;
+ curr->upper.data = 0;
+ curr->lower.data =
+ htole32(
+ adapter->txd_cmd |
+ (E1000_TXD_CMD_EOP | flags) |
+ slot->len);
+ if (slot->flags & NS_BUF_CHANGED) {
+ curr->buffer_addr = htole64(vtophys(addr));
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(txr->txtag, txbuf->map,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(txr->txtag, txbuf->map,
+ BUS_DMASYNC_PREWRITE);
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwcur = ring->cur;
+
+ /* decrease avail by number of sent packets */
+ ring->avail -= n;
+ kring->nr_hwavail = ring->avail;
+
+ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+ E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me),
+ ring->cur);
+ }
+ if (do_lock)
+ EM_TX_UNLOCK(txr);
+ return 0;
+}
+
+/*
+ * Reconcile kernel and user view of the receive ring, see ixgbe.c
+ */
+static int
+em_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ k = ring->cur;
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ EM_RX_LOCK(rxr);
+ /* XXX check sync modes */
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+ /* acknowledge all the received packets. */
+ j = rxr->next_to_check;
+ for (n = 0; ; n++) {
+ struct e1000_rx_desc *curr = &rxr->rx_base[j];
+
+ if ((curr->status & E1000_RXD_STAT_DD) == 0)
+ break;
+ ring->slot[j].len = le16toh(curr->length);
+ bus_dmamap_sync(rxr->tag, rxr->rx_buffers[j].map,
+ BUS_DMASYNC_POSTREAD);
+ j = (j == lim) ? 0 : j + 1;
+ }
+ if (n) {
+ rxr->next_to_check = j;
+ kring->nr_hwavail += n;
+ }
+
+ /* skip past packets that userspace has already processed:
+ * making them available for reception.
+ * advance nr_hwcur and issue a bus_dmamap_sync on the
+ * buffers so it is safe to write to them.
+ * Also increase nr_hwavail
+ */
+ j = kring->nr_hwcur;
+ if (j != k) { /* userspace has read some packets. */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct e1000_rx_desc *curr = &rxr->rx_base[j];
+ struct em_buffer *rxbuf = &rxr->rx_buffers[j];
+ void *addr = NMB(slot);
+
+ if (addr == netmap_buffer_base) { /* bad buf */
+ if (do_lock)
+ EM_RX_UNLOCK(rxr);
+ return netmap_ring_reinit(kring);
+ }
+
+ curr->status = 0;
+ if (slot->flags & NS_BUF_CHANGED) {
+ curr->buffer_addr = htole64(vtophys(addr));
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(rxr->rxtag, rxbuf->map,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(rxr->rxtag, rxbuf->map,
+ BUS_DMASYNC_PREREAD);
+
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwavail -= n;
+ kring->nr_hwcur = ring->cur;
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+ /*
+ * IMPORTANT: we must leave one free slot in the ring,
+ * so move j back by one unit
+ */
+ j = (j == 0) ? lim : j - 1;
+ E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j);
+ }
+ /* tell userspace that there are new packets */
+ ring->avail = kring->nr_hwavail ;
+ if (do_lock)
+ EM_RX_UNLOCK(rxr);
+ return 0;
+}
diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h
new file mode 100644
index 000000000000..0c147063b211
--- /dev/null
+++ b/sys/dev/netmap/if_igb_netmap.h
@@ -0,0 +1,378 @@
+/*
+ * Copyright (C) 2011 Universita` di Pisa. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: if_igb_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * netmap modifications for igb
+ * contribured by Ahmed Kooli
+ */
+
+#include <net/netmap.h>
+#include <sys/selinfo.h>
+#include <vm/vm.h>
+#include <vm/pmap.h> /* vtophys ? */
+#include <dev/netmap/netmap_kern.h>
+
+static int igb_netmap_reg(struct ifnet *, int onoff);
+static int igb_netmap_txsync(void *, u_int, int);
+static int igb_netmap_rxsync(void *, u_int, int);
+static void igb_netmap_lock_wrapper(void *, int, u_int);
+
+
+static void
+igb_netmap_attach(struct adapter *adapter)
+{
+ struct netmap_adapter na;
+
+ bzero(&na, sizeof(na));
+
+ na.ifp = adapter->ifp;
+ na.separate_locks = 1;
+ na.num_tx_desc = adapter->num_tx_desc;
+ na.num_rx_desc = adapter->num_rx_desc;
+ na.nm_txsync = igb_netmap_txsync;
+ na.nm_rxsync = igb_netmap_rxsync;
+ na.nm_lock = igb_netmap_lock_wrapper;
+ na.nm_register = igb_netmap_reg;
+ /*
+ * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode
+ * we allocate the buffers on the first register. So we must
+ * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set.
+ */
+ na.buff_size = MCLBYTES;
+ netmap_attach(&na, adapter->num_queues);
+}
+
+
+/*
+ * wrapper to export locks to the generic code
+ */
+static void
+igb_netmap_lock_wrapper(void *_a, int what, u_int queueid)
+{
+ struct adapter *adapter = _a;
+
+ ASSERT(queueid < adapter->num_queues);
+ switch (what) {
+ case NETMAP_CORE_LOCK:
+ IGB_CORE_LOCK(adapter);
+ break;
+ case NETMAP_CORE_UNLOCK:
+ IGB_CORE_UNLOCK(adapter);
+ break;
+ case NETMAP_TX_LOCK:
+ IGB_TX_LOCK(&adapter->tx_rings[queueid]);
+ break;
+ case NETMAP_TX_UNLOCK:
+ IGB_TX_UNLOCK(&adapter->tx_rings[queueid]);
+ break;
+ case NETMAP_RX_LOCK:
+ IGB_RX_LOCK(&adapter->rx_rings[queueid]);
+ break;
+ case NETMAP_RX_UNLOCK:
+ IGB_RX_UNLOCK(&adapter->rx_rings[queueid]);
+ break;
+ }
+}
+
+
+/*
+ * support for netmap register/unregisted. We are already under core lock.
+ * only called on the first init or the last unregister.
+ */
+static int
+igb_netmap_reg(struct ifnet *ifp, int onoff)
+{
+ struct adapter *adapter = ifp->if_softc;
+ struct netmap_adapter *na = NA(ifp);
+ int error = 0;
+
+ if (!na)
+ return EINVAL;
+
+ igb_disable_intr(adapter);
+
+ /* Tell the stack that the interface is no longer active */
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* save if_transmit to restore it later */
+ na->if_transmit = ifp->if_transmit;
+ ifp->if_transmit = netmap_start;
+
+ igb_init_locked(adapter);
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ } else {
+fail:
+ /* restore if_transmit */
+ ifp->if_transmit = na->if_transmit;
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ igb_init_locked(adapter); /* also enables intr */
+ }
+ return (error);
+}
+
+
+/*
+ * Reconcile kernel and user view of the transmit ring.
+ *
+ * Userspace has filled tx slots up to cur (excluded).
+ * The last unused slot previously known to the kernel was nr_hwcur,
+ * and the last interrupt reported nr_hwavail slots available
+ * (using the special value -1 to indicate idle transmit ring).
+ * The function must first update avail to what the kernel
+ * knows, subtract the newly used slots (cur - nr_hwcur)
+ * from both avail and nr_hwavail, and set nr_hwcur = cur
+ * issuing a dmamap_sync on all slots.
+ *
+ * Check parameters in the struct netmap_ring.
+ * We don't use avail, only check for bogus values.
+ * Make sure cur is valid, and same goes for buffer indexes and lengths.
+ * To avoid races, read the values once, and never use those from
+ * the ring afterwards.
+ */
+static int
+igb_netmap_txsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->tx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ /* generate an interrupt approximately every half ring */
+ int report_frequency = kring->nkr_num_slots >> 1;
+
+ k = ring->cur; /* ring is not protected by any lock */
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ IGB_TX_LOCK(txr);
+ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ /* record completed transmissions. TODO
+ *
+ * Instead of reading from the TDH register, we could and try to check
+ * the status bit of descriptor packets.
+ */
+ j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+ if (j >= kring->nkr_num_slots) /* XXX can it happen ? */
+ j -= kring->nkr_num_slots;
+ int delta = j - txr->next_to_clean;
+ if (delta) {
+ /* new tx were completed */
+ if (delta < 0)
+ delta += kring->nkr_num_slots;
+ txr->next_to_clean = j;
+ kring->nr_hwavail += delta;
+ }
+
+ /* update avail to what the hardware knows */
+ ring->avail = kring->nr_hwavail;
+
+ j = kring->nr_hwcur;
+ if (j != k) { /* we have new packets to send */
+ u32 olinfo_status = 0;
+ n = 0;
+
+ /* 82575 needs the queue index added */
+ if (adapter->hw.mac.type == e1000_82575)
+ olinfo_status |= txr->me << 4;
+
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct igb_tx_buffer *txbuf = &txr->tx_buffers[j];
+ union e1000_adv_tx_desc *curr =
+ (union e1000_adv_tx_desc *)&txr->tx_base[j];
+ void *addr = NMB(slot);
+ int flags = ((slot->flags & NS_REPORT) ||
+ j == 0 || j == report_frequency) ?
+ E1000_ADVTXD_DCMD_RS : 0;
+ int len = slot->len;
+
+ if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
+ if (do_lock)
+ IGB_TX_UNLOCK(txr);
+ return netmap_ring_reinit(kring);
+ }
+
+ slot->flags &= ~NS_REPORT;
+ curr->read.buffer_addr = htole64(vtophys(addr));
+ curr->read.olinfo_status =
+ htole32(olinfo_status |
+ (len<< E1000_ADVTXD_PAYLEN_SHIFT));
+ curr->read.cmd_type_len =
+ htole32(len | E1000_ADVTXD_DTYP_DATA |
+ E1000_ADVTXD_DCMD_IFCS |
+ E1000_ADVTXD_DCMD_DEXT |
+ E1000_ADVTXD_DCMD_EOP | flags);
+ if (slot->flags & NS_BUF_CHANGED) {
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(txr->txtag, txbuf->map,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(txr->txtag, txbuf->map,
+ BUS_DMASYNC_PREWRITE);
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwcur = k;
+
+ /* decrease avail by number of sent packets */
+ ring->avail -= n;
+ kring->nr_hwavail = ring->avail;
+
+ /* Set the watchdog */
+ txr->queue_status = IGB_QUEUE_WORKING;
+ txr->watchdog_time = ticks;
+
+ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+ E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), k);
+ }
+ if (do_lock)
+ IGB_TX_UNLOCK(txr);
+ return 0;
+}
+
+
+/*
+ * Reconcile kernel and user view of the receive ring.
+ *
+ * Userspace has read rx slots up to cur (excluded).
+ * The last unread slot previously known to the kernel was nr_hwcur,
+ * and the last interrupt reported nr_hwavail slots available.
+ * We must subtract the newly consumed slots (cur - nr_hwcur)
+ * from nr_hwavail, clearing the descriptors for the next
+ * read, tell the hardware that they are available,
+ * and set nr_hwcur = cur and avail = nr_hwavail.
+ * issuing a dmamap_sync on all slots.
+ */
+static int
+igb_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ k = ring->cur; /* ring is not protected by any lock */
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ IGB_RX_LOCK(rxr);
+
+ /* Sync the ring. */
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+ j = rxr->next_to_check;
+ for (n = 0; ; n++) {
+ union e1000_adv_rx_desc *curr = &rxr->rx_base[j];
+ uint32_t staterr = le32toh(curr->wb.upper.status_error);
+
+ if ((staterr & E1000_RXD_STAT_DD) == 0)
+ break;
+ ring->slot[j].len = le16toh(curr->wb.upper.length);
+
+ bus_dmamap_sync(rxr->ptag,
+ rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD);
+ j = (j == lim) ? 0 : j + 1;
+ }
+ if (n) {
+ rxr->next_to_check = j;
+ kring->nr_hwavail += n;
+ if (kring->nr_hwavail >= lim - 10) {
+ ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail);
+ }
+ }
+
+ /* skip past packets that userspace has already processed,
+ * making them available for reception.
+ * advance nr_hwcur and issue a bus_dmamap_sync on the
+ * buffers so it is safe to write to them.
+ * Also increase nr_hwavail
+ */
+ j = kring->nr_hwcur;
+ if (j != k) { /* userspace has read some packets. */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = ring->slot + j;
+ union e1000_adv_rx_desc *curr = &rxr->rx_base[j];
+ struct igb_rx_buf *rxbuf = rxr->rx_buffers + j;
+ void *addr = NMB(slot);
+
+ if (addr == netmap_buffer_base) { /* bad buf */
+ if (do_lock)
+ IGB_RX_UNLOCK(rxr);
+ return netmap_ring_reinit(kring);
+ }
+
+ curr->wb.upper.status_error = 0;
+ curr->read.pkt_addr = htole64(vtophys(addr));
+ if (slot->flags & NS_BUF_CHANGED) {
+ netmap_reload_map(rxr->ptag, rxbuf->pmap,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
+ BUS_DMASYNC_PREREAD);
+
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwavail -= n;
+ kring->nr_hwcur = ring->cur;
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+ /* IMPORTANT: we must leave one free slot in the ring,
+ * so move j back by one unit
+ */
+ j = (j == 0) ? lim : j - 1;
+ E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j);
+ }
+ /* tell userspace that there are new packets */
+ ring->avail = kring->nr_hwavail ;
+ if (do_lock)
+ IGB_RX_UNLOCK(rxr);
+ return 0;
+}
diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h
new file mode 100644
index 000000000000..a8f34989bcc4
--- /dev/null
+++ b/sys/dev/netmap/if_lem_netmap.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: if_lem_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * netmap support for if_lem.c
+ */
+
+#include <net/netmap.h>
+#include <sys/selinfo.h>
+#include <vm/vm.h>
+#include <vm/pmap.h> /* vtophys ? */
+#include <dev/netmap/netmap_kern.h>
+
+static int lem_netmap_reg(struct ifnet *, int onoff);
+static int lem_netmap_txsync(void *, u_int, int);
+static int lem_netmap_rxsync(void *, u_int, int);
+static void lem_netmap_lock_wrapper(void *, int, u_int);
+
+
+SYSCTL_NODE(_dev, OID_AUTO, lem, CTLFLAG_RW, 0, "lem card");
+
+static void
+lem_netmap_attach(struct adapter *adapter)
+{
+ struct netmap_adapter na;
+
+ bzero(&na, sizeof(na));
+
+ na.ifp = adapter->ifp;
+ na.separate_locks = 1;
+ na.num_tx_desc = adapter->num_tx_desc;
+ na.num_rx_desc = adapter->num_rx_desc;
+ na.nm_txsync = lem_netmap_txsync;
+ na.nm_rxsync = lem_netmap_rxsync;
+ na.nm_lock = lem_netmap_lock_wrapper;
+ na.nm_register = lem_netmap_reg;
+ na.buff_size = MCLBYTES;
+ netmap_attach(&na, 1);
+}
+
+
+static void
+lem_netmap_lock_wrapper(void *_a, int what, u_int ringid)
+{
+ struct adapter *adapter = _a;
+
+ /* only one ring here so ignore the ringid */
+ switch (what) {
+ case NETMAP_CORE_LOCK:
+ EM_CORE_LOCK(adapter);
+ break;
+ case NETMAP_CORE_UNLOCK:
+ EM_CORE_UNLOCK(adapter);
+ break;
+ case NETMAP_TX_LOCK:
+ EM_TX_LOCK(adapter);
+ break;
+ case NETMAP_TX_UNLOCK:
+ EM_TX_UNLOCK(adapter);
+ break;
+ case NETMAP_RX_LOCK:
+ EM_RX_LOCK(adapter);
+ break;
+ case NETMAP_RX_UNLOCK:
+ EM_RX_UNLOCK(adapter);
+ break;
+ }
+}
+
+
+/*
+ * Reconcile kernel and user view of the transmit ring. see ixgbe.c
+ */
+static int
+lem_netmap_txsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->tx_rings[0];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ /* generate an interrupt approximately every half ring */
+ int report_frequency = kring->nkr_num_slots >> 1;
+
+ k = ring->cur;
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ EM_TX_LOCK(adapter);
+ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ /* record completed transmissions TODO
+ *
+ * instead of using TDH, we could read the transmitted status bit.
+ */
+ j = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
+ if (j >= kring->nkr_num_slots) { /* can it happen ? */
+ D("bad TDH %d", j);
+ j -= kring->nkr_num_slots;
+ }
+ int delta = j - adapter->next_tx_to_clean;
+ if (delta) {
+ if (delta < 0)
+ delta += kring->nkr_num_slots;
+ adapter->next_tx_to_clean = j;
+ kring->nr_hwavail += delta;
+ }
+
+ /* update avail to what the hardware knows */
+ ring->avail = kring->nr_hwavail;
+
+ j = kring->nr_hwcur;
+ if (j != k) { /* we have new packets to send */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct e1000_tx_desc *curr = &adapter->tx_desc_base[j];
+ struct em_buffer *txbuf = &adapter->tx_buffer_area[j];
+ void *addr = NMB(slot);
+ int flags = ((slot->flags & NS_REPORT) ||
+ j == 0 || j == report_frequency) ?
+ E1000_TXD_CMD_RS : 0;
+ int len = slot->len;
+
+ if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
+ if (do_lock)
+ EM_TX_UNLOCK(adapter);
+ return netmap_ring_reinit(kring);
+ }
+
+ curr->upper.data = 0;
+ /* always interrupt. XXX make it conditional */
+ curr->lower.data =
+ htole32( adapter->txd_cmd | len |
+ (E1000_TXD_CMD_EOP | flags) );
+ if (slot->flags & NS_BUF_CHANGED) {
+ curr->buffer_addr = htole64(vtophys(addr));
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(adapter->txtag, txbuf->map,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(adapter->txtag, txbuf->map,
+ BUS_DMASYNC_PREWRITE);
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwcur = ring->cur;
+
+ /* decrease avail by number of sent packets */
+ ring->avail -= n;
+ kring->nr_hwavail = ring->avail;
+
+ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), ring->cur);
+ }
+ if (do_lock)
+ EM_TX_UNLOCK(adapter);
+ return 0;
+}
+
+
+/*
+ * Reconcile kernel and user view of the receive ring. see ixgbe.c
+ */
+static int
+lem_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->rx_rings[0];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ k = ring->cur;
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ EM_RX_LOCK(adapter);
+ /* XXX check sync modes */
+ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+ /* acknowldge all the received packets. */
+ j = adapter->next_rx_desc_to_check;
+ for (n = 0; ; n++) {
+ struct e1000_rx_desc *curr = &adapter->rx_desc_base[j];
+ int len = le16toh(adapter->rx_desc_base[j].length) - 4; // CRC
+
+ if ((curr->status & E1000_RXD_STAT_DD) == 0)
+ break;
+
+ if (len < 0) {
+ D("bogus pkt size at %d", j);
+ len = 0;
+ }
+ ring->slot[j].len = len;
+ bus_dmamap_sync(adapter->rxtag, adapter->rx_buffer_area[j].map,
+ BUS_DMASYNC_POSTREAD);
+ j = (j == lim) ? 0 : j + 1;
+ }
+ if (n) {
+ adapter->next_rx_desc_to_check = j;
+ kring->nr_hwavail += n;
+ }
+
+ /* skip past packets that userspace has already processed,
+ * making them available for reception. We don't need to set
+ * the length as it is the same for all slots.
+ */
+ j = kring->nr_hwcur;
+ if (j != k) { /* userspace has read some packets. */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct e1000_rx_desc *curr = &adapter->rx_desc_base[j];
+ struct em_buffer *rxbuf = &adapter->rx_buffer_area[j];
+ void *addr = NMB(slot);
+
+ if (addr == netmap_buffer_base) { /* bad buf */
+ if (do_lock)
+ EM_RX_UNLOCK(adapter);
+ return netmap_ring_reinit(kring);
+ }
+ curr = &adapter->rx_desc_base[j];
+ curr->status = 0;
+ if (slot->flags & NS_BUF_CHANGED) {
+ curr->buffer_addr = htole64(vtophys(addr));
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(adapter->rxtag, rxbuf->map,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(adapter->rxtag, rxbuf->map,
+ BUS_DMASYNC_PREREAD);
+
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwavail -= n;
+ kring->nr_hwcur = ring->cur;
+ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+ /*
+ * IMPORTANT: we must leave one free slot in the ring,
+ * so move j back by one unit
+ */
+ j = (j == 0) ? lim : j - 1;
+ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), j);
+ }
+
+ /* tell userspace that there are new packets */
+ ring->avail = kring->nr_hwavail ;
+ if (do_lock)
+ EM_RX_UNLOCK(adapter);
+ return 0;
+}
+
+
+/*
+ * Register/unregister routine
+ */
+static int
+lem_netmap_reg(struct ifnet *ifp, int onoff)
+{
+ struct adapter *adapter = ifp->if_softc;
+ struct netmap_adapter *na = NA(ifp);
+ int error = 0;
+
+ if (!na)
+ return EINVAL;
+
+ lem_disable_intr(adapter);
+
+ /* Tell the stack that the interface is no longer active */
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ /* lem_netmap_block_tasks(adapter); */
+#ifndef EM_LEGACY_IRQ
+ taskqueue_block(adapter->tq);
+ taskqueue_drain(adapter->tq, &adapter->rxtx_task);
+ taskqueue_drain(adapter->tq, &adapter->link_task);
+#endif /* !EM_LEGCY_IRQ */
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* save if_transmit to restore it when exiting.
+ * XXX what about if_start and if_qflush ?
+ */
+ na->if_transmit = ifp->if_transmit;
+ ifp->if_transmit = netmap_start;
+
+ lem_init_locked(adapter);
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ } else {
+fail:
+ /* restore non-netmap mode */
+ ifp->if_transmit = na->if_transmit;
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ lem_init_locked(adapter); /* also enables intr */
+ }
+
+#ifndef EM_LEGACY_IRQ
+ taskqueue_unblock(adapter->tq);
+#endif /* !EM_LEGCY_IRQ */
+
+ return (error);
+}
diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h
new file mode 100644
index 000000000000..efccf3a795bc
--- /dev/null
+++ b/sys/dev/netmap/if_re_netmap.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2011 Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: if_re_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * netmap support for if_re
+ */
+
+#include <net/netmap.h>
+#include <sys/selinfo.h>
+#include <vm/vm.h>
+#include <vm/pmap.h> /* vtophys ? */
+#include <dev/netmap/netmap_kern.h>
+
+static int re_netmap_reg(struct ifnet *, int onoff);
+static int re_netmap_txsync(void *, u_int, int);
+static int re_netmap_rxsync(void *, u_int, int);
+static void re_netmap_lock_wrapper(void *, int, u_int);
+
+static void
+re_netmap_attach(struct rl_softc *sc)
+{
+ struct netmap_adapter na;
+
+ bzero(&na, sizeof(na));
+
+ na.ifp = sc->rl_ifp;
+ na.separate_locks = 0;
+ na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt;
+ na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt;
+ na.nm_txsync = re_netmap_txsync;
+ na.nm_rxsync = re_netmap_rxsync;
+ na.nm_lock = re_netmap_lock_wrapper;
+ na.nm_register = re_netmap_reg;
+ na.buff_size = MCLBYTES;
+ netmap_attach(&na, 1);
+}
+
+
+/*
+ * wrapper to export locks to the generic code
+ * We should not use the tx/rx locks
+ */
+static void
+re_netmap_lock_wrapper(void *_a, int what, u_int queueid)
+{
+ struct rl_softc *adapter = _a;
+
+ switch (what) {
+ case NETMAP_CORE_LOCK:
+ RL_LOCK(adapter);
+ break;
+ case NETMAP_CORE_UNLOCK:
+ RL_UNLOCK(adapter);
+ break;
+
+ case NETMAP_TX_LOCK:
+ case NETMAP_RX_LOCK:
+ case NETMAP_TX_UNLOCK:
+ case NETMAP_RX_UNLOCK:
+ D("invalid lock call %d, no tx/rx locks here", what);
+ break;
+ }
+}
+
+
+/*
+ * support for netmap register/unregisted. We are already under core lock.
+ * only called on the first register or the last unregister.
+ */
+static int
+re_netmap_reg(struct ifnet *ifp, int onoff)
+{
+ struct rl_softc *adapter = ifp->if_softc;
+ struct netmap_adapter *na = NA(ifp);
+ int error = 0;
+
+ if (!na)
+ return EINVAL;
+ /* Tell the stack that the interface is no longer active */
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ re_stop(adapter);
+
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* save if_transmit and restore it */
+ na->if_transmit = ifp->if_transmit;
+ /* XXX if_start and if_qflush ??? */
+ ifp->if_transmit = netmap_start;
+
+ re_init_locked(adapter);
+
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ } else {
+fail:
+ /* restore if_transmit */
+ ifp->if_transmit = na->if_transmit;
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ re_init_locked(adapter); /* also enables intr */
+ }
+ return (error);
+
+}
+
+
+/*
+ * Reconcile kernel and user view of the transmit ring.
+ *
+ * Userspace has filled tx slots up to cur (excluded).
+ * The last unused slot previously known to the kernel was nr_hwcur,
+ * and the last interrupt reported nr_hwavail slots available
+ * (using the special value -1 to indicate idle transmit ring).
+ * The function must first update avail to what the kernel
+ * knows (translating the -1 to nkr_num_slots - 1),
+ * subtract the newly used slots (cur - nr_hwcur)
+ * from both avail and nr_hwavail, and set nr_hwcur = cur
+ * issuing a dmamap_sync on all slots.
+ */
+static int
+re_netmap_txsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct rl_softc *sc = a;
+ struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc;
+ struct netmap_adapter *na = NA(sc->rl_ifp);
+ struct netmap_kring *kring = &na->tx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ k = ring->cur;
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ RL_LOCK(sc);
+
+ /* Sync the TX descriptor list */
+ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
+ sc->rl_ldata.rl_tx_list_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+ /* record completed transmissions */
+ for (n = 0, j = sc->rl_ldata.rl_tx_considx;
+ j != sc->rl_ldata.rl_tx_prodidx;
+ n++, j = RL_TX_DESC_NXT(sc, j)) {
+ uint32_t cmdstat =
+ le32toh(sc->rl_ldata.rl_tx_list[j].rl_cmdstat);
+ if (cmdstat & RL_TDESC_STAT_OWN)
+ break;
+ }
+ if (n > 0) {
+ sc->rl_ldata.rl_tx_considx = j;
+ sc->rl_ldata.rl_tx_free += n;
+ kring->nr_hwavail += n;
+ }
+
+ /* update avail to what the hardware knows */
+ ring->avail = kring->nr_hwavail;
+
+ /* we trust prodidx, not hwcur */
+ j = kring->nr_hwcur = sc->rl_ldata.rl_tx_prodidx;
+ if (j != k) { /* we have new packets to send */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[j];
+ int cmd = slot->len | RL_TDESC_CMD_EOF |
+ RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ;
+ void *addr = NMB(slot);
+ int len = slot->len;
+
+ if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
+ if (do_lock)
+ RL_UNLOCK(sc);
+ return netmap_ring_reinit(kring);
+ }
+
+ if (j == lim) /* mark end of ring */
+ cmd |= RL_TDESC_CMD_EOR;
+
+ if (slot->flags & NS_BUF_CHANGED) {
+ uint64_t paddr = vtophys(addr);
+ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
+ desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(sc->rl_ldata.rl_tx_mtag,
+ txd[j].tx_dmamap, addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+ slot->flags &= ~NS_REPORT;
+ desc->rl_cmdstat = htole32(cmd);
+ bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag,
+ txd[j].tx_dmamap, BUS_DMASYNC_PREWRITE);
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ sc->rl_ldata.rl_tx_prodidx = kring->nr_hwcur = ring->cur;
+
+ /* decrease avail by number of sent packets */
+ ring->avail -= n;
+ kring->nr_hwavail = ring->avail;
+
+ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
+ sc->rl_ldata.rl_tx_list_map,
+ BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD);
+
+ /* start ? */
+ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
+ }
+ if (do_lock)
+ RL_UNLOCK(sc);
+ return 0;
+}
+
+
+/*
+ * Reconcile kernel and user view of the receive ring.
+ *
+ * Userspace has read rx slots up to cur (excluded).
+ * The last unread slot previously known to the kernel was nr_hwcur,
+ * and the last interrupt reported nr_hwavail slots available.
+ * We must subtract the newly consumed slots (cur - nr_hwcur)
+ * from nr_hwavail, clearing the descriptors for the next
+ * read, tell the hardware that they are available,
+ * and set nr_hwcur = cur and avail = nr_hwavail.
+ * issuing a dmamap_sync on all slots.
+ */
+static int
+re_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct rl_softc *sc = a;
+ struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc;
+ struct netmap_adapter *na = NA(sc->rl_ifp);
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ k = ring->cur;
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ RL_LOCK(sc);
+ /* XXX check sync modes */
+ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag,
+ sc->rl_ldata.rl_rx_list_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+ /*
+ * The device uses all the buffers in the ring, so we need
+ * another termination condition in addition to RL_RDESC_STAT_OWN
+ * cleared (all buffers could have it cleared. The easiest one
+ * is to limit the amount of data reported up to 'lim'
+ */
+ j = sc->rl_ldata.rl_rx_prodidx;
+ for (n = kring->nr_hwavail; n < lim ; n++) {
+ struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[j];
+ uint32_t rxstat = le32toh(cur_rx->rl_cmdstat);
+ uint32_t total_len;
+
+ if ((rxstat & RL_RDESC_STAT_OWN) != 0)
+ break;
+ total_len = rxstat & sc->rl_rxlenmask;
+ /* XXX subtract crc */
+ total_len = (total_len < 4) ? 0 : total_len - 4;
+ kring->ring->slot[j].len = total_len;
+ /* sync was in re_newbuf() */
+ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
+ rxd[j].rx_dmamap, BUS_DMASYNC_POSTREAD);
+ j = RL_RX_DESC_NXT(sc, j);
+ }
+ if (n != kring->nr_hwavail) {
+ sc->rl_ldata.rl_rx_prodidx = j;
+ sc->rl_ifp->if_ipackets += n - kring->nr_hwavail;
+ kring->nr_hwavail = n;
+ }
+
+ /* skip past packets that userspace has already processed,
+ * making them available for reception.
+ * advance nr_hwcur and issue a bus_dmamap_sync on the
+ * buffers so it is safe to write to them.
+ * Also increase nr_hwavail
+ */
+ j = kring->nr_hwcur;
+ if (j != k) { /* userspace has read some packets. */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = ring->slot + j;
+ struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[j];
+ int cmd = na->buff_size | RL_RDESC_CMD_OWN;
+ void *addr = NMB(slot);
+
+ if (addr == netmap_buffer_base) { /* bad buf */
+ if (do_lock)
+ RL_UNLOCK(sc);
+ return netmap_ring_reinit(kring);
+ }
+
+ if (j == lim) /* mark end of ring */
+ cmd |= RL_RDESC_CMD_EOR;
+
+ desc->rl_cmdstat = htole32(cmd);
+ slot->flags &= ~NS_REPORT;
+ if (slot->flags & NS_BUF_CHANGED) {
+ uint64_t paddr = vtophys(addr);
+ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
+ desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
+ netmap_reload_map(sc->rl_ldata.rl_rx_mtag,
+ rxd[j].rx_dmamap, addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
+ rxd[j].rx_dmamap, BUS_DMASYNC_PREREAD);
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwavail -= n;
+ kring->nr_hwcur = k;
+ /* Flush the RX DMA ring */
+
+ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag,
+ sc->rl_ldata.rl_rx_list_map,
+ BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD);
+ }
+ /* tell userspace that there are new packets */
+ ring->avail = kring->nr_hwavail ;
+ if (do_lock)
+ RL_UNLOCK(sc);
+ return 0;
+}
+
+static void
+re_netmap_tx_init(struct rl_softc *sc)
+{
+ struct rl_txdesc *txd;
+ struct rl_desc *desc;
+ int i;
+ struct netmap_adapter *na = NA(sc->rl_ifp);
+ struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0);
+
+ /* slot is NULL if we are not in netmap mode */
+ if (!slot)
+ return;
+ /* in netmap mode, overwrite addresses and maps */
+ txd = sc->rl_ldata.rl_tx_desc;
+ desc = sc->rl_ldata.rl_tx_list;
+
+ for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) {
+ void *addr = NMB(slot+i);
+ uint64_t paddr = vtophys(addr);
+
+ desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
+ desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
+ netmap_load_map(sc->rl_ldata.rl_tx_mtag,
+ txd[i].tx_dmamap, addr, na->buff_size);
+ }
+}
+
+static void
+re_netmap_rx_init(struct rl_softc *sc)
+{
+ /* slot is NULL if we are not in netmap mode */
+ struct netmap_adapter *na = NA(sc->rl_ifp);
+ struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0);
+ struct rl_desc *desc = sc->rl_ldata.rl_rx_list;
+ uint32_t cmdstat;
+ int i;
+
+ if (!slot)
+ return;
+
+ for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) {
+ void *addr = NMB(slot+i);
+ uint64_t paddr = vtophys(addr);
+
+ desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
+ desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
+ cmdstat = slot[i].len = na->buff_size; // XXX
+ if (i == sc->rl_ldata.rl_rx_desc_cnt - 1)
+ cmdstat |= RL_RDESC_CMD_EOR;
+ desc[i].rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN);
+
+ netmap_reload_map(sc->rl_ldata.rl_rx_mtag,
+ sc->rl_ldata.rl_rx_desc[i].rx_dmamap,
+ addr, na->buff_size);
+ }
+}
diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h
new file mode 100644
index 000000000000..a4d5491d67f1
--- /dev/null
+++ b/sys/dev/netmap/ixgbe_netmap.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: ixgbe_netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * netmap modifications for ixgbe
+ */
+
+#include <net/netmap.h>
+#include <sys/selinfo.h>
+// #include <vm/vm.h>
+// #include <vm/pmap.h> /* vtophys ? */
+#include <dev/netmap/netmap_kern.h>
+
+static int ixgbe_netmap_reg(struct ifnet *, int onoff);
+static int ixgbe_netmap_txsync(void *, u_int, int);
+static int ixgbe_netmap_rxsync(void *, u_int, int);
+static void ixgbe_netmap_lock_wrapper(void *, int, u_int);
+
+
+SYSCTL_NODE(_dev, OID_AUTO, ixgbe, CTLFLAG_RW, 0, "ixgbe card");
+
+static void
+ixgbe_netmap_attach(struct adapter *adapter)
+{
+ struct netmap_adapter na;
+
+ bzero(&na, sizeof(na));
+
+ na.ifp = adapter->ifp;
+ na.separate_locks = 1;
+ na.num_tx_desc = adapter->num_tx_desc;
+ na.num_rx_desc = adapter->num_rx_desc;
+ na.nm_txsync = ixgbe_netmap_txsync;
+ na.nm_rxsync = ixgbe_netmap_rxsync;
+ na.nm_lock = ixgbe_netmap_lock_wrapper;
+ na.nm_register = ixgbe_netmap_reg;
+ /*
+ * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode
+ * we allocate the buffers on the first register. So we must
+ * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set.
+ */
+ na.buff_size = MCLBYTES;
+ netmap_attach(&na, adapter->num_queues);
+}
+
+
+/*
+ * wrapper to export locks to the generic code
+ */
+static void
+ixgbe_netmap_lock_wrapper(void *_a, int what, u_int queueid)
+{
+ struct adapter *adapter = _a;
+
+ ASSERT(queueid < adapter->num_queues);
+ switch (what) {
+ case NETMAP_CORE_LOCK:
+ IXGBE_CORE_LOCK(adapter);
+ break;
+ case NETMAP_CORE_UNLOCK:
+ IXGBE_CORE_UNLOCK(adapter);
+ break;
+ case NETMAP_TX_LOCK:
+ IXGBE_TX_LOCK(&adapter->tx_rings[queueid]);
+ break;
+ case NETMAP_TX_UNLOCK:
+ IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]);
+ break;
+ case NETMAP_RX_LOCK:
+ IXGBE_RX_LOCK(&adapter->rx_rings[queueid]);
+ break;
+ case NETMAP_RX_UNLOCK:
+ IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]);
+ break;
+ }
+}
+
+
+/*
+ * support for netmap register/unregisted. We are already under core lock.
+ * only called on the first init or the last unregister.
+ */
+static int
+ixgbe_netmap_reg(struct ifnet *ifp, int onoff)
+{
+ struct adapter *adapter = ifp->if_softc;
+ struct netmap_adapter *na = NA(ifp);
+ int error = 0;
+
+ if (!na)
+ return EINVAL;
+
+ ixgbe_disable_intr(adapter);
+
+ /* Tell the stack that the interface is no longer active */
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* save if_transmit to restore it later */
+ na->if_transmit = ifp->if_transmit;
+ ifp->if_transmit = netmap_start;
+
+ ixgbe_init_locked(adapter);
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ } else {
+fail:
+ /* restore if_transmit */
+ ifp->if_transmit = na->if_transmit;
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ ixgbe_init_locked(adapter); /* also enables intr */
+ }
+ return (error);
+}
+
+
+/*
+ * Reconcile kernel and user view of the transmit ring.
+ *
+ * Userspace has filled tx slots up to cur (excluded).
+ * The last unused slot previously known to the kernel was nr_hwcur,
+ * and the last interrupt reported nr_hwavail slots available
+ * (using the special value -1 to indicate idle transmit ring).
+ * The function must first update avail to what the kernel
+ * knows, subtract the newly used slots (cur - nr_hwcur)
+ * from both avail and nr_hwavail, and set nr_hwcur = cur
+ * issuing a dmamap_sync on all slots.
+ *
+ * Check parameters in the struct netmap_ring.
+ * We don't use avail, only check for bogus values.
+ * Make sure cur is valid, and same goes for buffer indexes and lengths.
+ * To avoid races, read the values once, and never use those from
+ * the ring afterwards.
+ */
+static int
+ixgbe_netmap_txsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->tx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n = 0, lim = kring->nkr_num_slots - 1;
+
+ /* generate an interrupt approximately every half ring */
+ int report_frequency = kring->nkr_num_slots >> 1;
+
+ k = ring->cur; /* ring is not protected by any lock */
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ IXGBE_TX_LOCK(txr);
+ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ /* update avail to what the hardware knows */
+ ring->avail = kring->nr_hwavail;
+
+ j = kring->nr_hwcur;
+ if (j != k) { /* we have new packets to send */
+ while (j != k) {
+ struct netmap_slot *slot = &ring->slot[j];
+ struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[j];
+ union ixgbe_adv_tx_desc *curr = &txr->tx_base[j];
+ void *addr = NMB(slot);
+ int flags = ((slot->flags & NS_REPORT) ||
+ j == 0 || j == report_frequency) ?
+ IXGBE_TXD_CMD_RS : 0;
+ int len = slot->len;
+
+ if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
+ if (do_lock)
+ IXGBE_TX_UNLOCK(txr);
+ return netmap_ring_reinit(kring);
+ }
+
+ slot->flags &= ~NS_REPORT;
+ curr->read.buffer_addr = htole64(vtophys(addr));
+ curr->read.olinfo_status = 0;
+ curr->read.cmd_type_len =
+ htole32(txr->txd_cmd | len |
+ (IXGBE_ADVTXD_DTYP_DATA |
+ IXGBE_ADVTXD_DCMD_IFCS |
+ IXGBE_TXD_CMD_EOP | flags) );
+ if (slot->flags & NS_BUF_CHANGED) {
+ /* buffer has changed, unload and reload map */
+ netmap_reload_map(txr->txtag, txbuf->map,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(txr->txtag, txbuf->map,
+ BUS_DMASYNC_PREWRITE);
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwcur = k;
+
+ /* decrease avail by number of sent packets */
+ ring->avail -= n;
+ kring->nr_hwavail = ring->avail;
+
+ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+ IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), k);
+ }
+
+ if (n == 0 || kring->nr_hwavail < 1) {
+ /* record completed transmissions. TODO
+ *
+ * The datasheet discourages the use of TDH to find out the
+ * number of sent packets; the right way to do so, is to check
+ * the DD bit inside the status of a packet descriptor. On the
+ * other hand, we avoid to set the `report status' bit for
+ * *all* outgoing packets (kind of interrupt mitigation),
+ * consequently the DD bit is not guaranteed to be set for all
+ * the packets: thats way, for the moment we continue to use
+ * TDH.
+ */
+ j = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr));
+ if (j >= kring->nkr_num_slots) { /* XXX can happen */
+ D("TDH wrap %d", j);
+ j -= kring->nkr_num_slots;
+ }
+ int delta = j - txr->next_to_clean;
+ if (delta) {
+ /* new transmissions were completed, increment
+ ring->nr_hwavail. */
+ if (delta < 0)
+ delta += kring->nkr_num_slots;
+ txr->next_to_clean = j;
+ kring->nr_hwavail += delta;
+ ring->avail = kring->nr_hwavail;
+ }
+ }
+
+ if (do_lock)
+ IXGBE_TX_UNLOCK(txr);
+ return 0;
+}
+
+
+/*
+ * Reconcile kernel and user view of the receive ring.
+ *
+ * Userspace has read rx slots up to cur (excluded).
+ * The last unread slot previously known to the kernel was nr_hwcur,
+ * and the last interrupt reported nr_hwavail slots available.
+ * We must subtract the newly consumed slots (cur - nr_hwcur)
+ * from nr_hwavail, clearing the descriptors for the next
+ * read, tell the hardware that they are available,
+ * and set nr_hwcur = cur and avail = nr_hwavail.
+ * issuing a dmamap_sync on all slots.
+ */
+static int
+ixgbe_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
+{
+ struct adapter *adapter = a;
+ struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct netmap_adapter *na = NA(adapter->ifp);
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, n, lim = kring->nkr_num_slots - 1;
+
+ k = ring->cur; /* ring is not protected by any lock */
+ if ( (kring->nr_kflags & NR_REINIT) || k > lim)
+ return netmap_ring_reinit(kring);
+
+ if (do_lock)
+ IXGBE_RX_LOCK(rxr);
+ /* XXX check sync modes */
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+ j = rxr->next_to_check;
+ for (n = 0; ; n++) {
+ union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j];
+ uint32_t staterr = le32toh(curr->wb.upper.status_error);
+
+ if ((staterr & IXGBE_RXD_STAT_DD) == 0)
+ break;
+ ring->slot[j].len = le16toh(curr->wb.upper.length);
+ bus_dmamap_sync(rxr->ptag,
+ rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD);
+ j = (j == lim) ? 0 : j + 1;
+ }
+ if (n) {
+ rxr->next_to_check = j;
+ kring->nr_hwavail += n;
+ if (kring->nr_hwavail >= lim - 10) {
+ ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail);
+ }
+ }
+
+ /* skip past packets that userspace has already processed,
+ * making them available for reception.
+ * advance nr_hwcur and issue a bus_dmamap_sync on the
+ * buffers so it is safe to write to them.
+ * Also increase nr_hwavail
+ */
+ j = kring->nr_hwcur;
+ if (j != k) { /* userspace has read some packets. */
+ n = 0;
+ while (j != k) {
+ struct netmap_slot *slot = ring->slot + j;
+ union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j];
+ struct ixgbe_rx_buf *rxbuf = rxr->rx_buffers + j;
+ void *addr = NMB(slot);
+
+ if (addr == netmap_buffer_base) { /* bad buf */
+ if (do_lock)
+ IXGBE_RX_UNLOCK(rxr);
+ return netmap_ring_reinit(kring);
+ }
+
+ curr->wb.upper.status_error = 0;
+ curr->read.pkt_addr = htole64(vtophys(addr));
+ if (slot->flags & NS_BUF_CHANGED) {
+ netmap_reload_map(rxr->ptag, rxbuf->pmap,
+ addr, na->buff_size);
+ slot->flags &= ~NS_BUF_CHANGED;
+ }
+
+ bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
+ BUS_DMASYNC_PREREAD);
+
+ j = (j == lim) ? 0 : j + 1;
+ n++;
+ }
+ kring->nr_hwavail -= n;
+ kring->nr_hwcur = ring->cur;
+ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+ BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+ /* IMPORTANT: we must leave one free slot in the ring,
+ * so move j back by one unit
+ */
+ j = (j == 0) ? lim : j - 1;
+ IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), j);
+ }
+ /* tell userspace that there are new packets */
+ ring->avail = kring->nr_hwavail ;
+ if (do_lock)
+ IXGBE_RX_UNLOCK(rxr);
+ return 0;
+}
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
new file mode 100644
index 000000000000..7645a4e6e32b
--- /dev/null
+++ b/sys/dev/netmap/netmap.c
@@ -0,0 +1,1762 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: netmap.c 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * This module supports memory mapped access to network devices,
+ * see netmap(4).
+ *
+ * The module uses a large, memory pool allocated by the kernel
+ * and accessible as mmapped memory by multiple userspace threads/processes.
+ * The memory pool contains packet buffers and "netmap rings",
+ * i.e. user-accessible copies of the interface's queues.
+ *
+ * Access to the network card works like this:
+ * 1. a process/thread issues one or more open() on /dev/netmap, to create
+ * select()able file descriptor on which events are reported.
+ * 2. on each descriptor, the process issues an ioctl() to identify
+ * the interface that should report events to the file descriptor.
+ * 3. on each descriptor, the process issues an mmap() request to
+ * map the shared memory region within the process' address space.
+ * The list of interesting queues is indicated by a location in
+ * the shared memory region.
+ * 4. using the functions in the netmap(4) userspace API, a process
+ * can look up the occupation state of a queue, access memory buffers,
+ * and retrieve received packets or enqueue packets to transmit.
+ * 5. using some ioctl()s the process can synchronize the userspace view
+ * of the queue with the actual status in the kernel. This includes both
+ * receiving the notification of new packets, and transmitting new
+ * packets on the output interface.
+ * 6. select() or poll() can be used to wait for events on individual
+ * transmit or receive queues (or all queues for a given interface).
+ */
+
+#include <sys/cdefs.h> /* prerequisite */
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/module.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/conf.h> /* cdevsw struct */
+#include <sys/uio.h> /* uio struct */
+#include <sys/sockio.h>
+#include <sys/socketvar.h> /* struct socket */
+#include <sys/malloc.h>
+#include <sys/mman.h> /* PROT_EXEC */
+#include <sys/poll.h>
+#include <vm/vm.h> /* vtophys */
+#include <vm/pmap.h> /* vtophys */
+#include <sys/socket.h> /* sockaddrs */
+#include <machine/bus.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <net/if.h>
+#include <net/bpf.h> /* BIOCIMMEDIATE */
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+
+MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
+
+/*
+ * lock and unlock for the netmap memory allocator
+ */
+#define NMA_LOCK() mtx_lock(&netmap_mem_d->nm_mtx);
+#define NMA_UNLOCK() mtx_unlock(&netmap_mem_d->nm_mtx);
+
+/*
+ * Default amount of memory pre-allocated by the module.
+ * We start with a large size and then shrink our demand
+ * according to what is avalable when the module is loaded.
+ * At the moment the block is contiguous, but we can easily
+ * restrict our demand to smaller units (16..64k)
+ */
+#define NETMAP_MEMORY_SIZE (64 * 1024 * PAGE_SIZE)
+static void * netmap_malloc(size_t size, const char *msg);
+static void netmap_free(void *addr, const char *msg);
+
+/*
+ * Allocator for a pool of packet buffers. For each buffer we have
+ * one entry in the bitmap to signal the state. Allocation scans
+ * the bitmap, but since this is done only on attach, we are not
+ * too worried about performance
+ * XXX if we need to allocate small blocks, a translation
+ * table is used both for kernel virtual address and physical
+ * addresses.
+ */
+struct netmap_buf_pool {
+ u_int total_buffers; /* total buffers. */
+ u_int free;
+ u_int bufsize;
+ char *base; /* buffer base address */
+ uint32_t *bitmap; /* one bit per buffer, 1 means free */
+};
+struct netmap_buf_pool nm_buf_pool;
+/* XXX move these two vars back into netmap_buf_pool */
+u_int netmap_total_buffers;
+char *netmap_buffer_base;
+
+/* user-controlled variables */
+int netmap_verbose;
+
+static int no_timestamp; /* don't timestamp on rxsync */
+
+SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
+SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
+ CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
+SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
+ CTLFLAG_RW, &no_timestamp, 0, "no_timestamp");
+SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers,
+ CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers");
+SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers,
+ CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers");
+
+/*
+ * Allocate n buffers from the ring, and fill the slot.
+ * Buffer 0 is the 'junk' buffer.
+ */
+static void
+netmap_new_bufs(struct netmap_buf_pool *p, struct netmap_slot *slot, u_int n)
+{
+ uint32_t bi = 0; /* index in the bitmap */
+ uint32_t mask, j, i = 0; /* slot counter */
+
+ if (n > p->free) {
+ D("only %d out of %d buffers available", i, n);
+ return;
+ }
+ /* termination is guaranteed by p->free */
+ while (i < n && p->free > 0) {
+ uint32_t cur = p->bitmap[bi];
+ if (cur == 0) { /* bitmask is fully used */
+ bi++;
+ continue;
+ }
+ /* locate a slot */
+ for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ;
+ p->bitmap[bi] &= ~mask; /* slot in use */
+ p->free--;
+ slot[i].buf_idx = bi*32+j;
+ slot[i].len = p->bufsize;
+ slot[i].flags = NS_BUF_CHANGED;
+ i++;
+ }
+ ND("allocated %d buffers, %d available", n, p->free);
+}
+
+
+static void
+netmap_free_buf(struct netmap_buf_pool *p, uint32_t i)
+{
+ uint32_t pos, mask;
+ if (i >= p->total_buffers) {
+ D("invalid free index %d", i);
+ return;
+ }
+ pos = i / 32;
+ mask = 1 << (i % 32);
+ if (p->bitmap[pos] & mask) {
+ D("slot %d already free", i);
+ return;
+ }
+ p->bitmap[pos] |= mask;
+ p->free++;
+}
+
+
+/* Descriptor of the memory objects handled by our memory allocator. */
+struct netmap_mem_obj {
+ TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the
+ chain. */
+ int nmo_used; /* flag set on used memory objects. */
+ size_t nmo_size; /* size of the memory area reserved for the
+ object. */
+ void *nmo_data; /* pointer to the memory area. */
+};
+
+/* Wrap our memory objects to make them ``chainable``. */
+TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj);
+
+
+/* Descriptor of our custom memory allocator. */
+struct netmap_mem_d {
+ struct mtx nm_mtx; /* lock used to handle the chain of memory
+ objects. */
+ struct netmap_mem_obj_h nm_molist; /* list of memory objects */
+ size_t nm_size; /* total amount of memory used for rings etc. */
+ size_t nm_totalsize; /* total amount of allocated memory
+ (the difference is used for buffers) */
+ size_t nm_buf_start; /* offset of packet buffers.
+ This is page-aligned. */
+ size_t nm_buf_len; /* total memory for buffers */
+ void *nm_buffer; /* pointer to the whole pre-allocated memory
+ area. */
+};
+
+
+/* Structure associated to each thread which registered an interface. */
+struct netmap_priv_d {
+ struct netmap_if *np_nifp; /* netmap interface descriptor. */
+
+ struct ifnet *np_ifp; /* device for which we hold a reference */
+ int np_ringid; /* from the ioctl */
+ u_int np_qfirst, np_qlast; /* range of rings to scan */
+ uint16_t np_txpoll;
+};
+
+
+static struct cdev *netmap_dev; /* /dev/netmap character device. */
+static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */
+
+
+static d_mmap_t netmap_mmap;
+static d_ioctl_t netmap_ioctl;
+static d_poll_t netmap_poll;
+
+#ifdef NETMAP_KEVENT
+static d_kqfilter_t netmap_kqfilter;
+#endif
+
+static struct cdevsw netmap_cdevsw = {
+ .d_version = D_VERSION,
+ .d_name = "netmap",
+ .d_mmap = netmap_mmap,
+ .d_ioctl = netmap_ioctl,
+ .d_poll = netmap_poll,
+#ifdef NETMAP_KEVENT
+ .d_kqfilter = netmap_kqfilter,
+#endif
+};
+
+#ifdef NETMAP_KEVENT
+static int netmap_kqread(struct knote *, long);
+static int netmap_kqwrite(struct knote *, long);
+static void netmap_kqdetach(struct knote *);
+
+static struct filterops netmap_read_filterops = {
+ .f_isfd = 1,
+ .f_attach = NULL,
+ .f_detach = netmap_kqdetach,
+ .f_event = netmap_kqread,
+};
+
+static struct filterops netmap_write_filterops = {
+ .f_isfd = 1,
+ .f_attach = NULL,
+ .f_detach = netmap_kqdetach,
+ .f_event = netmap_kqwrite,
+};
+
+/*
+ * support for the kevent() system call.
+ *
+ * This is the kevent filter, and is executed each time a new event
+ * is triggered on the device. This function execute some operation
+ * depending on the received filter.
+ *
+ * The implementation should test the filters and should implement
+ * filter operations we are interested on (a full list in /sys/event.h).
+ *
+ * On a match we should:
+ * - set kn->kn_fop
+ * - set kn->kn_hook
+ * - call knlist_add() to deliver the event to the application.
+ *
+ * Return 0 if the event should be delivered to the application.
+ */
+static int
+netmap_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ /* declare variables needed to read/write */
+
+ switch(kn->kn_filter) {
+ case EVFILT_READ:
+ if (netmap_verbose)
+ D("%s kqfilter: EVFILT_READ" ifp->if_xname);
+
+ /* read operations */
+ kn->kn_fop = &netmap_read_filterops;
+ break;
+
+ case EVFILT_WRITE:
+ if (netmap_verbose)
+ D("%s kqfilter: EVFILT_WRITE" ifp->if_xname);
+
+ /* write operations */
+ kn->kn_fop = &netmap_write_filterops;
+ break;
+
+ default:
+ if (netmap_verbose)
+ D("%s kqfilter: invalid filter" ifp->if_xname);
+ return(EINVAL);
+ }
+
+ kn->kn_hook = 0;//
+ knlist_add(&netmap_sc->tun_rsel.si_note, kn, 0);
+
+ return (0);
+}
+#endif /* NETMAP_KEVENT */
+
+/*
+ * File descriptor's private data destructor.
+ *
+ * Call nm_register(ifp,0) to stop netmap mode on the interface and
+ * revert to normal operation. We expect that np_ifp has not gone.
+ */
+static void
+netmap_dtor(void *data)
+{
+ struct netmap_priv_d *priv = data;
+ struct ifnet *ifp = priv->np_ifp;
+ struct netmap_adapter *na = NA(ifp);
+ struct netmap_if *nifp = priv->np_nifp;
+
+ if (0)
+ printf("%s starting for %p ifp %p\n", __FUNCTION__, priv,
+ priv ? priv->np_ifp : NULL);
+
+ na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0);
+
+ na->refcount--;
+ if (na->refcount <= 0) { /* last instance */
+ u_int i;
+
+ D("deleting last netmap instance for %s", ifp->if_xname);
+ /*
+ * there is a race here with *_netmap_task() and
+ * netmap_poll(), which don't run under NETMAP_CORE_LOCK.
+ * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
+ * (aka NETMAP_DELETING(na)) are a unique marker that the
+ * device is dying.
+ * Before destroying stuff we sleep a bit, and then complete
+ * the job. NIOCREG should realize the condition and
+ * loop until they can continue; the other routines
+ * should check the condition at entry and quit if
+ * they cannot run.
+ */
+ na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
+ tsleep(na, 0, "NIOCUNREG", 4);
+ na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0);
+ na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
+ /* Wake up any sleeping threads. netmap_poll will
+ * then return POLLERR
+ */
+ for (i = 0; i < na->num_queues + 2; i++) {
+ selwakeuppri(&na->tx_rings[i].si, PI_NET);
+ selwakeuppri(&na->rx_rings[i].si, PI_NET);
+ }
+ /* release all buffers */
+ NMA_LOCK();
+ for (i = 0; i < na->num_queues + 1; i++) {
+ int j, lim;
+ struct netmap_ring *ring;
+
+ ND("tx queue %d", i);
+ ring = na->tx_rings[i].ring;
+ lim = na->tx_rings[i].nkr_num_slots;
+ for (j = 0; j < lim; j++)
+ netmap_free_buf(&nm_buf_pool,
+ ring->slot[j].buf_idx);
+
+ ND("rx queue %d", i);
+ ring = na->rx_rings[i].ring;
+ lim = na->rx_rings[i].nkr_num_slots;
+ for (j = 0; j < lim; j++)
+ netmap_free_buf(&nm_buf_pool,
+ ring->slot[j].buf_idx);
+ }
+ NMA_UNLOCK();
+ netmap_free(na->tx_rings[0].ring, "shadow rings");
+ wakeup(na);
+ }
+ netmap_free(nifp, "nifp");
+
+ na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
+
+ if_rele(ifp);
+
+ bzero(priv, sizeof(*priv)); /* XXX for safety */
+ free(priv, M_DEVBUF);
+}
+
+
+
+/*
+ * Create and return a new ``netmap_if`` object, and possibly also
+ * rings and packet buffors.
+ *
+ * Return NULL on failure.
+ */
+static void *
+netmap_if_new(const char *ifname, struct netmap_adapter *na)
+{
+ struct netmap_if *nifp;
+ struct netmap_ring *ring;
+ char *buff;
+ u_int i, len, ofs;
+ u_int n = na->num_queues + 1; /* shorthand, include stack queue */
+
+ /*
+ * the descriptor is followed inline by an array of offsets
+ * to the tx and rx rings in the shared memory region.
+ */
+ len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t);
+ nifp = netmap_malloc(len, "nifp");
+ if (nifp == NULL)
+ return (NULL);
+
+ /* initialize base fields */
+ *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues;
+ strncpy(nifp->ni_name, ifname, IFNAMSIZ);
+
+ (na->refcount)++; /* XXX atomic ? we are under lock */
+ if (na->refcount > 1)
+ goto final;
+
+ /*
+ * If this is the first instance, allocate the shadow rings and
+ * buffers for this card (one for each hw queue, one for the host).
+ * The rings are contiguous, but have variable size.
+ * The entire block is reachable at
+ * na->tx_rings[0].ring
+ */
+
+ len = n * (2 * sizeof(struct netmap_ring) +
+ (na->num_tx_desc + na->num_rx_desc) *
+ sizeof(struct netmap_slot) );
+ buff = netmap_malloc(len, "shadow rings");
+ if (buff == NULL) {
+ D("failed to allocate %d bytes for %s shadow ring",
+ len, ifname);
+error:
+ (na->refcount)--;
+ netmap_free(nifp, "nifp, rings failed");
+ return (NULL);
+ }
+ /* do we have the bufers ? we are in need of num_tx_desc buffers for
+ * each tx ring and num_tx_desc buffers for each rx ring. */
+ len = n * (na->num_tx_desc + na->num_rx_desc);
+ NMA_LOCK();
+ if (nm_buf_pool.free < len) {
+ NMA_UNLOCK();
+ netmap_free(buff, "not enough bufs");
+ goto error;
+ }
+ /*
+ * in the kring, store the pointers to the shared rings
+ * and initialize the rings. We are under NMA_LOCK().
+ */
+ ofs = 0;
+ for (i = 0; i < n; i++) {
+ struct netmap_kring *kring;
+ int numdesc;
+
+ /* Transmit rings */
+ kring = &na->tx_rings[i];
+ numdesc = na->num_tx_desc;
+ bzero(kring, sizeof(*kring));
+ kring->na = na;
+
+ ring = kring->ring = (struct netmap_ring *)(buff + ofs);
+ *(ssize_t *)(uintptr_t)&ring->buf_ofs =
+ nm_buf_pool.base - (char *)ring;
+ ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs);
+ *(int *)(int *)(uintptr_t)&ring->num_slots =
+ kring->nkr_num_slots = numdesc;
+
+ /*
+ * IMPORTANT:
+ * Always keep one slot empty, so we can detect new
+ * transmissions comparing cur and nr_hwcur (they are
+ * the same only if there are no new transmissions).
+ */
+ ring->avail = kring->nr_hwavail = numdesc - 1;
+ ring->cur = kring->nr_hwcur = 0;
+ netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc);
+
+ ofs += sizeof(struct netmap_ring) +
+ numdesc * sizeof(struct netmap_slot);
+
+ /* Receive rings */
+ kring = &na->rx_rings[i];
+ numdesc = na->num_rx_desc;
+ bzero(kring, sizeof(*kring));
+ kring->na = na;
+
+ ring = kring->ring = (struct netmap_ring *)(buff + ofs);
+ *(ssize_t *)(uintptr_t)&ring->buf_ofs =
+ nm_buf_pool.base - (char *)ring;
+ ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs);
+ *(int *)(int *)(uintptr_t)&ring->num_slots =
+ kring->nkr_num_slots = numdesc;
+ ring->cur = kring->nr_hwcur = 0;
+ ring->avail = kring->nr_hwavail = 0; /* empty */
+ netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc);
+ ofs += sizeof(struct netmap_ring) +
+ numdesc * sizeof(struct netmap_slot);
+ }
+ NMA_UNLOCK();
+ for (i = 0; i < n+1; i++) {
+ // XXX initialize the selrecord structs.
+ }
+final:
+ /*
+ * fill the slots for the rx and tx queues. They contain the offset
+ * between the ring and nifp, so the information is usable in
+ * userspace to reach the ring from the nifp.
+ */
+ for (i = 0; i < n; i++) {
+ char *base = (char *)nifp;
+ *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] =
+ (char *)na->tx_rings[i].ring - base;
+ *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] =
+ (char *)na->rx_rings[i].ring - base;
+ }
+ return (nifp);
+}
+
+
+/*
+ * mmap(2) support for the "netmap" device.
+ *
+ * Expose all the memory previously allocated by our custom memory
+ * allocator: this way the user has only to issue a single mmap(2), and
+ * can work on all the data structures flawlessly.
+ *
+ * Return 0 on success, -1 otherwise.
+ */
+static int
+#if __FreeBSD_version < 900000
+netmap_mmap(__unused struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr,
+ int nprot)
+#else
+netmap_mmap(__unused struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, __unused vm_memattr_t *memattr)
+#endif
+{
+ if (nprot & PROT_EXEC)
+ return (-1); // XXX -1 or EINVAL ?
+ ND("request for offset 0x%x", (uint32_t)offset);
+ *paddr = vtophys(netmap_mem_d->nm_buffer) + offset;
+
+ return (0);
+}
+
+
+/*
+ * handler for synchronization of the queues from/to the host
+ */
+static void
+netmap_sync_to_host(struct netmap_adapter *na)
+{
+ struct netmap_kring *kring = &na->tx_rings[na->num_queues];
+ struct netmap_ring *ring = kring->ring;
+ struct mbuf *head = NULL, *tail = NULL, *m;
+ u_int n, lim = kring->nkr_num_slots - 1;
+
+ na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0);
+
+ /* Take packets from hwcur to cur and pass them up.
+ * In case of no buffers we give up. At the end of the loop,
+ * the queue is drained in all cases.
+ */
+ for (n = kring->nr_hwcur; n != ring->cur;) {
+ struct netmap_slot *slot = &ring->slot[n];
+
+ n = (n == lim) ? 0 : n + 1;
+ if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) {
+ D("bad pkt at %d len %d", n, slot->len);
+ continue;
+ }
+ m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL);
+
+ if (m == NULL)
+ break;
+ if (tail)
+ tail->m_nextpkt = m;
+ else
+ head = m;
+ tail = m;
+ m->m_nextpkt = NULL;
+ }
+ kring->nr_hwcur = ring->cur;
+ kring->nr_hwavail = ring->avail = lim;
+ na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
+
+ /* send packets up, outside the lock */
+ while ((m = head) != NULL) {
+ head = head->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m->m_pkthdr.rcvif = na->ifp;
+ if (netmap_verbose & NM_VERB_HOST)
+ D("sending up pkt %p size %d", m, m->m_pkthdr.len);
+ (na->ifp->if_input)(na->ifp, m);
+ }
+}
+
+/*
+ * This routine also does the selrecord if called from the poll handler
+ * (we know because td != NULL).
+ */
+static void
+netmap_sync_from_host(struct netmap_adapter *na, struct thread *td)
+{
+ struct netmap_kring *kring = &na->rx_rings[na->num_queues];
+ struct netmap_ring *ring = kring->ring;
+ int delta;
+
+ na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0);
+
+ /* skip past packets processed by userspace,
+ * and then sync cur/avail with hwcur/hwavail
+ */
+ delta = ring->cur - kring->nr_hwcur;
+ if (delta < 0)
+ delta += kring->nkr_num_slots;
+ kring->nr_hwavail -= delta;
+ kring->nr_hwcur = ring->cur;
+ ring->avail = kring->nr_hwavail;
+ if (ring->avail == 0 && td)
+ selrecord(td, &kring->si);
+ if (ring->avail && (netmap_verbose & NM_VERB_HOST))
+ D("%d pkts from stack", ring->avail);
+ na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
+}
+
+
+/*
+ * get a refcounted reference to an interface.
+ * Return ENXIO if the interface does not exist, EINVAL if netmap
+ * is not supported by the interface.
+ * If successful, hold a reference.
+ */
+static int
+get_ifp(const char *name, struct ifnet **ifp)
+{
+ *ifp = ifunit_ref(name);
+ if (*ifp == NULL)
+ return (ENXIO);
+ /* can do this if the capability exists and if_pspare[0]
+ * points to the netmap descriptor.
+ */
+ if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp))
+ return 0; /* valid pointer, we hold the refcount */
+ if_rele(*ifp);
+ return EINVAL; // not NETMAP capable
+}
+
+
+/*
+ * Error routine called when txsync/rxsync detects an error.
+ * Can't do much more than resetting cur = hwcur, avail = hwavail.
+ * Return 1 on reinit.
+ */
+int
+netmap_ring_reinit(struct netmap_kring *kring)
+{
+ struct netmap_ring *ring = kring->ring;
+ u_int i, lim = kring->nkr_num_slots - 1;
+ int errors = 0;
+
+ D("called for %s", kring->na->ifp->if_xname);
+ if (ring->cur > lim)
+ errors++;
+ for (i = 0; i <= lim; i++) {
+ u_int idx = ring->slot[i].buf_idx;
+ u_int len = ring->slot[i].len;
+ if (idx < 2 || idx >= netmap_total_buffers) {
+ if (!errors++)
+ D("bad buffer at slot %d idx %d len %d ", i, idx, len);
+ ring->slot[i].buf_idx = 0;
+ ring->slot[i].len = 0;
+ } else if (len > NETMAP_BUF_SIZE) {
+ ring->slot[i].len = 0;
+ if (!errors++)
+ D("bad len %d at slot %d idx %d",
+ len, i, idx);
+ }
+ }
+ if (errors) {
+ int pos = kring - kring->na->tx_rings;
+ int n = kring->na->num_queues + 2;
+
+ D("total %d errors", errors);
+ errors++;
+ D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
+ kring->na->ifp->if_xname,
+ pos < n ? "TX" : "RX", pos < n ? pos : pos - n,
+ ring->cur, kring->nr_hwcur,
+ ring->avail, kring->nr_hwavail);
+ ring->cur = kring->nr_hwcur;
+ ring->avail = kring->nr_hwavail;
+ ring->flags |= NR_REINIT;
+ kring->na->flags |= NR_REINIT;
+ }
+ return (errors ? 1 : 0);
+}
+
+/*
+ * Clean the reinit flag for our rings.
+ * XXX at the moment, clear for all rings
+ */
+static void
+netmap_clean_reinit(struct netmap_adapter *na)
+{
+ //struct netmap_kring *kring;
+ u_int i;
+
+ na->flags &= ~NR_REINIT;
+ D("--- NR_REINIT reset on %s", na->ifp->if_xname);
+ for (i = 0; i < na->num_queues + 1; i++) {
+ na->tx_rings[i].ring->flags &= ~NR_REINIT;
+ na->rx_rings[i].ring->flags &= ~NR_REINIT;
+ }
+}
+
+/*
+ * Set the ring ID. For devices with a single queue, a request
+ * for all rings is the same as a single ring.
+ */
+static int
+netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
+{
+ struct ifnet *ifp = priv->np_ifp;
+ struct netmap_adapter *na = NA(ifp);
+ void *adapter = na->ifp->if_softc; /* shorthand */
+ u_int i = ringid & NETMAP_RING_MASK;
+ /* first time we don't lock */
+ int need_lock = (priv->np_qfirst != priv->np_qlast);
+
+ if ( (ringid & NETMAP_HW_RING) && i >= na->num_queues) {
+ D("invalid ring id %d", i);
+ return (EINVAL);
+ }
+ if (need_lock)
+ na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
+ priv->np_ringid = ringid;
+ if (ringid & NETMAP_SW_RING) {
+ priv->np_qfirst = na->num_queues;
+ priv->np_qlast = na->num_queues + 1;
+ } else if (ringid & NETMAP_HW_RING) {
+ priv->np_qfirst = i;
+ priv->np_qlast = i + 1;
+ } else {
+ priv->np_qfirst = 0;
+ priv->np_qlast = na->num_queues;
+ }
+ priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
+ if (need_lock)
+ na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
+ if (ringid & NETMAP_SW_RING)
+ D("ringid %s set to SW RING", ifp->if_xname);
+ else if (ringid & NETMAP_HW_RING)
+ D("ringid %s set to HW RING %d", ifp->if_xname,
+ priv->np_qfirst);
+ else
+ D("ringid %s set to all %d HW RINGS", ifp->if_xname,
+ priv->np_qlast);
+ return 0;
+}
+
+/*
+ * ioctl(2) support for the "netmap" device.
+ *
+ * Following a list of accepted commands:
+ * - NIOCGINFO
+ * - SIOCGIFADDR just for convenience
+ * - NIOCREGIF
+ * - NIOCUNREGIF
+ * - NIOCTXSYNC
+ * - NIOCRXSYNC
+ *
+ * Return 0 on success, errno otherwise.
+ */
+static int
+netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data,
+ __unused int fflag, __unused struct thread *td)
+{
+ struct netmap_priv_d *priv = NULL;
+ struct ifnet *ifp;
+ struct nmreq *nmr = (struct nmreq *) data;
+ struct netmap_adapter *na;
+ void *adapter;
+ int error;
+ u_int i;
+ struct netmap_if *nifp;
+
+ error = devfs_get_cdevpriv((void **)&priv);
+ if (error != ENOENT && error != 0)
+ return (error);
+
+ error = 0; /* Could be ENOENT */
+ switch (cmd) {
+ case NIOCGINFO: /* return capabilities etc */
+ /* memsize is always valid */
+ nmr->nr_memsize = netmap_mem_d->nm_totalsize;
+ nmr->nr_offset = 0;
+ nmr->nr_numrings = 0;
+ nmr->nr_numslots = 0;
+ if (nmr->nr_name[0] == '\0') /* just get memory info */
+ break;
+ error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */
+ if (error)
+ break;
+ na = NA(ifp); /* retrieve netmap_adapter */
+ nmr->nr_numrings = na->num_queues;
+ nmr->nr_numslots = na->num_tx_desc;
+ if_rele(ifp); /* return the refcount */
+ break;
+
+ case NIOCREGIF:
+ if (priv != NULL) /* thread already registered */
+ return netmap_set_ringid(priv, nmr->nr_ringid);
+ /* find the interface and a reference */
+ error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
+ if (error)
+ break;
+ na = NA(ifp); /* retrieve netmap adapter */
+ adapter = na->ifp->if_softc; /* shorthand */
+ /*
+ * Allocate the private per-thread structure.
+ * XXX perhaps we can use a blocking malloc ?
+ */
+ priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (priv == NULL) {
+ error = ENOMEM;
+ if_rele(ifp); /* return the refcount */
+ break;
+ }
+
+
+ for (i = 10; i > 0; i--) {
+ na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
+ if (!NETMAP_DELETING(na))
+ break;
+ na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
+ tsleep(na, 0, "NIOCREGIF", hz/10);
+ }
+ if (i == 0) {
+ D("too many NIOCREGIF attempts, give up");
+ error = EINVAL;
+ free(priv, M_DEVBUF);
+ if_rele(ifp); /* return the refcount */
+ break;
+ }
+
+ priv->np_ifp = ifp; /* store the reference */
+ error = netmap_set_ringid(priv, nmr->nr_ringid);
+ if (error)
+ goto error;
+ priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na);
+ if (nifp == NULL) { /* allocation failed */
+ error = ENOMEM;
+ } else if (ifp->if_capenable & IFCAP_NETMAP) {
+ /* was already set */
+ } else {
+ /* Otherwise set the card in netmap mode
+ * and make it use the shared buffers.
+ */
+ error = na->nm_register(ifp, 1); /* mode on */
+ if (error) {
+ /*
+ * do something similar to netmap_dtor().
+ */
+ netmap_free(na->tx_rings[0].ring, "rings, reg.failed");
+ free(na->tx_rings, M_DEVBUF);
+ na->tx_rings = na->rx_rings = NULL;
+ na->refcount--;
+ netmap_free(nifp, "nifp, rings failed");
+ nifp = NULL;
+ }
+ }
+
+ if (error) { /* reg. failed, release priv and ref */
+error:
+ na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
+ free(priv, M_DEVBUF);
+ if_rele(ifp); /* return the refcount */
+ break;
+ }
+
+ na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
+ error = devfs_set_cdevpriv(priv, netmap_dtor);
+
+ if (error != 0) {
+ /* could not assign the private storage for the
+ * thread, call the destructor explicitly.
+ */
+ netmap_dtor(priv);
+ break;
+ }
+
+ /* return the offset of the netmap_if object */
+ nmr->nr_numrings = na->num_queues;
+ nmr->nr_numslots = na->num_tx_desc;
+ nmr->nr_memsize = netmap_mem_d->nm_totalsize;
+ nmr->nr_offset =
+ ((char *) nifp - (char *) netmap_mem_d->nm_buffer);
+ break;
+
+ case NIOCUNREGIF:
+ if (priv == NULL)
+ return (ENXIO);
+
+ /* the interface is unregistered inside the
+ destructor of the private data. */
+ devfs_clear_cdevpriv();
+ break;
+
+ case NIOCTXSYNC:
+ case NIOCRXSYNC:
+ if (priv == NULL)
+ return (ENXIO);
+ ifp = priv->np_ifp; /* we have a reference */
+ na = NA(ifp); /* retrieve netmap adapter */
+ adapter = ifp->if_softc; /* shorthand */
+
+ if (na->flags & NR_REINIT)
+ netmap_clean_reinit(na);
+
+ if (priv->np_qfirst == na->num_queues) {
+ /* queues to/from host */
+ if (cmd == NIOCTXSYNC)
+ netmap_sync_to_host(na);
+ else
+ netmap_sync_from_host(na, NULL);
+ return error;
+ }
+
+ for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
+ if (cmd == NIOCTXSYNC) {
+ struct netmap_kring *kring = &na->tx_rings[i];
+ if (netmap_verbose & NM_VERB_TXSYNC)
+ D("sync tx ring %d cur %d hwcur %d",
+ i, kring->ring->cur,
+ kring->nr_hwcur);
+ na->nm_txsync(adapter, i, 1 /* do lock */);
+ if (netmap_verbose & NM_VERB_TXSYNC)
+ D("after sync tx ring %d cur %d hwcur %d",
+ i, kring->ring->cur,
+ kring->nr_hwcur);
+ } else {
+ na->nm_rxsync(adapter, i, 1 /* do lock */);
+ microtime(&na->rx_rings[i].ring->ts);
+ }
+ }
+
+ break;
+
+ case BIOCIMMEDIATE:
+ case BIOCGHDRCMPLT:
+ case BIOCSHDRCMPLT:
+ case BIOCSSEESENT:
+ D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
+ break;
+
+ default:
+ {
+ /*
+ * allow device calls
+ */
+ struct socket so;
+ bzero(&so, sizeof(so));
+ error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
+ if (error)
+ break;
+ so.so_vnet = ifp->if_vnet;
+ // so->so_proto not null.
+ error = ifioctl(&so, cmd, data, td);
+ if_rele(ifp);
+ }
+ }
+
+ return (error);
+}
+
+
+/*
+ * select(2) and poll(2) handlers for the "netmap" device.
+ *
+ * Can be called for one or more queues.
+ * Return true the event mask corresponding to ready events.
+ * If there are no ready events, do a selrecord on either individual
+ * selfd or on the global one.
+ * Device-dependent parts (locking and sync of tx/rx rings)
+ * are done through callbacks.
+ */
+static int
+netmap_poll(__unused struct cdev *dev, int events, struct thread *td)
+{
+ struct netmap_priv_d *priv = NULL;
+ struct netmap_adapter *na;
+ struct ifnet *ifp;
+ struct netmap_kring *kring;
+ u_int i, check_all, want_tx, want_rx, revents = 0;
+ void *adapter;
+
+ if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
+ return POLLERR;
+
+ ifp = priv->np_ifp;
+ // XXX check for deleting() ?
+ if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
+ return POLLERR;
+
+ if (netmap_verbose & 0x8000)
+ D("device %s events 0x%x", ifp->if_xname, events);
+ want_tx = events & (POLLOUT | POLLWRNORM);
+ want_rx = events & (POLLIN | POLLRDNORM);
+
+ adapter = ifp->if_softc;
+ na = NA(ifp); /* retrieve netmap adapter */
+
+ /* pending reinit, report up as a poll error. Pending
+ * reads and writes are lost.
+ */
+ if (na->flags & NR_REINIT) {
+ netmap_clean_reinit(na);
+ revents |= POLLERR;
+ }
+ /* how many queues we are scanning */
+ i = priv->np_qfirst;
+ if (i == na->num_queues) { /* from/to host */
+ if (priv->np_txpoll || want_tx) {
+ /* push any packets up, then we are always ready */
+ kring = &na->tx_rings[i];
+ netmap_sync_to_host(na);
+ revents |= want_tx;
+ }
+ if (want_rx) {
+ kring = &na->rx_rings[i];
+ if (kring->ring->avail == 0)
+ netmap_sync_from_host(na, td);
+ if (kring->ring->avail > 0) {
+ revents |= want_rx;
+ }
+ }
+ return (revents);
+ }
+
+ /*
+ * check_all is set if the card has more than one queue and
+ * the client is polling all of them. If true, we sleep on
+ * the "global" selfd, otherwise we sleep on individual selfd
+ * (we can only sleep on one of them per direction).
+ * The interrupt routine in the driver should always wake on
+ * the individual selfd, and also on the global one if the card
+ * has more than one ring.
+ *
+ * If the card has only one lock, we just use that.
+ * If the card has separate ring locks, we just use those
+ * unless we are doing check_all, in which case the whole
+ * loop is wrapped by the global lock.
+ * We acquire locks only when necessary: if poll is called
+ * when buffers are available, we can just return without locks.
+ *
+ * rxsync() is only called if we run out of buffers on a POLLIN.
+ * txsync() is called if we run out of buffers on POLLOUT, or
+ * there are pending packets to send. The latter can be disabled
+ * passing NETMAP_NO_TX_POLL in the NIOCREG call.
+ */
+ check_all = (i + 1 != priv->np_qlast);
+
+ /*
+ * core_lock indicates what to do with the core lock.
+ * The core lock is used when either the card has no individual
+ * locks, or it has individual locks but we are cheking all
+ * rings so we need the core lock to avoid missing wakeup events.
+ *
+ * It has three possible states:
+ * NO_CL we don't need to use the core lock, e.g.
+ * because we are protected by individual locks.
+ * NEED_CL we need the core lock. In this case, when we
+ * call the lock routine, move to LOCKED_CL
+ * to remember to release the lock once done.
+ * LOCKED_CL core lock is set, so we need to release it.
+ */
+ enum {NO_CL, NEED_CL, LOCKED_CL };
+ int core_lock = (check_all || !na->separate_locks) ?
+ NEED_CL:NO_CL;
+ /*
+ * We start with a lock free round which is good if we have
+ * data available. If this fails, then lock and call the sync
+ * routines.
+ */
+ for (i = priv->np_qfirst; want_rx && i < priv->np_qlast; i++) {
+ kring = &na->rx_rings[i];
+ if (kring->ring->avail > 0) {
+ revents |= want_rx;
+ want_rx = 0; /* also breaks the loop */
+ }
+ }
+ for (i = priv->np_qfirst; want_tx && i < priv->np_qlast; i++) {
+ kring = &na->tx_rings[i];
+ if (kring->ring->avail > 0) {
+ revents |= want_tx;
+ want_tx = 0; /* also breaks the loop */
+ }
+ }
+
+ /*
+ * If we to push packets out (priv->np_txpoll) or want_tx is
+ * still set, we do need to run the txsync calls (on all rings,
+ * to avoid that the tx rings stall).
+ */
+ if (priv->np_txpoll || want_tx) {
+ for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
+ kring = &na->tx_rings[i];
+ if (!want_tx && kring->ring->cur == kring->nr_hwcur)
+ continue;
+ if (core_lock == NEED_CL) {
+ na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
+ core_lock = LOCKED_CL;
+ }
+ if (na->separate_locks)
+ na->nm_lock(adapter, NETMAP_TX_LOCK, i);
+ if (netmap_verbose & NM_VERB_TXSYNC)
+ D("send %d on %s %d",
+ kring->ring->cur,
+ ifp->if_xname, i);
+ if (na->nm_txsync(adapter, i, 0 /* no lock */))
+ revents |= POLLERR;
+
+ if (want_tx) {
+ if (kring->ring->avail > 0) {
+ /* stop at the first ring. We don't risk
+ * starvation.
+ */
+ revents |= want_tx;
+ want_tx = 0;
+ } else if (!check_all)
+ selrecord(td, &kring->si);
+ }
+ if (na->separate_locks)
+ na->nm_lock(adapter, NETMAP_TX_UNLOCK, i);
+ }
+ }
+
+ /*
+ * now if want_rx is still set we need to lock and rxsync.
+ * Do it on all rings because otherwise we starve.
+ */
+ if (want_rx) {
+ for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
+ kring = &na->rx_rings[i];
+ if (core_lock == NEED_CL) {
+ na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
+ core_lock = LOCKED_CL;
+ }
+ if (na->separate_locks)
+ na->nm_lock(adapter, NETMAP_RX_LOCK, i);
+
+ if (na->nm_rxsync(adapter, i, 0 /* no lock */))
+ revents |= POLLERR;
+ if (no_timestamp == 0 ||
+ kring->ring->flags & NR_TIMESTAMP)
+ microtime(&kring->ring->ts);
+
+ if (kring->ring->avail > 0)
+ revents |= want_rx;
+ else if (!check_all)
+ selrecord(td, &kring->si);
+ if (na->separate_locks)
+ na->nm_lock(adapter, NETMAP_RX_UNLOCK, i);
+ }
+ }
+ if (check_all && revents == 0) {
+ i = na->num_queues + 1; /* the global queue */
+ if (want_tx)
+ selrecord(td, &na->tx_rings[i].si);
+ if (want_rx)
+ selrecord(td, &na->rx_rings[i].si);
+ }
+ if (core_lock == LOCKED_CL)
+ na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
+
+ return (revents);
+}
+
+/*------- driver support routines ------*/
+
+/*
+ * Initialize a ``netmap_adapter`` object created by driver on attach.
+ * We allocate a block of memory with room for a struct netmap_adapter
+ * plus two sets of N+2 struct netmap_kring (where N is the number
+ * of hardware rings):
+ * krings 0..N-1 are for the hardware queues.
+ * kring N is for the host stack queue
+ * kring N+1 is only used for the selinfo for all queues.
+ * Return 0 on success, ENOMEM otherwise.
+ */
+int
+netmap_attach(struct netmap_adapter *na, int num_queues)
+{
+ int n = num_queues + 2;
+ int size = sizeof(*na) + 2 * n * sizeof(struct netmap_kring);
+ void *buf;
+ struct ifnet *ifp = na->ifp;
+
+ if (ifp == NULL) {
+ D("ifp not set, giving up");
+ return EINVAL;
+ }
+ na->refcount = 0;
+ na->num_queues = num_queues;
+
+ buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (buf) {
+ ifp->if_pspare[0] = buf;
+ na->tx_rings = (void *)((char *)buf + sizeof(*na));
+ na->rx_rings = na->tx_rings + n;
+ bcopy(na, buf, sizeof(*na));
+ ifp->if_capabilities |= IFCAP_NETMAP;
+ }
+ D("%s for %s", buf ? "ok" : "failed", ifp->if_xname);
+
+ return (buf ? 0 : ENOMEM);
+}
+
+
+/*
+ * Free the allocated memory linked to the given ``netmap_adapter``
+ * object.
+ */
+void
+netmap_detach(struct ifnet *ifp)
+{
+ u_int i;
+ struct netmap_adapter *na = NA(ifp);
+
+ if (!na)
+ return;
+
+ for (i = 0; i < na->num_queues + 2; i++) {
+ knlist_destroy(&na->tx_rings[i].si.si_note);
+ knlist_destroy(&na->rx_rings[i].si.si_note);
+ }
+ bzero(na, sizeof(*na));
+ ifp->if_pspare[0] = NULL;
+ free(na, M_DEVBUF);
+}
+
+
+/*
+ * intercept packets coming from the network stack and present
+ * them to netmap as incoming packets on a separate ring.
+ * We are not locked when called.
+ */
+int
+netmap_start(struct ifnet *ifp, struct mbuf *m)
+{
+ struct netmap_adapter *na = NA(ifp);
+ u_int i, len, n = na->num_queues;
+ int error = EBUSY;
+ struct netmap_kring *kring = &na->rx_rings[n];
+ struct netmap_slot *slot;
+
+ len = m->m_pkthdr.len;
+ if (netmap_verbose & NM_VERB_HOST)
+ D("%s packet %d len %d from the stack", ifp->if_xname,
+ kring->nr_hwcur + kring->nr_hwavail, len);
+ na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0);
+ if (kring->nr_hwavail >= (int)kring->nkr_num_slots - 1) {
+ D("stack ring %s full\n", ifp->if_xname);
+ goto done; /* no space */
+ }
+ if (len > na->buff_size) {
+ D("drop packet size %d > %d", len, na->buff_size);
+ goto done; /* too long for us */
+ }
+
+ /* compute the insert position */
+ i = kring->nr_hwcur + kring->nr_hwavail;
+ if (i >= kring->nkr_num_slots)
+ i -= kring->nkr_num_slots;
+ slot = &kring->ring->slot[i];
+ m_copydata(m, 0, len, NMB(slot));
+ slot->len = len;
+ kring->nr_hwavail++;
+ if (netmap_verbose & NM_VERB_HOST)
+ D("wake up host ring %s %d", na->ifp->if_xname, na->num_queues);
+ selwakeuppri(&kring->si, PI_NET);
+ error = 0;
+done:
+ na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
+
+ /* release the mbuf in either cases of success or failure. As an
+ * alternative, put the mbuf in a free list and free the list
+ * only when really necessary.
+ */
+ m_freem(m);
+
+ return (error);
+}
+
+
+/*
+ * netmap_reset() is called by the driver routines when reinitializing
+ * a ring. The driver is in charge of locking to protect the kring.
+ * If netmap mode is not set just return NULL.
+ * Otherwise set NR_REINIT (in the ring and in na) to signal
+ * that a ring has been reinitialized,
+ * set cur = hwcur = 0 and avail = hwavail = num_slots - 1 .
+ * IT IS IMPORTANT to leave one slot free even in the tx ring because
+ * we rely on cur=hwcur only for empty rings.
+ * These are good defaults but can be overridden later in the device
+ * specific code if, after a reinit, the ring does not start from 0
+ * (e.g. if_em.c does this).
+ *
+ * XXX we shouldn't be touching the ring, but there is a
+ * race anyways and this is our best option.
+ *
+ * XXX setting na->flags makes the syscall code faster, as there is
+ * only one place to check. On the other hand, we will need a better
+ * way to notify multiple threads that rings have been reset.
+ * One way is to increment na->rst_count at each ring reset.
+ * Each thread in its own priv structure will keep a matching counter,
+ * and on a reset will acknowledge and clean its own rings.
+ */
+struct netmap_slot *
+netmap_reset(struct netmap_adapter *na, enum txrx tx, int n,
+ u_int new_cur)
+{
+ struct netmap_kring *kring;
+ struct netmap_ring *ring;
+ struct netmap_slot *slot;
+ u_int i;
+
+ if (na == NULL)
+ return NULL; /* no netmap support here */
+ if (!(na->ifp->if_capenable & IFCAP_NETMAP))
+ return NULL; /* nothing to reinitialize */
+ kring = tx == NR_TX ? na->tx_rings + n : na->rx_rings + n;
+ ring = kring->ring;
+ if (tx == NR_TX) {
+ /*
+ * The last argument is the new value of next_to_clean.
+ *
+ * In the TX ring, we have P pending transmissions (from
+ * next_to_clean to nr_hwcur) followed by nr_hwavail free slots.
+ * Generally we can use all the slots in the ring so
+ * P = ring_size - nr_hwavail hence (modulo ring_size):
+ * next_to_clean == nr_hwcur + nr_hwavail
+ *
+ * If, upon a reset, nr_hwavail == ring_size and next_to_clean
+ * does not change we have nothing to report. Otherwise some
+ * pending packets may be lost, or newly injected packets will.
+ */
+ /* if hwcur does not change, nothing to report.
+ * otherwise remember the change so perhaps we can
+ * shift the block at the next reinit
+ */
+ if (new_cur == kring->nr_hwcur &&
+ kring->nr_hwavail == kring->nkr_num_slots - 1) {
+ /* all ok */
+ D("+++ NR_REINIT ok on %s TX[%d]", na->ifp->if_xname, n);
+ } else {
+ D("+++ NR_REINIT set on %s TX[%d]", na->ifp->if_xname, n);
+ }
+ ring->flags |= NR_REINIT;
+ na->flags |= NR_REINIT;
+ ring->avail = kring->nr_hwavail = kring->nkr_num_slots - 1;
+ ring->cur = kring->nr_hwcur = new_cur;
+ } else {
+ /*
+ * The last argument is the next free slot.
+ * In the RX ring we have nr_hwavail full buffers starting
+ * from nr_hwcur.
+ * If nr_hwavail == 0 and nr_hwcur does not change we are ok
+ * otherwise we might be in trouble as the buffers are
+ * changing.
+ */
+ if (new_cur == kring->nr_hwcur && kring->nr_hwavail == 0) {
+ /* all ok */
+ D("+++ NR_REINIT ok on %s RX[%d]", na->ifp->if_xname, n);
+ } else {
+ D("+++ NR_REINIT set on %s RX[%d]", na->ifp->if_xname, n);
+ }
+ ring->flags |= NR_REINIT;
+ na->flags |= NR_REINIT;
+ ring->avail = kring->nr_hwavail = 0; /* no data */
+ ring->cur = kring->nr_hwcur = new_cur;
+ }
+
+ slot = ring->slot;
+ /*
+ * Check that buffer indexes are correct. If we find a
+ * bogus value we are a bit in trouble because we cannot
+ * recover easily. Best we can do is (probably) persistently
+ * reset the ring.
+ */
+ for (i = 0; i < kring->nkr_num_slots; i++) {
+ if (slot[i].buf_idx >= netmap_total_buffers) {
+ D("invalid buf_idx %d at slot %d", slot[i].buf_idx, i);
+ slot[i].buf_idx = 0; /* XXX reset */
+ }
+ /* XXX we don't really need to set the length */
+ slot[i].len = 0;
+ }
+ /* wakeup possible waiters, both on the ring and on the global
+ * selfd. Perhaps a bit early now but the device specific
+ * routine is locked so hopefully we won't have a race.
+ */
+ selwakeuppri(&kring->si, PI_NET);
+ selwakeuppri(&kring[na->num_queues + 1 - n].si, PI_NET);
+ return kring->ring->slot;
+}
+
+static void
+ns_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs,
+ __unused int nseg, __unused int error)
+{
+}
+
+/* unload a bus_dmamap and create a new one. Used when the
+ * buffer in the slot is changed.
+ * XXX buflen is probably not needed, buffers have constant size.
+ */
+void
+netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map,
+ void *buf, bus_size_t buflen)
+{
+ bus_addr_t paddr;
+ bus_dmamap_unload(tag, map);
+ bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr,
+ BUS_DMA_NOWAIT);
+}
+
+void
+netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map,
+ void *buf, bus_size_t buflen)
+{
+ bus_addr_t paddr;
+ bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr,
+ BUS_DMA_NOWAIT);
+}
+
+/*------ netmap memory allocator -------*/
+/*
+ * Request for a chunk of memory.
+ *
+ * Memory objects are arranged into a list, hence we need to walk this
+ * list until we find an object with the needed amount of data free.
+ * This sounds like a completely inefficient implementation, but given
+ * the fact that data allocation is done once, we can handle it
+ * flawlessly.
+ *
+ * Return NULL on failure.
+ */
+static void *
+netmap_malloc(size_t size, __unused const char *msg)
+{
+ struct netmap_mem_obj *mem_obj, *new_mem_obj;
+ void *ret = NULL;
+
+ NMA_LOCK();
+ TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) {
+ if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size)
+ continue;
+
+ new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
+ M_WAITOK | M_ZERO);
+ TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next);
+
+ new_mem_obj->nmo_used = 1;
+ new_mem_obj->nmo_size = size;
+ new_mem_obj->nmo_data = mem_obj->nmo_data;
+ memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size);
+
+ mem_obj->nmo_size -= size;
+ mem_obj->nmo_data = (char *) mem_obj->nmo_data + size;
+ if (mem_obj->nmo_size == 0) {
+ TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj,
+ nmo_next);
+ free(mem_obj, M_NETMAP);
+ }
+
+ ret = new_mem_obj->nmo_data;
+
+ break;
+ }
+ NMA_UNLOCK();
+ ND("%s: %d bytes at %p", msg, size, ret);
+
+ return (ret);
+}
+
+/*
+ * Return the memory to the allocator.
+ *
+ * While freeing a memory object, we try to merge adjacent chunks in
+ * order to reduce memory fragmentation.
+ */
+static void
+netmap_free(void *addr, const char *msg)
+{
+ size_t size;
+ struct netmap_mem_obj *cur, *prev, *next;
+
+ if (addr == NULL) {
+ D("NULL addr for %s", msg);
+ return;
+ }
+
+ NMA_LOCK();
+ TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) {
+ if (cur->nmo_data == addr && cur->nmo_used)
+ break;
+ }
+ if (cur == NULL) {
+ NMA_UNLOCK();
+ D("invalid addr %s %p", msg, addr);
+ return;
+ }
+
+ size = cur->nmo_size;
+ cur->nmo_used = 0;
+
+ /* merge current chunk of memory with the previous one,
+ if present. */
+ prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next);
+ if (prev && prev->nmo_used == 0) {
+ TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next);
+ prev->nmo_size += cur->nmo_size;
+ free(cur, M_NETMAP);
+ cur = prev;
+ }
+
+ /* merge with the next one */
+ next = TAILQ_NEXT(cur, nmo_next);
+ if (next && next->nmo_used == 0) {
+ TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next);
+ cur->nmo_size += next->nmo_size;
+ free(next, M_NETMAP);
+ }
+ NMA_UNLOCK();
+ ND("freed %s %d bytes at %p", msg, size, addr);
+}
+
+
+/*
+ * Initialize the memory allocator.
+ *
+ * Create the descriptor for the memory , allocate the pool of memory
+ * and initialize the list of memory objects with a single chunk
+ * containing the whole pre-allocated memory marked as free.
+ *
+ * Start with a large size, then halve as needed if we fail to
+ * allocate the block. While halving, always add one extra page
+ * because buffers 0 and 1 are used for special purposes.
+ * Return 0 on success, errno otherwise.
+ */
+static int
+netmap_memory_init(void)
+{
+ struct netmap_mem_obj *mem_obj;
+ void *buf = NULL;
+ int i, n, sz = NETMAP_MEMORY_SIZE;
+ int extra_sz = 0; // space for rings and two spare buffers
+
+ for (; !buf && sz >= 1<<20; sz >>=1) {
+ extra_sz = sz/200;
+ extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
+ buf = contigmalloc(sz + extra_sz,
+ M_NETMAP,
+ M_WAITOK | M_ZERO,
+ 0, /* low address */
+ -1UL, /* high address */
+ PAGE_SIZE, /* alignment */
+ 0 /* boundary */
+ );
+ }
+ if (buf == NULL)
+ return (ENOMEM);
+ sz += extra_sz;
+ netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP,
+ M_WAITOK | M_ZERO);
+ mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL,
+ MTX_DEF);
+ TAILQ_INIT(&netmap_mem_d->nm_molist);
+ netmap_mem_d->nm_buffer = buf;
+ netmap_mem_d->nm_totalsize = sz;
+
+ /*
+ * A buffer takes 2k, a slot takes 8 bytes + ring overhead,
+ * so the ratio is 200:1. In other words, we can use 1/200 of
+ * the memory for the rings, and the rest for the buffers,
+ * and be sure we never run out.
+ */
+ netmap_mem_d->nm_size = sz/200;
+ netmap_mem_d->nm_buf_start =
+ (netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
+ netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start;
+
+ nm_buf_pool.base = netmap_mem_d->nm_buffer;
+ nm_buf_pool.base += netmap_mem_d->nm_buf_start;
+ netmap_buffer_base = nm_buf_pool.base;
+ D("netmap_buffer_base %p (offset %d)",
+ netmap_buffer_base, netmap_mem_d->nm_buf_start);
+ /* number of buffers, they all start as free */
+
+ netmap_total_buffers = nm_buf_pool.total_buffers =
+ netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE;
+ nm_buf_pool.bufsize = NETMAP_BUF_SIZE;
+
+ D("Have %d MB, use %dKB for rings, %d buffers at %p",
+ (sz >> 20), (netmap_mem_d->nm_size >> 10),
+ nm_buf_pool.total_buffers, nm_buf_pool.base);
+
+ /* allocate and initialize the bitmap. Entry 0 is considered
+ * always busy (used as default when there are no buffers left).
+ */
+ n = (nm_buf_pool.total_buffers + 31) / 32;
+ nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP,
+ M_WAITOK | M_ZERO);
+ nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */
+ for (i = 1; i < n; i++)
+ nm_buf_pool.bitmap[i] = ~0;
+ nm_buf_pool.free = nm_buf_pool.total_buffers - 2;
+
+ mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
+ M_WAITOK | M_ZERO);
+ TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next);
+ mem_obj->nmo_used = 0;
+ mem_obj->nmo_size = netmap_mem_d->nm_size;
+ mem_obj->nmo_data = netmap_mem_d->nm_buffer;
+
+ return (0);
+}
+
+
+/*
+ * Finalize the memory allocator.
+ *
+ * Free all the memory objects contained inside the list, and deallocate
+ * the pool of memory; finally free the memory allocator descriptor.
+ */
+static void
+netmap_memory_fini(void)
+{
+ struct netmap_mem_obj *mem_obj;
+
+ while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) {
+ mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist);
+ TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next);
+ if (mem_obj->nmo_used == 1) {
+ printf("netmap: leaked %d bytes at %p\n",
+ mem_obj->nmo_size,
+ mem_obj->nmo_data);
+ }
+ free(mem_obj, M_NETMAP);
+ }
+ contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP);
+ // XXX mutex_destroy(nm_mtx);
+ free(netmap_mem_d, M_NETMAP);
+}
+
+
+/*
+ * Module loader.
+ *
+ * Create the /dev/netmap device and initialize all global
+ * variables.
+ *
+ * Return 0 on success, errno on failure.
+ */
+static int
+netmap_init(void)
+{
+ int error;
+
+
+ error = netmap_memory_init();
+ if (error != 0) {
+ printf("netmap: unable to initialize the memory allocator.");
+ return (error);
+ }
+ printf("netmap: loaded module with %d Mbytes\n",
+ netmap_mem_d->nm_totalsize >> 20);
+
+ netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
+ "netmap");
+
+ return (0);
+}
+
+
+/*
+ * Module unloader.
+ *
+ * Free all the memory, and destroy the ``/dev/netmap`` device.
+ */
+static void
+netmap_fini(void)
+{
+ destroy_dev(netmap_dev);
+
+ netmap_memory_fini();
+
+ printf("netmap: unloaded module.\n");
+}
+
+
+/*
+ * Kernel entry point.
+ *
+ * Initialize/finalize the module and return.
+ *
+ * Return 0 on success, errno on failure.
+ */
+static int
+netmap_loader(__unused struct module *module, int event, __unused void *arg)
+{
+ int error = 0;
+
+ switch (event) {
+ case MOD_LOAD:
+ error = netmap_init();
+ break;
+
+ case MOD_UNLOAD:
+ netmap_fini();
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+
+DEV_MODULE(netmap, netmap_loader, NULL);
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
new file mode 100644
index 000000000000..5434609c447b
--- /dev/null
+++ b/sys/dev/netmap/netmap_kern.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: netmap_kern.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * The header contains the definitions of constants and function
+ * prototypes used only in kernelspace.
+ */
+
+#ifndef _NET_NETMAP_KERN_H_
+#define _NET_NETMAP_KERN_H_
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_NETMAP);
+#endif
+
+#define ND(format, ...)
+#define D(format, ...) \
+ do { \
+ struct timeval __xxts; \
+ microtime(&__xxts); \
+ printf("%03d.%06d %s [%d] " format "\n",\
+ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+struct netmap_adapter;
+
+/*
+ * private, kernel view of a ring.
+ *
+ * XXX 20110627-todo
+ * The index in the NIC and netmap ring is offset by nkr_hwofs slots.
+ * This is so that, on a reset, buffers owned by userspace are not
+ * modified by the kernel. In particular:
+ * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides
+ * the next empty buffer as known by the hardware (next_to_check or so).
+ * TX rings: hwcur + hwofs coincides with next_to_send
+ */
+struct netmap_kring {
+ struct netmap_ring *ring;
+ u_int nr_hwcur;
+ int nr_hwavail;
+ u_int nr_kflags;
+ u_int nkr_num_slots;
+
+ u_int nkr_hwofs; /* offset between NIC and netmap ring */
+ struct netmap_adapter *na; // debugging
+ struct selinfo si; /* poll/select wait queue */
+};
+
+/*
+ * This struct is part of and extends the 'struct adapter' (or
+ * equivalent) device descriptor. It contains all fields needed to
+ * support netmap operation.
+ */
+struct netmap_adapter {
+ int refcount; /* number of user-space descriptors using this
+ interface, which is equal to the number of
+ struct netmap_if objs in the mapped region. */
+
+ int separate_locks; /* set if the interface suports different
+ locks for rx, tx and core. */
+
+ u_int num_queues; /* number of tx/rx queue pairs: this is
+ a duplicate field needed to simplify the
+ signature of ``netmap_detach``. */
+
+ u_int num_tx_desc; /* number of descriptor in each queue */
+ u_int num_rx_desc;
+ u_int buff_size;
+
+ u_int flags; /* NR_REINIT */
+ /* tx_rings and rx_rings are private but allocated
+ * as a contiguous chunk of memory. Each array has
+ * N+1 entries, for the adapter queues and for the host queue.
+ */
+ struct netmap_kring *tx_rings; /* array of TX rings. */
+ struct netmap_kring *rx_rings; /* array of RX rings. */
+
+ /* copy of if_qflush and if_transmit pointers, to intercept
+ * packets from the network stack when netmap is active.
+ * XXX probably if_qflush is not necessary.
+ */
+ void (*if_qflush)(struct ifnet *);
+ int (*if_transmit)(struct ifnet *, struct mbuf *);
+
+ /* references to the ifnet and device routines, used by
+ * the generic netmap functions.
+ */
+ struct ifnet *ifp; /* adapter is ifp->if_softc */
+
+ int (*nm_register)(struct ifnet *, int onoff);
+ void (*nm_lock)(void *, int what, u_int ringid);
+ int (*nm_txsync)(void *, u_int ring, int lock);
+ int (*nm_rxsync)(void *, u_int ring, int lock);
+};
+
+/*
+ * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP)
+ * and refcount gives the status of the interface, namely:
+ *
+ * enable refcount Status
+ *
+ * FALSE 0 normal operation
+ * FALSE != 0 -- (impossible)
+ * TRUE 1 netmap mode
+ * TRUE 0 being deleted.
+ */
+
+#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \
+ ( (_na)->ifp->if_capenable & IFCAP_NETMAP) )
+
+/*
+ * parameters for (*nm_lock)(adapter, what, index)
+ */
+enum {
+ NETMAP_NO_LOCK = 0,
+ NETMAP_CORE_LOCK, NETMAP_CORE_UNLOCK,
+ NETMAP_TX_LOCK, NETMAP_TX_UNLOCK,
+ NETMAP_RX_LOCK, NETMAP_RX_UNLOCK,
+};
+
+/*
+ * The following are support routines used by individual drivers to
+ * support netmap operation.
+ *
+ * netmap_attach() initializes a struct netmap_adapter, allocating the
+ * struct netmap_ring's and the struct selinfo.
+ *
+ * netmap_detach() frees the memory allocated by netmap_attach().
+ *
+ * netmap_start() replaces the if_transmit routine of the interface,
+ * and is used to intercept packets coming from the stack.
+ *
+ * netmap_load_map/netmap_reload_map are helper routines to set/reset
+ * the dmamap for a packet buffer
+ *
+ * netmap_reset() is a helper routine to be called in the driver
+ * when reinitializing a ring.
+ */
+int netmap_attach(struct netmap_adapter *, int);
+void netmap_detach(struct ifnet *);
+int netmap_start(struct ifnet *, struct mbuf *);
+enum txrx { NR_RX = 0, NR_TX = 1 };
+struct netmap_slot *netmap_reset(struct netmap_adapter *na,
+ enum txrx tx, int n, u_int new_cur);
+void netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map,
+ void *buf, bus_size_t buflen);
+void netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map,
+ void *buf, bus_size_t buflen);
+int netmap_ring_reinit(struct netmap_kring *);
+
+/*
+ * XXX eventually, get rid of netmap_total_buffers and netmap_buffer_base
+ * in favour of the structure
+ */
+// struct netmap_buf_pool;
+// extern struct netmap_buf_pool nm_buf_pool;
+extern u_int netmap_total_buffers;
+extern char *netmap_buffer_base;
+extern int netmap_verbose; // XXX debugging
+enum { /* verbose flags */
+ NM_VERB_ON = 1, /* generic verbose */
+ NM_VERB_HOST = 0x2, /* verbose host stack */
+ NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */
+ NM_VERB_TXSYNC = 0x20,
+ NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */
+ NM_VERB_TXINTR = 0x200,
+ NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */
+ NM_VERB_NIC_TXSYNC = 0x2000,
+};
+
+/*
+ * return a pointer to the struct netmap adapter from the ifp
+ */
+#define NA(_ifp) ((struct netmap_adapter *)(_ifp)->if_pspare[0])
+
+
+/*
+ * return the address of a buffer.
+ * XXX this is a special version with hardwired 2k bufs
+ * On error return netmap_buffer_base which is detected as a bad pointer.
+ */
+static inline char *
+NMB(struct netmap_slot *slot)
+{
+ uint32_t i = slot->buf_idx;
+ return (i >= netmap_total_buffers) ? netmap_buffer_base :
+#if NETMAP_BUF_SIZE == 2048
+ netmap_buffer_base + (i << 11);
+#else
+ netmap_buffer_base + (i *NETMAP_BUF_SIZE);
+#endif
+}
+
+#endif /* _NET_NETMAP_KERN_H_ */
diff --git a/sys/net/netmap.h b/sys/net/netmap.h
new file mode 100644
index 000000000000..be9c686a49ed
--- /dev/null
+++ b/sys/net/netmap.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. Neither the name of the authors nor the names of their contributors
+ * may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: netmap.h 9662 2011-11-16 13:18:06Z luigi $
+ *
+ * This header contains the definitions of the constants and the
+ * structures needed by the ``netmap'' module, both kernel and
+ * userspace.
+ */
+
+#ifndef _NET_NETMAP_H_
+#define _NET_NETMAP_H_
+
+/*
+ * --- Netmap data structures ---
+ *
+ * The data structures used by netmap are shown below. Those in
+ * capital letters are in an mmapp()ed area shared with userspace,
+ * while others are private to the kernel.
+ * Shared structures do not contain pointers but only relative
+ * offsets, so that addressing is portable between kernel and userspace.
+ *
+ * The 'softc' of each interface is extended with a struct netmap_adapter
+ * containing information to support netmap operation. In addition to
+ * the fixed fields, it has two pointers to reach the arrays of
+ * 'struct netmap_kring' which in turn reaches the various
+ * struct netmap_ring, shared with userspace.
+
+
+ softc
++----------------+
+| standard fields|
+| if_pspare[0] ----------+
++----------------+ |
+ |
++----------------+<------+
+|(netmap_adapter)|
+| | netmap_kring
+| tx_rings *--------------------------------->+-------------+
+| | netmap_kring | ring *---------> ...
+| rx_rings *---------->+--------------+ | nr_hwcur |
++----------------+ | ring *-------+ | nr_hwavail |
+ | nr_hwcur | | | selinfo |
+ | nr_hwavail | | +-------------+
+ | selinfo | | | ... |
+ +--------------+ | (na_num_rings+1 entries)
+ | .... | | | |
+ (na_num_rings+1 entries) +-------------+
+ | | |
+ +--------------+ |
+ | NETMAP_RING
+ +---->+-------------+
+ / | cur |
+ NETMAP_IF (nifp, one per file desc.) / | avail |
+ +---------------+ / | buf_ofs |
+ | ni_num_queues | / +=============+
+ | | / | buf_idx | slot[0]
+ | | / | len, flags |
+ | | / +-------------+
+ +===============+ / | buf_idx | slot[1]
+ | txring_ofs[0] | (rel.to nifp)--' | len, flags |
+ | txring_ofs[1] | +-------------+
+ (num_rings+1 entries) (nr_num_slots entries)
+ | txring_ofs[n] | | buf_idx | slot[n-1]
+ +---------------+ | len, flags |
+ | rxring_ofs[0] | +-------------+
+ | rxring_ofs[1] |
+ (num_rings+1 entries)
+ | txring_ofs[n] |
+ +---------------+
+
+ * The NETMAP_RING is the shadow ring that mirrors the NIC rings.
+ * Each slot has the index of a buffer, its length and some flags.
+ * In user space, the buffer address is computed as
+ * (char *)ring + buf_ofs + index*MAX_BUF_SIZE
+ * In the kernel, buffers do not necessarily need to be contiguous,
+ * and the virtual and physical addresses are derived through
+ * a lookup table. When userspace wants to use a different buffer
+ * in a location, it must set the NS_BUF_CHANGED flag to make
+ * sure that the kernel recomputes updates the hardware ring and
+ * other fields (bus_dmamap, etc.) as needed.
+ *
+ * Normally the driver is not requested to report the result of
+ * transmissions (this can dramatically speed up operation).
+ * However the user may request to report completion by setting
+ * NS_REPORT.
+ */
+struct netmap_slot {
+ uint32_t buf_idx; /* buffer index */
+ uint16_t len; /* packet length, to be copied to/from the hw ring */
+ uint16_t flags; /* buf changed, etc. */
+#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */
+#define NS_REPORT 0x0002 /* ask the hardware to report results
+ * e.g. by generating an interrupt
+ */
+};
+
+/*
+ * Netmap representation of a TX or RX ring (also known as "queue").
+ * This is a queue implemented as a fixed-size circular array.
+ * At the software level, two fields are important: avail and cur.
+ *
+ * In TX rings:
+ * avail indicates the number of slots available for transmission.
+ * It is decremented by the application when it appends a
+ * packet, and set to nr_hwavail (see below) on a
+ * NIOCTXSYNC to reflect the actual state of the queue
+ * (keeping track of completed transmissions).
+ * cur indicates the empty slot to use for the next packet
+ * to send (i.e. the "tail" of the queue).
+ * It is incremented by the application.
+ *
+ * The kernel side of netmap uses two additional fields in its own
+ * private ring structure, netmap_kring:
+ * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC.
+ * nr_hwavail is the number of slots known as available by the
+ * hardware. It is updated on an INTR (inc by the
+ * number of packets sent) and on a NIOCTXSYNC
+ * (decrease by nr_cur - nr_hwcur)
+ * A special case, nr_hwavail is -1 if the transmit
+ * side is idle (no pending transmits).
+ *
+ * In RX rings:
+ * avail is the number of packets available (possibly 0).
+ * It is decremented by the software when it consumes
+ * a packet, and set to nr_hwavail on a NIOCRXSYNC
+ * cur indicates the first slot that contains a packet
+ * (the "head" of the queue).
+ * It is incremented by the software when it consumes
+ * a packet.
+ *
+ * The kernel side of netmap uses two additional fields in the kring:
+ * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC
+ * nr_hwavail is the number of packets available. It is updated
+ * on INTR (inc by the number of new packets arrived)
+ * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur).
+ *
+ * DATA OWNERSHIP/LOCKING:
+ * The netmap_ring is owned by the user program and it is only
+ * accessed or modified in the upper half of the kernel during
+ * a system call.
+ *
+ * The netmap_kring is only modified by the upper half of the kernel.
+ */
+struct netmap_ring {
+ /*
+ * nr_buf_base_ofs is meant to be used through macros.
+ * It contains the offset of the buffer region from this
+ * descriptor.
+ */
+ const ssize_t buf_ofs;
+ const uint32_t num_slots; /* number of slots in the ring. */
+ uint32_t avail; /* number of usable slots */
+ uint32_t cur; /* 'current' r/w position */
+
+ const uint16_t nr_buf_size;
+ uint16_t flags;
+ /*
+ * When a ring is reinitialized, the kernel sets kflags.
+ * On exit from a syscall, if the flag is found set, we
+ * also reinitialize the nr_* variables. The kflag is then
+ * unconditionally copied to nr_flags and cleared.
+ */
+#define NR_REINIT 0x0001 /* ring reinitialized! */
+#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
+
+ struct timeval ts; /* time of last *sync() */
+
+ /* the slots follow. This struct has variable size */
+ struct netmap_slot slot[0]; /* array of slots. */
+};
+
+
+/*
+ * Netmap representation of an interface and its queue(s).
+ * There is one netmap_if for each file descriptor on which we want
+ * to select/poll. We assume that on each interface has the same number
+ * of receive and transmit queues.
+ * select/poll operates on one or all pairs depending on the value of
+ * nmr_queueid passed on the ioctl.
+ */
+struct netmap_if {
+ char ni_name[IFNAMSIZ]; /* name of the interface. */
+ const u_int ni_version; /* API version, currently unused */
+ const u_int ni_num_queues; /* number of queue pairs (TX/RX). */
+ const u_int ni_rx_queues; /* if zero, use ni_num_queues */
+ /*
+ * the following array contains the offset of the
+ * each netmap ring from this structure. The first num_queues+1
+ * refer to the tx rings, the next n+1 refer to the rx rings.
+ * The area is filled up by the kernel on NIOCREG,
+ * and then only read by userspace code.
+ * entries 0..ni_num_queues-1 indicate the hardware queues,
+ * entry ni_num_queues is the queue from/to the stack.
+ */
+ const ssize_t ring_ofs[0];
+};
+
+#ifndef IFCAP_NETMAP /* this should go in net/if.h */
+#define IFCAP_NETMAP 0x100000
+#endif
+
+#ifndef NIOCREGIF
+/*
+ * ioctl names and related fields
+ *
+ * NIOCGINFO takes a struct ifreq, the interface name is the input,
+ * the outputs are number of queues and number of descriptor
+ * for each queue (useful to set number of threads etc.).
+ *
+ * NIOCREGIF takes an interface name within a struct ifreq,
+ * and activates netmap mode on the interface (if possible).
+ *
+ * NIOCUNREGIF unregisters the interface associated to the fd.
+ *
+ * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
+ * whose identity is set in NIOCREGIF through nr_ringid
+ */
+
+/*
+ * struct nmreq overlays a struct ifreq
+ */
+struct nmreq {
+ char nr_name[IFNAMSIZ];
+ uint32_t nr_version; /* API version (unused) */
+ uint32_t nr_offset; /* nifp offset in the shared region */
+ uint32_t nr_memsize; /* size of the shared region */
+ uint32_t nr_numslots; /* descriptors per queue */
+ uint16_t nr_numrings;
+ uint16_t nr_ringid; /* ring(s) we care about */
+#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */
+#define NETMAP_SW_RING 0x2000 /* we process the sw ring */
+#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */
+#define NETMAP_RING_MASK 0xfff /* the ring number */
+};
+
+/*
+ * default buf size is 2048, but it may make sense to have
+ * it shorter for better cache usage.
+ */
+
+#define NETMAP_BUF_SIZE (2048)
+#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */
+#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */
+#define NIOCUNREGIF _IO('i', 147) /* interface unregister */
+#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
+#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
+#endif /* !NIOCREGIF */
+
+#endif /* _NET_NETMAP_H_ */
diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h
new file mode 100644
index 000000000000..c9443b89e43f
--- /dev/null
+++ b/sys/net/netmap_user.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. Neither the name of the authors nor the names of their contributors
+ * may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: netmap_user.h 9495 2011-10-18 15:28:23Z luigi $
+ *
+ * This header contains the macros used to manipulate netmap structures
+ * and packets in userspace. See netmap(4) for more information.
+ *
+ * The address of the struct netmap_if, say nifp, is determined
+ * by the value returned from ioctl(.., NIOCREG, ...) and the mmap
+ * region:
+ * ioctl(fd, NIOCREG, &req);
+ * mem = mmap(0, ... );
+ * nifp = NETMAP_IF(mem, req.nr_nifp);
+ * (so simple, we could just do it manually)
+ *
+ * From there:
+ * struct netmap_ring *NETMAP_TXRING(nifp, index)
+ * struct netmap_ring *NETMAP_RXRING(nifp, index)
+ * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags
+ *
+ * ring->slot[i] gives us the i-th slot (we can access
+ * directly plen, flags, bufindex)
+ *
+ * char *buf = NETMAP_BUF(ring, index) returns a pointer to
+ * the i-th buffer
+ *
+ * Since rings are circular, we have macros to compute the next index
+ * i = NETMAP_RING_NEXT(ring, i);
+ */
+
+#ifndef _NET_NETMAP_USER_H_
+#define _NET_NETMAP_USER_H_
+
+#define NETMAP_IF(b, o) (struct netmap_if *)((char *)(b) + (o))
+
+#define NETMAP_TXRING(nifp, index) \
+ ((struct netmap_ring *)((char *)(nifp) + \
+ (nifp)->ring_ofs[index] ) )
+
+#define NETMAP_RXRING(nifp, index) \
+ ((struct netmap_ring *)((char *)(nifp) + \
+ (nifp)->ring_ofs[index + (nifp)->ni_num_queues+1] ) )
+
+#if NETMAP_BUF_SIZE != 2048
+#error cannot handle odd size
+#define NETMAP_BUF(ring, index) \
+ ((char *)(ring) + (ring)->buf_ofs + ((index)*NETMAP_BUF_SIZE))
+#else
+#define NETMAP_BUF(ring, index) \
+ ((char *)(ring) + (ring)->buf_ofs + ((index)<<11))
+#endif
+
+#define NETMAP_RING_NEXT(r, i) \
+ ((i)+1 == (r)->num_slots ? 0 : (i) + 1 )
+
+/*
+ * Return 1 if the given tx ring is empty.
+ *
+ * @r netmap_ring descriptor pointer.
+ * Special case, a negative value in hwavail indicates that the
+ * transmit queue is idle.
+ * XXX revise
+ */
+#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1)
+
+#endif /* _NET_NETMAP_USER_H_ */
diff --git a/tools/tools/README b/tools/tools/README
index 253b2e08f739..9c3db2fc5364 100644
--- a/tools/tools/README
+++ b/tools/tools/README
@@ -50,6 +50,7 @@ mfc Merge a directory from HEAD to a branch where it does not
mid Create a Message-ID database for mailing lists.
mwl Tools specific to the Marvell 88W8363 support
ncpus Count the number of processors
+netmap Test applications for netmap(4)
notescheck Check for missing devices and options in NOTES files.
npe Tools specific to the Intel IXP4XXX NPE device
nxge A diagnostic tool for the nxge(4) driver
diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile
new file mode 100644
index 000000000000..4b682e52a311
--- /dev/null
+++ b/tools/tools/netmap/Makefile
@@ -0,0 +1,25 @@
+#
+# $FreeBSD$
+#
+# For multiple programs using a single source file each,
+# we can just define 'progs' and create custom targets.
+PROGS = pkt-gen bridge testpcap libnetmap.so
+
+CLEANFILES = $(PROGS) pcap.o
+NO_MAN=
+CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys
+CFLAGS += -Wextra
+
+LDFLAGS += -lpthread -lpcap
+
+.include <bsd.prog.mk>
+.include <bsd.lib.mk>
+
+all: $(PROGS)
+
+testpcap: pcap.c libnetmap.so
+ $(CC) $(CFLAGS) -L. -lnetmap -o ${.TARGET} pcap.c
+
+libnetmap.so: pcap.c
+ $(CC) $(CFLAGS) -fpic -c ${.ALLSRC}
+ $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o}
diff --git a/tools/tools/netmap/README b/tools/tools/netmap/README
new file mode 100644
index 000000000000..9a1ba6096188
--- /dev/null
+++ b/tools/tools/netmap/README
@@ -0,0 +1,11 @@
+$FreeBSD$
+
+This directory contains examples that use netmap
+
+ pkt-gen a packet sink/source using the netmap API
+
+ bridge a two-port jumper wire, also using the native API
+
+ testpcap a jumper wire using libnetmap (or libpcap)
+
+ click* various click examples
diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c
new file mode 100644
index 000000000000..2385a0811fb5
--- /dev/null
+++ b/tools/tools/netmap/bridge.c
@@ -0,0 +1,456 @@
+/*
+ * (C) 2011 Luigi Rizzo, Matteo Landi
+ *
+ * BSD license
+ *
+ * A netmap client to bridge two network interfaces
+ * (or one interface and the host stack).
+ *
+ * $FreeBSD$
+ */
+
+#include <errno.h>
+#include <signal.h> /* signal */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h> /* strcmp */
+#include <fcntl.h> /* open */
+#include <unistd.h> /* close */
+
+#include <sys/endian.h> /* le64toh */
+#include <sys/mman.h> /* PROT_* */
+#include <sys/ioctl.h> /* ioctl */
+#include <machine/param.h>
+#include <sys/poll.h>
+#include <sys/socket.h> /* sockaddr.. */
+#include <arpa/inet.h> /* ntohs */
+
+#include <net/if.h> /* ifreq */
+#include <net/ethernet.h>
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+
+#include <netinet/in.h> /* sockaddr_in */
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+int verbose = 0;
+
+/* debug support */
+#define ND(format, ...) {}
+#define D(format, ...) do { \
+ if (!verbose) break; \
+ struct timeval _xxts; \
+ gettimeofday(&_xxts, NULL); \
+ fprintf(stderr, "%03d.%06d %s [%d] " format "\n", \
+ (int)_xxts.tv_sec %1000, (int)_xxts.tv_usec, \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+
+char *version = "$Id: bridge.c 9642 2011-11-07 21:39:47Z luigi $";
+
+static int do_abort = 0;
+
+/*
+ * info on a ring we handle
+ */
+struct my_ring {
+ const char *ifname;
+ int fd;
+ char *mem; /* userspace mmap address */
+ u_int memsize;
+ u_int queueid;
+ u_int begin, end; /* first..last+1 rings to check */
+ struct netmap_if *nifp;
+ struct netmap_ring *tx, *rx; /* shortcuts */
+
+ uint32_t if_flags;
+ uint32_t if_reqcap;
+ uint32_t if_curcap;
+};
+
+static void
+sigint_h(__unused int sig)
+{
+ do_abort = 1;
+ signal(SIGINT, SIG_DFL);
+}
+
+
+static int
+do_ioctl(struct my_ring *me, int what)
+{
+ struct ifreq ifr;
+ int error;
+
+ bzero(&ifr, sizeof(ifr));
+ strncpy(ifr.ifr_name, me->ifname, sizeof(ifr.ifr_name));
+ switch (what) {
+ case SIOCSIFFLAGS:
+ ifr.ifr_flagshigh = me->if_flags >> 16;
+ ifr.ifr_flags = me->if_flags & 0xffff;
+ break;
+ case SIOCSIFCAP:
+ ifr.ifr_reqcap = me->if_reqcap;
+ ifr.ifr_curcap = me->if_curcap;
+ break;
+ }
+ error = ioctl(me->fd, what, &ifr);
+ if (error) {
+ D("ioctl error %d", what);
+ return error;
+ }
+ switch (what) {
+ case SIOCGIFFLAGS:
+ me->if_flags = (ifr.ifr_flagshigh << 16) |
+ (0xffff & ifr.ifr_flags);
+ if (verbose)
+ D("flags are 0x%x", me->if_flags);
+ break;
+
+ case SIOCGIFCAP:
+ me->if_reqcap = ifr.ifr_reqcap;
+ me->if_curcap = ifr.ifr_curcap;
+ if (verbose)
+ D("curcap are 0x%x", me->if_curcap);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * open a device. if me->mem is null then do an mmap.
+ */
+static int
+netmap_open(struct my_ring *me, int ringid)
+{
+ int fd, err, l;
+ struct nmreq req;
+
+ me->fd = fd = open("/dev/netmap", O_RDWR);
+ if (fd < 0) {
+ D("Unable to open /dev/netmap");
+ return (-1);
+ }
+ bzero(&req, sizeof(req));
+ strncpy(req.nr_name, me->ifname, sizeof(req.nr_name));
+ req.nr_ringid = ringid;
+ err = ioctl(fd, NIOCGINFO, &req);
+ if (err) {
+ D("cannot get info on %s", me->ifname);
+ goto error;
+ }
+ me->memsize = l = req.nr_memsize;
+ if (verbose)
+ D("memsize is %d MB", l>>20);
+ err = ioctl(fd, NIOCREGIF, &req);
+ if (err) {
+ D("Unable to register %s", me->ifname);
+ goto error;
+ }
+
+ if (me->mem == NULL) {
+ me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+ if (me->mem == MAP_FAILED) {
+ D("Unable to mmap");
+ me->mem = NULL;
+ goto error;
+ }
+ }
+
+ me->nifp = NETMAP_IF(me->mem, req.nr_offset);
+ me->queueid = ringid;
+ if (ringid & NETMAP_SW_RING) {
+ me->begin = req.nr_numrings;
+ me->end = me->begin + 1;
+ } else if (ringid & NETMAP_HW_RING) {
+ me->begin = ringid & NETMAP_RING_MASK;
+ me->end = me->begin + 1;
+ } else {
+ me->begin = 0;
+ me->end = req.nr_numrings;
+ }
+ me->tx = NETMAP_TXRING(me->nifp, me->begin);
+ me->rx = NETMAP_RXRING(me->nifp, me->begin);
+ return (0);
+error:
+ close(me->fd);
+ return -1;
+}
+
+
+static int
+netmap_close(struct my_ring *me)
+{
+ D("");
+ if (me->mem)
+ munmap(me->mem, me->memsize);
+ ioctl(me->fd, NIOCUNREGIF, NULL);
+ close(me->fd);
+ return (0);
+}
+
+
+/*
+ * move up to 'limit' pkts from rxring to txring swapping buffers.
+ */
+static int
+process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
+ u_int limit, const char *msg)
+{
+ u_int j, k, m = 0;
+
+ /* print a warning if any of the ring flags is set (e.g. NM_REINIT) */
+ if (rxring->flags || txring->flags)
+ D("%s rxflags %x txflags %x",
+ msg, rxring->flags, txring->flags);
+ j = rxring->cur; /* RX */
+ k = txring->cur; /* TX */
+ if (rxring->avail < limit)
+ limit = rxring->avail;
+ if (txring->avail < limit)
+ limit = txring->avail;
+ m = limit;
+ while (limit-- > 0) {
+ struct netmap_slot *rs = &rxring->slot[j];
+ struct netmap_slot *ts = &txring->slot[k];
+ uint32_t pkt;
+
+ /* swap packets */
+ if (ts->buf_idx < 2 || rs->buf_idx < 2) {
+ D("wrong index rx[%d] = %d -> tx[%d] = %d",
+ j, rs->buf_idx, k, ts->buf_idx);
+ sleep(2);
+ }
+ pkt = ts->buf_idx;
+ ts->buf_idx = rs->buf_idx;
+ rs->buf_idx = pkt;
+
+ /* copy the packet lenght. */
+ if (rs->len < 14 || rs->len > 2048)
+ D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k);
+ else if (verbose > 1)
+ D("send len %d rx[%d] -> tx[%d]", rs->len, j, k);
+ ts->len = rs->len;
+
+ /* report the buffer change. */
+ ts->flags |= NS_BUF_CHANGED;
+ rs->flags |= NS_BUF_CHANGED;
+ j = NETMAP_RING_NEXT(rxring, j);
+ k = NETMAP_RING_NEXT(txring, k);
+ }
+ rxring->avail -= m;
+ txring->avail -= m;
+ rxring->cur = j;
+ txring->cur = k;
+ if (verbose && m > 0)
+ D("sent %d packets to %p", m, txring);
+
+ return (m);
+}
+
+/* move packts from src to destination */
+static int
+move(struct my_ring *src, struct my_ring *dst, u_int limit)
+{
+ struct netmap_ring *txring, *rxring;
+ u_int m = 0, si = src->begin, di = dst->begin;
+ const char *msg = (src->queueid & NETMAP_SW_RING) ?
+ "host->net" : "net->host";
+
+ while (si < src->end && di < dst->end) {
+ rxring = NETMAP_RXRING(src->nifp, si);
+ txring = NETMAP_TXRING(dst->nifp, di);
+ ND("txring %p rxring %p", txring, rxring);
+ if (rxring->avail == 0) {
+ si++;
+ continue;
+ }
+ if (txring->avail == 0) {
+ di++;
+ continue;
+ }
+ m += process_rings(rxring, txring, limit, msg);
+ }
+
+ return (m);
+}
+
+/*
+ * how many packets on this set of queues ?
+ */
+static int
+howmany(struct my_ring *me, int tx)
+{
+ u_int i, tot = 0;
+
+ ND("me %p begin %d end %d", me, me->begin, me->end);
+ for (i = me->begin; i < me->end; i++) {
+ struct netmap_ring *ring = tx ?
+ NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i);
+ tot += ring->avail;
+ }
+ if (0 && verbose && tot && !tx)
+ D("ring %s %s %s has %d avail at %d",
+ me->ifname, tx ? "tx": "rx",
+ me->end > me->nifp->ni_num_queues ?
+ "host":"net",
+ tot, NETMAP_TXRING(me->nifp, me->begin)->cur);
+ return tot;
+}
+
+/*
+ * bridge [-v] if1 [if2]
+ *
+ * If only one name, or the two interfaces are the same,
+ * bridges userland and the adapter. Otherwise bridge
+ * two intefaces.
+ */
+int
+main(int argc, char **argv)
+{
+ struct pollfd pollfd[2];
+ int i;
+ u_int burst = 1024;
+ struct my_ring me[2];
+
+ fprintf(stderr, "%s %s built %s %s\n",
+ argv[0], version, __DATE__, __TIME__);
+
+ bzero(me, sizeof(me));
+
+ while (argc > 1 && !strcmp(argv[1], "-v")) {
+ verbose++;
+ argv++;
+ argc--;
+ }
+
+ if (argc < 2 || argc > 4) {
+ D("Usage: %s IFNAME1 [IFNAME2 [BURST]]", argv[0]);
+ return (1);
+ }
+
+ /* setup netmap interface #1. */
+ me[0].ifname = argv[1];
+ if (argc == 2 || !strcmp(argv[1], argv[2])) {
+ D("same interface, endpoint 0 goes to host");
+ i = NETMAP_SW_RING;
+ me[1].ifname = argv[1];
+ } else {
+ /* two different interfaces. Take all rings on if1 */
+ i = 0; // all hw rings
+ me[1].ifname = argv[2];
+ }
+ if (netmap_open(me, i))
+ return (1);
+ me[1].mem = me[0].mem; /* copy the pointer, so only one mmap */
+ if (netmap_open(me+1, 0))
+ return (1);
+
+ /* if bridging two interfaces, set promisc mode */
+ if (i != NETMAP_SW_RING) {
+ do_ioctl(me, SIOCGIFFLAGS);
+ if ((me[0].if_flags & IFF_UP) == 0) {
+ D("%s is down, bringing up...", me[0].ifname);
+ me[0].if_flags |= IFF_UP;
+ }
+ me[0].if_flags |= IFF_PPROMISC;
+ do_ioctl(me, SIOCSIFFLAGS);
+
+ do_ioctl(me+1, SIOCGIFFLAGS);
+ me[1].if_flags |= IFF_PPROMISC;
+ do_ioctl(me+1, SIOCSIFFLAGS);
+
+ /* also disable checksums etc. */
+ do_ioctl(me, SIOCGIFCAP);
+ me[0].if_reqcap = me[0].if_curcap;
+ me[0].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE);
+ do_ioctl(me+0, SIOCSIFCAP);
+ }
+ do_ioctl(me+1, SIOCGIFFLAGS);
+ if ((me[1].if_flags & IFF_UP) == 0) {
+ D("%s is down, bringing up...", me[1].ifname);
+ me[1].if_flags |= IFF_UP;
+ }
+ do_ioctl(me+1, SIOCSIFFLAGS);
+
+ do_ioctl(me+1, SIOCGIFCAP);
+ me[1].if_reqcap = me[1].if_curcap;
+ me[1].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE);
+ do_ioctl(me+1, SIOCSIFCAP);
+ if (argc > 3)
+ burst = atoi(argv[3]); /* packets burst size. */
+
+ /* setup poll(2) variables. */
+ memset(pollfd, 0, sizeof(pollfd));
+ for (i = 0; i < 2; i++) {
+ pollfd[i].fd = me[i].fd;
+ pollfd[i].events = (POLLIN);
+ }
+
+ D("Wait 2 secs for link to come up...");
+ sleep(2);
+ D("Ready to go, %s 0x%x/%d <-> %s 0x%x/%d.",
+ me[0].ifname, me[0].queueid, me[0].nifp->ni_num_queues,
+ me[1].ifname, me[1].queueid, me[1].nifp->ni_num_queues);
+
+ /* main loop */
+ signal(SIGINT, sigint_h);
+ while (!do_abort) {
+ int n0, n1, ret;
+ pollfd[0].events = pollfd[1].events = 0;
+ pollfd[0].revents = pollfd[1].revents = 0;
+ n0 = howmany(me, 0);
+ n1 = howmany(me + 1, 0);
+ if (n0)
+ pollfd[1].events |= POLLOUT;
+ else
+ pollfd[0].events |= POLLIN;
+ if (n1)
+ pollfd[0].events |= POLLOUT;
+ else
+ pollfd[1].events |= POLLIN;
+ ret = poll(pollfd, 2, 2500);
+ if (ret <= 0 || verbose)
+ D("poll %s [0] ev %x %x rx %d@%d tx %d,"
+ " [1] ev %x %x rx %d@%d tx %d",
+ ret <= 0 ? "timeout" : "ok",
+ pollfd[0].events,
+ pollfd[0].revents,
+ howmany(me, 0),
+ me[0].rx->cur,
+ howmany(me, 1),
+ pollfd[1].events,
+ pollfd[1].revents,
+ howmany(me+1, 0),
+ me[1].rx->cur,
+ howmany(me+1, 1)
+ );
+ if (ret < 0)
+ continue;
+ if (pollfd[0].revents & POLLERR) {
+ D("error on fd0, rxcur %d@%d",
+ me[0].rx->avail, me[0].rx->cur);
+ }
+ if (pollfd[1].revents & POLLERR) {
+ D("error on fd1, rxcur %d@%d",
+ me[1].rx->avail, me[1].rx->cur);
+ }
+ if (pollfd[0].revents & POLLOUT) {
+ move(me + 1, me, burst);
+ // XXX we don't need the ioctl */
+ // ioctl(me[0].fd, NIOCTXSYNC, NULL);
+ }
+ if (pollfd[1].revents & POLLOUT) {
+ move(me, me + 1, burst);
+ // XXX we don't need the ioctl */
+ // ioctl(me[1].fd, NIOCTXSYNC, NULL);
+ }
+ }
+ D("exiting");
+ netmap_close(me + 1);
+ netmap_close(me + 0);
+
+ return (0);
+}
diff --git a/tools/tools/netmap/click-test.cfg b/tools/tools/netmap/click-test.cfg
new file mode 100644
index 000000000000..fc5759f88b1e
--- /dev/null
+++ b/tools/tools/netmap/click-test.cfg
@@ -0,0 +1,19 @@
+//
+// $FreeBSD$
+//
+// A sample test configuration for click
+//
+//
+// create a switch
+
+myswitch :: EtherSwitch;
+
+// two input devices
+
+c0 :: FromDevice(ix0, PROMISC true);
+c1 :: FromDevice(ix1, PROMISC true);
+
+// and now pass packets around
+
+c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0);
+c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1);
diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c
new file mode 100644
index 000000000000..f010b839bfb2
--- /dev/null
+++ b/tools/tools/netmap/pcap.c
@@ -0,0 +1,761 @@
+/*
+ * (C) 2011 Luigi Rizzo
+ *
+ * BSD license
+ *
+ * A simple library that maps some pcap functions onto netmap
+ * This is not 100% complete but enough to let tcpdump, trafshow
+ * and other apps work.
+ *
+ * $FreeBSD$
+ */
+
+#include <errno.h>
+#include <signal.h> /* signal */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h> /* strcmp */
+#include <fcntl.h> /* open */
+#include <unistd.h> /* close */
+
+#include <sys/endian.h> /* le64toh */
+#include <sys/mman.h> /* PROT_* */
+#include <sys/ioctl.h> /* ioctl */
+#include <machine/param.h>
+#include <sys/poll.h>
+#include <sys/socket.h> /* sockaddr.. */
+#include <arpa/inet.h> /* ntohs */
+
+#include <net/if.h> /* ifreq */
+#include <net/ethernet.h>
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+
+#include <netinet/in.h> /* sockaddr_in */
+
+#include <sys/socket.h>
+#include <ifaddrs.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+char *version = "$Id$";
+int verbose = 0;
+
+/* debug support */
+#define ND(format, ...) do {} while (0)
+#define D(format, ...) do { \
+ if (verbose) \
+ fprintf(stderr, "--- %s [%d] " format "\n", \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+
+/*
+ * We redefine here a number of structures that are in pcap.h
+ * so we can compile this file without the system header.
+ */
+#ifndef PCAP_ERRBUF_SIZE
+#define PCAP_ERRBUF_SIZE 128
+
+/*
+ * Each packet is accompanied by a header including the timestamp,
+ * captured size and actual size.
+ */
+struct pcap_pkthdr {
+ struct timeval ts; /* time stamp */
+ uint32_t caplen; /* length of portion present */
+ uint32_t len; /* length this packet (off wire) */
+};
+
+typedef struct pcap_if pcap_if_t;
+
+/*
+ * Representation of an interface address.
+ */
+struct pcap_addr {
+ struct pcap_addr *next;
+ struct sockaddr *addr; /* address */
+ struct sockaddr *netmask; /* netmask for the above */
+ struct sockaddr *broadaddr; /* broadcast addr for the above */
+ struct sockaddr *dstaddr; /* P2P dest. address for the above */
+};
+
+struct pcap_if {
+ struct pcap_if *next;
+ char *name; /* name to hand to "pcap_open_live()" */
+ char *description; /* textual description of interface, or NULL */
+ struct pcap_addr *addresses;
+ uint32_t flags; /* PCAP_IF_ interface flags */
+};
+
+/*
+ * We do not support stats (yet)
+ */
+struct pcap_stat {
+ u_int ps_recv; /* number of packets received */
+ u_int ps_drop; /* number of packets dropped */
+ u_int ps_ifdrop; /* drops by interface XXX not yet supported */
+#ifdef WIN32
+ u_int bs_capt; /* number of packets that reach the app. */
+#endif /* WIN32 */
+};
+
+typedef void pcap_t;
+typedef enum {
+ PCAP_D_INOUT = 0,
+ PCAP_D_IN,
+ PCAP_D_OUT
+} pcap_direction_t;
+
+
+
+typedef void (*pcap_handler)(u_char *user,
+ const struct pcap_pkthdr *h, const u_char *bytes);
+
+char errbuf[PCAP_ERRBUF_SIZE];
+
+pcap_t *pcap_open_live(const char *device, int snaplen,
+ int promisc, int to_ms, char *errbuf);
+
+int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf);
+void pcap_close(pcap_t *p);
+int pcap_get_selectable_fd(pcap_t *p);
+int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user);
+int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf);
+int pcap_setdirection(pcap_t *p, pcap_direction_t d);
+char *pcap_lookupdev(char *errbuf);
+int pcap_inject(pcap_t *p, const void *buf, size_t size);
+int pcap_fileno(pcap_t *p);
+
+struct eproto {
+ const char *s;
+ u_short p;
+};
+#endif /* !PCAP_ERRBUF_SIZE */
+
+#ifdef __PIC__
+/*
+ * build as a shared library
+ */
+
+char pcap_version[] = "libnetmap version 0.3";
+
+/*
+ * Our equivalent of pcap_t
+ */
+struct my_ring {
+ struct nmreq nmr;
+
+ int fd;
+ char *mem; /* userspace mmap address */
+ u_int memsize;
+ u_int queueid;
+ u_int begin, end; /* first..last+1 rings to check */
+ struct netmap_if *nifp;
+
+ int snaplen;
+ char *errbuf;
+ int promisc;
+ int to_ms;
+
+ struct pcap_pkthdr hdr;
+
+ uint32_t if_flags;
+ uint32_t if_reqcap;
+ uint32_t if_curcap;
+
+ struct pcap_stat st;
+
+ char msg[PCAP_ERRBUF_SIZE];
+};
+
+
+static int
+do_ioctl(struct my_ring *me, int what)
+{
+ struct ifreq ifr;
+ int error;
+
+ bzero(&ifr, sizeof(ifr));
+ strncpy(ifr.ifr_name, me->nmr.nr_name, sizeof(ifr.ifr_name));
+ switch (what) {
+ case SIOCSIFFLAGS:
+ D("call SIOCSIFFLAGS 0x%x", me->if_flags);
+ ifr.ifr_flagshigh = (me->if_flags >> 16) & 0xffff;
+ ifr.ifr_flags = me->if_flags & 0xffff;
+ break;
+ case SIOCSIFCAP:
+ ifr.ifr_reqcap = me->if_reqcap;
+ ifr.ifr_curcap = me->if_curcap;
+ break;
+ }
+ error = ioctl(me->fd, what, &ifr);
+ if (error) {
+ D("ioctl 0x%x error %d", what, error);
+ return error;
+ }
+ switch (what) {
+ case SIOCSIFFLAGS:
+ case SIOCGIFFLAGS:
+ me->if_flags = (ifr.ifr_flagshigh << 16) |
+ (0xffff & ifr.ifr_flags);
+ D("flags are L 0x%x H 0x%x 0x%x",
+ (uint16_t)ifr.ifr_flags,
+ (uint16_t)ifr.ifr_flagshigh, me->if_flags);
+ break;
+
+ case SIOCGIFCAP:
+ me->if_reqcap = ifr.ifr_reqcap;
+ me->if_curcap = ifr.ifr_curcap;
+ D("curcap are 0x%x", me->if_curcap);
+ break;
+ }
+ return 0;
+}
+
+
+/*
+ * open a device. if me->mem is null then do an mmap.
+ */
+static int
+netmap_open(struct my_ring *me, int ringid)
+{
+ int fd, err, l;
+ u_int i;
+ struct nmreq req;
+
+ me->fd = fd = open("/dev/netmap", O_RDWR);
+ if (fd < 0) {
+ D("Unable to open /dev/netmap");
+ return (-1);
+ }
+ bzero(&req, sizeof(req));
+ strncpy(req.nr_name, me->nmr.nr_name, sizeof(req.nr_name));
+ req.nr_ringid = ringid;
+ err = ioctl(fd, NIOCGINFO, &req);
+ if (err) {
+ D("cannot get info on %s", me->nmr.nr_name);
+ goto error;
+ }
+ me->memsize = l = req.nr_memsize;
+ ND("memsize is %d MB", l>>20);
+ err = ioctl(fd, NIOCREGIF, &req);
+ if (err) {
+ D("Unable to register %s", me->nmr.nr_name);
+ goto error;
+ }
+
+ if (me->mem == NULL) {
+ me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+ if (me->mem == MAP_FAILED) {
+ D("Unable to mmap");
+ me->mem = NULL;
+ goto error;
+ }
+ }
+
+ me->nifp = NETMAP_IF(me->mem, req.nr_offset);
+ me->queueid = ringid;
+ if (ringid & NETMAP_SW_RING) {
+ me->begin = req.nr_numrings;
+ me->end = me->begin + 1;
+ } else if (ringid & NETMAP_HW_RING) {
+ me->begin = ringid & NETMAP_RING_MASK;
+ me->end = me->begin + 1;
+ } else {
+ me->begin = 0;
+ me->end = req.nr_numrings;
+ }
+ /* request timestamps for packets */
+ for (i = me->begin; i < me->end; i++) {
+ struct netmap_ring *ring = NETMAP_RXRING(me->nifp, i);
+ ring->flags = NR_TIMESTAMP;
+ }
+ //me->tx = NETMAP_TXRING(me->nifp, 0);
+ return (0);
+error:
+ close(me->fd);
+ return -1;
+}
+
+/*
+ * There is a set of functions that tcpdump expects even if probably
+ * not used
+ */
+struct eproto eproto_db[] = {
+ { "ip", ETHERTYPE_IP },
+ { "arp", ETHERTYPE_ARP },
+ { (char *)0, 0 }
+};
+
+
+int
+pcap_findalldevs(pcap_if_t **alldevsp, __unused char *errbuf)
+{
+ struct ifaddrs *i_head, *i;
+ pcap_if_t *top = NULL, *cur;
+ struct pcap_addr *tail = NULL;
+ int l;
+
+ D("listing all devs");
+ *alldevsp = NULL;
+ i_head = NULL;
+
+ if (getifaddrs(&i_head)) {
+ D("cannot get if addresses");
+ return -1;
+ }
+ for (i = i_head; i; i = i->ifa_next) {
+ //struct ifaddrs *ifa;
+ struct pcap_addr *pca;
+ //struct sockaddr *sa;
+
+ D("got interface %s", i->ifa_name);
+ if (!top || strcmp(top->name, i->ifa_name)) {
+ /* new interface */
+ l = sizeof(*top) + strlen(i->ifa_name) + 1;
+ cur = calloc(1, l);
+ if (cur == NULL) {
+ D("no space for if descriptor");
+ continue;
+ }
+ cur->name = (char *)(cur + 1);
+ //cur->flags = i->ifa_flags;
+ strcpy(cur->name, i->ifa_name);
+ cur->description = NULL;
+ cur->next = top;
+ top = cur;
+ tail = NULL;
+ }
+ /* now deal with addresses */
+ D("%s addr family %d len %d %s %s",
+ top->name,
+ i->ifa_addr->sa_family, i->ifa_addr->sa_len,
+ i->ifa_netmask ? "Netmask" : "",
+ i->ifa_broadaddr ? "Broadcast" : "");
+ l = sizeof(struct pcap_addr) +
+ (i->ifa_addr ? i->ifa_addr->sa_len:0) +
+ (i->ifa_netmask ? i->ifa_netmask->sa_len:0) +
+ (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0);
+ pca = calloc(1, l);
+ if (pca == NULL) {
+ D("no space for if addr");
+ continue;
+ }
+#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len))
+ pca->addr = (struct sockaddr *)(pca + 1);
+ bcopy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len);
+ if (i->ifa_netmask) {
+ pca->netmask = SA_NEXT(pca->addr);
+ bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len);
+ if (i->ifa_broadaddr) {
+ pca->broadaddr = SA_NEXT(pca->netmask);
+ bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len);
+ }
+ }
+ if (tail == NULL) {
+ top->addresses = pca;
+ } else {
+ tail->next = pca;
+ }
+ tail = pca;
+
+ }
+ freeifaddrs(i_head);
+ *alldevsp = top;
+ return 0;
+}
+
+void pcap_freealldevs(__unused pcap_if_t *alldevs)
+{
+ D("unimplemented");
+}
+
+char *
+pcap_lookupdev(char *buf)
+{
+ D("%s", buf);
+ strcpy(buf, "/dev/netmap");
+ return buf;
+}
+
+pcap_t *
+pcap_create(const char *source, char *errbuf)
+{
+ D("src %s (call open liveted)", source);
+ return pcap_open_live(source, 0, 1, 100, errbuf);
+}
+
+int
+pcap_activate(pcap_t *p)
+{
+ D("pcap %p running", p);
+ return 0;
+}
+
+int
+pcap_can_set_rfmon(__unused pcap_t *p)
+{
+ D("");
+ return 0; /* no we can't */
+}
+
+int
+pcap_set_snaplen(pcap_t *p, int snaplen)
+{
+ struct my_ring *me = p;
+
+ D("len %d", snaplen);
+ me->snaplen = snaplen;
+ return 0;
+}
+
+int
+pcap_snapshot(pcap_t *p)
+{
+ struct my_ring *me = p;
+
+ D("len %d", me->snaplen);
+ return me->snaplen;
+}
+
+int
+pcap_lookupnet(const char *device, uint32_t *netp,
+ uint32_t *maskp, __unused char *errbuf)
+{
+
+ D("device %s", device);
+ inet_aton("10.0.0.255", (struct in_addr *)netp);
+ inet_aton("255.255.255.0",(struct in_addr *) maskp);
+ return 0;
+}
+
+int
+pcap_set_promisc(pcap_t *p, int promisc)
+{
+ struct my_ring *me = p;
+
+ D("promisc %d", promisc);
+ if (do_ioctl(me, SIOCGIFFLAGS))
+ D("SIOCGIFFLAGS failed");
+ if (promisc) {
+ me->if_flags |= IFF_PPROMISC;
+ } else {
+ me->if_flags &= ~IFF_PPROMISC;
+ }
+ if (do_ioctl(me, SIOCSIFFLAGS))
+ D("SIOCSIFFLAGS failed");
+ return 0;
+}
+
+int
+pcap_set_timeout(pcap_t *p, int to_ms)
+{
+ struct my_ring *me = p;
+
+ D("%d ms", to_ms);
+ me->to_ms = to_ms;
+ return 0;
+}
+
+struct bpf_program;
+
+int
+pcap_compile(__unused pcap_t *p, __unused struct bpf_program *fp,
+ const char *str, __unused int optimize, __unused uint32_t netmask)
+{
+ D("%s", str);
+ return 0;
+}
+
+int
+pcap_setfilter(__unused pcap_t *p, __unused struct bpf_program *fp)
+{
+ D("");
+ return 0;
+}
+
+int
+pcap_datalink(__unused pcap_t *p)
+{
+ D("");
+ return 1; // ethernet
+}
+
+const char *
+pcap_datalink_val_to_name(int dlt)
+{
+ D("%d", dlt);
+ return "DLT_EN10MB";
+}
+
+const char *
+pcap_datalink_val_to_description(int dlt)
+{
+ D("%d", dlt);
+ return "Ethernet link";
+}
+
+struct pcap_stat;
+int
+pcap_stats(pcap_t *p, struct pcap_stat *ps)
+{
+ struct my_ring *me = p;
+ ND("");
+
+ me->st.ps_recv += 10;
+ *ps = me->st;
+ sprintf(me->msg, "stats not supported");
+ return -1;
+};
+
+char *
+pcap_geterr(pcap_t *p)
+{
+ struct my_ring *me = p;
+
+ D("");
+ return me->msg;
+}
+
+pcap_t *
+pcap_open_live(const char *device, __unused int snaplen,
+ int promisc, int to_ms, __unused char *errbuf)
+{
+ struct my_ring *me;
+
+ D("request to open %s", device);
+ me = calloc(1, sizeof(*me));
+ if (me == NULL) {
+ D("failed to allocate struct for %s", device);
+ return NULL;
+ }
+ strncpy(me->nmr.nr_name, device, sizeof(me->nmr.nr_name));
+ if (netmap_open(me, 0)) {
+ D("error opening %s", device);
+ free(me);
+ return NULL;
+ }
+ me->to_ms = to_ms;
+ if (do_ioctl(me, SIOCGIFFLAGS))
+ D("SIOCGIFFLAGS failed");
+ if (promisc) {
+ me->if_flags |= IFF_PPROMISC;
+ if (do_ioctl(me, SIOCSIFFLAGS))
+ D("SIOCSIFFLAGS failed");
+ }
+ if (do_ioctl(me, SIOCGIFCAP))
+ D("SIOCGIFCAP failed");
+ me->if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE);
+ if (do_ioctl(me, SIOCSIFCAP))
+ D("SIOCSIFCAP failed");
+
+ return (pcap_t *)me;
+}
+
+void
+pcap_close(pcap_t *p)
+{
+ struct my_ring *me = p;
+
+ D("");
+ if (!me)
+ return;
+ if (me->mem)
+ munmap(me->mem, me->memsize);
+ /* restore original flags ? */
+ ioctl(me->fd, NIOCUNREGIF, NULL);
+ close(me->fd);
+ bzero(me, sizeof(*me));
+ free(me);
+}
+
+int
+pcap_fileno(pcap_t *p)
+{
+ struct my_ring *me = p;
+ D("returns %d", me->fd);
+ return me->fd;
+}
+
+int
+pcap_get_selectable_fd(pcap_t *p)
+{
+ struct my_ring *me = p;
+
+ ND("");
+ return me->fd;
+}
+
+int
+pcap_setnonblock(__unused pcap_t *p, int nonblock, __unused char *errbuf)
+{
+ D("mode is %d", nonblock);
+ return 0; /* ignore */
+}
+
+int
+pcap_setdirection(__unused pcap_t *p, __unused pcap_direction_t d)
+{
+ D("");
+ return 0; /* ignore */
+};
+
+int
+pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
+{
+ struct my_ring *me = p;
+ int got = 0;
+ u_int si;
+
+ ND("cnt %d", cnt);
+ /* scan all rings */
+ for (si = me->begin; si < me->end; si++) {
+ struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si);
+ ND("ring has %d pkts", ring->avail);
+ if (ring->avail == 0)
+ continue;
+ me->hdr.ts = ring->ts;
+ while ((cnt == -1 || cnt != got) && ring->avail > 0) {
+ u_int i = ring->cur;
+ u_int idx = ring->slot[i].buf_idx;
+ if (idx < 2) {
+ D("%s bogus RX index %d at offset %d",
+ me->nifp->ni_name, idx, i);
+ sleep(2);
+ }
+ u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
+ me->hdr.len = me->hdr.caplen = ring->slot[i].len;
+ // D("call %p len %d", p, me->hdr.len);
+ callback(user, &me->hdr, buf);
+ ring->cur = NETMAP_RING_NEXT(ring, i);
+ ring->avail--;
+ got++;
+ }
+ }
+ return got;
+}
+
+int
+pcap_inject(pcap_t *p, const void *buf, size_t size)
+{
+ struct my_ring *me = p;
+ u_int si;
+
+ ND("cnt %d", cnt);
+ /* scan all rings */
+ for (si = me->begin; si < me->end; si++) {
+ struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si);
+
+ ND("ring has %d pkts", ring->avail);
+ if (ring->avail == 0)
+ continue;
+ u_int i = ring->cur;
+ u_int idx = ring->slot[i].buf_idx;
+ if (idx < 2) {
+ D("%s bogus TX index %d at offset %d",
+ me->nifp->ni_name, idx, i);
+ sleep(2);
+ }
+ u_char *dst = (u_char *)NETMAP_BUF(ring, idx);
+ ring->slot[i].len = size;
+ bcopy(buf, dst, size);
+ ring->cur = NETMAP_RING_NEXT(ring, i);
+ ring->avail--;
+ // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL);
+ return size;
+ }
+ errno = ENOBUFS;
+ return -1;
+}
+
+int
+pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
+{
+ struct my_ring *me = p;
+ struct pollfd fds[1];
+ int i;
+
+ ND("cnt %d", cnt);
+ memset(fds, 0, sizeof(fds));
+ fds[0].fd = me->fd;
+ fds[0].events = (POLLIN);
+
+ while (cnt == -1 || cnt > 0) {
+ if (poll(fds, 1, me->to_ms) <= 0) {
+ D("poll error/timeout");
+ continue;
+ }
+ i = pcap_dispatch(p, cnt, callback, user);
+ if (cnt > 0)
+ cnt -= i;
+ }
+ return 0;
+}
+
+#endif /* __PIC__ */
+
+#ifndef __PIC__
+void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf)
+{
+ pcap_inject((pcap_t *)user, buf, h->caplen);
+}
+
+/*
+ * a simple pcap test program, bridge between two interfaces.
+ */
+int
+main(int argc, char **argv)
+{
+ pcap_t *p0, *p1;
+ int burst = 1024;
+ struct pollfd pollfd[2];
+
+ fprintf(stderr, "%s %s built %s %s\n",
+ argv[0], version, __DATE__, __TIME__);
+
+ while (argc > 1 && !strcmp(argv[1], "-v")) {
+ verbose++;
+ argv++;
+ argc--;
+ }
+
+ if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) {
+ D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]);
+ return (1);
+ }
+ if (argc > 3)
+ burst = atoi(argv[3]);
+
+ p0 = pcap_open_live(argv[1], 0, 1, 100, NULL);
+ p1 = pcap_open_live(argv[2], 0, 1, 100, NULL);
+ D("%s", version);
+ D("open returns %p %p", p0, p1);
+ if (!p0 || !p1)
+ return(1);
+ bzero(pollfd, sizeof(pollfd));
+ pollfd[0].fd = pcap_fileno(p0);
+ pollfd[1].fd = pcap_fileno(p1);
+ pollfd[0].events = pollfd[1].events = POLLIN;
+ for (;;) {
+ /* do i need to reset ? */
+ pollfd[0].revents = pollfd[1].revents = 0;
+ int ret = poll(pollfd, 2, 1000);
+ if (ret <= 0 || verbose)
+ D("poll %s [0] ev %x %x [1] ev %x %x",
+ ret <= 0 ? "timeout" : "ok",
+ pollfd[0].events,
+ pollfd[0].revents,
+ pollfd[1].events,
+ pollfd[1].revents);
+ if (ret < 0)
+ continue;
+ if (pollfd[0].revents & POLLIN)
+ pcap_dispatch(p0, burst, do_send, p1);
+ if (pollfd[1].revents & POLLIN)
+ pcap_dispatch(p1, burst, do_send, p0);
+ }
+
+ return (0);
+}
+#endif /* !__PIC__ */
diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c
new file mode 100644
index 000000000000..747bd9dde00b
--- /dev/null
+++ b/tools/tools/netmap/pkt-gen.c
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ * $Id: pkt-gen.c 9638 2011-11-07 18:07:43Z luigi $
+ *
+ * Example program to show how to build a multithreaded packet
+ * source/sink using the netmap device.
+ *
+ * In this example we create a programmable number of threads
+ * to take care of all the queues of the interface used to
+ * send or receive traffic.
+ *
+ */
+
+const char *default_payload="netmap pkt-gen Luigi Rizzo and Matteo Landi\n"
+ "http://info.iet.unipi.it/~luigi/netmap/ ";
+
+#include <errno.h>
+#include <pthread.h> /* pthread_* */
+#include <pthread_np.h> /* pthread w/ affinity */
+#include <signal.h> /* signal */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h> /* strcmp */
+#include <fcntl.h> /* open */
+#include <unistd.h> /* close */
+#include <ifaddrs.h> /* getifaddrs */
+
+#include <sys/mman.h> /* PROT_* */
+#include <sys/ioctl.h> /* ioctl */
+#include <sys/poll.h>
+#include <sys/socket.h> /* sockaddr.. */
+#include <arpa/inet.h> /* ntohs */
+#include <sys/param.h>
+#include <sys/cpuset.h> /* cpu_set */
+#include <sys/sysctl.h> /* sysctl */
+#include <sys/time.h> /* timersub */
+
+#include <net/ethernet.h>
+#include <net/if.h> /* ifreq */
+#include <net/if_dl.h> /* LLADDR */
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+#include <pcap/pcap.h>
+
+
+static inline int min(int a, int b) { return a < b ? a : b; }
+
+/* debug support */
+#define D(format, ...) \
+ fprintf(stderr, "%s [%d] " format "\n", \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__)
+
+#ifndef EXPERIMENTAL
+#define EXPERIMENTAL 0
+#endif
+
+int verbose = 0;
+#define MAX_QUEUES 64 /* no need to limit */
+
+#define SKIP_PAYLOAD 1 /* do not check payload. */
+
+#if EXPERIMENTAL
+/* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */
+#define netmap_rdtsc(t) \
+ do { \
+ u_int __regs[4]; \
+ \
+ do_cpuid(0, __regs); \
+ (t) = rdtsc(); \
+ } while (0)
+
+static __inline void
+do_cpuid(u_int ax, u_int *p)
+{
+ __asm __volatile("cpuid"
+ : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+ : "0" (ax));
+}
+
+static __inline uint64_t
+rdtsc(void)
+{
+ uint64_t rv;
+
+ __asm __volatile("rdtsc" : "=A" (rv));
+ return (rv);
+}
+#define MAX_SAMPLES 100000
+#endif /* EXPERIMENTAL */
+
+
+struct pkt {
+ struct ether_header eh;
+ struct ip ip;
+ struct udphdr udp;
+ uint8_t body[NETMAP_BUF_SIZE];
+} __attribute__((__packed__));
+
+/*
+ * global arguments for all threads
+ */
+struct glob_arg {
+ const char *src_ip;
+ const char *dst_ip;
+ const char *src_mac;
+ const char *dst_mac;
+ int pkt_size;
+ int burst;
+ int npackets; /* total packets to send */
+ int nthreads;
+ int cpus;
+ int use_pcap;
+ pcap_t *p;
+};
+
+struct mystat {
+ uint64_t containers[8];
+};
+
+/*
+ * Arguments for a new thread. The same structure is used by
+ * the source and the sink
+ */
+struct targ {
+ struct glob_arg *g;
+ int used;
+ int completed;
+ int fd;
+ struct nmreq nmr;
+ struct netmap_if *nifp;
+ uint16_t qfirst, qlast; /* range of queues to scan */
+ uint64_t count;
+ struct timeval tic, toc;
+ int me;
+ pthread_t thread;
+ int affinity;
+
+ uint8_t dst_mac[6];
+ uint8_t src_mac[6];
+ u_int dst_mac_range;
+ u_int src_mac_range;
+ uint32_t dst_ip;
+ uint32_t src_ip;
+ u_int dst_ip_range;
+ u_int src_ip_range;
+
+ struct pkt pkt;
+};
+
+
+static struct targ *targs;
+static int global_nthreads;
+
+/* control-C handler */
+static void
+sigint_h(__unused int sig)
+{
+ for (int i = 0; i < global_nthreads; i++) {
+ /* cancel active threads. */
+ if (targs[i].used == 0)
+ continue;
+
+ D("Cancelling thread #%d\n", i);
+ pthread_cancel(targs[i].thread);
+ targs[i].used = 0;
+ }
+
+ signal(SIGINT, SIG_DFL);
+}
+
+
+/* sysctl wrapper to return the number of active CPUs */
+static int
+system_ncpus(void)
+{
+ int mib[2], ncpus;
+ size_t len;
+
+ mib[0] = CTL_HW;
+ mib[1] = HW_NCPU;
+ len = sizeof(mib);
+ sysctl(mib, 2, &ncpus, &len, NULL, 0);
+
+ return (ncpus);
+}
+
+/*
+ * locate the src mac address for our interface, put it
+ * into the user-supplied buffer. return 0 if ok, -1 on error.
+ */
+static int
+source_hwaddr(const char *ifname, char *buf)
+{
+ struct ifaddrs *ifaphead, *ifap;
+ int l = sizeof(ifap->ifa_name);
+
+ if (getifaddrs(&ifaphead) != 0) {
+ D("getifaddrs %s failed", ifname);
+ return (-1);
+ }
+
+ for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
+ struct sockaddr_dl *sdl =
+ (struct sockaddr_dl *)ifap->ifa_addr;
+ uint8_t *mac;
+
+ if (!sdl || sdl->sdl_family != AF_LINK)
+ continue;
+ if (strncmp(ifap->ifa_name, ifname, l) != 0)
+ continue;
+ mac = (uint8_t *)LLADDR(sdl);
+ sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
+ mac[0], mac[1], mac[2],
+ mac[3], mac[4], mac[5]);
+ if (verbose)
+ D("source hwaddr %s", buf);
+ break;
+ }
+ freeifaddrs(ifaphead);
+ return ifap ? 0 : 1;
+}
+
+
+/* set the thread affinity. */
+static int
+setaffinity(pthread_t me, int i)
+{
+ cpuset_t cpumask;
+
+ if (i == -1)
+ return 0;
+
+ /* Set thread affinity affinity.*/
+ CPU_ZERO(&cpumask);
+ CPU_SET(i, &cpumask);
+
+ if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
+ D("Unable to set affinity");
+ return 1;
+ }
+ return 0;
+}
+
+/* Compute the checksum of the given ip header. */
+static uint16_t
+checksum(const void *data, uint16_t len)
+{
+ const uint8_t *addr = data;
+ uint32_t sum = 0;
+
+ while (len > 1) {
+ sum += addr[0] * 256 + addr[1];
+ addr += 2;
+ len -= 2;
+ }
+
+ if (len == 1)
+ sum += *addr * 256;
+
+ sum = (sum >> 16) + (sum & 0xffff);
+ sum += (sum >> 16);
+
+ sum = htons(sum);
+
+ return ~sum;
+}
+
+/*
+ * Fill a packet with some payload.
+ */
+static void
+initialize_packet(struct targ *targ)
+{
+ struct pkt *pkt = &targ->pkt;
+ struct ether_header *eh;
+ struct ip *ip;
+ struct udphdr *udp;
+ uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(*ip);
+ int i, l, l0 = strlen(default_payload);
+ char *p;
+
+ for (i = 0; i < paylen;) {
+ l = min(l0, paylen - i);
+ bcopy(default_payload, pkt->body + i, l);
+ i += l;
+ }
+ pkt->body[i-1] = '\0';
+
+ udp = &pkt->udp;
+ udp->uh_sport = htons(1234);
+ udp->uh_dport = htons(4321);
+ udp->uh_ulen = htons(paylen);
+ udp->uh_sum = 0; // checksum(udp, sizeof(*udp));
+
+ ip = &pkt->ip;
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = 5;
+ ip->ip_id = 0;
+ ip->ip_tos = IPTOS_LOWDELAY;
+ ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh));
+ ip->ip_id = 0;
+ ip->ip_off = htons(IP_DF); /* Don't fragment */
+ ip->ip_ttl = IPDEFTTL;
+ ip->ip_p = IPPROTO_UDP;
+ inet_aton(targ->g->src_ip, (struct in_addr *)&ip->ip_src);
+ inet_aton(targ->g->dst_ip, (struct in_addr *)&ip->ip_dst);
+ targ->dst_ip = ip->ip_dst.s_addr;
+ targ->src_ip = ip->ip_src.s_addr;
+ p = index(targ->g->src_ip, '-');
+ if (p) {
+ targ->dst_ip_range = atoi(p+1);
+ D("dst-ip sweep %d addresses", targ->dst_ip_range);
+ }
+ ip->ip_sum = checksum(ip, sizeof(*ip));
+
+ eh = &pkt->eh;
+ bcopy(ether_aton(targ->g->src_mac), targ->src_mac, 6);
+ bcopy(targ->src_mac, eh->ether_shost, 6);
+ p = index(targ->g->src_mac, '-');
+ if (p)
+ targ->src_mac_range = atoi(p+1);
+
+ bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
+ bcopy(targ->dst_mac, eh->ether_dhost, 6);
+ p = index(targ->g->dst_mac, '-');
+ if (p)
+ targ->dst_mac_range = atoi(p+1);
+ eh->ether_type = htons(ETHERTYPE_IP);
+}
+
+/* Check the payload of the packet for errors (use it for debug).
+ * Look for consecutive ascii representations of the size of the packet.
+ */
+static void
+check_payload(char *p, int psize)
+{
+ char temp[64];
+ int n_read, size, sizelen;
+
+ /* get the length in ASCII of the length of the packet. */
+ sizelen = sprintf(temp, "%d", psize) + 1; // include a whitespace
+
+ /* dummy payload. */
+ p += 14; /* skip packet header. */
+ n_read = 14;
+ while (psize - n_read >= sizelen) {
+ sscanf(p, "%d", &size);
+ if (size != psize) {
+ D("Read %d instead of %d", size, psize);
+ break;
+ }
+
+ p += sizelen;
+ n_read += sizelen;
+ }
+}
+
+
+/*
+ * create and enqueue a batch of packets on a ring.
+ * On the last one set NS_REPORT to tell the driver to generate
+ * an interrupt when done.
+ */
+static int
+send_packets(struct netmap_ring *ring, struct pkt *pkt,
+ int size, u_int count, int fill_all)
+{
+ u_int sent, cur = ring->cur;
+
+ if (ring->avail < count)
+ count = ring->avail;
+
+ for (sent = 0; sent < count; sent++) {
+ struct netmap_slot *slot = &ring->slot[cur];
+ char *p = NETMAP_BUF(ring, slot->buf_idx);
+
+ if (fill_all)
+ memcpy(p, pkt, size);
+
+ slot->len = size;
+ if (sent == count - 1)
+ slot->flags |= NS_REPORT;
+ cur = NETMAP_RING_NEXT(ring, cur);
+ }
+ ring->avail -= sent;
+ ring->cur = cur;
+
+ return (sent);
+}
+
+static void *
+sender_body(void *data)
+{
+ struct targ *targ = (struct targ *) data;
+
+ struct pollfd fds[1];
+ struct netmap_if *nifp = targ->nifp;
+ struct netmap_ring *txring;
+ int i, n = targ->g->npackets / targ->g->nthreads, sent = 0;
+ int fill_all = 1;
+
+ if (setaffinity(targ->thread, targ->affinity))
+ goto quit;
+ /* setup poll(2) machanism. */
+ memset(fds, 0, sizeof(fds));
+ fds[0].fd = targ->fd;
+ fds[0].events = (POLLOUT);
+
+ /* main loop.*/
+ gettimeofday(&targ->tic, NULL);
+ if (targ->g->use_pcap) {
+ int size = targ->g->pkt_size;
+ void *pkt = &targ->pkt;
+ pcap_t *p = targ->g->p;
+
+ for (; sent < n; sent++) {
+ if (pcap_inject(p, pkt, size) == -1)
+ break;
+ }
+ } else {
+ while (sent < n) {
+
+ /*
+ * wait for available room in the send queue(s)
+ */
+ if (poll(fds, 1, 2000) <= 0) {
+ D("poll error/timeout on queue %d\n", targ->me);
+ goto quit;
+ }
+ /*
+ * scan our queues and send on those with room
+ */
+ if (sent > 100000)
+ fill_all = 0;
+ for (i = targ->qfirst; i < targ->qlast; i++) {
+ int m, limit = MIN(n - sent, targ->g->burst);
+
+ txring = NETMAP_TXRING(nifp, i);
+ if (txring->avail == 0)
+ continue;
+ m = send_packets(txring, &targ->pkt, targ->g->pkt_size,
+ limit, fill_all);
+ sent += m;
+ targ->count = sent;
+ }
+ }
+ /* Tell the interface that we have new packets. */
+ ioctl(fds[0].fd, NIOCTXSYNC, NULL);
+
+ /* final part: wait all the TX queues to be empty. */
+ for (i = targ->qfirst; i < targ->qlast; i++) {
+ txring = NETMAP_TXRING(nifp, i);
+ while (!NETMAP_TX_RING_EMPTY(txring)) {
+ ioctl(fds[0].fd, NIOCTXSYNC, NULL);
+ usleep(1); /* wait 1 tick */
+ }
+ }
+ }
+
+ gettimeofday(&targ->toc, NULL);
+ targ->completed = 1;
+ targ->count = sent;
+
+quit:
+ /* reset the ``used`` flag. */
+ targ->used = 0;
+
+ return (NULL);
+}
+
+
+static void
+receive_pcap(u_char *user, __unused const struct pcap_pkthdr * h,
+ __unused const u_char * bytes)
+{
+ int *count = (int *)user;
+ (*count)++;
+}
+
+static int
+receive_packets(struct netmap_ring *ring, u_int limit, int skip_payload)
+{
+ u_int cur, rx;
+
+ cur = ring->cur;
+ if (ring->avail < limit)
+ limit = ring->avail;
+ for (rx = 0; rx < limit; rx++) {
+ struct netmap_slot *slot = &ring->slot[cur];
+ char *p = NETMAP_BUF(ring, slot->buf_idx);
+
+ if (!skip_payload)
+ check_payload(p, slot->len);
+
+ cur = NETMAP_RING_NEXT(ring, cur);
+ }
+ ring->avail -= rx;
+ ring->cur = cur;
+
+ return (rx);
+}
+
+static void *
+receiver_body(void *data)
+{
+ struct targ *targ = (struct targ *) data;
+ struct pollfd fds[1];
+ struct netmap_if *nifp = targ->nifp;
+ struct netmap_ring *rxring;
+ int i, received = 0;
+
+ if (setaffinity(targ->thread, targ->affinity))
+ goto quit;
+
+ /* setup poll(2) machanism. */
+ memset(fds, 0, sizeof(fds));
+ fds[0].fd = targ->fd;
+ fds[0].events = (POLLIN);
+
+ /* unbounded wait for the first packet. */
+ for (;;) {
+ i = poll(fds, 1, 1000);
+ if (i > 0 && !(fds[0].revents & POLLERR))
+ break;
+ D("waiting for initial packets, poll returns %d %d", i, fds[0].revents);
+ }
+
+ /* main loop, exit after 1s silence */
+ gettimeofday(&targ->tic, NULL);
+ if (targ->g->use_pcap) {
+ for (;;) {
+ pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL);
+ }
+ } else {
+ while (1) {
+ /* Once we started to receive packets, wait at most 1 seconds
+ before quitting. */
+ if (poll(fds, 1, 1 * 1000) <= 0) {
+ gettimeofday(&targ->toc, NULL);
+ targ->toc.tv_sec -= 1; /* Substract timeout time. */
+ break;
+ }
+
+ for (i = targ->qfirst; i < targ->qlast; i++) {
+ int m;
+
+ rxring = NETMAP_RXRING(nifp, i);
+ if (rxring->avail == 0)
+ continue;
+
+ m = receive_packets(rxring, targ->g->burst,
+ SKIP_PAYLOAD);
+ received += m;
+ targ->count = received;
+ }
+
+ // tell the card we have read the data
+ //ioctl(fds[0].fd, NIOCRXSYNC, NULL);
+ }
+ }
+
+ targ->completed = 1;
+ targ->count = received;
+
+quit:
+ /* reset the ``used`` flag. */
+ targ->used = 0;
+
+ return (NULL);
+}
+
+static void
+tx_output(uint64_t sent, int size, double delta)
+{
+ double amount = 8.0 * (1.0 * size * sent) / delta;
+ double pps = sent / delta;
+ char units[4] = { '\0', 'K', 'M', 'G' };
+ int aunit = 0, punit = 0;
+
+ while (amount >= 1000) {
+ amount /= 1000;
+ aunit += 1;
+ }
+ while (pps >= 1000) {
+ pps /= 1000;
+ punit += 1;
+ }
+
+ printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",
+ sent, size, delta);
+ printf("Speed: %.2f%cpps. Bandwidth: %.2f%cbps.\n",
+ pps, units[punit], amount, units[aunit]);
+}
+
+
+static void
+rx_output(uint64_t received, double delta)
+{
+
+ double pps = received / delta;
+ char units[4] = { '\0', 'K', 'M', 'G' };
+ int punit = 0;
+
+ while (pps >= 1000) {
+ pps /= 1000;
+ punit += 1;
+ }
+
+ printf("Received %llu packets, in %.2f seconds.\n", received, delta);
+ printf("Speed: %.2f%cpps.\n", pps, units[punit]);
+}
+
+static void
+usage(void)
+{
+ const char *cmd = "pkt-gen";
+ fprintf(stderr,
+ "Usage:\n"
+ "%s arguments\n"
+ "\t-i interface interface name\n"
+ "\t-t pkts_to_send also forces send mode\n"
+ "\t-r pkts_to_receive also forces receive mode\n"
+ "\t-l pkts_size in bytes excluding CRC\n"
+ "\t-d dst-ip end with %%n to sweep n addresses\n"
+ "\t-s src-ip end with %%n to sweep n addresses\n"
+ "\t-D dst-mac end with %%n to sweep n addresses\n"
+ "\t-S src-mac end with %%n to sweep n addresses\n"
+ "\t-b burst size testing, mostly\n"
+ "\t-c cores cores to use\n"
+ "\t-p threads processes/threads to use\n"
+ "\t-T report_ms milliseconds between reports\n"
+ "\t-w wait_for_link_time in seconds\n"
+ "",
+ cmd);
+
+ exit(0);
+}
+
+
+int
+main(int arc, char **argv)
+{
+ int i, fd;
+
+ struct glob_arg g;
+
+ struct nmreq nmr;
+ void *mmap_addr; /* the mmap address */
+ void *(*td_body)(void *) = receiver_body;
+ int ch;
+ int report_interval = 1000; /* report interval */
+ char *ifname = NULL;
+ int wait_link = 2;
+ int devqueues = 1; /* how many device queues */
+
+ bzero(&g, sizeof(g));
+
+ g.src_ip = "10.0.0.1";
+ g.dst_ip = "10.1.0.1";
+ g.dst_mac = "ff:ff:ff:ff:ff:ff";
+ g.src_mac = NULL;
+ g.pkt_size = 60;
+ g.burst = 512; // default
+ g.nthreads = 1;
+ g.cpus = 1;
+
+ while ( (ch = getopt(arc, argv,
+ "i:t:r:l:d:s:D:S:b:c:p:T:w:v")) != -1) {
+ switch(ch) {
+ default:
+ D("bad option %c %s", ch, optarg);
+ usage();
+ break;
+ case 'i': /* interface */
+ ifname = optarg;
+ break;
+ case 't': /* send */
+ td_body = sender_body;
+ g.npackets = atoi(optarg);
+ break;
+ case 'r': /* receive */
+ td_body = receiver_body;
+ g.npackets = atoi(optarg);
+ break;
+ case 'l': /* pkt_size */
+ g.pkt_size = atoi(optarg);
+ break;
+ case 'd':
+ g.dst_ip = optarg;
+ break;
+ case 's':
+ g.src_ip = optarg;
+ break;
+ case 'T': /* report interval */
+ report_interval = atoi(optarg);
+ break;
+ case 'w':
+ wait_link = atoi(optarg);
+ break;
+ case 'b': /* burst */
+ g.burst = atoi(optarg);
+ break;
+ case 'c':
+ g.cpus = atoi(optarg);
+ break;
+ case 'p':
+ g.nthreads = atoi(optarg);
+ break;
+
+ case 'P':
+ g.use_pcap = 1;
+ break;
+
+ case 'D': /* destination mac */
+ g.dst_mac = optarg;
+ {
+ struct ether_addr *mac = ether_aton(g.dst_mac);
+ D("ether_aton(%s) gives %p", g.dst_mac, mac);
+ }
+ break;
+ case 'S': /* source mac */
+ g.src_mac = optarg;
+ break;
+ case 'v':
+ verbose++;
+ }
+ }
+
+ if (ifname == NULL) {
+ D("missing ifname");
+ usage();
+ }
+ {
+ int n = system_ncpus();
+ if (g.cpus < 0 || g.cpus > n) {
+ D("%d cpus is too high, have only %d cpus", g.cpus, n);
+ usage();
+ }
+ if (g.cpus == 0)
+ g.cpus = n;
+ }
+ if (g.pkt_size < 16 || g.pkt_size > 1536) {
+ D("bad pktsize %d\n", g.pkt_size);
+ usage();
+ }
+
+ bzero(&nmr, sizeof(nmr));
+ /*
+ * Open the netmap device to fetch the number of queues of our
+ * interface.
+ *
+ * The first NIOCREGIF also detaches the card from the
+ * protocol stack and may cause a reset of the card,
+ * which in turn may take some time for the PHY to
+ * reconfigure.
+ */
+ fd = open("/dev/netmap", O_RDWR);
+ if (fd == -1) {
+ D("Unable to open /dev/netmap");
+ // fail later
+ } else {
+ if ((ioctl(fd, NIOCGINFO, &nmr)) == -1) {
+ D("Unable to get if info without name");
+ } else {
+ D("map size is %d Kb", nmr.nr_memsize >> 10);
+ }
+ bzero(&nmr, sizeof(nmr));
+ strncpy(nmr.nr_name, ifname, sizeof(nmr.nr_name));
+ if ((ioctl(fd, NIOCGINFO, &nmr)) == -1) {
+ D("Unable to get if info for %s", ifname);
+ }
+ devqueues = nmr.nr_numrings;
+ }
+
+ /* validate provided nthreads. */
+ if (g.nthreads < 1 || g.nthreads > devqueues) {
+ D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
+ // continue, fail later
+ }
+
+ if (td_body == sender_body && g.src_mac == NULL) {
+ static char mybuf[20] = "ff:ff:ff:ff:ff:ff";
+ /* retrieve source mac address. */
+ if (source_hwaddr(ifname, mybuf) == -1) {
+ D("Unable to retrieve source mac");
+ // continue, fail later
+ }
+ g.src_mac = mybuf;
+ }
+
+ /*
+ * Map the netmap shared memory: instead of issuing mmap()
+ * inside the body of the threads, we prefer to keep this
+ * operation here to simplify the thread logic.
+ */
+ D("mmapping %d Kbytes", nmr.nr_memsize>>10);
+ mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize,
+ PROT_WRITE | PROT_READ,
+ MAP_SHARED, fd, 0);
+ if (mmap_addr == MAP_FAILED) {
+ D("Unable to mmap %d KB", nmr.nr_memsize >> 10);
+ // continue, fail later
+ }
+
+ /*
+ * Register the interface on the netmap device: from now on,
+ * we can operate on the network interface without any
+ * interference from the legacy network stack.
+ *
+ * We decide to put the first interface registration here to
+ * give time to cards that take a long time to reset the PHY.
+ */
+ if (ioctl(fd, NIOCREGIF, &nmr) == -1) {
+ D("Unable to register interface %s", ifname);
+ //continue, fail later
+ }
+
+
+ /* Print some debug information. */
+ fprintf(stdout,
+ "%s %s: %d queues, %d threads and %d cpus.\n",
+ (td_body == sender_body) ? "Sending on" : "Receiving from",
+ ifname,
+ devqueues,
+ g.nthreads,
+ g.cpus);
+ if (td_body == sender_body) {
+ fprintf(stdout, "%s -> %s (%s -> %s)\n",
+ g.src_ip, g.dst_ip,
+ g.src_mac, g.dst_mac);
+ }
+
+ /* Exit if something went wrong. */
+ if (fd < 0) {
+ D("aborting");
+ usage();
+ }
+
+
+ /* Wait for PHY reset. */
+ D("Wait %d secs for phy reset", wait_link);
+ sleep(wait_link);
+ D("Ready...");
+
+ /* Install ^C handler. */
+ global_nthreads = g.nthreads;
+ signal(SIGINT, sigint_h);
+
+ if (g.use_pcap) {
+ // XXX g.p = pcap_open_live(..);
+ }
+
+ targs = calloc(g.nthreads, sizeof(*targs));
+ /*
+ * Now create the desired number of threads, each one
+ * using a single descriptor.
+ */
+ for (i = 0; i < g.nthreads; i++) {
+ struct netmap_if *tnifp;
+ struct nmreq tifreq;
+ int tfd;
+
+ if (g.use_pcap) {
+ tfd = -1;
+ tnifp = NULL;
+ } else {
+ /* register interface. */
+ tfd = open("/dev/netmap", O_RDWR);
+ if (tfd == -1) {
+ D("Unable to open /dev/netmap");
+ continue;
+ }
+
+ bzero(&tifreq, sizeof(tifreq));
+ strncpy(tifreq.nr_name, ifname, sizeof(tifreq.nr_name));
+ tifreq.nr_ringid = (g.nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
+
+ /*
+ * if we are acting as a receiver only, do not touch the transmit ring.
+ * This is not the default because many apps may use the interface
+ * in both directions, but a pure receiver does not.
+ */
+ if (td_body == receiver_body) {
+ tifreq.nr_ringid |= NETMAP_NO_TX_POLL;
+ }
+
+ if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) {
+ D("Unable to register %s", ifname);
+ continue;
+ }
+ tnifp = NETMAP_IF(mmap_addr, tifreq.nr_offset);
+ }
+ /* start threads. */
+ bzero(&targs[i], sizeof(targs[i]));
+ targs[i].g = &g;
+ targs[i].used = 1;
+ targs[i].completed = 0;
+ targs[i].fd = tfd;
+ targs[i].nmr = tifreq;
+ targs[i].nifp = tnifp;
+ targs[i].qfirst = (g.nthreads > 1) ? i : 0;
+ targs[i].qlast = (g.nthreads > 1) ? i+1 : tifreq.nr_numrings;
+ targs[i].me = i;
+ targs[i].affinity = g.cpus ? i % g.cpus : -1;
+ if (td_body == sender_body) {
+ /* initialize the packet to send. */
+ initialize_packet(&targs[i]);
+ }
+
+ if (pthread_create(&targs[i].thread, NULL, td_body,
+ &targs[i]) == -1) {
+ D("Unable to create thread %d", i);
+ targs[i].used = 0;
+ }
+ }
+
+ {
+ uint64_t my_count = 0, prev = 0;
+ uint64_t count = 0;
+ double delta_t;
+ struct timeval tic, toc;
+
+ gettimeofday(&toc, NULL);
+ for (;;) {
+ struct timeval now, delta;
+ uint64_t pps;
+ int done = 0;
+
+ delta.tv_sec = report_interval/1000;
+ delta.tv_usec = (report_interval%1000)*1000;
+ select(0, NULL, NULL, NULL, &delta);
+ gettimeofday(&now, NULL);
+ timersub(&now, &toc, &toc);
+ my_count = 0;
+ for (i = 0; i < g.nthreads; i++) {
+ my_count += targs[i].count;
+ if (targs[i].used == 0)
+ done++;
+ }
+ pps = toc.tv_sec* 1000000 + toc.tv_usec;
+ if (pps < 10000)
+ continue;
+ pps = (my_count - prev)*1000000 / pps;
+ D("%llu pps", pps);
+ prev = my_count;
+ toc = now;
+ if (done == g.nthreads)
+ break;
+ }
+
+ timerclear(&tic);
+ timerclear(&toc);
+ for (i = 0; i < g.nthreads; i++) {
+ /*
+ * Join active threads, unregister interfaces and close
+ * file descriptors.
+ */
+ pthread_join(targs[i].thread, NULL);
+ ioctl(targs[i].fd, NIOCUNREGIF, &targs[i].nmr);
+ close(targs[i].fd);
+
+ if (targs[i].completed == 0)
+ continue;
+
+ /*
+ * Collect threads o1utput and extract information about
+ * how log it took to send all the packets.
+ */
+ count += targs[i].count;
+ if (!timerisset(&tic) || timercmp(&targs[i].tic, &tic, <))
+ tic = targs[i].tic;
+ if (!timerisset(&toc) || timercmp(&targs[i].toc, &toc, >))
+ toc = targs[i].toc;
+ }
+
+ /* print output. */
+ timersub(&toc, &tic, &toc);
+ delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
+ if (td_body == sender_body)
+ tx_output(count, g.pkt_size, delta_t);
+ else
+ rx_output(count, delta_t);
+ }
+
+ ioctl(fd, NIOCUNREGIF, &nmr);
+ munmap(mmap_addr, nmr.nr_memsize);
+ close(fd);
+
+ return (0);
+}
+/* end of file */