aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuigi Rizzo <luigi@FreeBSD.org>2014-02-15 04:53:04 +0000
committerLuigi Rizzo <luigi@FreeBSD.org>2014-02-15 04:53:04 +0000
commitf0ea3689a9c1c27067145ed902811149e78cc4fa (patch)
tree5f40d56905d46741e85cd83a0278b12363e3e2a7
parent53bf5ef829d5fd312db3851ce6cb589173b744e1 (diff)
downloadsrc-f0ea3689a9c1c27067145ed902811149e78cc4fa.tar.gz
src-f0ea3689a9c1c27067145ed902811149e78cc4fa.zip
This new version of netmap brings you the following:
- netmap pipes, providing bidirectional blocking I/O while moving 100+ Mpps between processes using shared memory channels (no mistake: over one hundred million. But mind you, i said *moving* not *processing*); - kqueue support (BHyVe needs it); - improved user library. Just the interface name lets you select a NIC, host port, VALE switch port, netmap pipe, and individual queues. The upcoming netmap-enabled libpcap will use this feature. - optional extra buffers associated to netmap ports, for applications that need to buffer data yet don't want to make copies. - segmentation offloading for the VALE switch, useful between VMs. and a number of bug fixes and performance improvements. My colleagues Giuseppe Lettieri and Vincenzo Maffione did a substantial amount of work on these features so we owe them a big thanks. There are some external repositories that can be of interest: https://code.google.com/p/netmap our public repository for netmap/VALE code, including linux versions and other stuff that does not belong here, such as python bindings. https://code.google.com/p/netmap-libpcap a clone of the libpcap repository with netmap support. With this any libpcap client has access to most netmap feature with no recompilation. E.g. tcpdump can filter packets at 10-15 Mpps. https://code.google.com/p/netmap-ipfw a userspace version of ipfw+dummynet which uses netmap to send/receive packets. Speed is up in the 7-10 Mpps range per core for simple rulesets. Both netmap-libpcap and netmap-ipfw will be merged upstream at some point, but while this happens it is useful to have access to them. And yes, this code will be merged soon. It is infinitely better than the version currently in 10 and 9. MFC after: 3 days
Notes
Notes: svn path=/head/; revision=261909
-rw-r--r--sys/conf/files4
-rw-r--r--sys/dev/netmap/netmap.c501
-rw-r--r--sys/dev/netmap/netmap_freebsd.c265
-rw-r--r--sys/dev/netmap/netmap_generic.c41
-rw-r--r--sys/dev/netmap/netmap_kern.h227
-rw-r--r--sys/dev/netmap/netmap_mem2.c382
-rw-r--r--sys/dev/netmap/netmap_mem2.h14
-rw-r--r--sys/dev/netmap/netmap_offloadings.c401
-rw-r--r--sys/dev/netmap/netmap_pipe.c711
-rw-r--r--sys/dev/netmap/netmap_vale.c287
-rw-r--r--sys/modules/netmap/Makefile2
-rw-r--r--sys/net/netmap.h163
-rw-r--r--sys/net/netmap_user.h354
-rw-r--r--tools/tools/netmap/Makefile21
-rw-r--r--tools/tools/netmap/README17
-rw-r--r--tools/tools/netmap/bridge.c114
-rw-r--r--tools/tools/netmap/click-test.cfg19
-rw-r--r--tools/tools/netmap/nm_util.c278
-rw-r--r--tools/tools/netmap/nm_util.h127
-rw-r--r--tools/tools/netmap/pcap.c528
-rw-r--r--tools/tools/netmap/pkt-gen.c450
-rw-r--r--tools/tools/netmap/vale-ctl.c1
22 files changed, 3082 insertions, 1825 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 1f20111572fe..c61030225e84 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1875,8 +1875,10 @@ dev/ncv/ncr53c500_pccard.c optional ncv pccard
dev/netmap/netmap.c optional netmap
dev/netmap/netmap_freebsd.c optional netmap
dev/netmap/netmap_generic.c optional netmap
-dev/netmap/netmap_mbq.c optional netmap
+dev/netmap/netmap_mbq.c optional netmap
dev/netmap/netmap_mem2.c optional netmap
+dev/netmap/netmap_offloadings.c optional netmap
+dev/netmap/netmap_pipe.c optional netmap
dev/netmap/netmap_vale.c optional netmap
# compile-with "${NORMAL_C} -Wconversion -Wextra"
dev/nge/if_nge.c optional nge
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index fdd368a346fe..de88fb58fc8c 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -156,9 +156,11 @@ ports attached to the switch)
/* reduce conditional code */
-#define init_waitqueue_head(x) // only needed in linux
-
+// linux API, use for the knlist in FreeBSD
+#define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL)
+void freebsd_selwakeup(struct selinfo *si, int pri);
+#define OS_selwakeup(a, b) freebsd_selwakeup(a, b)
#elif defined(linux)
@@ -231,6 +233,7 @@ static int netmap_admode = NETMAP_ADMODE_BEST;
int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */
int netmap_generic_ringsize = 1024; /* Generic ringsize. */
+int netmap_generic_rings = 1; /* number of queues in generic. */
SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
@@ -238,6 +241,7 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0,
SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
NMG_LOCK_T netmap_global_lock;
@@ -270,28 +274,30 @@ netmap_set_all_rings(struct ifnet *ifp, int stopped)
{
struct netmap_adapter *na;
int i;
+ u_int ntx, nrx;
if (!(ifp->if_capenable & IFCAP_NETMAP))
return;
na = NA(ifp);
- for (i = 0; i <= na->num_tx_rings; i++) {
+ ntx = netmap_real_tx_rings(na);
+ nrx = netmap_real_rx_rings(na);
+
+ for (i = 0; i < ntx; i++) {
if (stopped)
netmap_disable_ring(na->tx_rings + i);
else
na->tx_rings[i].nkr_stopped = 0;
- na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
- (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
+ na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY);
}
- for (i = 0; i <= na->num_rx_rings; i++) {
+ for (i = 0; i < nrx; i++) {
if (stopped)
netmap_disable_ring(na->rx_rings + i);
else
na->rx_rings[i].nkr_stopped = 0;
- na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
- (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
+ na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY);
}
}
@@ -426,14 +432,73 @@ netmap_update_config(struct netmap_adapter *na)
return 1;
}
+static int
+netmap_txsync_compat(struct netmap_kring *kring, int flags)
+{
+ struct netmap_adapter *na = kring->na;
+ return na->nm_txsync(na, kring->ring_id, flags);
+}
+
+static int
+netmap_rxsync_compat(struct netmap_kring *kring, int flags)
+{
+ struct netmap_adapter *na = kring->na;
+ return na->nm_rxsync(na, kring->ring_id, flags);
+}
+
+static int
+netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
+{
+ (void)flags;
+ netmap_txsync_to_host(kring->na);
+ return 0;
+}
+
+static int
+netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
+{
+ (void)flags;
+ netmap_rxsync_from_host(kring->na, NULL, NULL);
+ return 0;
+}
+
+
+/* create the krings array and initialize the fields common to all adapters.
+ * The array layout is this:
+ *
+ * +----------+
+ * na->tx_rings ----->| | \
+ * | | } na->num_tx_ring
+ * | | /
+ * +----------+
+ * | | host tx kring
+ * na->rx_rings ----> +----------+
+ * | | \
+ * | | } na->num_rx_rings
+ * | | /
+ * +----------+
+ * | | host rx kring
+ * +----------+
+ * na->tailroom ----->| | \
+ * | | } tailroom bytes
+ * | | /
+ * +----------+
+ *
+ * Note: for compatibility, host krings are created even when not needed.
+ * The tailroom space is currently used by vale ports for allocating leases.
+ */
int
-netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
+netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
{
u_int i, len, ndesc;
struct netmap_kring *kring;
+ u_int ntx, nrx;
+
+ /* account for the (possibly fake) host rings */
+ ntx = na->num_tx_rings + 1;
+ nrx = na->num_rx_rings + 1;
- // XXX additional space for extra rings ?
len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
@@ -454,12 +519,19 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
kring->na = na;
kring->ring_id = i;
kring->nkr_num_slots = ndesc;
+ if (i < na->num_tx_rings) {
+ kring->nm_sync = netmap_txsync_compat; // XXX
+ } else if (i == na->num_tx_rings) {
+ kring->nm_sync = netmap_txsync_to_host_compat;
+ }
/*
* IMPORTANT: Always keep one slot empty.
*/
kring->rhead = kring->rcur = kring->nr_hwcur = 0;
kring->rtail = kring->nr_hwtail = ndesc - 1;
snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
+ ND("ktx %s h %d c %d t %d",
+ kring->name, kring->rhead, kring->rcur, kring->rtail);
mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
init_waitqueue_head(&kring->si);
}
@@ -471,9 +543,16 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
kring->na = na;
kring->ring_id = i;
kring->nkr_num_slots = ndesc;
+ if (i < na->num_rx_rings) {
+ kring->nm_sync = netmap_rxsync_compat; // XXX
+ } else if (i == na->num_rx_rings) {
+ kring->nm_sync = netmap_rxsync_from_host_compat;
+ }
kring->rhead = kring->rcur = kring->nr_hwcur = 0;
kring->rtail = kring->nr_hwtail = 0;
snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
+ ND("krx %s h %d c %d t %d",
+ kring->name, kring->rhead, kring->rcur, kring->rtail);
mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
init_waitqueue_head(&kring->si);
}
@@ -486,17 +565,15 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
}
-/* XXX check boundaries */
+/* undo the actions performed by netmap_krings_create */
void
netmap_krings_delete(struct netmap_adapter *na)
{
- int i;
+ struct netmap_kring *kring = na->tx_rings;
- for (i = 0; i < na->num_tx_rings + 1; i++) {
- mtx_destroy(&na->tx_rings[i].q_lock);
- }
- for (i = 0; i < na->num_rx_rings + 1; i++) {
- mtx_destroy(&na->rx_rings[i].q_lock);
+ /* we rely on the krings layout described above */
+ for ( ; kring != na->tailroom; kring++) {
+ mtx_destroy(&kring->q_lock);
}
free(na->tx_rings, M_DEVBUF);
na->tx_rings = na->rx_rings = na->tailroom = NULL;
@@ -677,6 +754,20 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
netmap_mem_if_delete(na, nifp);
}
+static __inline int
+nm_tx_si_user(struct netmap_priv_d *priv)
+{
+ return (priv->np_na != NULL &&
+ (priv->np_txqlast - priv->np_txqfirst > 1));
+}
+
+static __inline int
+nm_rx_si_user(struct netmap_priv_d *priv)
+{
+ return (priv->np_na != NULL &&
+ (priv->np_rxqlast - priv->np_rxqfirst > 1));
+}
+
/*
* returns 1 if this is the last instance and we can free priv
@@ -702,6 +793,10 @@ netmap_dtor_locked(struct netmap_priv_d *priv)
priv->np_nifp = NULL;
netmap_drop_memory_locked(priv);
if (priv->np_na) {
+ if (nm_tx_si_user(priv))
+ na->tx_si_users--;
+ if (nm_rx_si_user(priv))
+ na->rx_si_users--;
netmap_adapter_put(na);
priv->np_na = NULL;
}
@@ -864,22 +959,8 @@ netmap_txsync_to_host(struct netmap_adapter *na)
struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
struct netmap_ring *ring = kring->ring;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const head = nm_txsync_prologue(kring);
+ u_int const head = kring->rhead;
struct mbq q;
- int error;
-
- error = nm_kr_tryget(kring);
- if (error) {
- if (error == NM_KR_BUSY)
- D("ring %p busy (user error)", kring);
- return;
- }
- if (head > lim) {
- D("invalid ring index in stack TX kring %p", kring);
- netmap_ring_reinit(kring);
- nm_kr_put(kring);
- return;
- }
/* Take packets from hwcur to head and pass them up.
* force head = cur since netmap_grab_packets() stops at head
@@ -896,7 +977,6 @@ netmap_txsync_to_host(struct netmap_adapter *na)
kring->nr_hwtail -= lim + 1;
nm_txsync_finalize(kring);
- nm_kr_put(kring);
netmap_send_up(na->ifp, &q);
}
@@ -921,27 +1001,15 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
struct netmap_ring *ring = kring->ring;
u_int nm_i, n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const head = nm_rxsync_prologue(kring);
+ u_int const head = kring->rhead;
int ret = 0;
struct mbq *q = &kring->rx_queue;
(void)pwait; /* disable unused warnings */
-
- if (head > lim) {
- netmap_ring_reinit(kring);
- return EINVAL;
- }
-
- if (kring->nkr_stopped) /* check a first time without lock */
- return EBUSY;
+ (void)td;
mtx_lock(&q->lock);
- if (kring->nkr_stopped) { /* check again with lock held */
- ret = EBUSY;
- goto unlock_out;
- }
-
/* First part: import newly received packets */
n = mbq_len(q);
if (n) { /* grab packets from the queue */
@@ -982,8 +1050,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
if (kring->rcur == kring->rtail && td) /* no bufs available */
selrecord(td, &kring->si);
-unlock_out:
-
mtx_unlock(&q->lock);
return ret;
}
@@ -1107,19 +1173,26 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
int
netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
{
- struct ifnet *ifp;
+ struct ifnet *ifp = NULL;
int error = 0;
- struct netmap_adapter *ret;
+ struct netmap_adapter *ret = NULL;
*na = NULL; /* default return value */
/* first try to see if this is a bridge port. */
NMG_LOCK_ASSERT();
+ error = netmap_get_pipe_na(nmr, na, create);
+ if (error || *na != NULL)
+ return error;
+
error = netmap_get_bdg_na(nmr, na, create);
- if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
+ if (error)
return error;
+ if (*na != NULL) /* valid match in netmap_get_bdg_na() */
+ goto pipes;
+
ifp = ifunit_ref(nmr->nr_name);
if (ifp == NULL) {
return ENXIO;
@@ -1129,18 +1202,23 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
if (error)
goto out;
- if (ret != NULL) {
- /* Users cannot use the NIC attached to a bridge directly */
- if (NETMAP_OWNED_BY_KERN(ret)) {
- error = EBUSY;
- goto out;
- }
- error = 0;
- *na = ret;
- netmap_adapter_get(ret);
+ /* Users cannot use the NIC attached to a bridge directly */
+ if (NETMAP_OWNED_BY_KERN(ret)) {
+ error = EBUSY;
+ goto out;
}
+ *na = ret;
+ netmap_adapter_get(ret);
+
+pipes:
+ error = netmap_pipe_alloc(*na, nmr);
+
out:
- if_rele(ifp);
+ if (error && ret != NULL)
+ netmap_adapter_put(ret);
+
+ if (ifp)
+ if_rele(ifp);
return error;
}
@@ -1365,45 +1443,88 @@ netmap_ring_reinit(struct netmap_kring *kring)
* for all rings is the same as a single ring.
*/
static int
-netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
+netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
{
struct netmap_adapter *na = priv->np_na;
- struct ifnet *ifp = na->ifp;
- u_int i = ringid & NETMAP_RING_MASK;
- /* initially (np_qfirst == np_qlast) we don't want to lock */
- u_int lim = na->num_rx_rings;
-
- if (na->num_tx_rings > lim)
- lim = na->num_tx_rings;
- if ( (ringid & NETMAP_HW_RING) && i >= lim) {
- D("invalid ring id %d", i);
- return (EINVAL);
- }
- priv->np_ringid = ringid;
- if (ringid & NETMAP_SW_RING) {
- priv->np_qfirst = NETMAP_SW_RING;
- priv->np_qlast = 0;
- } else if (ringid & NETMAP_HW_RING) {
- priv->np_qfirst = i;
- priv->np_qlast = i + 1;
- } else {
- priv->np_qfirst = 0;
- priv->np_qlast = NETMAP_HW_RING ;
+ u_int j, i = ringid & NETMAP_RING_MASK;
+ u_int reg = flags & NR_REG_MASK;
+
+ if (reg == NR_REG_DEFAULT) {
+ /* convert from old ringid to flags */
+ if (ringid & NETMAP_SW_RING) {
+ reg = NR_REG_SW;
+ } else if (ringid & NETMAP_HW_RING) {
+ reg = NR_REG_ONE_NIC;
+ } else {
+ reg = NR_REG_ALL_NIC;
+ }
+ D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
+ }
+ switch (reg) {
+ case NR_REG_ALL_NIC:
+ case NR_REG_PIPE_MASTER:
+ case NR_REG_PIPE_SLAVE:
+ priv->np_txqfirst = 0;
+ priv->np_txqlast = na->num_tx_rings;
+ priv->np_rxqfirst = 0;
+ priv->np_rxqlast = na->num_rx_rings;
+ ND("%s %d %d", "ALL/PIPE",
+ priv->np_rxqfirst, priv->np_rxqlast);
+ break;
+ case NR_REG_SW:
+ case NR_REG_NIC_SW:
+ if (!(na->na_flags & NAF_HOST_RINGS)) {
+ D("host rings not supported");
+ return EINVAL;
+ }
+ priv->np_txqfirst = (reg == NR_REG_SW ?
+ na->num_tx_rings : 0);
+ priv->np_txqlast = na->num_tx_rings + 1;
+ priv->np_rxqfirst = (reg == NR_REG_SW ?
+ na->num_rx_rings : 0);
+ priv->np_rxqlast = na->num_rx_rings + 1;
+ ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
+ priv->np_rxqfirst, priv->np_rxqlast);
+ break;
+ case NR_REG_ONE_NIC:
+ if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
+ D("invalid ring id %d", i);
+ return EINVAL;
+ }
+ /* if not enough rings, use the first one */
+ j = i;
+ if (j >= na->num_tx_rings)
+ j = 0;
+ priv->np_txqfirst = j;
+ priv->np_txqlast = j + 1;
+ j = i;
+ if (j >= na->num_rx_rings)
+ j = 0;
+ priv->np_rxqfirst = j;
+ priv->np_rxqlast = j + 1;
+ break;
+ default:
+ D("invalid regif type %d", reg);
+ return EINVAL;
}
priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
- if (netmap_verbose) {
- if (ringid & NETMAP_SW_RING)
- D("ringid %s set to SW RING", NM_IFPNAME(ifp));
- else if (ringid & NETMAP_HW_RING)
- D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
- priv->np_qfirst);
- else
- D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
- }
+ priv->np_flags = (flags & ~NR_REG_MASK) | reg;
+ if (nm_tx_si_user(priv))
+ na->tx_si_users++;
+ if (nm_rx_si_user(priv))
+ na->rx_si_users++;
+ if (netmap_verbose) {
+ D("%s: tx [%d,%d) rx [%d,%d) id %d",
+ NM_IFPNAME(na->ifp),
+ priv->np_txqfirst,
+ priv->np_txqlast,
+ priv->np_rxqfirst,
+ priv->np_rxqlast,
+ i);
+ }
return 0;
}
-
/*
* possibly move the interface to netmap-mode.
* If success it returns a pointer to netmap_if, otherwise NULL.
@@ -1411,7 +1532,7 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
*/
struct netmap_if *
netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
- uint16_t ringid, int *err)
+ uint16_t ringid, uint32_t flags, int *err)
{
struct ifnet *ifp = na->ifp;
struct netmap_if *nifp = NULL;
@@ -1421,7 +1542,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
/* ring configuration may have changed, fetch from the card */
netmap_update_config(na);
priv->np_na = na; /* store the reference */
- error = netmap_set_ringid(priv, ringid);
+ error = netmap_set_ringid(priv, ringid, flags);
if (error)
goto out;
/* ensure allocators are ready */
@@ -1501,26 +1622,12 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
struct nmreq *nmr = (struct nmreq *) data;
struct netmap_adapter *na = NULL;
int error;
- u_int i, lim;
+ u_int i, qfirst, qlast;
struct netmap_if *nifp;
struct netmap_kring *krings;
(void)dev; /* UNUSED */
(void)fflag; /* UNUSED */
-#ifdef linux
-#define devfs_get_cdevpriv(pp) \
- ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \
- (*pp ? 0 : ENOENT); })
-
-/* devfs_set_cdevpriv cannot fail on linux */
-#define devfs_set_cdevpriv(p, fn) \
- ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
-
-
-#define devfs_clear_cdevpriv() do { \
- netmap_dtor(priv); ((struct file *)td)->private_data = 0; \
- } while (0)
-#endif /* linux */
if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
/* truncate name */
@@ -1530,6 +1637,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
nmr->nr_name,
nmr->nr_version, NETMAP_API);
nmr->nr_version = NETMAP_API;
+ }
+ if (nmr->nr_version < NETMAP_MIN_API ||
+ nmr->nr_version > NETMAP_MAX_API) {
return EINVAL;
}
}
@@ -1564,7 +1674,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
nmd = na->nm_mem; /* get memory allocator */
}
- error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
+ error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
+ &nmr->nr_arg2);
if (error)
break;
if (na == NULL) /* only memory info */
@@ -1576,8 +1687,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
nmr->nr_tx_rings = na->num_tx_rings;
nmr->nr_rx_slots = na->num_rx_desc;
nmr->nr_tx_slots = na->num_tx_desc;
- if (memflags & NETMAP_MEM_PRIVATE)
- nmr->nr_ringid |= NETMAP_PRIV_MEM;
netmap_adapter_put(na);
} while (0);
NMG_UNLOCK();
@@ -1587,7 +1696,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
/* possibly attach/detach NIC and VALE switch */
i = nmr->nr_cmd;
if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
- || i == NETMAP_BDG_OFFSET) {
+ || i == NETMAP_BDG_VNET_HDR) {
error = netmap_bdg_ctl(nmr, NULL);
break;
} else if (i != 0) {
@@ -1602,7 +1711,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
u_int memflags;
if (priv->np_na != NULL) { /* thread already registered */
- error = netmap_set_ringid(priv, nmr->nr_ringid);
+ error = EBUSY;
break;
}
/* find the interface and a reference */
@@ -1615,27 +1724,39 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
error = EBUSY;
break;
}
- nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
+ nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
if (!nifp) { /* reg. failed, release priv and ref */
netmap_adapter_put(na);
priv->np_nifp = NULL;
break;
}
+ priv->np_td = td; // XXX kqueue, debugging only
/* return the offset of the netmap_if object */
nmr->nr_rx_rings = na->num_rx_rings;
nmr->nr_tx_rings = na->num_tx_rings;
nmr->nr_rx_slots = na->num_rx_desc;
nmr->nr_tx_slots = na->num_tx_desc;
- error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
+ error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
+ &nmr->nr_arg2);
if (error) {
netmap_adapter_put(na);
break;
}
if (memflags & NETMAP_MEM_PRIVATE) {
- nmr->nr_ringid |= NETMAP_PRIV_MEM;
*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
}
+ priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
+ &na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
+ priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
+ &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
+
+ if (nmr->nr_arg3) {
+ D("requested %d extra buffers", nmr->nr_arg3);
+ nmr->nr_arg3 = netmap_extra_alloc(na,
+ &nifp->ni_bufs_head, nmr->nr_arg3);
+ D("got %d extra buffers", nmr->nr_arg3);
+ }
nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
} while (0);
NMG_UNLOCK();
@@ -1666,21 +1787,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
break;
}
- if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
- if (cmd == NIOCTXSYNC)
- netmap_txsync_to_host(na);
- else
- netmap_rxsync_from_host(na, NULL, NULL);
- break;
+ if (cmd == NIOCTXSYNC) {
+ krings = na->tx_rings;
+ qfirst = priv->np_txqfirst;
+ qlast = priv->np_txqlast;
+ } else {
+ krings = na->rx_rings;
+ qfirst = priv->np_rxqfirst;
+ qlast = priv->np_rxqlast;
}
- /* find the last ring to scan */
- lim = priv->np_qlast;
- if (lim == NETMAP_HW_RING)
- lim = (cmd == NIOCTXSYNC) ?
- na->num_tx_rings : na->num_rx_rings;
-
- krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
- for (i = priv->np_qfirst; i < lim; i++) {
+
+ for (i = qfirst; i < qlast; i++) {
struct netmap_kring *kring = krings + i;
if (nm_kr_tryget(kring)) {
error = EBUSY;
@@ -1694,14 +1811,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
netmap_ring_reinit(kring);
} else {
- na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
+ kring->nm_sync(kring, NAF_FORCE_RECLAIM);
}
if (netmap_verbose & NM_VERB_TXSYNC)
D("post txsync ring %d cur %d hwcur %d",
i, kring->ring->cur,
kring->nr_hwcur);
} else {
- na->nm_rxsync(na, i, NAF_FORCE_READ);
+ kring->nm_sync(kring, NAF_FORCE_READ);
microtime(&na->rx_rings[i].ring->ts);
}
nm_kr_put(kring);
@@ -1772,9 +1889,9 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
struct ifnet *ifp;
struct netmap_kring *kring;
u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
- u_int lim_tx, lim_rx;
struct mbq q; /* packets from hw queues to host stack */
void *pwait = dev; /* linux compatibility */
+ int is_kevent = 0;
/*
* In order to avoid nested locks, we need to "double check"
@@ -1786,7 +1903,19 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
(void)pwait;
mbq_init(&q);
- if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
+ /*
+ * XXX kevent has curthread->tp_fop == NULL,
+ * so devfs_get_cdevpriv() fails. We circumvent this by passing
+ * priv as the first argument, which is also useful to avoid
+ * the selrecord() which are not necessary in that case.
+ */
+ if (devfs_get_cdevpriv((void **)&priv) != 0) {
+ is_kevent = 1;
+ if (netmap_verbose)
+ D("called from kevent");
+ priv = (struct netmap_priv_d *)dev;
+ }
+ if (priv == NULL)
return POLLERR;
if (priv->np_nifp == NULL) {
@@ -1811,28 +1940,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
want_tx = events & (POLLOUT | POLLWRNORM);
want_rx = events & (POLLIN | POLLRDNORM);
- lim_tx = na->num_tx_rings;
- lim_rx = na->num_rx_rings;
-
- if (priv->np_qfirst == NETMAP_SW_RING) {
- // XXX locking ?
- /* handle the host stack ring */
- if (priv->np_txpoll || want_tx) {
- /* push any packets up, then we are always ready */
- netmap_txsync_to_host(na);
- revents |= want_tx;
- }
- if (want_rx) {
- kring = &na->rx_rings[lim_rx];
- /* XXX replace with rxprologue etc. */
- if (nm_ring_empty(kring->ring))
- netmap_rxsync_from_host(na, td, dev);
- if (!nm_ring_empty(kring->ring))
- revents |= want_rx;
- }
- return (revents);
- }
-
/*
* check_all_{tx|rx} are set if the card has more than one queue AND
@@ -1847,19 +1954,15 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
* there are pending packets to send. The latter can be disabled
* passing NETMAP_NO_TX_POLL in the NIOCREG call.
*/
- check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
- check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
-
- if (priv->np_qlast != NETMAP_HW_RING) {
- lim_tx = lim_rx = priv->np_qlast;
- }
+ check_all_tx = nm_tx_si_user(priv);
+ check_all_rx = nm_rx_si_user(priv);
/*
* We start with a lock free round which is cheap if we have
* slots available. If this fails, then lock and call the sync
* routines.
*/
- for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
+ for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
kring = &na->rx_rings[i];
/* XXX compare ring->cur and kring->tail */
if (!nm_ring_empty(kring->ring)) {
@@ -1867,7 +1970,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
want_rx = 0; /* also breaks the loop */
}
}
- for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
+ for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
kring = &na->tx_rings[i];
/* XXX compare ring->cur and kring->tail */
if (!nm_ring_empty(kring->ring)) {
@@ -1891,7 +1994,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
* used to skip rings with no pending transmissions.
*/
flush_tx:
- for (i = priv->np_qfirst; i < lim_tx; i++) {
+ for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
int found = 0;
kring = &na->tx_rings[i];
@@ -1906,7 +2009,7 @@ flush_tx:
netmap_ring_reinit(kring);
revents |= POLLERR;
} else {
- if (na->nm_txsync(na, i, 0))
+ if (kring->nm_sync(kring, 0))
revents |= POLLERR;
}
@@ -1921,12 +2024,12 @@ flush_tx:
if (found) { /* notify other listeners */
revents |= want_tx;
want_tx = 0;
- na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY);
+ na->nm_notify(na, i, NR_TX, 0);
}
}
- if (want_tx && retry_tx) {
+ if (want_tx && retry_tx && !is_kevent) {
selrecord(td, check_all_tx ?
- &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
+ &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
retry_tx = 0;
goto flush_tx;
}
@@ -1940,7 +2043,7 @@ flush_tx:
int send_down = 0; /* transparent mode */
/* two rounds here to for race avoidance */
do_retry_rx:
- for (i = priv->np_qfirst; i < lim_rx; i++) {
+ for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
int found = 0;
kring = &na->rx_rings[i];
@@ -1962,7 +2065,7 @@ do_retry_rx:
netmap_grab_packets(kring, &q, netmap_fwd);
}
- if (na->nm_rxsync(na, i, 0))
+ if (kring->nm_sync(kring, 0))
revents |= POLLERR;
if (netmap_no_timestamp == 0 ||
kring->ring->flags & NR_TIMESTAMP) {
@@ -1974,24 +2077,26 @@ do_retry_rx:
if (found) {
revents |= want_rx;
retry_rx = 0;
- na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY);
+ na->nm_notify(na, i, NR_RX, 0);
}
}
/* transparent mode XXX only during first pass ? */
- kring = &na->rx_rings[lim_rx];
- if (check_all_rx
- && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
- /* XXX fix to use kring fields */
- if (nm_ring_empty(kring->ring))
- send_down = netmap_rxsync_from_host(na, td, dev);
- if (!nm_ring_empty(kring->ring))
- revents |= want_rx;
+ if (na->na_flags & NAF_HOST_RINGS) {
+ kring = &na->rx_rings[na->num_rx_rings];
+ if (check_all_rx
+ && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
+ /* XXX fix to use kring fields */
+ if (nm_ring_empty(kring->ring))
+ send_down = netmap_rxsync_from_host(na, td, dev);
+ if (!nm_ring_empty(kring->ring))
+ revents |= want_rx;
+ }
}
- if (retry_rx)
+ if (retry_rx && !is_kevent)
selrecord(td, check_all_rx ?
- &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
+ &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
if (send_down > 0 || retry_rx) {
retry_rx = 0;
if (send_down)
@@ -2032,14 +2137,14 @@ netmap_notify(struct netmap_adapter *na, u_int n_ring,
if (tx == NR_TX) {
kring = na->tx_rings + n_ring;
- selwakeuppri(&kring->si, PI_NET);
- if (flags & NAF_GLOBAL_NOTIFY)
- selwakeuppri(&na->tx_si, PI_NET);
+ OS_selwakeup(&kring->si, PI_NET);
+ if (na->tx_si_users > 0)
+ OS_selwakeup(&na->tx_si, PI_NET);
} else {
kring = na->rx_rings + n_ring;
- selwakeuppri(&kring->si, PI_NET);
- if (flags & NAF_GLOBAL_NOTIFY)
- selwakeuppri(&na->rx_si, PI_NET);
+ OS_selwakeup(&kring->si, PI_NET);
+ if (na->rx_si_users > 0)
+ OS_selwakeup(&na->rx_si, PI_NET);
}
return 0;
}
@@ -2090,6 +2195,7 @@ netmap_detach_common(struct netmap_adapter *na)
D("freeing leftover tx_rings");
na->nm_krings_delete(na);
}
+ netmap_pipe_dealloc(na);
if (na->na_flags & NAF_MEM_OWNER)
netmap_mem_private_delete(na->nm_mem);
bzero(na, sizeof(*na));
@@ -2120,6 +2226,7 @@ netmap_attach(struct netmap_adapter *arg)
if (hwna == NULL)
goto fail;
hwna->up = *arg;
+ hwna->up.na_flags |= NAF_HOST_RINGS;
if (netmap_attach_common(&hwna->up)) {
free(hwna, M_DEVBUF);
goto fail;
@@ -2177,12 +2284,10 @@ NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
return 1;
}
-
int
netmap_hw_krings_create(struct netmap_adapter *na)
{
- int ret = netmap_krings_create(na,
- na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
+ int ret = netmap_krings_create(na, 0);
if (ret == 0) {
/* initialize the mbq for the sw rx ring */
mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
@@ -2370,7 +2475,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
* We do the wakeup here, but the ring is not yet reconfigured.
* However, we are under lock so there are no races.
*/
- na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
+ na->nm_notify(na, n, tx, 0);
return kring->ring->slot;
}
@@ -2405,15 +2510,13 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
return; // not a physical queue
kring = na->rx_rings + q;
kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?
- na->nm_notify(na, q, NR_RX,
- (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
+ na->nm_notify(na, q, NR_RX, 0);
*work_done = 1; /* do not fire napi again */
} else { /* TX path */
if (q >= na->num_tx_rings)
return; // not a physical queue
kring = na->tx_rings + q;
- na->nm_notify(na, q, NR_TX,
- (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
+ na->nm_notify(na, q, NR_TX, 0);
}
}
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 6716168526dc..a8e287c6ddd8 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -29,8 +29,10 @@
#include <sys/module.h>
#include <sys/errno.h>
#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/poll.h> /* POLLIN, POLLOUT */
#include <sys/kernel.h> /* types used in module initialization */
#include <sys/conf.h> /* DEV_MODULE */
+#include <sys/endian.h>
#include <sys/rwlock.h>
@@ -49,6 +51,8 @@
#include <net/if.h>
#include <net/if_var.h>
#include <machine/bus.h> /* bus_dmamap_* */
+#include <netinet/in.h> /* in6_cksum_pseudo() */
+#include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
@@ -57,6 +61,73 @@
/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
+rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
+{
+ /* TODO XXX please use the FreeBSD implementation for this. */
+ uint16_t *words = (uint16_t *)data;
+ int nw = len / 2;
+ int i;
+
+ for (i = 0; i < nw; i++)
+ cur_sum += be16toh(words[i]);
+
+ if (len & 1)
+ cur_sum += (data[len-1] << 8);
+
+ return cur_sum;
+}
+
+/* Fold a raw checksum: 'cur_sum' is in host byte order, while the
+ * return value is in network byte order.
+ */
+uint16_t nm_csum_fold(rawsum_t cur_sum)
+{
+ /* TODO XXX please use the FreeBSD implementation for this. */
+ while (cur_sum >> 16)
+ cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16);
+
+ return htobe16((~cur_sum) & 0xFFFF);
+}
+
+uint16_t nm_csum_ipv4(struct nm_iphdr *iph)
+{
+#if 0
+ return in_cksum_hdr((void *)iph);
+#else
+ return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
+#endif
+}
+
+void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
+ size_t datalen, uint16_t *check)
+{
+ uint16_t pseudolen = datalen + iph->protocol;
+
+ /* Compute and insert the pseudo-header cheksum. */
+ *check = in_pseudo(iph->saddr, iph->daddr,
+ htobe16(pseudolen));
+ /* Compute the checksum on TCP/UDP header + payload
+ * (includes the pseudo-header).
+ */
+ *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));
+}
+
+void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
+ size_t datalen, uint16_t *check)
+{
+#ifdef INET6
+ *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);
+ *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));
+#else
+ static int notsupported = 0;
+ if (!notsupported) {
+ notsupported = 1;
+ D("inet6 segmentation not supported");
+ }
+#endif
+}
+
+
/*
* Intercept the rx routine in the standard device driver.
* Second argument is non-zero to intercept, 0 to restore
@@ -91,10 +162,7 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept)
* Intercept the packet steering routine in the tx path,
* so that we can decide which queue is used for an mbuf.
* Second argument is non-zero to intercept, 0 to restore.
- *
- * actually we also need to redirect the if_transmit ?
- *
- * XXX see if FreeBSD has such a mechanism
+ * On freebsd we just intercept if_transmit.
*/
void
netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
@@ -111,7 +179,8 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
}
-/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success
+/*
+ * Transmit routine used by generic_netmap_txsync(). Returns 0 on success
* and non-zero on error (which may be packet drops or other errors).
* addr and len identify the netmap buffer, m is the (preallocated)
* mbuf to use for transmissions.
@@ -162,38 +231,39 @@ void
generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
{
D("called");
- *txq = 1;
- *rxq = 1;
+ *txq = netmap_generic_rings;
+ *rxq = netmap_generic_rings;
}
-void netmap_mitigation_init(struct netmap_generic_adapter *na)
+void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na)
{
ND("called");
- na->mit_pending = 0;
+ mit->mit_pending = 0;
+ mit->mit_na = na;
}
-void netmap_mitigation_start(struct netmap_generic_adapter *na)
+void netmap_mitigation_start(struct nm_generic_mit *mit)
{
ND("called");
}
-void netmap_mitigation_restart(struct netmap_generic_adapter *na)
+void netmap_mitigation_restart(struct nm_generic_mit *mit)
{
ND("called");
}
-int netmap_mitigation_active(struct netmap_generic_adapter *na)
+int netmap_mitigation_active(struct nm_generic_mit *mit)
{
ND("called");
return 0;
}
-void netmap_mitigation_cleanup(struct netmap_generic_adapter *na)
+void netmap_mitigation_cleanup(struct nm_generic_mit *mit)
{
ND("called");
}
@@ -216,8 +286,10 @@ netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
{
struct netmap_vm_handle_t *vmh = handle;
- D("handle %p size %jd prot %d foff %jd",
- handle, (intmax_t)size, prot, (intmax_t)foff);
+
+ if (netmap_verbose)
+ D("handle %p size %jd prot %d foff %jd",
+ handle, (intmax_t)size, prot, (intmax_t)foff);
dev_ref(vmh->dev);
return 0;
}
@@ -229,7 +301,9 @@ netmap_dev_pager_dtor(void *handle)
struct netmap_vm_handle_t *vmh = handle;
struct cdev *dev = vmh->dev;
struct netmap_priv_d *priv = vmh->priv;
- D("handle %p", handle);
+
+ if (netmap_verbose)
+ D("handle %p", handle);
netmap_dtor(priv);
free(vmh, M_DEVBUF);
dev_rel(dev);
@@ -302,8 +376,9 @@ netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
struct netmap_priv_d *priv;
vm_object_t obj;
- D("cdev %p foff %jd size %jd objp %p prot %d", cdev,
- (intmax_t )*foff, (intmax_t )objsize, objp, prot);
+ if (netmap_verbose)
+ D("cdev %p foff %jd size %jd objp %p prot %d", cdev,
+ (intmax_t )*foff, (intmax_t )objsize, objp, prot);
vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
M_NOWAIT | M_ZERO);
@@ -383,6 +458,157 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
return 0;
}
+/******************** kqueue support ****************/
+
+/*
+ * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED.
+ * We use a non-zero argument to distinguish the call from the one
+ * in kevent_scan() which instead also needs to run netmap_poll().
+ * The knote uses a global mutex for the time being. We might
+ * try to reuse the one in the si, but it is not allocated
+ * permanently so it might be a bit tricky.
+ *
+ * The *kqfilter function registers one or another f_event
+ * depending on read or write mode.
+ * In the call to f_event() td_fpop is NULL so any child function
+ * calling devfs_get_cdevpriv() would fail - and we need it in
+ * netmap_poll(). As a workaround we store priv into kn->kn_hook
+ * and pass it as first argument to netmap_poll(), which then
+ * uses the failure to tell that we are called from f_event()
+ * and do not need the selrecord().
+ */
+
+void freebsd_selwakeup(struct selinfo *si, int pri);
+
+void
+freebsd_selwakeup(struct selinfo *si, int pri)
+{
+ if (netmap_verbose)
+ D("on knote %p", &si->si_note);
+ selwakeuppri(si, pri);
+ /* use a non-zero hint to tell the notification from the
+ * call done in kqueue_scan() which uses 0
+ */
+ KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */);
+}
+
+static void
+netmap_knrdetach(struct knote *kn)
+{
+ struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
+ struct selinfo *si = priv->np_rxsi;
+
+ D("remove selinfo %p", si);
+ knlist_remove(&si->si_note, kn, 0);
+}
+
+static void
+netmap_knwdetach(struct knote *kn)
+{
+ struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
+ struct selinfo *si = priv->np_txsi;
+
+ D("remove selinfo %p", si);
+ knlist_remove(&si->si_note, kn, 0);
+}
+
+/*
+ * callback from notifies (generated externally) and our
+ * calls to kevent(). The former we just return 1 (ready)
+ * since we do not know better.
+ * In the latter we call netmap_poll and return 0/1 accordingly.
+ */
+static int
+netmap_knrw(struct knote *kn, long hint, int events)
+{
+ struct netmap_priv_d *priv;
+ int revents;
+
+ if (hint != 0) {
+ ND(5, "call from notify");
+ return 1; /* assume we are ready */
+ }
+ priv = kn->kn_hook;
+ /* the notification may come from an external thread,
+ * in which case we do not want to run the netmap_poll
+ * This should be filtered above, but check just in case.
+ */
+ if (curthread != priv->np_td) { /* should not happen */
+ RD(5, "curthread changed %p %p", curthread, priv->np_td);
+ return 1;
+ } else {
+ revents = netmap_poll((void *)priv, events, curthread);
+ return (events & revents) ? 1 : 0;
+ }
+}
+
+static int
+netmap_knread(struct knote *kn, long hint)
+{
+ return netmap_knrw(kn, hint, POLLIN);
+}
+
+static int
+netmap_knwrite(struct knote *kn, long hint)
+{
+ return netmap_knrw(kn, hint, POLLOUT);
+}
+
+static struct filterops netmap_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = netmap_knrdetach,
+ .f_event = netmap_knread,
+};
+
+static struct filterops netmap_wfiltops = {
+ .f_isfd = 1,
+ .f_detach = netmap_knwdetach,
+ .f_event = netmap_knwrite,
+};
+
+
+/*
+ * This is called when a thread invokes kevent() to record
+ * a change in the configuration of the kqueue().
+ * The 'priv' should be the same as in the netmap device.
+ */
+static int
+netmap_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ struct netmap_priv_d *priv;
+ int error;
+ struct netmap_adapter *na;
+ struct selinfo *si;
+ int ev = kn->kn_filter;
+
+ if (ev != EVFILT_READ && ev != EVFILT_WRITE) {
+ D("bad filter request %d", ev);
+ return 1;
+ }
+ error = devfs_get_cdevpriv((void**)&priv);
+ if (error) {
+ D("device not yet setup");
+ return 1;
+ }
+ na = priv->np_na;
+ if (na == NULL) {
+ D("no netmap adapter for this file descriptor");
+ return 1;
+ }
+ /* the si is indicated in the priv */
+ si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi;
+ // XXX lock(priv) ?
+ kn->kn_fop = (ev == EVFILT_WRITE) ?
+ &netmap_wfiltops : &netmap_rfiltops;
+ kn->kn_hook = priv;
+ knlist_add(&si->si_note, kn, 1);
+ // XXX unlock(priv)
+ ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s",
+ na, na->ifp->if_xname, curthread, priv, kn,
+ priv->np_nifp,
+ kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH");
+ return 0;
+}
struct cdevsw netmap_cdevsw = {
.d_version = D_VERSION,
@@ -391,9 +617,10 @@ struct cdevsw netmap_cdevsw = {
.d_mmap_single = netmap_mmap_single,
.d_ioctl = netmap_ioctl,
.d_poll = netmap_poll,
+ .d_kqfilter = netmap_kqfilter,
.d_close = netmap_close,
};
-
+/*--- end of kqueue support ----*/
/*
* Kernel entry point.
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index e695fcbd29f8..63253b6b0693 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
#define rtnl_lock() D("rtnl_lock called");
#define rtnl_unlock() D("rtnl_unlock called");
#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)
+#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid)
#define smp_mb()
/*
@@ -222,6 +223,17 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
#endif /* REG_RESET */
if (enable) { /* Enable netmap mode. */
+ /* Init the mitigation support. */
+ gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!gna->mit) {
+ D("mitigation allocation failed");
+ error = ENOMEM;
+ goto out;
+ }
+ for (r=0; r<na->num_rx_rings; r++)
+ netmap_mitigation_init(&gna->mit[r], na);
+
/* Initialize the rx queue, as generic_rx_handler() can
* be called as soon as netmap_catch_rx() returns.
*/
@@ -229,9 +241,6 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
mbq_safe_init(&na->rx_rings[r].rx_queue);
}
- /* Init the mitigation timer. */
- netmap_mitigation_init(gna);
-
/*
* Preallocate packet buffers for the tx rings.
*/
@@ -306,7 +315,9 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
mbq_safe_destroy(&na->rx_rings[r].rx_queue);
}
- netmap_mitigation_cleanup(gna);
+ for (r=0; r<na->num_rx_rings; r++)
+ netmap_mitigation_cleanup(&gna->mit[r]);
+ free(gna->mit, M_DEVBUF);
for (r=0; r<na->num_tx_rings; r++) {
for (i=0; i<na->num_tx_desc; i++) {
@@ -344,10 +355,12 @@ free_tx_pools:
free(na->tx_rings[r].tx_pool, M_DEVBUF);
na->tx_rings[r].tx_pool = NULL;
}
- netmap_mitigation_cleanup(gna);
for (r=0; r<na->num_rx_rings; r++) {
+ netmap_mitigation_cleanup(&gna->mit[r]);
mbq_safe_destroy(&na->rx_rings[r].rx_queue);
}
+ free(gna->mit, M_DEVBUF);
+out:
return error;
}
@@ -557,12 +570,11 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
nm_i = nm_next(nm_i, lim);
+ IFRATE(rate_ctx.new.txpkt ++);
}
/* Update hwcur to the next slot to transmit. */
kring->nr_hwcur = nm_i; /* not head, we could break early */
-
- IFRATE(rate_ctx.new.txpkt += ntx);
}
/*
@@ -600,7 +612,11 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
struct netmap_adapter *na = NA(ifp);
struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
u_int work_done;
- u_int rr = 0; // receive ring number
+ u_int rr = MBUF_RXQ(m); // receive ring number
+
+ if (rr >= na->num_rx_rings) {
+ rr = rr % na->num_rx_rings; // XXX expensive...
+ }
/* limit the size of the queue */
if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {
@@ -617,13 +633,13 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
/* same as send combining, filter notification if there is a
* pending timer, otherwise pass it up and start a timer.
*/
- if (likely(netmap_mitigation_active(gna))) {
+ if (likely(netmap_mitigation_active(&gna->mit[rr]))) {
/* Record that there is some pending work. */
- gna->mit_pending = 1;
+ gna->mit[rr].mit_pending = 1;
} else {
netmap_generic_irq(na->ifp, rr, &work_done);
IFRATE(rate_ctx.new.rxirq++);
- netmap_mitigation_start(gna);
+ netmap_mitigation_start(&gna->mit[rr]);
}
}
}
@@ -682,7 +698,6 @@ generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
ring->slot[nm_i].flags = slot_flags;
m_freem(m);
nm_i = nm_next(nm_i, lim);
- n++;
}
if (n) {
kring->nr_hwtail = nm_i;
@@ -772,7 +787,7 @@ generic_netmap_attach(struct ifnet *ifp)
/* when using generic, IFCAP_NETMAP is set so we force
* NAF_SKIP_INTR to use the regular interrupt handler
*/
- na->na_flags = NAF_SKIP_INTR;
+ na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS;
ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)",
ifp->num_tx_queues, ifp->real_num_tx_queues,
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 668e083e0b96..ddcb0e3185a6 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -35,6 +35,7 @@
#define _NET_NETMAP_KERN_H_
#define WITH_VALE // comment out to disable VALE support
+#define WITH_PIPES
#if defined(__FreeBSD__)
@@ -267,11 +268,11 @@ struct netmap_kring {
volatile int nkr_stopped; // XXX what for ?
- /* support for adapters without native netmap support.
+ /* Support for adapters without native netmap support.
* On tx rings we preallocate an array of tx buffers
* (same size as the netmap ring), on rx rings we
- * store incoming packets in a queue.
- * XXX who writes to the rx queue ?
+ * store incoming mbufs in a queue that is drained by
+ * a rxsync.
*/
struct mbuf **tx_pool;
// u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */
@@ -280,6 +281,13 @@ struct netmap_kring {
uint32_t ring_id; /* debugging */
char name[64]; /* diagnostic */
+ int (*nm_sync)(struct netmap_kring *kring, int flags);
+
+#ifdef WITH_PIPES
+ struct netmap_kring *pipe;
+ struct netmap_ring *save_ring;
+#endif /* WITH_PIPES */
+
} __attribute__((__aligned__(64)));
@@ -388,6 +396,7 @@ struct netmap_adapter {
* emulated. Where possible (e.g. FreeBSD)
* IFCAP_NETMAP also mirrors this flag.
*/
+#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */
int active_fds; /* number of user-space descriptors using this
interface, which is equal to the number of
struct netmap_if objs in the mapped region. */
@@ -411,6 +420,9 @@ struct netmap_adapter {
NM_SELINFO_T tx_si, rx_si; /* global wait queues */
+ /* count users of the global wait queues */
+ int tx_si_users, rx_si_users;
+
/* copy of if_qflush and if_transmit pointers, to intercept
* packets from the network stack when netmap is active.
*/
@@ -438,9 +450,11 @@ struct netmap_adapter {
*
* nm_config() returns configuration information from the OS
*
- * nm_krings_create() XXX
+ * nm_krings_create() create and init the krings array
+ * (the array layout must conform to the description
+ * found above the definition of netmap_krings_create)
*
- * nm_krings_delete() XXX
+ * nm_krings_delete() cleanup and delete the kring array
*
* nm_notify() is used to act after data have become available.
* For hw devices this is typically a selwakeup(),
@@ -464,7 +478,6 @@ struct netmap_adapter {
void (*nm_krings_delete)(struct netmap_adapter *);
int (*nm_notify)(struct netmap_adapter *,
u_int ring, enum txrx, int flags);
-#define NAF_GLOBAL_NOTIFY 4
#define NAF_DISABLE_NOTIFY 8
/* standard refcount to control the lifetime of the adapter
@@ -484,6 +497,12 @@ struct netmap_adapter {
* from userspace
*/
void *na_private;
+
+#ifdef WITH_PIPES
+ struct netmap_pipe_adapter **na_pipes;
+ int na_next_pipe;
+ int na_max_pipes;
+#endif /* WITH_PIPES */
};
@@ -514,7 +533,10 @@ struct netmap_vp_adapter { /* VALE software port */
struct nm_bridge *na_bdg;
int retry;
- u_int offset; /* Offset of ethernet header for each packet. */
+ /* Offset of ethernet header for each packet. */
+ u_int virt_hdr_len;
+ /* Maximum Frame Size, used in bdg_mismatch_datapath() */
+ u_int mfs;
};
@@ -524,6 +546,12 @@ struct netmap_hw_adapter { /* physical device */
struct net_device_ops nm_ndo; // XXX linux only
};
+/* Mitigation support. */
+struct nm_generic_mit {
+ struct hrtimer mit_timer;
+ int mit_pending;
+ struct netmap_adapter *mit_na; /* backpointer */
+};
struct netmap_generic_adapter { /* emulated device */
struct netmap_hw_adapter up;
@@ -534,18 +562,29 @@ struct netmap_generic_adapter { /* emulated device */
/* generic netmap adapters support:
* a net_device_ops struct overrides ndo_select_queue(),
* save_if_input saves the if_input hook (FreeBSD),
- * mit_timer and mit_pending implement rx interrupt mitigation,
+ * mit implements rx interrupt mitigation,
*/
struct net_device_ops generic_ndo;
void (*save_if_input)(struct ifnet *, struct mbuf *);
- struct hrtimer mit_timer;
- int mit_pending;
+ struct nm_generic_mit *mit;
#ifdef linux
netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *);
#endif
};
+static __inline int
+netmap_real_tx_rings(struct netmap_adapter *na)
+{
+ return na->num_tx_rings + !!(na->na_flags & NAF_HOST_RINGS);
+}
+
+static __inline int
+netmap_real_rx_rings(struct netmap_adapter *na)
+{
+ return na->num_rx_rings + !!(na->na_flags & NAF_HOST_RINGS);
+}
+
#ifdef WITH_VALE
/*
@@ -614,6 +653,25 @@ struct netmap_bwrap_adapter {
#endif /* WITH_VALE */
+#ifdef WITH_PIPES
+
+#define NM_MAXPIPES 64 /* max number of pipes per adapter */
+
+struct netmap_pipe_adapter {
+ struct netmap_adapter up;
+
+ u_int id; /* pipe identifier */
+ int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */
+
+ struct netmap_adapter *parent; /* adapter that owns the memory */
+ struct netmap_pipe_adapter *peer; /* the other end of the pipe */
+ int peer_ref; /* 1 iff we are holding a ref to the peer */
+
+ u_int parent_slot; /* index in the parent pipe array */
+};
+
+#endif /* WITH_PIPES */
+
/* return slots reserved to rx clients; used in drivers */
static inline uint32_t
@@ -767,9 +825,8 @@ uint32_t nm_rxsync_prologue(struct netmap_kring *);
static inline void
nm_txsync_finalize(struct netmap_kring *kring)
{
- /* update ring head/tail to what the kernel knows */
+ /* update ring tail to what the kernel knows */
kring->ring->tail = kring->rtail = kring->nr_hwtail;
- kring->ring->head = kring->rhead = kring->nr_hwcur;
/* note, head/rhead/hwcur might be behind cur/rcur
* if no carrier
@@ -819,14 +876,14 @@ nm_rxsync_finalize(struct netmap_kring *kring)
* Support routines to be used with the VALE switch
*/
int netmap_update_config(struct netmap_adapter *na);
-int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom);
+int netmap_krings_create(struct netmap_adapter *na, u_int tailroom);
void netmap_krings_delete(struct netmap_adapter *na);
int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
struct netmap_if *
netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
- uint16_t ringid, int *err);
+ uint16_t ringid, uint32_t flags, int *err);
@@ -868,6 +925,20 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func);
#define netmap_bdg_ctl(_1, _2) EINVAL
#endif /* !WITH_VALE */
+#ifdef WITH_PIPES
+/* max number of pipes per device */
+#define NM_MAXPIPES 64 /* XXX how many? */
+/* in case of no error, returns the actual number of pipes in nmr->nr_arg1 */
+int netmap_pipe_alloc(struct netmap_adapter *, struct nmreq *nmr);
+void netmap_pipe_dealloc(struct netmap_adapter *);
+int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
+#else /* !WITH_PIPES */
+#define NM_MAXPIPES 0
+#define netmap_pipe_alloc(_1, _2) EOPNOTSUPP
+#define netmap_pipe_dealloc(_1)
+#define netmap_get_pipe_na(_1, _2, _3) 0
+#endif
+
/* Various prototypes */
int netmap_poll(struct cdev *dev, int events, struct thread *td);
int netmap_init(void);
@@ -938,6 +1009,7 @@ enum { /* verbose flags */
extern int netmap_txsync_retry;
extern int netmap_generic_mit;
extern int netmap_generic_ringsize;
+extern int netmap_generic_rings;
/*
* NA returns a pointer to the struct netmap adapter from the ifp,
@@ -1160,13 +1232,21 @@ struct netmap_priv_d {
struct netmap_if * volatile np_nifp; /* netmap if descriptor. */
struct netmap_adapter *np_na;
- int np_ringid; /* from the ioctl */
- u_int np_qfirst, np_qlast; /* range of rings to scan */
- uint16_t np_txpoll;
+ uint32_t np_flags; /* from the ioctl */
+ u_int np_txqfirst, np_txqlast; /* range of tx rings to scan */
+ u_int np_rxqfirst, np_rxqlast; /* range of rx rings to scan */
+ uint16_t np_txpoll; /* XXX and also np_rxpoll ? */
struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */
/* np_refcount is only used on FreeBSD */
int np_refcount; /* use with NMG_LOCK held */
+
+ /* pointers to the selinfo to be used for selrecord.
+ * Either the local or the global one depending on the
+ * number of rings.
+ */
+ NM_SELINFO_T *np_rxsi, *np_txsi;
+ struct thread *np_td; /* kqueue, just debugging */
};
@@ -1188,10 +1268,113 @@ void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);
* to reduce the number of interrupt requests/selwakeup
* to clients on incoming packets.
*/
-void netmap_mitigation_init(struct netmap_generic_adapter *na);
-void netmap_mitigation_start(struct netmap_generic_adapter *na);
-void netmap_mitigation_restart(struct netmap_generic_adapter *na);
-int netmap_mitigation_active(struct netmap_generic_adapter *na);
-void netmap_mitigation_cleanup(struct netmap_generic_adapter *na);
+void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na);
+void netmap_mitigation_start(struct nm_generic_mit *mit);
+void netmap_mitigation_restart(struct nm_generic_mit *mit);
+int netmap_mitigation_active(struct nm_generic_mit *mit);
+void netmap_mitigation_cleanup(struct nm_generic_mit *mit);
+
+
+
+/* Shared declarations for the VALE switch. */
+
+/*
+ * Each transmit queue accumulates a batch of packets into
+ * a structure before forwarding. Packets to the same
+ * destination are put in a list using ft_next as a link field.
+ * ft_frags and ft_next are valid only on the first fragment.
+ */
+struct nm_bdg_fwd { /* forwarding entry for a bridge */
+ void *ft_buf; /* netmap or indirect buffer */
+ uint8_t ft_frags; /* how many fragments (only on 1st frag) */
+ uint8_t _ft_port; /* dst port (unused) */
+ uint16_t ft_flags; /* flags, e.g. indirect */
+ uint16_t ft_len; /* src fragment len */
+ uint16_t ft_next; /* next packet to same destination */
+};
+
+/* struct 'virtio_net_hdr' from linux. */
+struct nm_vnet_hdr {
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */
+#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
+ uint8_t flags;
+#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */
+#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */
+#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */
+#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */
+#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */
+ uint8_t gso_type;
+ uint16_t hdr_len;
+ uint16_t gso_size;
+ uint16_t csum_start;
+ uint16_t csum_offset;
+};
+
+#define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */
+
+/* Private definitions for IPv4, IPv6, UDP and TCP headers. */
+
+struct nm_iphdr {
+ uint8_t version_ihl;
+ uint8_t tos;
+ uint16_t tot_len;
+ uint16_t id;
+ uint16_t frag_off;
+ uint8_t ttl;
+ uint8_t protocol;
+ uint16_t check;
+ uint32_t saddr;
+ uint32_t daddr;
+ /*The options start here. */
+};
+
+struct nm_tcphdr {
+ uint16_t source;
+ uint16_t dest;
+ uint32_t seq;
+ uint32_t ack_seq;
+ uint8_t doff; /* Data offset + Reserved */
+ uint8_t flags;
+ uint16_t window;
+ uint16_t check;
+ uint16_t urg_ptr;
+};
+
+struct nm_udphdr {
+ uint16_t source;
+ uint16_t dest;
+ uint16_t len;
+ uint16_t check;
+};
+
+struct nm_ipv6hdr {
+ uint8_t priority_version;
+ uint8_t flow_lbl[3];
+
+ uint16_t payload_len;
+ uint8_t nexthdr;
+ uint8_t hop_limit;
+
+ uint8_t saddr[16];
+ uint8_t daddr[16];
+};
+
+/* Type used to store a checksum (in host byte order) that hasn't been
+ * folded yet.
+ */
+#define rawsum_t uint32_t
+
+rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum);
+uint16_t nm_csum_ipv4(struct nm_iphdr *iph);
+void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
+ size_t datalen, uint16_t *check);
+void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
+ size_t datalen, uint16_t *check);
+uint16_t nm_csum_fold(rawsum_t cur_sum);
+
+void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
+ struct netmap_vp_adapter *dst_na,
+ struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,
+ u_int *j, u_int lim, u_int *howmany);
#endif /* _NET_NETMAP_KERN_H_ */
diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c
index 55f598518434..5491845090e7 100644
--- a/sys/dev/netmap/netmap_mem2.c
+++ b/sys/dev/netmap/netmap_mem2.c
@@ -82,6 +82,21 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {
},
};
+struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = {
+ [NETMAP_IF_POOL] = {
+ .size = 1024,
+ .num = 1,
+ },
+ [NETMAP_RING_POOL] = {
+ .size = 5*PAGE_SIZE,
+ .num = 4,
+ },
+ [NETMAP_BUF_POOL] = {
+ .size = 2048,
+ .num = 4098,
+ },
+};
+
/*
* nm_mem is the memory allocator used for all physical interfaces
@@ -118,9 +133,16 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */
.config = netmap_mem_global_config,
.finalize = netmap_mem_global_finalize,
.deref = netmap_mem_global_deref,
+
+ .nm_id = 1,
+
+ .prev = &nm_mem,
+ .next = &nm_mem,
};
+struct netmap_mem_d *netmap_last_mem_d = &nm_mem;
+
// XXX logically belongs to nm_mem
struct lut_entry *netmap_buffer_lut; /* exported */
@@ -135,7 +157,7 @@ const struct netmap_mem_d nm_blueprint = {
.objminsize = sizeof(struct netmap_if),
.objmaxsize = 4096,
.nummin = 1,
- .nummax = 10,
+ .nummax = 100,
},
[NETMAP_RING_POOL] = {
.name = "%s_ring",
@@ -172,13 +194,67 @@ const struct netmap_mem_d nm_blueprint = {
SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \
CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \
SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \
- CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s")
+ CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \
+ SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \
+ CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \
+ "Default size of private netmap " STRINGIFY(name) "s"); \
+ SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \
+ CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \
+ "Default number of private netmap " STRINGIFY(name) "s")
SYSCTL_DECL(_dev_netmap);
DECLARE_SYSCTLS(NETMAP_IF_POOL, if);
DECLARE_SYSCTLS(NETMAP_RING_POOL, ring);
DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf);
+static int
+nm_mem_assign_id(struct netmap_mem_d *nmd)
+{
+ nm_memid_t id;
+ struct netmap_mem_d *scan = netmap_last_mem_d;
+ int error = ENOMEM;
+
+ NMA_LOCK(&nm_mem);
+
+ do {
+ /* we rely on unsigned wrap around */
+ id = scan->nm_id + 1;
+ if (id == 0) /* reserve 0 as error value */
+ id = 1;
+ scan = scan->next;
+ if (id != scan->nm_id) {
+ nmd->nm_id = id;
+ nmd->prev = scan->prev;
+ nmd->next = scan;
+ scan->prev->next = nmd;
+ scan->prev = nmd;
+ netmap_last_mem_d = nmd;
+ error = 0;
+ break;
+ }
+ } while (scan != netmap_last_mem_d);
+
+ NMA_UNLOCK(&nm_mem);
+ return error;
+}
+
+static void
+nm_mem_release_id(struct netmap_mem_d *nmd)
+{
+ NMA_LOCK(&nm_mem);
+
+ nmd->prev->next = nmd->next;
+ nmd->next->prev = nmd->prev;
+
+ if (netmap_last_mem_d == nmd)
+ netmap_last_mem_d = nmd->prev;
+
+ nmd->prev = nmd->next = NULL;
+
+ NMA_UNLOCK(&nm_mem);
+}
+
+
/*
* First, find the allocator that contains the requested offset,
* then locate the cluster through a lookup table.
@@ -216,7 +292,8 @@ netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset)
}
int
-netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags)
+netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags,
+ nm_memid_t *id)
{
int error = 0;
NMA_LOCK(nmd);
@@ -234,6 +311,7 @@ netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags)
}
}
*memflags = nmd->flags;
+ *id = nmd->nm_id;
out:
NMA_UNLOCK(nmd);
return error;
@@ -343,21 +421,34 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_
/*
- * free by index, not by address. This is slow, but is only used
- * for a small number of objects (rings, nifp)
+ * free by index, not by address.
+ * XXX should we also cleanup the content ?
*/
-static void
+static int
netmap_obj_free(struct netmap_obj_pool *p, uint32_t j)
{
+ uint32_t *ptr, mask;
+
if (j >= p->objtotal) {
D("invalid index %u, max %u", j, p->objtotal);
- return;
+ return 1;
+ }
+ ptr = &p->bitmap[j / 32];
+ mask = (1 << (j % 32));
+ if (*ptr & mask) {
+ D("ouch, double free on buffer %d", j);
+ return 1;
+ } else {
+ *ptr |= mask;
+ p->objfree++;
+ return 0;
}
- p->bitmap[j / 32] |= (1 << (j % 32));
- p->objfree++;
- return;
}
+/*
+ * free by address. This is slow but is only used for a few
+ * objects (rings, nifp)
+ */
static void
netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr)
{
@@ -388,9 +479,63 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr)
netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index)
+#if 0 // XXX unused
/* Return the index associated to the given packet buffer */
#define netmap_buf_index(n, v) \
(netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n))
+#endif
+
+/*
+ * allocate extra buffers in a linked list.
+ * returns the actual number.
+ */
+uint32_t
+netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n)
+{
+ struct netmap_mem_d *nmd = na->nm_mem;
+ uint32_t i, pos = 0; /* opaque, scan position in the bitmap */
+
+ NMA_LOCK(nmd);
+
+ *head = 0; /* default, 'null' index ie empty list */
+ for (i = 0 ; i < n; i++) {
+ uint32_t cur = *head; /* save current head */
+ uint32_t *p = netmap_buf_malloc(nmd, &pos, head);
+ if (p == NULL) {
+ D("no more buffers after %d of %d", i, n);
+ *head = cur; /* restore */
+ break;
+ }
+ RD(5, "allocate buffer %d -> %d", *head, cur);
+ *p = cur; /* link to previous head */
+ }
+
+ NMA_UNLOCK(nmd);
+
+ return i;
+}
+
+static void
+netmap_extra_free(struct netmap_adapter *na, uint32_t head)
+{
+ struct lut_entry *lut = na->na_lut;
+ struct netmap_mem_d *nmd = na->nm_mem;
+ struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL];
+ uint32_t i, cur, *buf;
+
+ D("freeing the extra list");
+ for (i = 0; head >=2 && head < p->objtotal; i++) {
+ cur = head;
+ buf = lut[head].vaddr;
+ head = *buf;
+ *buf = 0;
+ if (netmap_obj_free(p, cur))
+ break;
+ }
+ if (head != 0)
+ D("breaking with head %d", head);
+ D("freed %d buffers", i);
+}
/* Return nonzero on error */
@@ -425,6 +570,19 @@ cleanup:
return (ENOMEM);
}
+static void
+netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index)
+{
+ struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL];
+ u_int i;
+
+ for (i = 0; i < n; i++) {
+ slot[i].buf_idx = index;
+ slot[i].len = p->_objsize;
+ slot[i].flags = 0;
+ }
+}
+
static void
netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i)
@@ -438,6 +596,18 @@ netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i)
netmap_obj_free(p, i);
}
+
+static void
+netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n)
+{
+ u_int i;
+
+ for (i = 0; i < n; i++) {
+ if (slot[i].buf_idx > 2)
+ netmap_free_buf(nmd, slot[i].buf_idx);
+ }
+}
+
static void
netmap_reset_obj_allocator(struct netmap_obj_pool *p)
{
@@ -677,7 +847,9 @@ static void
netmap_mem_reset_all(struct netmap_mem_d *nmd)
{
int i;
- D("resetting %p", nmd);
+
+ if (netmap_verbose)
+ D("resetting %p", nmd);
for (i = 0; i < NETMAP_POOLS_NR; i++) {
netmap_reset_obj_allocator(&nmd->pools[i]);
}
@@ -703,12 +875,14 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd)
nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3;
nmd->flags |= NETMAP_MEM_FINALIZED;
- D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers",
- nmd->pools[NETMAP_IF_POOL].memtotal >> 10,
- nmd->pools[NETMAP_RING_POOL].memtotal >> 10,
- nmd->pools[NETMAP_BUF_POOL].memtotal >> 20);
+ if (netmap_verbose)
+ D("interfaces %d KB, rings %d KB, buffers %d MB",
+ nmd->pools[NETMAP_IF_POOL].memtotal >> 10,
+ nmd->pools[NETMAP_RING_POOL].memtotal >> 10,
+ nmd->pools[NETMAP_BUF_POOL].memtotal >> 20);
- D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree);
+ if (netmap_verbose)
+ D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree);
return 0;
@@ -724,10 +898,13 @@ netmap_mem_private_delete(struct netmap_mem_d *nmd)
{
if (nmd == NULL)
return;
- D("deleting %p", nmd);
+ if (netmap_verbose)
+ D("deleting %p", nmd);
if (nmd->refcount > 0)
D("bug: deleting mem allocator with refcount=%d!", nmd->refcount);
- D("done deleting %p", nmd);
+ nm_mem_release_id(nmd);
+ if (netmap_verbose)
+ D("done deleting %p", nmd);
NMA_LOCK_DESTROY(nmd);
free(nmd, M_DEVBUF);
}
@@ -762,35 +939,70 @@ netmap_mem_private_deref(struct netmap_mem_d *nmd)
NMA_UNLOCK(nmd);
}
+
+/*
+ * allocator for private memory
+ */
struct netmap_mem_d *
-netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd)
+netmap_mem_private_new(const char *name, u_int txr, u_int txd,
+ u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr)
{
struct netmap_mem_d *d = NULL;
struct netmap_obj_params p[NETMAP_POOLS_NR];
- int i;
- u_int maxd;
+ int i, err;
+ u_int v, maxd;
d = malloc(sizeof(struct netmap_mem_d),
M_DEVBUF, M_NOWAIT | M_ZERO);
- if (d == NULL)
- return NULL;
+ if (d == NULL) {
+ err = ENOMEM;
+ goto error;
+ }
*d = nm_blueprint;
- /* XXX the rest of the code assumes the stack rings are alwasy present */
+ err = nm_mem_assign_id(d);
+ if (err)
+ goto error;
+
+ /* account for the fake host rings */
txr++;
rxr++;
- p[NETMAP_IF_POOL].size = sizeof(struct netmap_if) +
- sizeof(ssize_t) * (txr + rxr);
- p[NETMAP_IF_POOL].num = 2;
+
+ /* copy the min values */
+ for (i = 0; i < NETMAP_POOLS_NR; i++) {
+ p[i] = netmap_min_priv_params[i];
+ }
+
+ /* possibly increase them to fit user request */
+ v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr);
+ if (p[NETMAP_IF_POOL].size < v)
+ p[NETMAP_IF_POOL].size = v;
+ v = 2 + 4 * npipes;
+ if (p[NETMAP_IF_POOL].num < v)
+ p[NETMAP_IF_POOL].num = v;
maxd = (txd > rxd) ? txd : rxd;
- p[NETMAP_RING_POOL].size = sizeof(struct netmap_ring) +
- sizeof(struct netmap_slot) * maxd;
- p[NETMAP_RING_POOL].num = txr + rxr;
- p[NETMAP_BUF_POOL].size = 2048; /* XXX find a way to let the user choose this */
- p[NETMAP_BUF_POOL].num = rxr * (rxd + 2) + txr * (txd + 2);
+ v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd;
+ if (p[NETMAP_RING_POOL].size < v)
+ p[NETMAP_RING_POOL].size = v;
+ /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake)
+ * and two rx rings (again, 1 normal and 1 fake host)
+ */
+ v = txr + rxr + 8 * npipes;
+ if (p[NETMAP_RING_POOL].num < v)
+ p[NETMAP_RING_POOL].num = v;
+ /* for each pipe we only need the buffers for the 4 "real" rings.
+ * On the other end, the pipe ring dimension may be different from
+ * the parent port ring dimension. As a compromise, we allocate twice the
+ * space actually needed if the pipe rings were the same size as the parent rings
+ */
+ v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs;
+ /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */
+ if (p[NETMAP_BUF_POOL].num < v)
+ p[NETMAP_BUF_POOL].num = v;
- D("req if %d*%d ring %d*%d buf %d*%d",
+ if (netmap_verbose)
+ D("req if %d*%d ring %d*%d buf %d*%d",
p[NETMAP_IF_POOL].num,
p[NETMAP_IF_POOL].size,
p[NETMAP_RING_POOL].num,
@@ -802,8 +1014,9 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int
snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ,
nm_blueprint.pools[i].name,
name);
- if (netmap_config_obj_allocator(&d->pools[i],
- p[i].num, p[i].size))
+ err = netmap_config_obj_allocator(&d->pools[i],
+ p[i].num, p[i].size);
+ if (err)
goto error;
}
@@ -814,6 +1027,8 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int
return d;
error:
netmap_mem_private_delete(d);
+ if (perr)
+ *perr = err;
return NULL;
}
@@ -917,20 +1132,25 @@ netmap_mem_fini(void)
static void
netmap_free_rings(struct netmap_adapter *na)
{
- u_int i;
+ struct netmap_kring *kring;
+ struct netmap_ring *ring;
if (!na->tx_rings)
return;
- for (i = 0; i < na->num_tx_rings + 1; i++) {
- if (na->tx_rings[i].ring) {
- netmap_ring_free(na->nm_mem, na->tx_rings[i].ring);
- na->tx_rings[i].ring = NULL;
- }
+ for (kring = na->tx_rings; kring != na->rx_rings; kring++) {
+ ring = kring->ring;
+ if (ring == NULL)
+ continue;
+ netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots);
+ netmap_ring_free(na->nm_mem, ring);
+ kring->ring = NULL;
}
- for (i = 0; i < na->num_rx_rings + 1; i++) {
- if (na->rx_rings[i].ring) {
- netmap_ring_free(na->nm_mem, na->rx_rings[i].ring);
- na->rx_rings[i].ring = NULL;
- }
+ for (/* cont'd from above */; kring != na->tailroom; kring++) {
+ ring = kring->ring;
+ if (ring == NULL)
+ continue;
+ netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots);
+ netmap_ring_free(na->nm_mem, ring);
+ kring->ring = NULL;
}
}
@@ -938,6 +1158,8 @@ netmap_free_rings(struct netmap_adapter *na)
*
* Allocate netmap rings and buffers for this card
* The rings are contiguous, but have variable size.
+ * The kring array must follow the layout described
+ * in netmap_krings_create().
*/
int
netmap_mem_rings_create(struct netmap_adapter *na)
@@ -945,10 +1167,16 @@ netmap_mem_rings_create(struct netmap_adapter *na)
struct netmap_ring *ring;
u_int len, ndesc;
struct netmap_kring *kring;
+ u_int i;
NMA_LOCK(na->nm_mem);
- for (kring = na->tx_rings; kring != na->rx_rings; kring++) { /* Transmit rings */
+ /* transmit rings */
+ for (i =0, kring = na->tx_rings; kring != na->rx_rings; kring++, i++) {
+ if (kring->ring) {
+ ND("%s %ld already created", kring->name, kring - na->tx_rings);
+ continue; /* already created by somebody else */
+ }
ndesc = kring->nkr_num_slots;
len = sizeof(struct netmap_ring) +
ndesc * sizeof(struct netmap_slot);
@@ -971,14 +1199,27 @@ netmap_mem_rings_create(struct netmap_adapter *na)
ring->tail = kring->rtail;
*(uint16_t *)(uintptr_t)&ring->nr_buf_size =
NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ ND("%s h %d c %d t %d", kring->name,
+ ring->head, ring->cur, ring->tail);
ND("initializing slots for txring");
- if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) {
- D("Cannot allocate buffers for tx_ring");
- goto cleanup;
+ if (i != na->num_tx_rings || (na->na_flags & NAF_HOST_RINGS)) {
+ /* this is a real ring */
+ if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) {
+ D("Cannot allocate buffers for tx_ring");
+ goto cleanup;
+ }
+ } else {
+ /* this is a fake tx ring, set all indices to 0 */
+ netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0);
}
}
- for ( ; kring != na->tailroom; kring++) { /* Receive rings */
+ /* receive rings */
+ for ( i = 0 /* kring cont'd from above */ ; kring != na->tailroom; kring++, i++) {
+ if (kring->ring) {
+ ND("%s %ld already created", kring->name, kring - na->rx_rings);
+ continue; /* already created by somebody else */
+ }
ndesc = kring->nkr_num_slots;
len = sizeof(struct netmap_ring) +
ndesc * sizeof(struct netmap_slot);
@@ -1001,10 +1242,18 @@ netmap_mem_rings_create(struct netmap_adapter *na)
ring->tail = kring->rtail;
*(int *)(uintptr_t)&ring->nr_buf_size =
NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ ND("%s h %d c %d t %d", kring->name,
+ ring->head, ring->cur, ring->tail);
ND("initializing slots for rxring %p", ring);
- if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) {
- D("Cannot allocate buffers for rx_ring");
- goto cleanup;
+ if (i != na->num_rx_rings || (na->na_flags & NAF_HOST_RINGS)) {
+ /* this is a real ring */
+ if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) {
+ D("Cannot allocate buffers for rx_ring");
+ goto cleanup;
+ }
+ } else {
+ /* this is a fake rx ring, set all indices to 1 */
+ netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 1);
}
}
@@ -1024,20 +1273,8 @@ void
netmap_mem_rings_delete(struct netmap_adapter *na)
{
/* last instance, release bufs and rings */
- u_int i, lim;
- struct netmap_kring *kring;
- struct netmap_ring *ring;
-
NMA_LOCK(na->nm_mem);
- for (kring = na->tx_rings; kring != na->tailroom; kring++) {
- ring = kring->ring;
- if (ring == NULL)
- continue;
- lim = kring->nkr_num_slots;
- for (i = 0; i < lim; i++)
- netmap_free_buf(na->nm_mem, ring->slot[i].buf_idx);
- }
netmap_free_rings(na);
NMA_UNLOCK(na->nm_mem);
@@ -1059,16 +1296,12 @@ netmap_mem_if_new(const char *ifname, struct netmap_adapter *na)
ssize_t base; /* handy for relative offsets between rings and nifp */
u_int i, len, ntx, nrx;
- /*
- * verify whether virtual port need the stack ring
- */
- ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */
- nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */
+ /* account for the (eventually fake) host rings */
+ ntx = na->num_tx_rings + 1;
+ nrx = na->num_rx_rings + 1;
/*
* the descriptor is followed inline by an array of offsets
* to the tx and rx rings in the shared memory region.
- * For virtual rx rings we also allocate an array of
- * pointers to assign to nkr_leases.
*/
NMA_LOCK(na->nm_mem);
@@ -1112,7 +1345,8 @@ netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp)
/* nothing to do */
return;
NMA_LOCK(na->nm_mem);
-
+ if (nifp->ni_bufs_head)
+ netmap_extra_free(na, nifp->ni_bufs_head);
netmap_if_free(na->nm_mem, nifp);
NMA_UNLOCK(na->nm_mem);
diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
index 8e6c58cbc4ee..e83616a5195f 100644
--- a/sys/dev/netmap/netmap_mem2.h
+++ b/sys/dev/netmap/netmap_mem2.h
@@ -160,6 +160,7 @@ typedef int (*netmap_mem_config_t)(struct netmap_mem_d*);
typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*);
typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*);
+typedef uint16_t nm_memid_t;
/* We implement two kinds of netmap_mem_d structures:
*
@@ -192,6 +193,11 @@ struct netmap_mem_d {
netmap_mem_config_t config;
netmap_mem_finalize_t finalize;
netmap_mem_deref_t deref;
+
+ nm_memid_t nm_id; /* allocator identifier */
+
+ /* list of all existing allocators, sorted by nm_id */
+ struct netmap_mem_d *prev, *next;
};
extern struct netmap_mem_d nm_mem;
@@ -206,14 +212,16 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *);
int netmap_mem_rings_create(struct netmap_adapter *);
void netmap_mem_rings_delete(struct netmap_adapter *);
void netmap_mem_deref(struct netmap_mem_d *);
-int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags);
+int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id);
ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr);
-struct netmap_mem_d*
- netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd);
+struct netmap_mem_d* netmap_mem_private_new(const char *name,
+ u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes,
+ int* error);
void netmap_mem_private_delete(struct netmap_mem_d *);
#define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize)
+uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n);
#endif
diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c
new file mode 100644
index 000000000000..a776a2424577
--- /dev/null
+++ b/sys/dev/netmap/netmap_offloadings.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (C) 2014 Vincenzo Maffione. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* $FreeBSD$ */
+
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h> /* prerequisite */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/sockio.h>
+#include <sys/socketvar.h> /* struct socket */
+#include <sys/socket.h> /* sockaddrs */
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <sys/endian.h>
+
+#elif defined(linux)
+
+#include "bsd_glue.h"
+
+#elif defined(__APPLE__)
+
+#warning OSX support is only partial
+#include "osx_glue.h"
+
+#else
+
+#error Unsupported platform
+
+#endif /* unsupported */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+
+
+
+/* This routine is called by bdg_mismatch_datapath() when it finishes
+ * accumulating bytes for a segment, in order to fix some fields in the
+ * segment headers (which still contain the same content as the header
+ * of the original GSO packet). 'buf' points to the beginning (e.g.
+ * the ethernet header) of the segment, and 'len' is its length.
+ */
+static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,
+ u_int segmented_bytes, u_int last_segment,
+ u_int tcp, u_int iphlen)
+{
+ struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14);
+ struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14);
+ uint16_t *check = NULL;
+ uint8_t *check_data = NULL;
+
+ if (iphlen == 20) {
+ /* Set the IPv4 "Total Length" field. */
+ iph->tot_len = htobe16(len-14);
+ ND("ip total length %u", be16toh(ip->tot_len));
+
+ /* Set the IPv4 "Identification" field. */
+ iph->id = htobe16(be16toh(iph->id) + idx);
+ ND("ip identification %u", be16toh(iph->id));
+
+ /* Compute and insert the IPv4 header checksum. */
+ iph->check = 0;
+ iph->check = nm_csum_ipv4(iph);
+ ND("IP csum %x", be16toh(iph->check));
+ } else {/* if (iphlen == 40) */
+ /* Set the IPv6 "Payload Len" field. */
+ ip6h->payload_len = htobe16(len-14-iphlen);
+ }
+
+ if (tcp) {
+ struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen);
+
+ /* Set the TCP sequence number. */
+ tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes);
+ ND("tcp seq %u", be32toh(tcph->seq));
+
+ /* Zero the PSH and FIN TCP flags if this is not the last
+ segment. */
+ if (!last_segment)
+ tcph->flags &= ~(0x8 | 0x1);
+ ND("last_segment %u", last_segment);
+
+ check = &tcph->check;
+ check_data = (uint8_t *)tcph;
+ } else { /* UDP */
+ struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen);
+
+ /* Set the UDP 'Length' field. */
+ udph->len = htobe16(len-14-iphlen);
+
+ check = &udph->check;
+ check_data = (uint8_t *)udph;
+ }
+
+ /* Compute and insert TCP/UDP checksum. */
+ *check = 0;
+ if (iphlen == 20)
+ nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check);
+ else
+ nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check);
+
+ ND("TCP/UDP csum %x", be16toh(*check));
+}
+
+
+/* The VALE mismatch datapath implementation. */
+void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
+ struct netmap_vp_adapter *dst_na,
+ struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,
+ u_int *j, u_int lim, u_int *howmany)
+{
+ struct netmap_slot *slot = NULL;
+ struct nm_vnet_hdr *vh = NULL;
+ /* Number of source slots to process. */
+ u_int frags = ft_p->ft_frags;
+ struct nm_bdg_fwd *ft_end = ft_p + frags;
+
+ /* Source and destination pointers. */
+ uint8_t *dst, *src;
+ size_t src_len, dst_len;
+
+ u_int j_start = *j;
+ u_int dst_slots = 0;
+
+ /* If the source port uses the offloadings, while destination doesn't,
+ * we grab the source virtio-net header and do the offloadings here.
+ */
+ if (na->virt_hdr_len && !dst_na->virt_hdr_len) {
+ vh = (struct nm_vnet_hdr *)ft_p->ft_buf;
+ }
+
+ /* Init source and dest pointers. */
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ slot = &ring->slot[*j];
+ dst = BDG_NMB(&dst_na->up, slot);
+ dst_len = src_len;
+
+ /* We are processing the first input slot and there is a mismatch
+ * between source and destination virt_hdr_len (SHL and DHL).
+ * When the a client is using virtio-net headers, the header length
+ * can be:
+ * - 10: the header corresponds to the struct nm_vnet_hdr
+ * - 12: the first 10 bytes correspond to the struct
+ * virtio_net_hdr, and the last 2 bytes store the
+ * "mergeable buffers" info, which is an optional
+ * hint that can be zeroed for compability
+ *
+ * The destination header is therefore built according to the
+ * following table:
+ *
+ * SHL | DHL | destination header
+ * -----------------------------
+ * 0 | 10 | zero
+ * 0 | 12 | zero
+ * 10 | 0 | doesn't exist
+ * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero
+ * 12 | 0 | doesn't exist
+ * 12 | 10 | copied from the first 10 bytes of source header
+ */
+ bzero(dst, dst_na->virt_hdr_len);
+ if (na->virt_hdr_len && dst_na->virt_hdr_len)
+ memcpy(dst, src, sizeof(struct nm_vnet_hdr));
+ /* Skip the virtio-net headers. */
+ src += na->virt_hdr_len;
+ src_len -= na->virt_hdr_len;
+ dst += dst_na->virt_hdr_len;
+ dst_len = dst_na->virt_hdr_len + src_len;
+
+ /* Here it could be dst_len == 0 (which implies src_len == 0),
+ * so we avoid passing a zero length fragment.
+ */
+ if (dst_len == 0) {
+ ft_p++;
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ dst_len = src_len;
+ }
+
+ if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ u_int gso_bytes = 0;
+ /* Length of the GSO packet header. */
+ u_int gso_hdr_len = 0;
+ /* Pointer to the GSO packet header. Assume it is in a single fragment. */
+ uint8_t *gso_hdr = NULL;
+ /* Index of the current segment. */
+ u_int gso_idx = 0;
+ /* Payload data bytes segmented so far (e.g. TCP data bytes). */
+ u_int segmented_bytes = 0;
+ /* Length of the IP header (20 if IPv4, 40 if IPv6). */
+ u_int iphlen = 0;
+ /* Is this a TCP or an UDP GSO packet? */
+ u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN)
+ == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1;
+
+ /* Segment the GSO packet contained into the input slots (frags). */
+ while (ft_p != ft_end) {
+ size_t copy;
+
+ /* Grab the GSO header if we don't have it. */
+ if (!gso_hdr) {
+ uint16_t ethertype;
+
+ gso_hdr = src;
+
+ /* Look at the 'Ethertype' field to see if this packet
+ * is IPv4 or IPv6.
+ */
+ ethertype = be16toh(*((uint16_t *)(gso_hdr + 12)));
+ if (ethertype == 0x0800)
+ iphlen = 20;
+ else /* if (ethertype == 0x86DD) */
+ iphlen = 40;
+ ND(3, "type=%04x", ethertype);
+
+ /* Compute gso_hdr_len. For TCP we need to read the
+ * content of the 'Data Offset' field.
+ */
+ if (tcp) {
+ struct nm_tcphdr *tcph =
+ (struct nm_tcphdr *)&gso_hdr[14+iphlen];
+
+ gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4);
+ } else
+ gso_hdr_len = 14 + iphlen + 8; /* UDP */
+
+ ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len,
+ dst_na->mfs);
+
+ /* Advance source pointers. */
+ src += gso_hdr_len;
+ src_len -= gso_hdr_len;
+ if (src_len == 0) {
+ ft_p++;
+ if (ft_p == ft_end)
+ break;
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ continue;
+ }
+ }
+
+ /* Fill in the header of the current segment. */
+ if (gso_bytes == 0) {
+ memcpy(dst, gso_hdr, gso_hdr_len);
+ gso_bytes = gso_hdr_len;
+ }
+
+ /* Fill in data and update source and dest pointers. */
+ copy = src_len;
+ if (gso_bytes + copy > dst_na->mfs)
+ copy = dst_na->mfs - gso_bytes;
+ memcpy(dst + gso_bytes, src, copy);
+ gso_bytes += copy;
+ src += copy;
+ src_len -= copy;
+
+ /* A segment is complete or we have processed all the
+ the GSO payload bytes. */
+ if (gso_bytes >= dst_na->mfs ||
+ (src_len == 0 && ft_p + 1 == ft_end)) {
+ /* After raw segmentation, we must fix some header
+ * fields and compute checksums, in a protocol dependent
+ * way. */
+ gso_fix_segment(dst, gso_bytes, gso_idx,
+ segmented_bytes,
+ src_len == 0 && ft_p + 1 == ft_end,
+ tcp, iphlen);
+
+ ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes);
+ slot->len = gso_bytes;
+ slot->flags = 0;
+ segmented_bytes += gso_bytes - gso_hdr_len;
+
+ dst_slots++;
+
+ /* Next destination slot. */
+ *j = nm_next(*j, lim);
+ slot = &ring->slot[*j];
+ dst = BDG_NMB(&dst_na->up, slot);
+
+ gso_bytes = 0;
+ gso_idx++;
+ }
+
+ /* Next input slot. */
+ if (src_len == 0) {
+ ft_p++;
+ if (ft_p == ft_end)
+ break;
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ }
+ }
+ ND(3, "%d bytes segmented", segmented_bytes);
+
+ } else {
+ /* Address of a checksum field into a destination slot. */
+ uint16_t *check = NULL;
+ /* Accumulator for an unfolded checksum. */
+ rawsum_t csum = 0;
+
+ /* Process a non-GSO packet. */
+
+ /* Init 'check' if necessary. */
+ if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ if (unlikely(vh->csum_offset + vh->csum_start > src_len))
+ D("invalid checksum request");
+ else
+ check = (uint16_t *)(dst + vh->csum_start +
+ vh->csum_offset);
+ }
+
+ while (ft_p != ft_end) {
+ /* Init/update the packet checksum if needed. */
+ if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ if (!dst_slots)
+ csum = nm_csum_raw(src + vh->csum_start,
+ src_len - vh->csum_start, 0);
+ else
+ csum = nm_csum_raw(src, src_len, csum);
+ }
+
+ /* Round to a multiple of 64 */
+ src_len = (src_len + 63) & ~63;
+
+ if (ft_p->ft_flags & NS_INDIRECT) {
+ if (copyin(src, dst, src_len)) {
+ /* Invalid user pointer, pretend len is 0. */
+ dst_len = 0;
+ }
+ } else {
+ memcpy(dst, src, (int)src_len);
+ }
+ slot->len = dst_len;
+
+ dst_slots++;
+
+ /* Next destination slot. */
+ *j = nm_next(*j, lim);
+ slot = &ring->slot[*j];
+ dst = BDG_NMB(&dst_na->up, slot);
+
+ /* Next source slot. */
+ ft_p++;
+ src = ft_p->ft_buf;
+ dst_len = src_len = ft_p->ft_len;
+
+ }
+
+ /* Finalize (fold) the checksum if needed. */
+ if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ *check = nm_csum_fold(csum);
+ }
+ ND(3, "using %u dst_slots", dst_slots);
+
+ /* A second pass on the desitations slots to set the slot flags,
+ * using the right number of destination slots.
+ */
+ while (j_start != *j) {
+ slot = &ring->slot[j_start];
+ slot->flags = (dst_slots << 8)| NS_MOREFRAG;
+ j_start = nm_next(j_start, lim);
+ }
+ /* Clear NS_MOREFRAG flag on last entry. */
+ slot->flags = (dst_slots << 8);
+ }
+
+ /* Update howmany. */
+ if (unlikely(dst_slots > *howmany)) {
+ dst_slots = *howmany;
+ D("Slot allocation error: Should never happen");
+ }
+ *howmany -= dst_slots;
+}
diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c
new file mode 100644
index 000000000000..f8f29fa1770a
--- /dev/null
+++ b/sys/dev/netmap/netmap_pipe.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* $FreeBSD$ */
+
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h> /* prerequisite */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h> /* sockaddrs */
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <sys/refcount.h>
+
+
+#elif defined(linux)
+
+#include "bsd_glue.h"
+
+#elif defined(__APPLE__)
+
+#warning OSX support is only partial
+#include "osx_glue.h"
+
+#else
+
+#error Unsupported platform
+
+#endif /* unsupported */
+
+/*
+ * common headers
+ */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+#ifdef WITH_PIPES
+
+#define NM_PIPE_MAXSLOTS 4096
+
+int netmap_default_pipes = 0; /* default number of pipes for each nic */
+SYSCTL_DECL(_dev_netmap);
+SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , "");
+
+/* allocate the pipe array in the parent adapter */
+int
+netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr)
+{
+ size_t len;
+ int mode = nmr->nr_flags & NR_REG_MASK;
+ u_int npipes;
+
+ if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) {
+ /* this is for our parent, not for us */
+ return 0;
+ }
+
+ /* TODO: we can resize the array if the new
+ * request can accomodate the already existing pipes
+ */
+ if (na->na_pipes) {
+ nmr->nr_arg1 = na->na_max_pipes;
+ return 0;
+ }
+
+ npipes = nmr->nr_arg1;
+ if (npipes == 0)
+ npipes = netmap_default_pipes;
+ nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL);
+
+ if (npipes == 0) {
+ /* really zero, nothing to alloc */
+ goto out;
+ }
+
+ len = sizeof(struct netmap_pipe_adapter *) * npipes;
+ na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (na->na_pipes == NULL)
+ return ENOMEM;
+
+ na->na_max_pipes = npipes;
+ na->na_next_pipe = 0;
+
+out:
+ nmr->nr_arg1 = npipes;
+
+ return 0;
+}
+
+/* deallocate the parent array in the parent adapter */
+void
+netmap_pipe_dealloc(struct netmap_adapter *na)
+{
+ if (na->na_pipes) {
+ ND("freeing pipes for %s", NM_IFPNAME(na->ifp));
+ free(na->na_pipes, M_DEVBUF);
+ na->na_pipes = NULL;
+ na->na_max_pipes = 0;
+ na->na_next_pipe = 0;
+ }
+}
+
+/* find a pipe endpoint with the given id among the parent's pipes */
+static struct netmap_pipe_adapter *
+netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id)
+{
+ int i;
+ struct netmap_pipe_adapter *na;
+
+ for (i = 0; i < parent->na_next_pipe; i++) {
+ na = parent->na_pipes[i];
+ if (na->id == pipe_id) {
+ return na;
+ }
+ }
+ return NULL;
+}
+
+/* add a new pipe endpoint to the parent array */
+static int
+netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
+{
+ if (parent->na_next_pipe >= parent->na_max_pipes) {
+ D("%s: no space left for pipes", NM_IFPNAME(parent->ifp));
+ return ENOMEM;
+ }
+
+ parent->na_pipes[parent->na_next_pipe] = na;
+ na->parent_slot = parent->na_next_pipe;
+ parent->na_next_pipe++;
+ return 0;
+}
+
+/* remove the given pipe endpoint from the parent array */
+static void
+netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
+{
+ u_int n;
+ n = --parent->na_next_pipe;
+ if (n != na->parent_slot) {
+ parent->na_pipes[na->parent_slot] =
+ parent->na_pipes[n];
+ }
+ parent->na_pipes[n] = NULL;
+}
+
+static int
+netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *txkring = na->tx_rings + ring_nr,
+ *rxkring = txkring->pipe;
+ u_int limit; /* slots to transfer */
+ u_int j, k, lim_tx = txkring->nkr_num_slots - 1,
+ lim_rx = rxkring->nkr_num_slots - 1;
+ int m, busy;
+
+ ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name);
+ ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail,
+ txkring->rcur, txkring->rhead, txkring->rtail);
+
+ j = rxkring->nr_hwtail; /* RX */
+ k = txkring->nr_hwcur; /* TX */
+ m = txkring->rhead - txkring->nr_hwcur; /* new slots */
+ if (m < 0)
+ m += txkring->nkr_num_slots;
+ limit = m;
+ m = rxkring->nkr_num_slots - 1; /* max avail space on destination */
+ busy = j - rxkring->nr_hwcur; /* busy slots */
+ if (busy < 0)
+ busy += txkring->nkr_num_slots;
+ m -= busy; /* subtract busy slots */
+ ND(2, "m %d limit %d", m, limit);
+ if (m < limit)
+ limit = m;
+
+ if (limit == 0) {
+ /* either the rxring is full, or nothing to send */
+ nm_txsync_finalize(txkring); /* actually useless */
+ return 0;
+ }
+
+ while (limit-- > 0) {
+ struct netmap_slot *rs = &rxkring->save_ring->slot[j];
+ struct netmap_slot *ts = &txkring->ring->slot[k];
+ struct netmap_slot tmp;
+
+ /* swap the slots */
+ tmp = *rs;
+ *rs = *ts;
+ *ts = tmp;
+
+ /* no need to report the buffer change */
+
+ j = nm_next(j, lim_rx);
+ k = nm_next(k, lim_tx);
+ }
+
+ wmb(); /* make sure the slots are updated before publishing them */
+ rxkring->nr_hwtail = j;
+ txkring->nr_hwcur = k;
+ txkring->nr_hwtail = nm_prev(k, lim_tx);
+
+ nm_txsync_finalize(txkring);
+ ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail,
+ txkring->rcur, txkring->rhead, txkring->rtail, j);
+
+ wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */
+ rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0);
+
+ return 0;
+}
+
+static int
+netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *rxkring = na->rx_rings + ring_nr,
+ *txkring = rxkring->pipe;
+ uint32_t oldhwcur = rxkring->nr_hwcur;
+
+ ND("%s %x <- %s", rxkring->name, flags, txkring->name);
+ rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */
+ ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail,
+ rxkring->rcur, rxkring->rhead, rxkring->rtail);
+ rmb(); /* paired with the first wmb() in txsync */
+ nm_rxsync_finalize(rxkring);
+
+ if (oldhwcur != rxkring->nr_hwcur) {
+ /* we have released some slots, notify the other end */
+ wmb(); /* make sure nr_hwcur is updated before notifying */
+ txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0);
+ }
+ return 0;
+}
+
+/* Pipe endpoints are created and destroyed together, so that endopoints do not
+ * have to check for the existence of their peer at each ?xsync.
+ *
+ * To play well with the existing netmap infrastructure (refcounts etc.), we
+ * adopt the following strategy:
+ *
+ * 1) The first endpoint that is created also creates the other endpoint and
+ * grabs a reference to it.
+ *
+ * state A) user1 --> endpoint1 --> endpoint2
+ *
+ * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives
+ * its reference to the user:
+ *
+ * state B) user1 --> endpoint1 endpoint2 <--- user2
+ *
+ * 3) Assume that, starting from state B endpoint2 is closed. In the unregister
+ * callback endpoint2 notes that endpoint1 is still active and adds a reference
+ * from endpoint1 to itself. When user2 then releases her own reference,
+ * endpoint2 is not destroyed and we are back to state A. A symmetrical state
+ * would be reached if endpoint1 were released instead.
+ *
+ * 4) If, starting from state A, endpoint1 is closed, the destructor notes that
+ * it owns a reference to endpoint2 and releases it.
+ *
+ * Something similar goes on for the creation and destruction of the krings.
+ */
+
+
+/* netmap_pipe_krings_delete.
+ *
+ * There are two cases:
+ *
+ * 1) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1. We have to create both sets
+ * of krings.
+ *
+ * 2) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e2. e1 is certainly registered and our
+ * krings already exist, but they may be hidden.
+ */
+static int
+netmap_pipe_krings_create(struct netmap_adapter *na)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ struct netmap_adapter *ona = &pna->peer->up;
+ int error = 0;
+ if (pna->peer_ref) {
+ int i;
+
+ /* case 1) above */
+ D("%p: case 1, create everything", na);
+ error = netmap_krings_create(na, 0);
+ if (error)
+ goto err;
+
+ /* we also create all the rings, since we need to
+ * update the save_ring pointers.
+ * netmap_mem_rings_create (called by our caller)
+ * will not create the rings again
+ */
+
+ error = netmap_mem_rings_create(na);
+ if (error)
+ goto del_krings1;
+
+ /* update our hidden ring pointers */
+ for (i = 0; i < na->num_tx_rings + 1; i++)
+ na->tx_rings[i].save_ring = na->tx_rings[i].ring;
+ for (i = 0; i < na->num_rx_rings + 1; i++)
+ na->rx_rings[i].save_ring = na->rx_rings[i].ring;
+
+ /* now, create krings and rings of the other end */
+ error = netmap_krings_create(ona, 0);
+ if (error)
+ goto del_rings1;
+
+ error = netmap_mem_rings_create(ona);
+ if (error)
+ goto del_krings2;
+
+ for (i = 0; i < ona->num_tx_rings + 1; i++)
+ ona->tx_rings[i].save_ring = ona->tx_rings[i].ring;
+ for (i = 0; i < ona->num_rx_rings + 1; i++)
+ ona->rx_rings[i].save_ring = ona->rx_rings[i].ring;
+
+ /* cross link the krings */
+ for (i = 0; i < na->num_tx_rings; i++) {
+ na->tx_rings[i].pipe = pna->peer->up.rx_rings + i;
+ na->rx_rings[i].pipe = pna->peer->up.tx_rings + i;
+ pna->peer->up.tx_rings[i].pipe = na->rx_rings + i;
+ pna->peer->up.rx_rings[i].pipe = na->tx_rings + i;
+ }
+ } else {
+ int i;
+ /* case 2) above */
+ /* recover the hidden rings */
+ ND("%p: case 2, hidden rings", na);
+ for (i = 0; i < na->num_tx_rings + 1; i++)
+ na->tx_rings[i].ring = na->tx_rings[i].save_ring;
+ for (i = 0; i < na->num_rx_rings + 1; i++)
+ na->rx_rings[i].ring = na->rx_rings[i].save_ring;
+ }
+ return 0;
+
+del_krings2:
+ netmap_krings_delete(ona);
+del_rings1:
+ netmap_mem_rings_delete(na);
+del_krings1:
+ netmap_krings_delete(na);
+err:
+ return error;
+}
+
+/* netmap_pipe_reg.
+ *
+ * There are two cases on registration (onoff==1)
+ *
+ * 1.a) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1. Nothing special to do.
+ *
+ * 1.b) state is
+ *
+ * usr1 --> e1 --> e2 <-- usr2
+ *
+ * and we are e2. Drop the ref e1 is holding.
+ *
+ * There are two additional cases on unregister (onoff==0)
+ *
+ * 2.a) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1. Nothing special to do, e2 will
+ * be cleaned up by the destructor of e1.
+ *
+ * 2.b) state is
+ *
+ * usr1 --> e1 e2 <-- usr2
+ *
+ * and we are either e1 or e2. Add a ref from the
+ * other end and hide our rings.
+ */
+static int
+netmap_pipe_reg(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ struct ifnet *ifp = na->ifp;
+ ND("%p: onoff %d", na, onoff);
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+ } else {
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ }
+ if (pna->peer_ref) {
+ ND("%p: case 1.a or 2.a, nothing to do", na);
+ return 0;
+ }
+ if (onoff) {
+ ND("%p: case 1.b, drop peer", na);
+ pna->peer->peer_ref = 0;
+ netmap_adapter_put(na);
+ } else {
+ int i;
+ ND("%p: case 2.b, grab peer", na);
+ netmap_adapter_get(na);
+ pna->peer->peer_ref = 1;
+ /* hide our rings from netmap_mem_rings_delete */
+ for (i = 0; i < na->num_tx_rings + 1; i++) {
+ na->tx_rings[i].ring = NULL;
+ }
+ for (i = 0; i < na->num_rx_rings + 1; i++) {
+ na->rx_rings[i].ring = NULL;
+ }
+ }
+ return 0;
+}
+
+/* netmap_pipe_krings_delete.
+ *
+ * There are two cases:
+ *
+ * 1) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1 (e2 is not registered, so krings_delete cannot be
+ * called on it);
+ *
+ * 2) state is
+ *
+ * usr1 --> e1 e2 <-- usr2
+ *
+ * and we are either e1 or e2.
+ *
+ * In the former case we have to also delete the krings of e2;
+ * in the latter case we do nothing (note that our krings
+ * have already been hidden in the unregister callback).
+ */
+static void
+netmap_pipe_krings_delete(struct netmap_adapter *na)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ struct netmap_adapter *ona; /* na of the other end */
+ int i;
+
+ if (!pna->peer_ref) {
+ ND("%p: case 2, kept alive by peer", na);
+ return;
+ }
+ /* case 1) above */
+ ND("%p: case 1, deleting everyhing", na);
+ netmap_krings_delete(na); /* also zeroes tx_rings etc. */
+ /* restore the ring to be deleted on the peer */
+ ona = &pna->peer->up;
+ if (ona->tx_rings == NULL) {
+ /* already deleted, we must be on an
+ * cleanup-after-error path */
+ return;
+ }
+ for (i = 0; i < ona->num_tx_rings + 1; i++)
+ ona->tx_rings[i].ring = ona->tx_rings[i].save_ring;
+ for (i = 0; i < ona->num_rx_rings + 1; i++)
+ ona->rx_rings[i].ring = ona->rx_rings[i].save_ring;
+ netmap_mem_rings_delete(ona);
+ netmap_krings_delete(ona);
+}
+
+
+static void
+netmap_pipe_dtor(struct netmap_adapter *na)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ ND("%p", na);
+ if (pna->peer_ref) {
+ ND("%p: clean up peer", na);
+ pna->peer_ref = 0;
+ netmap_adapter_put(&pna->peer->up);
+ }
+ if (pna->role == NR_REG_PIPE_MASTER)
+ netmap_pipe_remove(pna->parent, pna);
+ netmap_adapter_put(pna->parent);
+ free(na->ifp, M_DEVBUF);
+ na->ifp = NULL;
+ pna->parent = NULL;
+}
+
+int
+netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
+{
+ struct nmreq pnmr;
+ struct netmap_adapter *pna; /* parent adapter */
+ struct netmap_pipe_adapter *mna, *sna, *req;
+ struct ifnet *ifp, *ifp2;
+ u_int pipe_id;
+ int role = nmr->nr_flags & NR_REG_MASK;
+ int error;
+
+ ND("flags %x", nmr->nr_flags);
+
+ if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) {
+ ND("not a pipe");
+ return 0;
+ }
+ role = nmr->nr_flags & NR_REG_MASK;
+
+ /* first, try to find the parent adapter */
+ bzero(&pnmr, sizeof(pnmr));
+ memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ);
+ /* pass to parent the requested number of pipes */
+ pnmr.nr_arg1 = nmr->nr_arg1;
+ error = netmap_get_na(&pnmr, &pna, create);
+ if (error) {
+ ND("parent lookup failed: %d", error);
+ return error;
+ }
+ ND("found parent: %s", NM_IFPNAME(pna->ifp));
+
+ if (NETMAP_OWNED_BY_KERN(pna)) {
+ ND("parent busy");
+ error = EBUSY;
+ goto put_out;
+ }
+
+ /* next, lookup the pipe id in the parent list */
+ req = NULL;
+ pipe_id = nmr->nr_ringid & NETMAP_RING_MASK;
+ mna = netmap_pipe_find(pna, pipe_id);
+ if (mna) {
+ if (mna->role == role) {
+ ND("found %d directly at %d", pipe_id, mna->parent_slot);
+ req = mna;
+ } else {
+ ND("found %d indirectly at %d", pipe_id, mna->parent_slot);
+ req = mna->peer;
+ }
+ /* the pipe we have found already holds a ref to the parent,
+ * so we need to drop the one we got from netmap_get_na()
+ */
+ netmap_adapter_put(pna);
+ goto found;
+ }
+ ND("pipe %d not found, create %d", pipe_id, create);
+ if (!create) {
+ error = ENODEV;
+ goto put_out;
+ }
+ /* we create both master and slave.
+ * The endpoint we were asked for holds a reference to
+ * the other one.
+ */
+ ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ifp) {
+ error = ENOMEM;
+ goto put_out;
+ }
+ strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp));
+
+ mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (mna == NULL) {
+ error = ENOMEM;
+ goto free_ifp;
+ }
+ mna->up.ifp = ifp;
+
+ mna->id = pipe_id;
+ mna->role = NR_REG_PIPE_MASTER;
+ mna->parent = pna;
+
+ mna->up.nm_txsync = netmap_pipe_txsync;
+ mna->up.nm_rxsync = netmap_pipe_rxsync;
+ mna->up.nm_register = netmap_pipe_reg;
+ mna->up.nm_dtor = netmap_pipe_dtor;
+ mna->up.nm_krings_create = netmap_pipe_krings_create;
+ mna->up.nm_krings_delete = netmap_pipe_krings_delete;
+ mna->up.nm_mem = pna->nm_mem;
+ mna->up.na_lut = pna->na_lut;
+ mna->up.na_lut_objtotal = pna->na_lut_objtotal;
+
+ mna->up.num_tx_rings = 1;
+ mna->up.num_rx_rings = 1;
+ mna->up.num_tx_desc = nmr->nr_tx_slots;
+ nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc,
+ 1, NM_PIPE_MAXSLOTS, NULL);
+ mna->up.num_rx_desc = nmr->nr_rx_slots;
+ nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc,
+ 1, NM_PIPE_MAXSLOTS, NULL);
+ error = netmap_attach_common(&mna->up);
+ if (error)
+ goto free_ifp;
+ /* register the master with the parent */
+ error = netmap_pipe_add(pna, mna);
+ if (error)
+ goto free_mna;
+
+ /* create the slave */
+ ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ifp) {
+ error = ENOMEM;
+ goto free_mna;
+ }
+ strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp));
+
+ sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (sna == NULL) {
+ error = ENOMEM;
+ goto free_ifp2;
+ }
+ /* most fields are the same, copy from master and then fix */
+ *sna = *mna;
+ sna->up.ifp = ifp2;
+ sna->role = NR_REG_PIPE_SLAVE;
+ error = netmap_attach_common(&sna->up);
+ if (error)
+ goto free_sna;
+
+ /* join the two endpoints */
+ mna->peer = sna;
+ sna->peer = mna;
+
+ /* we already have a reference to the parent, but we
+ * need another one for the other endpoint we created
+ */
+ netmap_adapter_get(pna);
+
+ if (role == NR_REG_PIPE_MASTER) {
+ req = mna;
+ mna->peer_ref = 1;
+ netmap_adapter_get(&sna->up);
+ } else {
+ req = sna;
+ sna->peer_ref = 1;
+ netmap_adapter_get(&mna->up);
+ }
+ ND("created master %p and slave %p", mna, sna);
+found:
+
+ ND("pipe %d %s at %p", pipe_id,
+ (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req);
+ *na = &req->up;
+ netmap_adapter_get(*na);
+
+ /* write the configuration back */
+ nmr->nr_tx_rings = req->up.num_tx_rings;
+ nmr->nr_rx_rings = req->up.num_rx_rings;
+ nmr->nr_tx_slots = req->up.num_tx_desc;
+ nmr->nr_rx_slots = req->up.num_rx_desc;
+
+ /* keep the reference to the parent.
+ * It will be released by the req destructor
+ */
+
+ return 0;
+
+free_sna:
+ free(sna, M_DEVBUF);
+free_ifp2:
+ free(ifp2, M_DEVBUF);
+free_mna:
+ free(mna, M_DEVBUF);
+free_ifp:
+ free(ifp, M_DEVBUF);
+put_out:
+ netmap_adapter_put(pna);
+ return error;
+}
+
+
+#endif /* WITH_PIPES */
diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c
index 13a725378c28..34e39126e525 100644
--- a/sys/dev/netmap/netmap_vale.c
+++ b/sys/dev/netmap/netmap_vale.c
@@ -164,21 +164,6 @@ static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
int kern_netmap_regif(struct nmreq *nmr);
/*
- * Each transmit queue accumulates a batch of packets into
- * a structure before forwarding. Packets to the same
- * destination are put in a list using ft_next as a link field.
- * ft_frags and ft_next are valid only on the first fragment.
- */
-struct nm_bdg_fwd { /* forwarding entry for a bridge */
- void *ft_buf; /* netmap or indirect buffer */
- uint8_t ft_frags; /* how many fragments (only on 1st frag) */
- uint8_t _ft_port; /* dst port (unused) */
- uint16_t ft_flags; /* flags, e.g. indirect */
- uint16_t ft_len; /* src fragment len */
- uint16_t ft_next; /* next packet to same destination */
-};
-
-/*
* For each output interface, nm_bdg_q is used to construct a list.
* bq_len is the number of output buffers (we can have coalescing
* during the copy).
@@ -381,7 +366,7 @@ nm_alloc_bdgfwd(struct netmap_adapter *na)
l += sizeof(struct nm_bdg_q) * num_dstq;
l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
- nrings = na->num_tx_rings + 1;
+ nrings = netmap_real_tx_rings(na);
kring = na->tx_rings;
for (i = 0; i < nrings; i++) {
struct nm_bdg_fwd *ft;
@@ -421,7 +406,8 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
acquire BDG_WLOCK() and copy back the array.
*/
- D("detach %d and %d (lim %d)", hw, sw, lim);
+ if (netmap_verbose)
+ D("detach %d and %d (lim %d)", hw, sw, lim);
/* make a copy of the list of active ports, update it,
* and then copy back within BDG_WLOCK().
*/
@@ -675,7 +661,7 @@ nm_bdg_attach(struct nmreq *nmr)
goto unref_exit;
}
- nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error);
+ nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
if (!nifp) {
goto unref_exit;
}
@@ -855,15 +841,23 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
NMG_UNLOCK();
break;
- case NETMAP_BDG_OFFSET:
+ case NETMAP_BDG_VNET_HDR:
+ /* Valid lengths for the virtio-net header are 0 (no header),
+ 10 and 12. */
+ if (nmr->nr_arg1 != 0 &&
+ nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
+ nmr->nr_arg1 != 12) {
+ error = EINVAL;
+ break;
+ }
NMG_LOCK();
error = netmap_get_bdg_na(nmr, &na, 0);
if (na && !error) {
vpna = (struct netmap_vp_adapter *)na;
- if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
- nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
- vpna->offset = nmr->nr_arg1;
- D("Using offset %d for %p", vpna->offset, vpna);
+ vpna->virt_hdr_len = nmr->nr_arg1;
+ if (vpna->virt_hdr_len)
+ vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
netmap_adapter_put(na);
}
NMG_UNLOCK();
@@ -877,26 +871,20 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
return error;
}
-
static int
netmap_vp_krings_create(struct netmap_adapter *na)
{
- u_int ntx, nrx, tailroom;
+ u_int tailroom;
int error, i;
uint32_t *leases;
-
- /* XXX vps do not need host rings,
- * but we crash if we don't have one
- */
- ntx = na->num_tx_rings + 1;
- nrx = na->num_rx_rings + 1;
+ u_int nrx = netmap_real_rx_rings(na);
/*
* Leases are attached to RX rings on vale ports
*/
tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
- error = netmap_krings_create(na, ntx, nrx, tailroom);
+ error = netmap_krings_create(na, tailroom);
if (error)
return error;
@@ -1212,16 +1200,16 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
u_int len = ft[i].ft_len;
ND("slot %d frags %d", i, ft[i].ft_frags);
- /* Drop the packet if the offset is not into the first
+ /* Drop the packet if the virtio-net header is not into the first
fragment nor at the very beginning of the second. */
- if (unlikely(na->offset > len))
+ if (unlikely(na->virt_hdr_len > len))
continue;
- if (len == na->offset) {
+ if (len == na->virt_hdr_len) {
buf = ft[i+1].ft_buf;
len = ft[i+1].ft_len;
} else {
- buf += na->offset;
- len -= na->offset;
+ buf += na->virt_hdr_len;
+ len -= na->virt_hdr_len;
}
dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
if (netmap_verbose > 255)
@@ -1280,13 +1268,13 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
struct netmap_vp_adapter *dst_na;
struct netmap_kring *kring;
struct netmap_ring *ring;
- u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next;
+ u_int dst_nr, lim, j, d_i, next, brd_next;
u_int needed, howmany;
int retry = netmap_txsync_retry;
struct nm_bdg_q *d;
uint32_t my_start = 0, lease_idx = 0;
int nrings;
- int offset_mismatch;
+ int virt_hdr_mismatch = 0;
d_i = dsts[i];
ND("second pass %d port %d", i, d_i);
@@ -1311,8 +1299,6 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
goto cleanup;
}
- offset_mismatch = (dst_na->offset != na->offset);
-
/* there is at least one either unicast or broadcast packet */
brd_next = brddst->bq_head;
next = d->bq_head;
@@ -1325,6 +1311,29 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
*/
needed = d->bq_len + brddst->bq_len;
+ if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
+ /* There is a virtio-net header/offloadings mismatch between
+ * source and destination. The slower mismatch datapath will
+ * be used to cope with all the mismatches.
+ */
+ virt_hdr_mismatch = 1;
+ if (dst_na->mfs < na->mfs) {
+ /* We may need to do segmentation offloadings, and so
+ * we may need a number of destination slots greater
+ * than the number of input slots ('needed').
+ * We look for the smallest integer 'x' which satisfies:
+ * needed * na->mfs + x * H <= x * na->mfs
+ * where 'H' is the length of the longest header that may
+ * be replicated in the segmentation process (e.g. for
+ * TCPv4 we must account for ethernet header, IP header
+ * and TCPv4 header).
+ */
+ needed = (needed * na->mfs) /
+ (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
+ ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
+ }
+ }
+
ND(5, "pass 2 dst %d is %x %s",
i, d_i, is_vp ? "virtual" : "nic/host");
dst_nr = d_i & (NM_BDG_MAXRINGS-1);
@@ -1337,6 +1346,10 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
retry:
+ if (dst_na->retry && retry) {
+ /* try to get some free slot from the previous run */
+ dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
+ }
/* reserve the buffers in the queue and an entry
* to report completion, and drop lock.
* XXX this might become a helper function.
@@ -1346,9 +1359,6 @@ retry:
mtx_unlock(&kring->q_lock);
goto cleanup;
}
- if (dst_na->retry) {
- dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
- }
my_start = j = kring->nkr_hwlease;
howmany = nm_kr_space(kring, 1);
if (needed < howmany)
@@ -1365,7 +1375,6 @@ retry:
struct netmap_slot *slot;
struct nm_bdg_fwd *ft_p, *ft_end;
u_int cnt;
- int fix_mismatch = offset_mismatch;
/* find the queue from which we pick next packet.
* NM_FT_NULL is always higher than valid indexes
@@ -1383,58 +1392,43 @@ retry:
cnt = ft_p->ft_frags; // cnt > 0
if (unlikely(cnt > howmany))
break; /* no more space */
- howmany -= cnt;
if (netmap_verbose && cnt > 1)
RD(5, "rx %d frags to %d", cnt, j);
ft_end = ft_p + cnt;
- do {
- char *dst, *src = ft_p->ft_buf;
- size_t copy_len = ft_p->ft_len, dst_len = copy_len;
-
- slot = &ring->slot[j];
- dst = BDG_NMB(&dst_na->up, slot);
-
- if (unlikely(fix_mismatch)) {
- /* We are processing the first fragment
- * and there is a mismatch between source
- * and destination offsets. Create a zeroed
- * header for the destination, independently
- * of the source header length and content.
- */
- src += na->offset;
- copy_len -= na->offset;
- bzero(dst, dst_na->offset);
- dst += dst_na->offset;
- dst_len = dst_na->offset + copy_len;
- /* fix the first fragment only */
- fix_mismatch = 0;
- /* Here it could be copy_len == dst_len == 0,
- * and so a zero length fragment is passed.
- */
- }
-
- ND("send [%d] %d(%d) bytes at %s:%d",
- i, (int)copy_len, (int)dst_len,
- NM_IFPNAME(dst_ifp), j);
- /* round to a multiple of 64 */
- copy_len = (copy_len + 63) & ~63;
-
- if (ft_p->ft_flags & NS_INDIRECT) {
- if (copyin(src, dst, copy_len)) {
- // invalid user pointer, pretend len is 0
- dst_len = 0;
- }
- } else {
- //memcpy(dst, src, copy_len);
- pkt_copy(src, dst, (int)copy_len);
- }
- slot->len = dst_len;
- slot->flags = (cnt << 8)| NS_MOREFRAG;
- j = nm_next(j, lim);
- ft_p++;
- sent++;
- } while (ft_p != ft_end);
- slot->flags = (cnt << 8); /* clear flag on last entry */
+ if (unlikely(virt_hdr_mismatch)) {
+ bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
+ } else {
+ howmany -= cnt;
+ do {
+ char *dst, *src = ft_p->ft_buf;
+ size_t copy_len = ft_p->ft_len, dst_len = copy_len;
+
+ slot = &ring->slot[j];
+ dst = BDG_NMB(&dst_na->up, slot);
+
+ ND("send [%d] %d(%d) bytes at %s:%d",
+ i, (int)copy_len, (int)dst_len,
+ NM_IFPNAME(dst_ifp), j);
+ /* round to a multiple of 64 */
+ copy_len = (copy_len + 63) & ~63;
+
+ if (ft_p->ft_flags & NS_INDIRECT) {
+ if (copyin(src, dst, copy_len)) {
+ // invalid user pointer, pretend len is 0
+ dst_len = 0;
+ }
+ } else {
+ //memcpy(dst, src, copy_len);
+ pkt_copy(src, dst, (int)copy_len);
+ }
+ slot->len = dst_len;
+ slot->flags = (cnt << 8)| NS_MOREFRAG;
+ j = nm_next(j, lim);
+ needed--;
+ ft_p++;
+ } while (ft_p != ft_end);
+ slot->flags = (cnt << 8); /* clear flag on last entry */
+ }
/* are we done ? */
if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
break;
@@ -1484,9 +1478,9 @@ retry:
*/
if (likely(j != my_start)) {
kring->nr_hwtail = j;
- dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
still_locked = 0;
mtx_unlock(&kring->q_lock);
+ dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
if (dst_na->retry && retry--)
goto retry;
}
@@ -1615,6 +1609,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
struct netmap_vp_adapter *vpna;
struct netmap_adapter *na;
int error;
+ u_int npipes = 0;
vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
if (vpna == NULL)
@@ -1636,8 +1631,23 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
na->num_tx_desc = nmr->nr_tx_slots;
nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1, NM_BDG_MAXSLOTS, NULL);
+ /* validate number of pipes. We want at least 1,
+ * but probably can do with some more.
+ * So let's use 2 as default (when 0 is supplied)
+ */
+ npipes = nmr->nr_arg1;
+ nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
+ nmr->nr_arg1 = npipes; /* write back */
+ /* validate extra bufs */
+ nm_bound_var(&nmr->nr_arg3, 0, 0,
+ 128*NM_BDG_MAXSLOTS, NULL);
na->num_rx_desc = nmr->nr_rx_slots;
- vpna->offset = 0;
+ vpna->virt_hdr_len = 0;
+ vpna->mfs = 1514;
+ /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
+ vpna->mfs = netmap_buf_size; */
+ if (netmap_verbose)
+ D("max frame size %u", vpna->mfs);
na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
na->nm_txsync = bdg_netmap_txsync;
@@ -1648,14 +1658,21 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
na->nm_krings_delete = netmap_vp_krings_delete;
na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
na->num_tx_rings, na->num_tx_desc,
- na->num_rx_rings, na->num_rx_desc);
+ na->num_rx_rings, na->num_rx_desc,
+ nmr->nr_arg3, npipes, &error);
+ if (na->nm_mem == NULL)
+ goto err;
/* other nmd fields are set in the common routine */
error = netmap_attach_common(na);
- if (error) {
- free(vpna, M_DEVBUF);
- return error;
- }
+ if (error)
+ goto err;
return 0;
+
+err:
+ if (na->nm_mem != NULL)
+ netmap_mem_private_delete(na->nm_mem);
+ free(vpna, M_DEVBUF);
+ return error;
}
@@ -1763,19 +1780,17 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
ring->cur = kring->rcur;
ring->tail = kring->rtail;
- /* simulate a user wakeup on the rx ring */
if (is_host_ring) {
- netmap_rxsync_from_host(na, NULL, NULL);
vpna = hostna;
ring_nr = 0;
- } else {
- /* fetch packets that have arrived.
- * XXX maybe do this in a loop ?
- */
- error = na->nm_rxsync(na, ring_nr, 0);
- if (error)
- goto put_out;
- }
+ }
+ /* simulate a user wakeup on the rx ring */
+ /* fetch packets that have arrived.
+ * XXX maybe do this in a loop ?
+ */
+ error = kring->nm_sync(kring, 0);
+ if (error)
+ goto put_out;
if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
D("how strange, interrupt with no packets on %s",
NM_IFPNAME(ifp));
@@ -1801,7 +1816,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
ring->tail = kring->rtail;
/* another call to actually release the buffers */
if (!is_host_ring) {
- error = na->nm_rxsync(na, ring_nr, 0);
+ error = kring->nm_sync(kring, 0);
} else {
/* mark all packets as released, as in the
* second part of netmap_rxsync_from_host()
@@ -1842,11 +1857,11 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
* The original number of rings comes from hwna,
* rx rings on one side equals tx rings on the other.
*/
- for (i = 0; i <= na->num_rx_rings; i++) {
+ for (i = 0; i < na->num_rx_rings + 1; i++) {
hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
hwna->tx_rings[i].ring = na->rx_rings[i].ring;
}
- for (i = 0; i <= na->num_tx_rings; i++) {
+ for (i = 0; i < na->num_tx_rings + 1; i++) {
hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
hwna->rx_rings[i].ring = na->tx_rings[i].ring;
}
@@ -1914,8 +1929,10 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)
return error;
}
- hostna->tx_rings = na->tx_rings + na->num_tx_rings;
- hostna->rx_rings = na->rx_rings + na->num_rx_rings;
+ if (na->na_flags & NAF_HOST_RINGS) {
+ hostna->tx_rings = na->tx_rings + na->num_tx_rings;
+ hostna->rx_rings = na->rx_rings + na->num_rx_rings;
+ }
return 0;
}
@@ -1957,6 +1974,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
return 0;
+ mtx_lock(&kring->q_lock);
/* first step: simulate a user wakeup on the rx ring */
netmap_vp_rxsync(na, ring_n, flags);
ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
@@ -1972,12 +1990,8 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
*/
/* set tail to what the hw expects */
ring->tail = hw_kring->rtail;
- if (ring_n == na->num_rx_rings) {
- netmap_txsync_to_host(hwna);
- } else {
- nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
- error = hwna->nm_txsync(hwna, ring_n, flags);
- }
+ nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
+ error = hw_kring->nm_sync(hw_kring, flags);
/* fourth step: now we are back the rx ring */
/* claim ownership on all hw owned bufs */
@@ -1991,7 +2005,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
ring->head, ring->cur, ring->tail,
hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
-
+ mtx_unlock(&kring->q_lock);
return error;
}
@@ -2047,18 +2061,21 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
bna->hwna = hwna;
netmap_adapter_get(hwna);
hwna->na_private = bna; /* weak reference */
-
- hostna = &bna->host.up;
- hostna->ifp = hwna->ifp;
- hostna->num_tx_rings = 1;
- hostna->num_tx_desc = hwna->num_rx_desc;
- hostna->num_rx_rings = 1;
- hostna->num_rx_desc = hwna->num_tx_desc;
- // hostna->nm_txsync = netmap_bwrap_host_txsync;
- // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
- hostna->nm_notify = netmap_bwrap_host_notify;
- hostna->nm_mem = na->nm_mem;
- hostna->na_private = bna;
+
+ if (hwna->na_flags & NAF_HOST_RINGS) {
+ na->na_flags |= NAF_HOST_RINGS;
+ hostna = &bna->host.up;
+ hostna->ifp = hwna->ifp;
+ hostna->num_tx_rings = 1;
+ hostna->num_tx_desc = hwna->num_rx_desc;
+ hostna->num_rx_rings = 1;
+ hostna->num_rx_desc = hwna->num_tx_desc;
+ // hostna->nm_txsync = netmap_bwrap_host_txsync;
+ // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
+ hostna->nm_notify = netmap_bwrap_host_notify;
+ hostna->nm_mem = na->nm_mem;
+ hostna->na_private = bna;
+ }
ND("%s<->%s txr %d txd %d rxr %d rxd %d",
fake->if_xname, real->if_xname,
diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile
index aea844bde1ce..647cd103600f 100644
--- a/sys/modules/netmap/Makefile
+++ b/sys/modules/netmap/Makefile
@@ -14,5 +14,7 @@ SRCS += netmap_generic.c
SRCS += netmap_mbq.c netmap_mbq.h
SRCS += netmap_vale.c
SRCS += netmap_freebsd.c
+SRCS += netmap_offloadings.c
+SRCS += netmap_pipe.c
.include <bsd.kmod.mk>
diff --git a/sys/net/netmap.h b/sys/net/netmap.h
index a5ee9b55edc9..f0b4c56d4e39 100644
--- a/sys/net/netmap.h
+++ b/sys/net/netmap.h
@@ -39,8 +39,10 @@
#ifndef _NET_NETMAP_H_
#define _NET_NETMAP_H_
-#define NETMAP_API 10 /* current API version */
+#define NETMAP_API 11 /* current API version */
+#define NETMAP_MIN_API 11 /* min and max versions accepted */
+#define NETMAP_MAX_API 15
/*
* Some fields should be cache-aligned to reduce contention.
* The alignment is architecture and OS dependent, but rather than
@@ -73,20 +75,21 @@
+===============+ / | buf_idx, len | slot[1]
| txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
| txring_ofs[1] | +---------------+
- (tx+1+extra_tx entries) (num_slots entries)
+ (tx+1 entries) (num_slots entries)
| txring_ofs[t] | | buf_idx, len | slot[n-1]
+---------------+ | flags, ptr |
| rxring_ofs[0] | +---------------+
| rxring_ofs[1] |
- (rx+1+extra_rx entries)
+ (rx+1 entries)
| rxring_ofs[r] |
+---------------+
- * For each "interface" (NIC, host stack, VALE switch port) attached to a
- * file descriptor, the mmap()ed region contains a (logically readonly)
+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to
+ * a file descriptor, the mmap()ed region contains a (logically readonly)
* struct netmap_if pointing to struct netmap_ring's.
+ *
* There is one netmap_ring per physical NIC ring, plus one tx/rx ring
- * pair attached to the host stack (this pair is unused for VALE ports).
+ * pair attached to the host stack (this pair is unused for non-NIC ports).
*
* All physical/host stack ports share the same memory region,
* so that zero-copy can be implemented between them.
@@ -98,7 +101,42 @@
* is provided for user-supplied buffers in the tx path.
*
* In user space, the buffer address is computed as
- * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE
+ * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE
+ *
+ * Added in NETMAP_API 11:
+ *
+ * + NIOCREGIF can request the allocation of extra spare buffers from
+ * the same memory pool. The desired number of buffers must be in
+ * nr_arg3. The ioctl may return fewer buffers, depending on memory
+ * availability. nr_arg3 will return the actual value, and, once
+ * mapped, nifp->ni_bufs_head will be the index of the first buffer.
+ *
+ * The buffers are linked to each other using the first uint32_t
+ * as the index. On close, ni_bufs_head must point to the list of
+ * buffers to be released.
+ *
+ * + NIOCREGIF can request space for extra rings (and buffers)
+ * allocated in the same memory space. The number of extra rings
+ * is in nr_arg1, and is advisory. This is a no-op on NICs where
+ * the size of the memory space is fixed.
+ *
+ * + NIOCREGIF can attach to PIPE rings sharing the same memory
+ * space with a parent device. The ifname indicates the parent device,
+ * which must already exist. Flags in nr_flags indicate if we want to
+ * bind the master or slave side, the index (from nr_ringid)
+ * is just a cookie and does need to be sequential.
+ *
+ * + NIOCREGIF can also attach to 'monitor' rings that replicate
+ * the content of specific rings, also from the same memory space.
+ *
+ * Extra flags in nr_flags support the above functions.
+ * Application libraries may use the following naming scheme:
+ * netmap:foo all NIC ring pairs
+ * netmap:foo^ only host ring pair
+ * netmap:foo+ all NIC ring + host ring pairs
+ * netmap:foo-k the k-th NIC ring pair
+ * netmap:foo{k PIPE ring pair k, master side
+ * netmap:foo}k PIPE ring pair k, slave side
*/
/*
@@ -284,8 +322,8 @@ struct netmap_if {
const uint32_t ni_tx_rings; /* number of HW tx rings */
const uint32_t ni_rx_rings; /* number of HW rx rings */
- const uint32_t ni_extra_tx_rings;
- const uint32_t ni_extra_rx_rings;
+ uint32_t ni_bufs_head; /* head index for extra bufs */
+ uint32_t ni_spare1[5];
/*
* The following array contains the offset of each netmap ring
* from this structure, in the following order:
@@ -321,6 +359,7 @@ struct netmap_if {
*
* The actual argument (struct nmreq) has a number of options to request
* different functions.
+ * The following are used in NIOCREGIF when nr_cmd == 0:
*
* nr_name (in)
* The name of the port (em0, valeXXX:YYY, etc.)
@@ -337,6 +376,13 @@ struct netmap_if {
*
* nr_ringid (in)
* Indicates how rings should be bound to the file descriptors.
+ * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK)
+ * are used to indicate the ring number, and nr_flags specifies
+ * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected.
+ *
+ * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED:
+ * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control
+ * the binding as follows:
* 0 (default) binds all physical rings
* NETMAP_HW_RING | ring number binds a single ring pair
* NETMAP_SW_RING binds only the host tx/rx rings
@@ -345,8 +391,41 @@ struct netmap_if {
* packets on tx rings only if POLLOUT is set.
* The default is to push any pending packet.
*
- * NETMAP_PRIV_MEM is set on return for ports that use private
- * memory regions and cannot use buffer swapping.
+ * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release
+ * packets on rx rings also when POLLIN is NOT set.
+ * The default is to touch the rx ring only with POLLIN.
+ * Note that this is the opposite of TX because it
+ * reflects the common usage.
+ *
+ * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead.
+ * NETMAP_PRIV_MEM is set on return for ports that do not use
+ * the global memory allocator.
+ * This information is not significant and applications
+ * should look at the region id in nr_arg2
+ *
+ * nr_flags is the recommended mode to indicate which rings should
+ * be bound to a file descriptor. Values are NR_REG_*
+ *
+ * nr_arg1 (in) The number of extra rings to be reserved.
+ * Especially when allocating a VALE port the system only
+ * allocates the amount of memory needed for the port.
+ * If more shared memory rings are desired (e.g. for pipes),
+ * the first invocation for the same basename/allocator
+ * should specify a suitable number. Memory cannot be
+ * extended after the first allocation without closing
+ * all ports on the same region.
+ *
+ * nr_arg2 (in/out) The identity of the memory region used.
+ * On input, 0 means the system decides autonomously,
+ * other values may try to select a specific region.
+ * On return the actual value is reported.
+ * Region '1' is the global allocator, normally shared
+ * by all interfaces. Other values are private regions.
+ * If two ports the same region zero-copy is possible.
+ *
+ * nr_arg3 (in/out) number of extra buffers to be allocated.
+ *
+ *
*
* nr_cmd (in) if non-zero indicates a special command:
* NETMAP_BDG_ATTACH and nr_name = vale*:ifname
@@ -362,17 +441,33 @@ struct netmap_if {
* NETMAP_BDG_LIST
* list the configuration of VALE switches.
*
- * NETMAP_BDG_OFFSET XXX ?
- * Set the offset of data in packets. Used with VALE
- * switches where the clients use the vhost header.
+ * NETMAP_BDG_VNET_HDR
+ * Set the virtio-net header length used by the client
+ * of a VALE switch port.
+ *
+ * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific
*
- * nr_arg1, nr_arg2 (in/out) command specific
+ *
*
*/
/*
- * struct nmreq overlays a struct ifreq
+ * struct nmreq overlays a struct ifreq (just the name)
+ *
+ * On input, nr_ringid indicates which rings we are requesting,
+ * with the low flags for the specific ring number.
+ * selection FLAGS RING INDEX
+ *
+ * all the NIC rings 0x0000 -
+ * only HOST ring 0x2000 ring index
+ * single NIC ring 0x4000 -
+ * all the NIC+HOST rings 0x6000 -
+ * one pipe ring, master 0x8000 ring index
+ * *** INVALID 0xA000
+ * one pipe ring, slave 0xC000 ring index
+ * *** INVALID 0xE000
+ *
*/
struct nmreq {
char nr_name[IFNAMSIZ];
@@ -383,27 +478,47 @@ struct nmreq {
uint32_t nr_rx_slots; /* slots in rx rings */
uint16_t nr_tx_rings; /* number of tx rings */
uint16_t nr_rx_rings; /* number of rx rings */
+
uint16_t nr_ringid; /* ring(s) we care about */
-#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */
-#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */
-#define NETMAP_SW_RING 0x2000 /* process the sw ring */
+#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */
+#define NETMAP_SW_RING 0x2000 /* only host ring pair */
+
+#define NETMAP_RING_MASK 0x0fff /* the ring number */
+
#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
-#define NETMAP_RING_MASK 0xfff /* the ring number */
+
+#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */
uint16_t nr_cmd;
#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
#define NETMAP_BDG_DETACH 2 /* detach the NIC */
#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
#define NETMAP_BDG_LIST 4 /* get bridge's info */
-#define NETMAP_BDG_OFFSET 5 /* set the port offset */
+#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */
+#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */
- uint16_t nr_arg1;
+ uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */
#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
-#define NETMAP_BDG_MAX_OFFSET 12
uint16_t nr_arg2;
- uint32_t spare2[3];
+ uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */
+ uint32_t nr_flags;
+ /* various modes, extends nr_ringid */
+ uint32_t spare2[1];
+};
+
+#define NR_REG_MASK 0xf /* values for nr_flags */
+enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */
+ NR_REG_ALL_NIC = 1,
+ NR_REG_SW = 2,
+ NR_REG_NIC_SW = 3,
+ NR_REG_ONE_NIC = 4,
+ NR_REG_PIPE_MASTER = 5,
+ NR_REG_PIPE_SLAVE = 6,
};
+/* monitor uses the NR_REG to select the rings to monitor */
+#define NR_MONITOR_TX 0x100
+#define NR_MONITOR_RX 0x200
/*
diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h
index 1bb337cf0ef7..9c3a4c1e5949 100644
--- a/sys/net/netmap_user.h
+++ b/sys/net/netmap_user.h
@@ -66,6 +66,7 @@
#define _NET_NETMAP_USER_H_
#include <stdint.h>
+#include <sys/socket.h> /* apple needs sockaddr */
#include <net/if.h> /* IFNAMSIZ */
#ifndef likely
@@ -104,12 +105,12 @@ nm_ring_next(struct netmap_ring *r, uint32_t i)
/*
* Return 1 if we have pending transmissions in the tx ring.
- * When everything is complete ring->cur = ring->tail + 1 (modulo ring size)
+ * When everything is complete ring->head = ring->tail + 1 (modulo ring size)
*/
static inline int
nm_tx_pending(struct netmap_ring *r)
{
- return nm_ring_next(r, r->tail) != r->cur;
+ return nm_ring_next(r, r->tail) != r->head;
}
@@ -142,13 +143,41 @@ nm_ring_space(struct netmap_ring *ring)
#include <signal.h>
#include <stdlib.h>
-struct nm_hdr_t { /* same as pcap_pkthdr */
+#ifndef ND /* debug macros */
+/* debug support */
+#define ND(_fmt, ...) do {} while(0)
+#define D(_fmt, ...) \
+ do { \
+ struct timeval t0; \
+ gettimeofday(&t0, NULL); \
+ fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \
+ (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+/* Rate limited version of "D", lps indicates how many per second */
+#define RD(lps, format, ...) \
+ do { \
+ static int t0, __cnt; \
+ struct timeval __xxts; \
+ gettimeofday(&__xxts, NULL); \
+ if (t0 != __xxts.tv_sec) { \
+ t0 = __xxts.tv_sec; \
+ __cnt = 0; \
+ } \
+ if (__cnt++ < lps) { \
+ D(format, ##__VA_ARGS__); \
+ } \
+ } while (0)
+#endif
+
+struct nm_pkthdr { /* same as pcap_pkthdr */
struct timeval ts;
uint32_t caplen;
uint32_t len;
};
-struct nm_stat_t { // pcap_stat
+struct nm_stat { /* same as pcap_stat */
u_int ps_recv;
u_int ps_drop;
u_int ps_ifdrop;
@@ -159,19 +188,29 @@ struct nm_stat_t { // pcap_stat
#define NM_ERRBUF_SIZE 512
-struct nm_desc_t {
- struct nm_desc_t *self;
+struct nm_desc {
+ struct nm_desc *self; /* point to self if netmap. */
int fd;
void *mem;
int memsize;
- struct netmap_if *nifp;
+ int done_mmap; /* set if mem is the result of mmap */
+ struct netmap_if * const nifp;
uint16_t first_tx_ring, last_tx_ring, cur_tx_ring;
uint16_t first_rx_ring, last_rx_ring, cur_rx_ring;
struct nmreq req; /* also contains the nr_name = ifname */
- struct nm_hdr_t hdr;
-
- struct netmap_ring *tx, *rx; /* shortcuts to base hw/sw rings */
+ struct nm_pkthdr hdr;
+ /*
+ * The memory contains netmap_if, rings and then buffers.
+ * Given a pointer (e.g. to nm_inject) we can compare with
+ * mem/buf_start/buf_end to tell if it is a buffer or
+ * some other descriptor in our region.
+ * We also store a pointer to some ring as it helps in the
+ * translation from buffer indexes to addresses.
+ */
+ struct netmap_ring * const some_ring;
+ void * const buf_start;
+ void * const buf_end;
/* parameters from pcap_open_live */
int snaplen;
int promisc;
@@ -183,7 +222,7 @@ struct nm_desc_t {
uint32_t if_reqcap;
uint32_t if_curcap;
- struct nm_stat_t st;
+ struct nm_stat st;
char msg[NM_ERRBUF_SIZE];
};
@@ -191,8 +230,8 @@ struct nm_desc_t {
* when the descriptor is open correctly, d->self == d
* Eventually we should also use some magic number.
*/
-#define P2NMD(p) ((struct nm_desc_t *)(p))
-#define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d))
+#define P2NMD(p) ((struct nm_desc *)(p))
+#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d))
#define NETMAP_FD(d) (P2NMD(d)->fd)
@@ -205,7 +244,7 @@ struct nm_desc_t {
* XXX only for multiples of 64 bytes, non overlapped.
*/
static inline void
-pkt_copy(const void *_src, void *_dst, int l)
+nm_pkt_copy(const void *_src, void *_dst, int l)
{
const uint64_t *src = (const uint64_t *)_src;
uint64_t *dst = (uint64_t *)_dst;
@@ -230,7 +269,7 @@ pkt_copy(const void *_src, void *_dst, int l)
/*
* The callback, invoked on each received packet. Same as libpcap
*/
-typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d);
+typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d);
/*
*--- the pcap-like API ---
@@ -238,21 +277,49 @@ typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d);
* nm_open() opens a file descriptor, binds to a port and maps memory.
*
* ifname (netmap:foo or vale:foo) is the port name
- * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc.
- * ring_no only used if NETMAP_HW_RING is specified, is interpreted
- * as a string or integer indicating the ring number
- * ring_flags is stored in all ring flags (e.g. for transparent mode)
- * to open. If successful, t opens the fd and maps the memory.
+ * a suffix can indicate the follwing:
+ * ^ bind the host (sw) ring pair
+ * * bind host and NIC ring pairs (transparent)
+ * -NN bind individual NIC ring pair
+ * {NN bind master side of pipe NN
+ * }NN bind slave side of pipe NN
+ *
+ * req provides the initial values of nmreq before parsing ifname.
+ * Remember that the ifname parsing will override the ring
+ * number in nm_ringid, and part of nm_flags;
+ * flags special functions, normally 0
+ * indicates which fields of *arg are significant
+ * arg special functions, normally NULL
+ * if passed a netmap_desc with mem != NULL,
+ * use that memory instead of mmap.
*/
-static struct nm_desc_t *nm_open(const char *ifname,
- const char *ring_no, int flags, int ring_flags);
+static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req,
+ uint64_t flags, const struct nm_desc *arg);
+
+/*
+ * nm_open can import some fields from the parent descriptor.
+ * These flags control which ones.
+ * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL,
+ * which set the initial value for these flags.
+ * Note that the 16 low bits of the flags are reserved for data
+ * that may go into the nmreq.
+ */
+enum {
+ NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */
+ NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */
+ NM_OPEN_ARG1 = 0x100000,
+ NM_OPEN_ARG2 = 0x200000,
+ NM_OPEN_ARG3 = 0x400000,
+ NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */
+};
+
/*
* nm_close() closes and restores the port to its previous state
*/
-static int nm_close(struct nm_desc_t *);
+static int nm_close(struct nm_desc *);
/*
* nm_inject() is the same as pcap_inject()
@@ -260,111 +327,226 @@ static int nm_close(struct nm_desc_t *);
* nm_nextpkt() is the same as pcap_next()
*/
-static int nm_inject(struct nm_desc_t *, const void *, size_t);
-static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
-static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *);
+static int nm_inject(struct nm_desc *, const void *, size_t);
+static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *);
+static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *);
/*
* Try to open, return descriptor if successful, NULL otherwise.
* An invalid netmap name will return errno = 0;
+ * You can pass a pointer to a pre-filled nm_desc to add special
+ * parameters. Flags is used as follows
+ * NM_OPEN_NO_MMAP use the memory from arg, only
+ * if the nr_arg2 (memory block) matches.
+ * NM_OPEN_ARG1 use req.nr_arg1 from arg
+ * NM_OPEN_ARG2 use req.nr_arg2 from arg
+ * NM_OPEN_RING_CFG user ring config from arg
*/
-static struct nm_desc_t *
-nm_open(const char *ifname, const char *ring_name, int flags, int ring_flags)
+static struct nm_desc *
+nm_open(const char *ifname, const struct nmreq *req,
+ uint64_t new_flags, const struct nm_desc *arg)
{
- struct nm_desc_t *d;
- u_int n, namelen;
- char *port = NULL;
+ struct nm_desc *d = NULL;
+ const struct nm_desc *parent = arg;
+ u_int namelen;
+ uint32_t nr_ringid = 0, nr_flags;
+ const char *port = NULL;
+ const char *errmsg = NULL;
if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) {
- errno = 0; /* name not recognised */
+ errno = 0; /* name not recognised, not an error */
return NULL;
}
if (ifname[0] == 'n')
ifname += 7;
- port = strchr(ifname, '-');
- if (!port) {
- namelen = strlen(ifname);
- } else {
- namelen = port - ifname;
- flags &= ~(NETMAP_SW_RING | NETMAP_HW_RING | NETMAP_RING_MASK);
- if (port[1] == 's')
- flags |= NETMAP_SW_RING;
- else
- ring_name = port;
+ /* scan for a separator */
+ for (port = ifname; *port && !index("-*^{}", *port); port++)
+ ;
+ namelen = port - ifname;
+ if (namelen >= sizeof(d->req.nr_name)) {
+ errmsg = "name too long";
+ goto fail;
}
- if (namelen >= sizeof(d->req.nr_name))
- namelen = sizeof(d->req.nr_name) - 1;
+ switch (*port) {
+ default: /* '\0', no suffix */
+ nr_flags = NR_REG_ALL_NIC;
+ break;
+ case '-': /* one NIC */
+ nr_flags = NR_REG_ONE_NIC;
+ nr_ringid = atoi(port + 1);
+ break;
+ case '*': /* NIC and SW, ignore port */
+ nr_flags = NR_REG_NIC_SW;
+ if (port[1]) {
+ errmsg = "invalid port for nic+sw";
+ goto fail;
+ }
+ break;
+ case '^': /* only sw ring */
+ nr_flags = NR_REG_SW;
+ if (port[1]) {
+ errmsg = "invalid port for sw ring";
+ goto fail;
+ }
+ break;
+ case '{':
+ nr_flags = NR_REG_PIPE_MASTER;
+ nr_ringid = atoi(port + 1);
+ break;
+ case '}':
+ nr_flags = NR_REG_PIPE_SLAVE;
+ nr_ringid = atoi(port + 1);
+ break;
+ }
+
+ if (nr_ringid >= NETMAP_RING_MASK) {
+ errmsg = "invalid ringid";
+ goto fail;
+ }
+ /* add the *XPOLL flags */
+ nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL);
- d = (struct nm_desc_t *)calloc(1, sizeof(*d));
+ d = (struct nm_desc *)calloc(1, sizeof(*d));
if (d == NULL) {
+ errmsg = "nm_desc alloc failure";
errno = ENOMEM;
return NULL;
}
d->self = d; /* set this early so nm_close() works */
d->fd = open("/dev/netmap", O_RDWR);
- if (d->fd < 0)
+ if (d->fd < 0) {
+ errmsg = "cannot open /dev/netmap";
goto fail;
-
- if (flags & NETMAP_SW_RING) {
- d->req.nr_ringid = NETMAP_SW_RING;
- } else {
- u_int r;
- if (flags & NETMAP_HW_RING) /* interpret ring as int */
- r = (uintptr_t)ring_name;
- else /* interpret ring as numeric string */
- r = ring_name ? atoi(ring_name) : ~0;
- r = (r < NETMAP_RING_MASK) ? (r | NETMAP_HW_RING) : 0;
- d->req.nr_ringid = r; /* set the ring */
}
- d->req.nr_ringid |= (flags & ~NETMAP_RING_MASK);
+
+ if (req)
+ d->req = *req;
d->req.nr_version = NETMAP_API;
+ d->req.nr_ringid &= ~NETMAP_RING_MASK;
+
+ /* these fields are overridden by ifname and flags processing */
+ d->req.nr_ringid |= nr_ringid;
+ d->req.nr_flags = nr_flags;
memcpy(d->req.nr_name, ifname, namelen);
d->req.nr_name[namelen] = '\0';
+ /* optionally import info from parent */
+ if (IS_NETMAP_DESC(parent) && new_flags) {
+ if (new_flags & NM_OPEN_ARG1)
+ D("overriding ARG1 %d", parent->req.nr_arg1);
+ d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ?
+ parent->req.nr_arg1 : 4;
+ if (new_flags & NM_OPEN_ARG2)
+ D("overriding ARG2 %d", parent->req.nr_arg2);
+ d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ?
+ parent->req.nr_arg2 : 0;
+ if (new_flags & NM_OPEN_ARG3)
+ D("overriding ARG3 %d", parent->req.nr_arg3);
+ d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ?
+ parent->req.nr_arg3 : 0;
+ if (new_flags & NM_OPEN_RING_CFG) {
+ D("overriding RING_CFG");
+ d->req.nr_tx_slots = parent->req.nr_tx_slots;
+ d->req.nr_rx_slots = parent->req.nr_rx_slots;
+ d->req.nr_tx_rings = parent->req.nr_tx_rings;
+ d->req.nr_rx_rings = parent->req.nr_rx_rings;
+ }
+ if (new_flags & NM_OPEN_IFNAME) {
+ D("overriding ifname %s ringid 0x%x flags 0x%x",
+ parent->req.nr_name, parent->req.nr_ringid,
+ parent->req.nr_flags);
+ memcpy(d->req.nr_name, parent->req.nr_name,
+ sizeof(d->req.nr_name));
+ d->req.nr_ringid = parent->req.nr_ringid;
+ d->req.nr_flags = parent->req.nr_flags;
+ }
+ }
if (ioctl(d->fd, NIOCREGIF, &d->req)) {
+ errmsg = "NIOCREGIF failed";
goto fail;
}
- d->memsize = d->req.nr_memsize;
- d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED,
- d->fd, 0);
- if (d->mem == NULL)
- goto fail;
- d->nifp = NETMAP_IF(d->mem, d->req.nr_offset);
- if (d->req.nr_ringid & NETMAP_SW_RING) {
+ if (IS_NETMAP_DESC(parent) && parent->mem &&
+ parent->req.nr_arg2 == d->req.nr_arg2) {
+ /* do not mmap, inherit from parent */
+ d->memsize = parent->memsize;
+ d->mem = parent->mem;
+ } else {
+ d->memsize = d->req.nr_memsize;
+ d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED,
+ d->fd, 0);
+ if (d->mem == NULL) {
+ errmsg = "mmap failed";
+ goto fail;
+ }
+ d->done_mmap = 1;
+ }
+ {
+ struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset);
+ struct netmap_ring *r = NETMAP_RXRING(nifp, );
+
+ *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp;
+ *(struct netmap_ring **)(uintptr_t)&d->some_ring = r;
+ *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0);
+ *(void **)(uintptr_t)&d->buf_end =
+ (char *)d->mem + d->memsize;
+ }
+
+ if (nr_flags == NR_REG_SW) { /* host stack */
d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings;
d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings;
- } else if (d->req.nr_ringid & NETMAP_HW_RING) {
- /* XXX check validity */
- d->first_tx_ring = d->last_tx_ring =
- d->first_rx_ring = d->last_rx_ring =
- d->req.nr_ringid & NETMAP_RING_MASK;
- } else {
- d->first_tx_ring = d->last_rx_ring = 0;
+ } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */
+ d->first_tx_ring = 0;
+ d->first_rx_ring = 0;
d->last_tx_ring = d->req.nr_tx_rings - 1;
d->last_rx_ring = d->req.nr_rx_rings - 1;
+ } else if (nr_flags == NR_REG_NIC_SW) {
+ d->first_tx_ring = 0;
+ d->first_rx_ring = 0;
+ d->last_tx_ring = d->req.nr_tx_rings;
+ d->last_rx_ring = d->req.nr_rx_rings;
+ } else if (nr_flags == NR_REG_ONE_NIC) {
+ /* XXX check validity */
+ d->first_tx_ring = d->last_tx_ring =
+ d->first_rx_ring = d->last_rx_ring = nr_ringid;
+ } else { /* pipes */
+ d->first_tx_ring = d->last_tx_ring = 0;
+ d->first_rx_ring = d->last_rx_ring = 0;
}
- d->tx = NETMAP_TXRING(d->nifp, 0);
- d->rx = NETMAP_RXRING(d->nifp, 0);
- d->cur_tx_ring = d->first_tx_ring;
- d->cur_rx_ring = d->first_rx_ring;
- for (n = d->first_tx_ring; n <= d->last_tx_ring; n++) {
- d->tx[n].flags |= ring_flags;
+
+#ifdef DEBUG_NETMAP_USER
+ { /* debugging code */
+ int i;
+
+ D("%s tx %d .. %d %d rx %d .. %d %d", ifname,
+ d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings,
+ d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings);
+ for (i = 0; i <= d->req.nr_tx_rings; i++) {
+ struct netmap_ring *r = NETMAP_TXRING(d->nifp, i);
+ D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail);
}
- for (n = d->first_rx_ring; n <= d->last_rx_ring; n++) {
- d->rx[n].flags |= ring_flags;
+ for (i = 0; i <= d->req.nr_rx_rings; i++) {
+ struct netmap_ring *r = NETMAP_RXRING(d->nifp, i);
+ D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail);
}
+ }
+#endif /* debugging */
+
+ d->cur_tx_ring = d->first_tx_ring;
+ d->cur_rx_ring = d->first_rx_ring;
return d;
fail:
nm_close(d);
+ if (errmsg)
+ D("%s %s", errmsg, ifname);
errno = EINVAL;
return NULL;
}
static int
-nm_close(struct nm_desc_t *d)
+nm_close(struct nm_desc *d)
{
/*
* ugly trick to avoid unused warnings
@@ -375,7 +557,7 @@ nm_close(struct nm_desc_t *d)
if (d == NULL || d->self != d)
return EINVAL;
- if (d->mem)
+ if (d->done_mmap && d->mem)
munmap(d->mem, d->memsize);
if (d->fd != -1)
close(d->fd);
@@ -389,7 +571,7 @@ nm_close(struct nm_desc_t *d)
* Same prototype as pcap_inject(), only need to cast.
*/
static int
-nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
+nm_inject(struct nm_desc *d, const void *buf, size_t size)
{
u_int c, n = d->last_tx_ring - d->first_tx_ring + 1;
@@ -408,7 +590,7 @@ nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
i = ring->cur;
idx = ring->slot[i].buf_idx;
ring->slot[i].len = size;
- pkt_copy(buf, NETMAP_BUF(ring, idx), size);
+ nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size);
d->cur_tx_ring = ri;
ring->head = ring->cur = nm_ring_next(ring, i);
return size;
@@ -421,7 +603,7 @@ nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
* Same prototype as pcap_dispatch(), only need to cast.
*/
static int
-nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
+nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg)
{
int n = d->last_rx_ring - d->first_rx_ring + 1;
int c, got = 0, ri = d->cur_rx_ring;
@@ -457,7 +639,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
}
static u_char *
-nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr)
+nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr)
{
int ri = d->cur_rx_ring;
diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile
index e873389c7179..c50247366b5a 100644
--- a/tools/tools/netmap/Makefile
+++ b/tools/tools/netmap/Makefile
@@ -3,11 +3,11 @@
#
# For multiple programs using a single source file each,
# we can just define 'progs' and create custom targets.
-PROGS = pkt-gen bridge vale-ctl testpcap libnetmap.so
+PROGS = pkt-gen bridge vale-ctl
-CLEANFILES = $(PROGS) pcap.o nm_util.o
+CLEANFILES = $(PROGS) *.o
NO_MAN=
-CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys
+CFLAGS += -Werror -Wall # -nostdinc -I/usr/include -I../../../sys
CFLAGS += -Wextra
LDFLAGS += -lpthread
@@ -22,12 +22,11 @@ LDFLAGS += -lpcap
all: $(PROGS)
-pkt-gen bridge: nm_util.o
- $(CC) $(CFLAGS) -o ${.TARGET} ${.TARGET:=.c} nm_util.o $(LDFLAGS)
+pkt-gen: pkt-gen.o
+ $(CC) $(CFLAGS) -o pkt-gen pkt-gen.o $(LDFLAGS)
-testpcap: pcap.c libnetmap.so
- $(CC) $(CFLAGS) -DTEST -L. -lnetmap -o ${.TARGET} pcap.c
-
-libnetmap.so: pcap.c nm_util.c
- $(CC) $(CFLAGS) -fpic -c ${.ALLSRC}
- $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o}
+bridge: bridge.o
+ $(CC) $(CFLAGS) -o bridge bridge.o
+
+vale-ctl: vale-ctl.o
+ $(CC) $(CFLAGS) -o vale-ctl vale-ctl.o
diff --git a/tools/tools/netmap/README b/tools/tools/netmap/README
index 2bde6f2ab4d8..40378e62bbe6 100644
--- a/tools/tools/netmap/README
+++ b/tools/tools/netmap/README
@@ -6,19 +6,4 @@ This directory contains examples that use netmap
bridge a two-port jumper wire, also using the native API
- testpcap a jumper wire using libnetmap (or libpcap)
-
- click* various click examples
-
-------------------------------------------------------------
-Some performance data as of may 2012 for applications using libpcap.
-Throughput is generally in Mpps computed with the 64-byte frames,
-using 1 core on a 2.9GHz CPU and 10Gbit/s interface
-
-Libpcap version -- Application ---------------------
-BSD netmap
----------------------------------------------------
- 0.77 3.82 ports/trafshow (version 5)
- 0.94 7.7 net-mgmt/ipcad (ip accounting daemon)
- 0.9 5.0 net-mgmt/darkstat (ip accounting + graphing)
- 0.83 2.45 net-mgmt/iftop (curses traffic display)
+ vale-ctl the program to control VALE bridges
diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c
index cab545bfc919..0895d4ede676 100644
--- a/tools/tools/netmap/bridge.c
+++ b/tools/tools/netmap/bridge.c
@@ -9,14 +9,15 @@
* $FreeBSD$
*/
-#include "nm_util.h"
-
+#include <stdio.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+#include <sys/poll.h>
int verbose = 0;
-char *version = "$Id$";
-
static int do_abort = 0;
+static int zerocopy = 1; /* enable zerocopy if possible */
static void
sigint_h(int sig)
@@ -28,6 +29,26 @@ sigint_h(int sig)
/*
+ * how many packets on this set of queues ?
+ */
+int
+pkt_queued(struct nm_desc *d, int tx)
+{
+ u_int i, tot = 0;
+
+ if (tx) {
+ for (i = d->first_tx_ring; i <= d->last_tx_ring; i++) {
+ tot += nm_ring_space(NETMAP_TXRING(d->nifp, i));
+ }
+ } else {
+ for (i = d->first_rx_ring; i <= d->last_rx_ring; i++) {
+ tot += nm_ring_space(NETMAP_RXRING(d->nifp, i));
+ }
+ }
+ return tot;
+}
+
+/*
* move up to 'limit' pkts from rxring to txring swapping buffers.
*/
static int
@@ -52,12 +73,6 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
while (limit-- > 0) {
struct netmap_slot *rs = &rxring->slot[j];
struct netmap_slot *ts = &txring->slot[k];
-#ifdef NO_SWAP
- char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx);
- char *txbuf = NETMAP_BUF(txring, ts->buf_idx);
-#else
- uint32_t pkt;
-#endif
/* swap packets */
if (ts->buf_idx < 2 || rs->buf_idx < 2) {
@@ -65,24 +80,26 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
j, rs->buf_idx, k, ts->buf_idx);
sleep(2);
}
-#ifndef NO_SWAP
- pkt = ts->buf_idx;
- ts->buf_idx = rs->buf_idx;
- rs->buf_idx = pkt;
-#endif
/* copy the packet length. */
- if (rs->len < 14 || rs->len > 2048)
+ if (rs->len > 2048) {
D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k);
- else if (verbose > 1)
+ rs->len = 0;
+ } else if (verbose > 1) {
D("%s send len %d rx[%d] -> tx[%d]", msg, rs->len, j, k);
+ }
ts->len = rs->len;
-#ifdef NO_SWAP
- pkt_copy(rxbuf, txbuf, ts->len);
-#else
- /* report the buffer change. */
- ts->flags |= NS_BUF_CHANGED;
- rs->flags |= NS_BUF_CHANGED;
-#endif /* NO_SWAP */
+ if (zerocopy) {
+ uint32_t pkt = ts->buf_idx;
+ ts->buf_idx = rs->buf_idx;
+ rs->buf_idx = pkt;
+ /* report the buffer change. */
+ ts->flags |= NS_BUF_CHANGED;
+ rs->flags |= NS_BUF_CHANGED;
+ } else {
+ char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx);
+ char *txbuf = NETMAP_BUF(txring, ts->buf_idx);
+ nm_pkt_copy(rxbuf, txbuf, ts->len);
+ }
j = nm_ring_next(rxring, j);
k = nm_ring_next(txring, k);
}
@@ -96,7 +113,7 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
/* move packts from src to destination */
static int
-move(struct nm_desc_t *src, struct nm_desc_t *dst, u_int limit)
+move(struct nm_desc *src, struct nm_desc *dst, u_int limit)
{
struct netmap_ring *txring, *rxring;
u_int m = 0, si = src->first_rx_ring, di = dst->first_tx_ring;
@@ -104,8 +121,8 @@ move(struct nm_desc_t *src, struct nm_desc_t *dst, u_int limit)
"host->net" : "net->host";
while (si <= src->last_rx_ring && di <= dst->last_tx_ring) {
- rxring = src->tx + si;
- txring = dst->tx + di;
+ rxring = NETMAP_RXRING(src->nifp, si);
+ txring = NETMAP_TXRING(dst->nifp, di);
ND("txring %p rxring %p", txring, rxring);
if (nm_ring_empty(rxring)) {
si++;
@@ -141,15 +158,16 @@ int
main(int argc, char **argv)
{
struct pollfd pollfd[2];
- int i, ch;
+ int ch;
u_int burst = 1024, wait_link = 4;
- struct nm_desc_t *pa = NULL, *pb = NULL;
+ struct nm_desc *pa = NULL, *pb = NULL;
char *ifa = NULL, *ifb = NULL;
+ char ifabuf[64] = { 0 };
- fprintf(stderr, "%s %s built %s %s\n",
- argv[0], version, __DATE__, __TIME__);
+ fprintf(stderr, "%s built %s %s\n",
+ argv[0], __DATE__, __TIME__);
- while ( (ch = getopt(argc, argv, "b:i:vw:")) != -1) {
+ while ( (ch = getopt(argc, argv, "b:ci:vw:")) != -1) {
switch (ch) {
default:
D("bad option %c %s", ch, optarg);
@@ -167,6 +185,9 @@ main(int argc, char **argv)
D("%s ignored, already have 2 interfaces",
optarg);
break;
+ case 'c':
+ zerocopy = 0; /* do not zerocopy */
+ break;
case 'v':
verbose++;
break;
@@ -202,20 +223,25 @@ main(int argc, char **argv)
}
if (!strcmp(ifa, ifb)) {
D("same interface, endpoint 0 goes to host");
- i = NETMAP_SW_RING;
+ snprintf(ifabuf, sizeof(ifabuf) - 1, "%s^", ifa);
+ ifa = ifabuf;
} else {
/* two different interfaces. Take all rings on if1 */
- i = 0; // all hw rings
}
- pa = netmap_open(ifa, i, 1);
- if (pa == NULL)
+ pa = nm_open(ifa, NULL, 0, NULL);
+ if (pa == NULL) {
+ D("cannot open %s", ifa);
return (1);
+ }
// XXX use a single mmap ?
- pb = netmap_open(ifb, 0, 1);
+ pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa);
if (pb == NULL) {
+ D("cannot open %s", ifb);
nm_close(pa);
return (1);
}
+ zerocopy = zerocopy && (pa->mem == pb->mem);
+ D("------- zerocopy %ssupported", zerocopy ? "" : "NOT ");
/* setup poll(2) variables. */
memset(pollfd, 0, sizeof(pollfd));
@@ -252,23 +278,25 @@ main(int argc, char **argv)
pollfd[0].events,
pollfd[0].revents,
pkt_queued(pa, 0),
- pa->rx->cur,
+ NETMAP_RXRING(pa->nifp, pa->cur_rx_ring)->cur,
pkt_queued(pa, 1),
pollfd[1].events,
pollfd[1].revents,
pkt_queued(pb, 0),
- pb->rx->cur,
+ NETMAP_RXRING(pb->nifp, pb->cur_rx_ring)->cur,
pkt_queued(pb, 1)
);
if (ret < 0)
continue;
if (pollfd[0].revents & POLLERR) {
- D("error on fd0, rx [%d,%d)",
- pa->rx->cur, pa->rx->tail);
+ struct netmap_ring *rx = NETMAP_RXRING(pa->nifp, pa->cur_rx_ring);
+ D("error on fd0, rx [%d,%d,%d)",
+ rx->head, rx->cur, rx->tail);
}
if (pollfd[1].revents & POLLERR) {
- D("error on fd1, rx [%d,%d)",
- pb->rx->cur, pb->rx->tail);
+ struct netmap_ring *rx = NETMAP_RXRING(pb->nifp, pb->cur_rx_ring);
+ D("error on fd1, rx [%d,%d,%d)",
+ rx->head, rx->cur, rx->tail);
}
if (pollfd[0].revents & POLLOUT) {
move(pb, pa, burst);
diff --git a/tools/tools/netmap/click-test.cfg b/tools/tools/netmap/click-test.cfg
deleted file mode 100644
index fc5759f88b1e..000000000000
--- a/tools/tools/netmap/click-test.cfg
+++ /dev/null
@@ -1,19 +0,0 @@
-//
-// $FreeBSD$
-//
-// A sample test configuration for click
-//
-//
-// create a switch
-
-myswitch :: EtherSwitch;
-
-// two input devices
-
-c0 :: FromDevice(ix0, PROMISC true);
-c1 :: FromDevice(ix1, PROMISC true);
-
-// and now pass packets around
-
-c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0);
-c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1);
diff --git a/tools/tools/netmap/nm_util.c b/tools/tools/netmap/nm_util.c
deleted file mode 100644
index deb52bbc87e4..000000000000
--- a/tools/tools/netmap/nm_util.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- * $Id$
- *
- * utilities to use netmap devices.
- * This does the basic functions of opening a device and issuing
- * ioctls()
- */
-
-#include "nm_util.h"
-
-extern int verbose;
-
-int
-nm_do_ioctl(struct nm_desc_t *me, u_long what, int subcmd)
-{
- struct ifreq ifr;
- int error;
- int fd;
-
-#if defined( __FreeBSD__ ) || defined (__APPLE__)
- (void)subcmd; // only used on Linux
- fd = me->fd;
-#endif
-
-#ifdef linux
- struct ethtool_value eval;
-
- bzero(&eval, sizeof(eval));
- fd = socket(AF_INET, SOCK_DGRAM, 0);
- if (fd < 0) {
- printf("Error: cannot get device control socket.\n");
- return -1;
- }
-#endif /* linux */
-
- bzero(&ifr, sizeof(ifr));
- strncpy(ifr.ifr_name, me->req.nr_name, sizeof(ifr.ifr_name));
- switch (what) {
- case SIOCSIFFLAGS:
-#ifndef __APPLE__
- ifr.ifr_flagshigh = me->if_flags >> 16;
-#endif
- ifr.ifr_flags = me->if_flags & 0xffff;
- break;
-
-#if defined( __FreeBSD__ )
- case SIOCSIFCAP:
- ifr.ifr_reqcap = me->if_reqcap;
- ifr.ifr_curcap = me->if_curcap;
- break;
-#endif
-
-#ifdef linux
- case SIOCETHTOOL:
- eval.cmd = subcmd;
- eval.data = 0;
- ifr.ifr_data = (caddr_t)&eval;
- break;
-#endif /* linux */
- }
- error = ioctl(fd, what, &ifr);
- if (error)
- goto done;
- switch (what) {
- case SIOCGIFFLAGS:
-#ifndef __APPLE__
- me->if_flags = (ifr.ifr_flagshigh << 16) |
- (0xffff & ifr.ifr_flags);
-#endif
- if (verbose)
- D("flags are 0x%x", me->if_flags);
- break;
-
-#if defined( __FreeBSD__ )
- case SIOCGIFCAP:
- me->if_reqcap = ifr.ifr_reqcap;
- me->if_curcap = ifr.ifr_curcap;
- if (verbose)
- D("curcap are 0x%x", me->if_curcap);
- break;
-#endif /* __FreeBSD__ */
- }
-done:
-#ifdef linux
- close(fd);
-#endif
- if (error)
- D("ioctl error %d %lu", error, what);
- return error;
-}
-
-/*
- * open a device. if me->mem is null then do an mmap.
- * Returns the file descriptor.
- * The extra flag checks configures promisc mode.
- */
-struct nm_desc_t *
-netmap_open(const char *name, int ringid, int promisc)
-{
- struct nm_desc_t *d = nm_open(name, NULL, ringid, 0);
-
- if (d == NULL)
- return d;
-
- if (verbose)
- D("memsize is %d MB", d->req.nr_memsize>>20);
-
- /* Set the operating mode. */
- if (ringid != NETMAP_SW_RING) {
- nm_do_ioctl(d, SIOCGIFFLAGS, 0);
- if ((d->if_flags & IFF_UP) == 0) {
- D("%s is down, bringing up...", name);
- d->if_flags |= IFF_UP;
- }
- if (promisc) {
- d->if_flags |= IFF_PPROMISC;
- nm_do_ioctl(d, SIOCSIFFLAGS, 0);
- }
-
- /* disable GSO, TSO, RXCSUM, TXCSUM...
- * TODO: set them back when done.
- */
-#ifdef __FreeBSD__
- nm_do_ioctl(d, SIOCGIFCAP, 0);
- d->if_reqcap = d->if_curcap;
- d->if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE);
- nm_do_ioctl(d, SIOCSIFCAP, 0);
-#endif
-#ifdef linux
- nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_SGSO);
- nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_STSO);
- nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_SRXCSUM);
- nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_STXCSUM);
-#endif /* linux */
- }
-
- return d;
-}
-
-
-/*
- * how many packets on this set of queues ?
- */
-int
-pkt_queued(struct nm_desc_t *d, int tx)
-{
- u_int i, tot = 0;
-
- ND("me %p begin %d end %d", me, me->begin, me->end);
- if (tx) {
- for (i = d->first_tx_ring; i <= d->last_tx_ring; i++)
- tot += nm_ring_space(d->tx + i);
- } else {
- for (i = d->first_rx_ring; i <= d->last_rx_ring; i++)
- tot += nm_ring_space(d->rx + i);
- }
- return tot;
-}
-
-#if 0
-
-/*
- *
-
-Helper routines for multiple readers from the same queue
-
-- all readers open the device in 'passive' mode (NETMAP_PRIV_RING set).
- In this mode a thread that loses the race on a poll() just continues
- without calling *xsync()
-
-- all readers share an extra 'ring' which contains the sync information.
- In particular we have a shared head+tail pointers that work
- together with cur and available
- ON RETURN FROM THE SYSCALL:
- shadow->cur = ring->cur
- shadow->tail = ring->tail
- shadow->link[i] = i for all slots // mark invalid
-
- */
-
-struct nm_q_arg {
- u_int want; /* Input */
- u_int have; /* Output, 0 on error */
- u_int cur;
- u_int tail;
- struct netmap_ring *ring;
-};
-
-/*
- * grab a number of slots from the queue.
- */
-struct nm_q_arg
-my_grab(struct nm_q_arg q)
-{
- const u_int ns = q.ring->num_slots;
-
- // lock(ring);
- for (;;) {
-
- q.cur = (volatile u_int)q.ring->head;
- q.have = ns + q.head - (volatile u_int)q.ring->tail;
- if (q.have >= ns)
- q.have -= ns;
- if (q.have == 0) /* no space; caller may ioctl/retry */
- break;
- if (q.want < q.have)
- q.have = q.want;
- q.tail = q.cur + q.have;
- if (q.tail >= ns)
- q.tail -= ns;
- if (atomic_cmpset_int(&q.ring->cur, q.cur, q.tail)
- break; /* success */
- }
- // unlock(ring);
- D("returns %d out of %d at %d,%d",
- q.have, q.want, q.cur, q.tail);
- /* the last one can clear avail ? */
- return q;
-}
-
-
-int
-my_release(struct nm_q_arg q)
-{
- u_int cur = q.cur, tail = q.tail, i;
- struct netmap_ring *r = q.ring;
-
- /* link the block to the next one.
- * there is no race here because the location is mine.
- */
- r->slot[cur].ptr = tail; /* this is mine */
- r->slot[cur].flags |= NM_SLOT_PTR; // points to next block
- // memory barrier
- // lock(ring);
- if (r->head != cur)
- goto done;
- for (;;) {
- // advance head
- r->head = head = r->slot[head].ptr;
- // barrier ?
- if (head == r->slot[head].ptr)
- break; // stop here
- }
- /* we have advanced from q.head to head (r.head might be
- * further down.
- */
- // do an ioctl/poll to flush.
-done:
- // unlock(ring);
- return; /* not my turn to release */
-}
-#endif /* unused */
diff --git a/tools/tools/netmap/nm_util.h b/tools/tools/netmap/nm_util.h
deleted file mode 100644
index 0ab2e2e81984..000000000000
--- a/tools/tools/netmap/nm_util.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * $FreeBSD$
- * $Id$
- *
- * Some utilities to build netmap-based programs.
- */
-
-#ifndef _NM_UTIL_H
-#define _NM_UTIL_H
-
-#define _GNU_SOURCE /* for CPU_SET() */
-
-#include <stdio.h> /* fprintf */
-#include <sys/poll.h> /* POLLIN */
-#include <inttypes.h> /* PRI* macros */
-#include <sys/types.h> /* u_char */
-
-#include <arpa/inet.h> /* ntohs */
-#include <sys/sysctl.h> /* sysctl */
-#include <ifaddrs.h> /* getifaddrs */
-#include <net/ethernet.h> /* ETHERTYPE_IP */
-#include <netinet/in.h> /* IPPROTO_* */
-#include <netinet/ip.h> /* struct ip */
-#include <netinet/udp.h> /* struct udp */
-
-
-#define NETMAP_WITH_LIBS
-#include <net/netmap_user.h>
-
-#include <pthread.h> /* pthread_* */
-
-#ifdef linux
-
-#define cpuset_t cpu_set_t
-
-#define ifr_flagshigh ifr_flags /* only the low 16 bits here */
-#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */
-#include <linux/ethtool.h>
-#include <linux/sockios.h>
-
-#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
-#include <netinet/ether.h> /* ether_aton */
-#include <linux/if_packet.h> /* sockaddr_ll */
-#endif /* linux */
-
-#ifdef __FreeBSD__
-#include <sys/endian.h> /* le64toh */
-#include <machine/param.h>
-
-#include <pthread_np.h> /* pthread w/ affinity */
-#include <sys/cpuset.h> /* cpu_set */
-#include <net/if_dl.h> /* LLADDR */
-#endif /* __FreeBSD__ */
-
-#ifdef __APPLE__
-
-#define cpuset_t uint64_t // XXX
-static inline void CPU_ZERO(cpuset_t *p)
-{
- *p = 0;
-}
-
-static inline void CPU_SET(uint32_t i, cpuset_t *p)
-{
- *p |= 1<< (i & 0x3f);
-}
-
-#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
-
-#define ifr_flagshigh ifr_flags // XXX
-#define IFF_PPROMISC IFF_PROMISC
-#include <net/if_dl.h> /* LLADDR */
-#define clock_gettime(a,b) \
- do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
-#endif /* __APPLE__ */
-
-static inline int min(int a, int b) { return a < b ? a : b; }
-extern int time_second;
-
-/* debug support */
-#define ND(format, ...) do {} while(0)
-#define D(format, ...) \
- fprintf(stderr, "%s [%d] " format "\n", \
- __FUNCTION__, __LINE__, ##__VA_ARGS__)
-
-#define RD(lps, format, ...) \
- do { \
- static int t0, cnt; \
- if (t0 != time_second) { \
- t0 = time_second; \
- cnt = 0; \
- } \
- if (cnt++ < lps) \
- D(format, ##__VA_ARGS__); \
- } while (0)
-
-
-
-struct nm_desc_t * netmap_open(const char *name, int ringid, int promisc);
-int nm_do_ioctl(struct nm_desc_t *me, u_long what, int subcmd);
-int pkt_queued(struct nm_desc_t *d, int tx);
-#endif /* _NM_UTIL_H */
diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c
deleted file mode 100644
index b3c2be5d23ff..000000000000
--- a/tools/tools/netmap/pcap.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
- * (C) 2011-2014 Luigi Rizzo
- *
- * BSD license
- *
- * A simple library that maps some pcap functions onto netmap
- * This is not 100% complete but enough to let tcpdump, trafshow
- * and other apps work.
- *
- * $FreeBSD$
- */
-
-#define MY_PCAP
-#include "nm_util.h"
-
-char *version = "$Id$";
-int verbose = 0;
-
-/*
- * We redefine here a number of structures that are in pcap.h
- * so we can compile this file without the system header.
- */
-#ifndef PCAP_ERRBUF_SIZE
-#define PCAP_ERRBUF_SIZE 128
-/*
- * Each packet is accompanied by a header including the timestamp,
- * captured size and actual size.
- */
-struct pcap_pkthdr {
- struct timeval ts; /* time stamp */
- uint32_t caplen; /* length of portion present */
- uint32_t len; /* length this packet (off wire) */
-};
-
-typedef struct pcap_if pcap_if_t;
-
-/*
- * Representation of an interface address.
- */
-struct pcap_addr {
- struct pcap_addr *next;
- struct sockaddr *addr; /* address */
- struct sockaddr *netmask; /* netmask for the above */
- struct sockaddr *broadaddr; /* broadcast addr for the above */
- struct sockaddr *dstaddr; /* P2P dest. address for the above */
-};
-
-struct pcap_if {
- struct pcap_if *next;
- char *name; /* name to hand to "pcap_open_live()" */
- char *description; /* textual description of interface, or NULL */
- struct pcap_addr *addresses;
- uint32_t flags; /* PCAP_IF_ interface flags */
-};
-
-/*
- * We do not support stats (yet)
- */
-struct pcap_stat {
- u_int ps_recv; /* number of packets received */
- u_int ps_drop; /* number of packets dropped */
- u_int ps_ifdrop; /* drops by interface XXX not yet supported */
-#ifdef WIN32
- u_int bs_capt; /* number of packets that reach the app. */
-#endif /* WIN32 */
-};
-
-typedef struct nm_desc_t pcap_t;
-typedef enum {
- PCAP_D_INOUT = 0,
- PCAP_D_IN,
- PCAP_D_OUT
-} pcap_direction_t;
-
-
-
-typedef void (*pcap_handler)(u_char *user,
- const struct pcap_pkthdr *h, const u_char *bytes);
-
-char errbuf[PCAP_ERRBUF_SIZE];
-
-pcap_t *pcap_open_live(const char *device, int snaplen,
- int promisc, int to_ms, char *errbuf);
-
-int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf);
-void pcap_close(pcap_t *p);
-int pcap_get_selectable_fd(pcap_t *p);
-int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user);
-int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf);
-int pcap_setdirection(pcap_t *p, pcap_direction_t d);
-char *pcap_lookupdev(char *errbuf);
-int pcap_inject(pcap_t *p, const void *buf, size_t size);
-int pcap_fileno(pcap_t *p);
-const char *pcap_lib_version(void);
-
-
-struct eproto {
- const char *s;
- u_short p;
-};
-#endif /* !PCAP_ERRBUF_SIZE */
-
-#ifndef TEST
-/*
- * build as a shared library
- */
-
-char pcap_version[] = "libnetmap version 0.3";
-
-
-/*
- * There is a set of functions that tcpdump expects even if probably
- * not used
- */
-struct eproto eproto_db[] = {
- { "ip", ETHERTYPE_IP },
- { "arp", ETHERTYPE_ARP },
- { (char *)0, 0 }
-};
-
-
-const char *pcap_lib_version(void)
-{
- return pcap_version;
-}
-
-int
-pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf)
-{
- pcap_if_t *top = NULL;
-#ifndef linux
- struct ifaddrs *i_head, *i;
- pcap_if_t *cur;
- struct pcap_addr *tail = NULL;
- int l;
-
- D("listing all devs");
- *alldevsp = NULL;
- i_head = NULL;
-
- if (getifaddrs(&i_head)) {
- D("cannot get if addresses");
- return -1;
- }
- for (i = i_head; i; i = i->ifa_next) {
- //struct ifaddrs *ifa;
- struct pcap_addr *pca;
- //struct sockaddr *sa;
-
- D("got interface %s", i->ifa_name);
- if (!top || strcmp(top->name, i->ifa_name)) {
- /* new interface */
- l = sizeof(*top) + strlen(i->ifa_name) + 1;
- cur = calloc(1, l);
- if (cur == NULL) {
- D("no space for if descriptor");
- continue;
- }
- cur->name = (char *)(cur + 1);
- //cur->flags = i->ifa_flags;
- strcpy(cur->name, i->ifa_name);
- cur->description = NULL;
- cur->next = top;
- top = cur;
- tail = NULL;
- }
- /* now deal with addresses */
- D("%s addr family %d len %d %s %s",
- top->name,
- i->ifa_addr->sa_family, i->ifa_addr->sa_len,
- i->ifa_netmask ? "Netmask" : "",
- i->ifa_broadaddr ? "Broadcast" : "");
- l = sizeof(struct pcap_addr) +
- (i->ifa_addr ? i->ifa_addr->sa_len:0) +
- (i->ifa_netmask ? i->ifa_netmask->sa_len:0) +
- (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0);
- pca = calloc(1, l);
- if (pca == NULL) {
- D("no space for if addr");
- continue;
- }
-#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len))
- pca->addr = (struct sockaddr *)(pca + 1);
- pkt_copy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len);
- if (i->ifa_netmask) {
- pca->netmask = SA_NEXT(pca->addr);
- bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len);
- if (i->ifa_broadaddr) {
- pca->broadaddr = SA_NEXT(pca->netmask);
- bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len);
- }
- }
- if (tail == NULL) {
- top->addresses = pca;
- } else {
- tail->next = pca;
- }
- tail = pca;
-
- }
- freeifaddrs(i_head);
-#endif /* !linux */
- (void)errbuf; /* UNUSED */
- *alldevsp = top;
- return 0;
-}
-
-void pcap_freealldevs(pcap_if_t *alldevs)
-{
- (void)alldevs; /* UNUSED */
- D("unimplemented");
-}
-
-char *
-pcap_lookupdev(char *buf)
-{
- D("%s", buf);
- strcpy(buf, "/dev/netmap");
- return buf;
-}
-
-pcap_t *
-pcap_create(const char *source, char *errbuf)
-{
- D("src %s (call open liveted)", source);
- return pcap_open_live(source, 0, 1, 100, errbuf);
-}
-
-int
-pcap_activate(pcap_t *p)
-{
- D("pcap %p running", p);
- return 0;
-}
-
-int
-pcap_can_set_rfmon(pcap_t *p)
-{
- (void)p; /* UNUSED */
- D("");
- return 0; /* no we can't */
-}
-
-int
-pcap_set_snaplen(pcap_t *p, int snaplen)
-{
- struct nm_desc_t *me = p;
-
- D("len %d", snaplen);
- me->snaplen = snaplen;
- return 0;
-}
-
-int
-pcap_snapshot(pcap_t *p)
-{
- struct nm_desc_t *me = p;
-
- D("len %d", me->snaplen);
- return me->snaplen;
-}
-
-int
-pcap_lookupnet(const char *device, uint32_t *netp,
- uint32_t *maskp, char *errbuf)
-{
-
- (void)errbuf; /* UNUSED */
- D("device %s", device);
- inet_aton("10.0.0.255", (struct in_addr *)netp);
- inet_aton("255.255.255.0",(struct in_addr *) maskp);
- return 0;
-}
-
-int
-pcap_set_promisc(pcap_t *p, int promisc)
-{
- D("promisc %d", promisc);
- if (nm_do_ioctl(p, SIOCGIFFLAGS, 0))
- D("SIOCGIFFLAGS failed");
- if (promisc) {
- p->if_flags |= IFF_PPROMISC;
- } else {
- p->if_flags &= ~IFF_PPROMISC;
- }
- if (nm_do_ioctl(p, SIOCSIFFLAGS, 0))
- D("SIOCSIFFLAGS failed");
- return 0;
-}
-
-int
-pcap_set_timeout(pcap_t *p, int to_ms)
-{
- D("%d ms", to_ms);
- p->to_ms = to_ms;
- return 0;
-}
-
-struct bpf_program;
-
-int
-pcap_compile(pcap_t *p, struct bpf_program *fp,
- const char *str, int optimize, uint32_t netmask)
-{
- (void)p; /* UNUSED */
- (void)fp; /* UNUSED */
- (void)optimize; /* UNUSED */
- (void)netmask; /* UNUSED */
- D("%s", str);
- return 0;
-}
-
-int
-pcap_setfilter(pcap_t *p, struct bpf_program *fp)
-{
- (void)p; /* UNUSED */
- (void)fp; /* UNUSED */
- D("");
- return 0;
-}
-
-int
-pcap_datalink(pcap_t *p)
-{
- (void)p; /* UNUSED */
- D("returns 1");
- return 1; // ethernet
-}
-
-const char *
-pcap_datalink_val_to_name(int dlt)
-{
- D("%d returns DLT_EN10MB", dlt);
- return "DLT_EN10MB";
-}
-
-const char *
-pcap_datalink_val_to_description(int dlt)
-{
- D("%d returns Ethernet link", dlt);
- return "Ethernet link";
-}
-
-struct pcap_stat;
-int
-pcap_stats(pcap_t *p, struct pcap_stat *ps)
-{
- *ps = *(struct pcap_stat *)(void *)&(p->st);
- return 0; /* accumulate from pcap_dispatch() */
-};
-
-char *
-pcap_geterr(pcap_t *p)
-{
- D("");
- return p->msg;
-}
-
-pcap_t *
-pcap_open_live(const char *device, int snaplen,
- int promisc, int to_ms, char *errbuf)
-{
- struct nm_desc_t *d;
- int l;
-
- if (!device) {
- D("missing device name");
- return NULL;
- }
-
- l = strlen(device) + 1;
- D("request to open %s snaplen %d promisc %d timeout %dms",
- device, snaplen, promisc, to_ms);
- d = nm_open(device, NULL, 0, 0);
- if (d == NULL) {
- D("error opening %s", device);
- return NULL;
- }
- d->to_ms = to_ms;
- d->snaplen = snaplen;
- d->errbuf = errbuf;
- d->promisc = promisc;
-
- return d;
-}
-
-void
-pcap_close(pcap_t *p)
-{
- nm_close(p);
- /* restore original flags ? */
-}
-
-int
-pcap_fileno(pcap_t *p)
-{
- struct nm_desc_t *d = p;
- D("returns %d", d->fd);
- return d->fd;
-}
-
-int
-pcap_get_selectable_fd(pcap_t *p)
-{
- struct nm_desc_t *d = p;
-
- return d->fd;
-}
-
-int
-pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf)
-{
- (void)p; /* UNUSED */
- (void)errbuf; /* UNUSED */
- D("mode is %d", nonblock);
- return 0; /* ignore */
-}
-
-int
-pcap_setdirection(pcap_t *p, pcap_direction_t d)
-{
- (void)p; /* UNUSED */
- (void)d; /* UNUSED */
- D("");
- return 0; /* ignore */
-};
-
-int
-pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
-{
- return nm_dispatch(p, cnt, (void *)callback, user);
-}
-
-int
-pcap_inject(pcap_t *p, const void *buf, size_t size)
-{
- return nm_inject(p, buf, size);
-}
-
-int
-pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
-{
- struct pollfd fds[1];
- int i;
-
- ND("cnt %d", cnt);
- memset(fds, 0, sizeof(fds));
- fds[0].fd = p->fd;
- fds[0].events = (POLLIN);
-
- while (cnt == -1 || cnt > 0) {
- if (poll(fds, 1, p->to_ms) <= 0) {
- D("poll error/timeout");
- continue;
- }
- i = nm_dispatch(p, cnt, (void *)callback, user);
- if (cnt > 0)
- cnt -= i;
- }
- return 0;
-}
-
-#endif /* !TEST */
-
-#ifdef TEST /* build test code */
-void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf)
-{
- pcap_inject((pcap_t *)user, buf, h->caplen);
-}
-
-/*
- * a simple pcap test program, bridge between two interfaces.
- */
-int
-main(int argc, char **argv)
-{
- pcap_t *p0, *p1;
- int burst = 1024;
- struct pollfd pollfd[2];
-
- fprintf(stderr, "%s %s built %s %s\n",
- argv[0], version, __DATE__, __TIME__);
-
- while (argc > 1 && !strcmp(argv[1], "-v")) {
- verbose++;
- argv++;
- argc--;
- }
-
- if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) {
- D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]);
- return (1);
- }
- if (argc > 3)
- burst = atoi(argv[3]);
-
- p0 = pcap_open_live(argv[1], 0, 1, 100, NULL);
- p1 = pcap_open_live(argv[2], 0, 1, 100, NULL);
- D("%s", version);
- D("open returns %p %p", p0, p1);
- if (!p0 || !p1)
- return(1);
- bzero(pollfd, sizeof(pollfd));
- pollfd[0].fd = pcap_fileno(p0);
- pollfd[1].fd = pcap_fileno(p1);
- pollfd[0].events = pollfd[1].events = POLLIN;
- for (;;) {
- /* do i need to reset ? */
- pollfd[0].revents = pollfd[1].revents = 0;
- int ret = poll(pollfd, 2, 1000);
- if (ret <= 0 || verbose)
- D("poll %s [0] ev %x %x [1] ev %x %x",
- ret <= 0 ? "timeout" : "ok",
- pollfd[0].events,
- pollfd[0].revents,
- pollfd[1].events,
- pollfd[1].revents);
- if (ret < 0)
- continue;
- if (pollfd[0].revents & POLLIN)
- pcap_dispatch(p0, burst, do_send, (void *)p1);
- if (pollfd[1].revents & POLLIN)
- pcap_dispatch(p1, burst, do_send, (void *)p0);
- }
-
- return (0);
-}
-#endif /* TEST */
diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c
index 3fb7702083fd..8e78fa8e24ed 100644
--- a/tools/tools/netmap/pkt-gen.c
+++ b/tools/tools/netmap/pkt-gen.c
@@ -37,26 +37,83 @@
*
*/
-#define MY_PCAP
-#include "nm_util.h"
-// #include <net/netmap_user.h>
+#define _GNU_SOURCE /* for CPU_SET() */
+#include <stdio.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+
#include <ctype.h> // isprint()
+#include <unistd.h> // sysconf()
+#include <sys/poll.h>
+#include <arpa/inet.h> /* ntohs */
+#include <sys/sysctl.h> /* sysctl */
+#include <ifaddrs.h> /* getifaddrs */
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+
+#include <pthread.h>
#ifndef NO_PCAP
#include <pcap/pcap.h>
#endif
+
+#ifdef linux
+
+#define cpuset_t cpu_set_t
+
+#define ifr_flagshigh ifr_flags /* only the low 16 bits here */
+#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+
+#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
+#include <netinet/ether.h> /* ether_aton */
+#include <linux/if_packet.h> /* sockaddr_ll */
+#endif /* linux */
+
+#ifdef __FreeBSD__
+#include <sys/endian.h> /* le64toh */
+#include <machine/param.h>
+
+#include <pthread_np.h> /* pthread w/ affinity */
+#include <sys/cpuset.h> /* cpu_set */
+#include <net/if_dl.h> /* LLADDR */
+#endif /* __FreeBSD__ */
+
+#ifdef __APPLE__
+
+#define cpuset_t uint64_t // XXX
+static inline void CPU_ZERO(cpuset_t *p)
+{
+ *p = 0;
+}
+
+static inline void CPU_SET(uint32_t i, cpuset_t *p)
+{
+ *p |= 1<< (i & 0x3f);
+}
+
+#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
+
+#define ifr_flagshigh ifr_flags // XXX
+#define IFF_PPROMISC IFF_PROMISC
+#include <net/if_dl.h> /* LLADDR */
+#define clock_gettime(a,b) \
+ do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
+#endif /* __APPLE__ */
+
const char *default_payload="netmap pkt-gen DIRECT payload\n"
"http://info.iet.unipi.it/~luigi/netmap/ ";
const char *indirect_payload="netmap pkt-gen indirect payload\n"
"http://info.iet.unipi.it/~luigi/netmap/ ";
-int time_second; // support for RD() debugging macro
-
int verbose = 0;
-#define SKIP_PAYLOAD 1 /* do not check payload. */
+#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */
#define VIRT_HDR_1 10 /* length of a base vnet-hdr */
@@ -85,6 +142,8 @@ struct mac_range {
struct ether_addr start, end;
};
+/* ifname can be netmap:foo-xxxx */
+#define MAX_IFNAMELEN 64 /* our buffer for ifname */
/*
* global arguments for all threads
*/
@@ -119,15 +178,16 @@ struct glob_arg {
int affinity;
int main_fd;
+ struct nm_desc *nmd;
+ uint64_t nmd_flags;
int report_interval; /* milliseconds between prints */
void *(*td_body)(void *);
void *mmap_addr;
- int mmap_size;
- char *ifname;
+ char ifname[MAX_IFNAMELEN];
char *nmr_config;
int dummy_send;
int virt_header; /* send also the virt_header */
- int host_ring;
+ int extra_bufs; /* goes in nr_arg3 */
};
enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
@@ -142,9 +202,7 @@ struct targ {
int completed;
int cancel;
int fd;
- struct nmreq nmr;
- struct netmap_if *nifp;
- uint16_t qfirst, qlast; /* range of queues to scan */
+ struct nm_desc *nmd;
volatile uint64_t count;
struct timespec tic, toc;
int me;
@@ -187,7 +245,7 @@ extract_ip_range(struct ip_range *r)
pp = index(ap, ':');
if (pp) {
*pp++ = '\0';
- if (*pp)
+ if (*pp)
r->port1 = strtol(pp, NULL, 0);
}
if (*ap) {
@@ -261,19 +319,17 @@ sigint_h(int sig)
static int
system_ncpus(void)
{
-#ifdef __FreeBSD__
- int mib[2], ncpus;
- size_t len;
-
- mib[0] = CTL_HW;
- mib[1] = HW_NCPU;
- len = sizeof(mib);
+ int ncpus;
+#if defined (__FreeBSD__)
+ int mib[2] = { CTL_HW, HW_NCPU };
+ size_t len = sizeof(mib);
sysctl(mib, 2, &ncpus, &len, NULL, 0);
-
+#elif defined(linux)
+ ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+#else /* others */
+ ncpus = 1;
+#endif /* others */
return (ncpus);
-#else
- return 1;
-#endif /* !__FreeBSD__ */
}
#ifdef __linux__
@@ -299,15 +355,17 @@ system_ncpus(void)
/*
* parse the vale configuration in conf and put it in nmr.
+ * Return the flag set if necessary.
* The configuration may consist of 0 to 4 numbers separated
* by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
* Missing numbers or zeroes stand for default values.
* As an additional convenience, if exactly one number
* is specified, then this is assigned to both #tx-slots and #rx-slots.
- * If there is no 4th number, then the 3rd is assigned to both #tx-rings
+ * If there is no 4th number, then the 3rd is assigned to both #tx-rings
* and #rx-rings.
*/
-void parse_nmr_config(const char* conf, struct nmreq *nmr)
+int
+parse_nmr_config(const char* conf, struct nmreq *nmr)
{
char *w, *tok;
int i, v;
@@ -315,7 +373,7 @@ void parse_nmr_config(const char* conf, struct nmreq *nmr)
nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
if (conf == NULL || ! *conf)
- return;
+ return 0;
w = strdup(conf);
for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
v = atoi(tok);
@@ -341,6 +399,9 @@ void parse_nmr_config(const char* conf, struct nmreq *nmr)
nmr->nr_tx_rings, nmr->nr_tx_slots,
nmr->nr_rx_rings, nmr->nr_rx_slots);
free(w);
+ return (nmr->nr_tx_rings || nmr->nr_tx_slots ||
+ nmr->nr_rx_rings || nmr->nr_rx_slots) ?
+ NM_OPEN_RING_CFG : 0;
}
@@ -385,7 +446,6 @@ source_hwaddr(const char *ifname, char *buf)
static int
setaffinity(pthread_t me, int i)
{
-#if 1 // def __FreeBSD__
cpuset_t cpumask;
if (i == -1)
@@ -399,10 +459,6 @@ setaffinity(pthread_t me, int i)
D("Unable to set affinity: %s", strerror(errno));
return 1;
}
-#else
- (void)me; /* suppress 'unused' warnings */
- (void)i;
-#endif /* __FreeBSD__ */
return 0;
}
@@ -449,7 +505,7 @@ dump_payload(char *p, int len, struct netmap_ring *ring, int cur)
int i, j, i0;
/* get the length in ASCII of the length of the packet. */
-
+
printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
ring, cur, ring->slot[cur].buf_idx,
ring->slot[cur].flags, len);
@@ -632,7 +688,7 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
slot->flags |= NS_INDIRECT;
slot->ptr = (uint64_t)frame;
} else if (options & OPT_COPY) {
- pkt_copy(frame, p, size);
+ nm_pkt_copy(frame, p, size);
if (fcnt == nfrags)
update_addresses(pkt, g);
} else if (options & OPT_MEMCPY) {
@@ -671,21 +727,19 @@ static void *
pinger_body(void *data)
{
struct targ *targ = (struct targ *) data;
- struct pollfd fds[1];
- struct netmap_if *nifp = targ->nifp;
+ struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
+ struct netmap_if *nifp = targ->nmd->nifp;
int i, rx = 0, n = targ->g->npackets;
void *frame;
int size;
+ uint32_t sent = 0;
+ struct timespec ts, now, last_print;
+ uint32_t count = 0, min = 1000000000, av = 0;
frame = &targ->pkt;
frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
size = targ->g->pkt_size + targ->g->virt_header;
- fds[0].fd = targ->fd;
- fds[0].events = (POLLIN);
- static uint32_t sent;
- struct timespec ts, now, last_print;
- uint32_t count = 0, min = 1000000000, av = 0;
if (targ->g->nthreads > 1) {
D("can only ping with 1 thread");
@@ -706,7 +760,7 @@ pinger_body(void *data)
if (nm_ring_empty(ring)) {
D("-- ouch, cannot send");
} else {
- pkt_copy(frame, p, size);
+ nm_pkt_copy(frame, p, size);
clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
bcopy(&sent, p+42, sizeof(sent));
bcopy(&ts, p+46, sizeof(ts));
@@ -715,13 +769,14 @@ pinger_body(void *data)
}
}
/* should use a parameter to decide how often to send */
- if (poll(fds, 1, 3000) <= 0) {
+ if (poll(&pfd, 1, 3000) <= 0) {
D("poll error/timeout on queue %d: %s", targ->me,
strerror(errno));
continue;
}
/* see what we got back */
- for (i = targ->qfirst; i < targ->qlast; i++) {
+ for (i = targ->nmd->first_tx_ring;
+ i <= targ->nmd->last_tx_ring; i++) {
ring = NETMAP_RXRING(nifp, i);
while (!nm_ring_empty(ring)) {
uint32_t seq;
@@ -775,12 +830,10 @@ static void *
ponger_body(void *data)
{
struct targ *targ = (struct targ *) data;
- struct pollfd fds[1];
- struct netmap_if *nifp = targ->nifp;
+ struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
+ struct netmap_if *nifp = targ->nmd->nifp;
struct netmap_ring *txring, *rxring;
int i, rx = 0, sent = 0, n = targ->g->npackets;
- fds[0].fd = targ->fd;
- fds[0].events = (POLLIN);
if (targ->g->nthreads > 1) {
D("can only reply ping with 1 thread");
@@ -791,9 +844,9 @@ ponger_body(void *data)
uint32_t txcur, txavail;
//#define BUSYWAIT
#ifdef BUSYWAIT
- ioctl(fds[0].fd, NIOCRXSYNC, NULL);
+ ioctl(pfd.fd, NIOCRXSYNC, NULL);
#else
- if (poll(fds, 1, 1000) <= 0) {
+ if (poll(&pfd, 1, 1000) <= 0) {
D("poll error/timeout on queue %d: %s", targ->me,
strerror(errno));
continue;
@@ -803,7 +856,7 @@ ponger_body(void *data)
txcur = txring->cur;
txavail = nm_ring_space(txring);
/* see what we got back */
- for (i = targ->qfirst; i < targ->qlast; i++) {
+ for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
rxring = NETMAP_RXRING(nifp, i);
while (!nm_ring_empty(rxring)) {
uint16_t *spkt, *dpkt;
@@ -821,7 +874,7 @@ ponger_body(void *data)
/* copy... */
dpkt = (uint16_t *)dst;
spkt = (uint16_t *)src;
- pkt_copy(src, dst, slot->len);
+ nm_pkt_copy(src, dst, slot->len);
dpkt[0] = spkt[3];
dpkt[1] = spkt[4];
dpkt[2] = spkt[5];
@@ -838,7 +891,7 @@ ponger_body(void *data)
txring->head = txring->cur = txcur;
targ->count = sent;
#ifdef BUSYWAIT
- ioctl(fds[0].fd, NIOCTXSYNC, NULL);
+ ioctl(pfd.fd, NIOCTXSYNC, NULL);
#endif
//D("tx %d rx %d", sent, rx);
}
@@ -924,11 +977,11 @@ static void *
sender_body(void *data)
{
struct targ *targ = (struct targ *) data;
-
- struct pollfd fds[1];
- struct netmap_if *nifp = targ->nifp;
+ struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
+ struct netmap_if *nifp = targ->nmd->nifp;
struct netmap_ring *txring;
- int i, n = targ->g->npackets / targ->g->nthreads, sent = 0;
+ int i, n = targ->g->npackets / targ->g->nthreads;
+ int64_t sent = 0;
int options = targ->g->options | OPT_COPY;
struct timespec nexttime = { 0, 0}; // XXX silence compiler
int rate_limit = targ->g->tx_rate;
@@ -943,10 +996,6 @@ sender_body(void *data)
D("start");
if (setaffinity(targ->thread, targ->affinity))
goto quit;
- /* setup poll(2) mechanism. */
- memset(fds, 0, sizeof(fds));
- fds[0].fd = targ->fd;
- fds[0].events = (POLLOUT);
/* main loop.*/
clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
@@ -956,7 +1005,7 @@ sender_body(void *data)
wait_time(targ->tic);
nexttime = targ->tic;
}
- if (targ->g->dev_type == DEV_TAP) {
+ if (targ->g->dev_type == DEV_TAP) {
D("writing to file desc %d", targ->g->main_fd);
for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
@@ -997,14 +1046,14 @@ sender_body(void *data)
/*
* wait for available room in the send queue(s)
*/
- if (poll(fds, 1, 2000) <= 0) {
+ if (poll(&pfd, 1, 2000) <= 0) {
if (targ->cancel)
break;
D("poll error/timeout on queue %d: %s", targ->me,
strerror(errno));
- goto quit;
+ // goto quit;
}
- if (fds[0].revents & POLLERR) {
+ if (pfd.revents & POLLERR) {
D("poll error");
goto quit;
}
@@ -1015,7 +1064,7 @@ sender_body(void *data)
D("drop copy");
options &= ~OPT_COPY;
}
- for (i = targ->qfirst; i < targ->qlast; i++) {
+ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
int m, limit = rate_limit ? tosend : targ->g->burst;
if (n > 0 && n - sent < limit)
limit = n - sent;
@@ -1024,10 +1073,10 @@ sender_body(void *data)
continue;
if (frags > 1)
limit = ((limit + frags - 1) / frags) * frags;
-
+
m = send_packets(txring, pkt, frame, size, targ->g,
limit, options, frags);
- ND("limit %d tail %d frags %d m %d",
+ ND("limit %d tail %d frags %d m %d",
limit, txring->tail, frags, m);
sent += m;
targ->count = sent;
@@ -1039,13 +1088,13 @@ sender_body(void *data)
}
}
/* flush any remaining packets */
- ioctl(fds[0].fd, NIOCTXSYNC, NULL);
+ ioctl(pfd.fd, NIOCTXSYNC, NULL);
/* final part: wait all the TX queues to be empty. */
- for (i = targ->qfirst; i < targ->qlast; i++) {
+ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
txring = NETMAP_TXRING(nifp, i);
while (nm_tx_pending(txring)) {
- ioctl(fds[0].fd, NIOCTXSYNC, NULL);
+ ioctl(pfd.fd, NIOCTXSYNC, NULL);
usleep(1); /* wait 1 tick */
}
}
@@ -1102,8 +1151,8 @@ static void *
receiver_body(void *data)
{
struct targ *targ = (struct targ *) data;
- struct pollfd fds[1];
- struct netmap_if *nifp = targ->nifp;
+ struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
+ struct netmap_if *nifp = targ->nmd->nifp;
struct netmap_ring *rxring;
int i;
uint64_t received = 0;
@@ -1111,17 +1160,13 @@ receiver_body(void *data)
if (setaffinity(targ->thread, targ->affinity))
goto quit;
- /* setup poll(2) mechanism. */
- memset(fds, 0, sizeof(fds));
- fds[0].fd = targ->fd;
- fds[0].events = (POLLIN);
-
/* unbounded wait for the first packet. */
for (;;) {
- i = poll(fds, 1, 1000);
- if (i > 0 && !(fds[0].revents & POLLERR))
+ i = poll(&pfd, 1, 1000);
+ if (i > 0 && !(pfd.revents & POLLERR))
break;
- RD(1, "waiting for initial packets, poll returns %d %d", i, fds[0].revents);
+ RD(1, "waiting for initial packets, poll returns %d %d",
+ i, pfd.revents);
}
/* main loop, exit after 1s silence */
@@ -1146,18 +1191,18 @@ receiver_body(void *data)
while (!targ->cancel) {
/* Once we started to receive packets, wait at most 1 seconds
before quitting. */
- if (poll(fds, 1, 1 * 1000) <= 0 && !targ->g->forever) {
+ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
targ->toc.tv_sec -= 1; /* Subtract timeout time. */
- break;
+ goto out;
}
- if (fds[0].revents & POLLERR) {
+ if (pfd.revents & POLLERR) {
D("poll err");
goto quit;
}
- for (i = targ->qfirst; i < targ->qlast; i++) {
+ for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
int m;
rxring = NETMAP_RXRING(nifp, i);
@@ -1168,12 +1213,12 @@ receiver_body(void *data)
received += m;
}
targ->count = received;
-
- // tell the card we have read the data
- //ioctl(fds[0].fd, NIOCRXSYNC, NULL);
}
}
+ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
+
+out:
targ->completed = 1;
targ->count = received;
@@ -1190,10 +1235,10 @@ quit:
static const char *
norm(char *buf, double val)
{
- char *units[] = { "", "K", "M", "G" };
+ char *units[] = { "", "K", "M", "G", "T" };
u_int i;
- for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *); i++)
+ for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)
val /= 1000;
sprintf(buf, "%.2f %s", val, units[i]);
return buf;
@@ -1205,8 +1250,8 @@ tx_output(uint64_t sent, int size, double delta)
double bw, raw_bw, pps;
char b1[40], b2[80], b3[80];
- printf("Sent %" PRIu64 " packets, %d bytes each, in %.2f seconds.\n",
- sent, size, delta);
+ printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",
+ (unsigned long long)sent, size, delta);
if (delta == 0)
delta = 1e-6;
if (size < 60) /* correct for min packet size */
@@ -1227,7 +1272,8 @@ rx_output(uint64_t received, double delta)
double pps;
char b1[40];
- printf("Received %" PRIu64 " packets, in %.2f seconds.\n", received, delta);
+ printf("Received %llu packets, in %.2f seconds.\n",
+ (unsigned long long) received, delta);
if (delta == 0)
delta = 1e-6;
@@ -1262,7 +1308,6 @@ usage(void)
"\t-R rate in packets per second\n"
"\t-X dump payload\n"
"\t-H len add empty virtio-net-header with size 'len'\n"
- "\t-h use host ring\n"
"",
cmd);
@@ -1280,77 +1325,57 @@ start_threads(struct glob_arg *g)
* using a single descriptor.
*/
for (i = 0; i < g->nthreads; i++) {
- bzero(&targs[i], sizeof(targs[i]));
- targs[i].fd = -1; /* default, with pcap */
- targs[i].g = g;
+ struct targ *t = &targs[i];
- if (g->dev_type == DEV_NETMAP) {
- struct nmreq tifreq;
- int tfd;
+ bzero(t, sizeof(*t));
+ t->fd = -1; /* default, with pcap */
+ t->g = g;
- /* register interface. */
- tfd = open("/dev/netmap", O_RDWR);
- if (tfd == -1) {
- D("Unable to open /dev/netmap: %s", strerror(errno));
- continue;
- }
- targs[i].fd = tfd;
+ if (g->dev_type == DEV_NETMAP) {
+ struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */
- bzero(&tifreq, sizeof(tifreq));
- strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name));
- tifreq.nr_version = NETMAP_API;
- if (g->host_ring) {
- tifreq.nr_ringid = NETMAP_SW_RING;
- } else {
- tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
+ if (g->nthreads > 1) {
+ if (nmd.req.nr_flags != NR_REG_ALL_NIC) {
+ D("invalid nthreads mode %d", nmd.req.nr_flags);
+ continue;
+ }
+ nmd.req.nr_flags = NR_REG_ONE_NIC;
+ nmd.req.nr_ringid = i;
}
- parse_nmr_config(g->nmr_config, &tifreq);
+ /* Only touch one of the rings (rx is already ok) */
+ if (g->td_body == receiver_body)
+ nmd.req.nr_ringid |= NETMAP_NO_TX_POLL;
- /*
- * if we are acting as a receiver only, do not touch the transmit ring.
- * This is not the default because many apps may use the interface
- * in both directions, but a pure receiver does not.
- */
- if (g->td_body == receiver_body) {
- tifreq.nr_ringid |= NETMAP_NO_TX_POLL;
- }
+ /* register interface. Override ifname and ringid etc. */
- if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) {
- D("Unable to register %s: %s", g->ifname, strerror(errno));
+ t->nmd = nm_open(t->g->ifname, NULL, g->nmd_flags |
+ NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, g->nmd);
+ if (t->nmd == NULL) {
+ D("Unable to open %s: %s",
+ t->g->ifname, strerror(errno));
continue;
}
- D("memsize is %d MB", tifreq.nr_memsize >> 20);
- targs[i].nmr = tifreq;
- targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset);
- D("nifp flags 0x%x", targs[i].nifp->ni_flags);
- /* start threads. */
- if (g->host_ring) {
- targs[i].qfirst = (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
- targs[i].qlast = targs[i].qfirst + 1;
- } else {
- targs[i].qfirst = (g->nthreads > 1) ? i : 0;
- targs[i].qlast = (g->nthreads > 1) ? i+1 :
- (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
- }
+ t->fd = t->nmd->fd;
+
} else {
targs[i].fd = g->main_fd;
}
- targs[i].used = 1;
- targs[i].me = i;
+ t->used = 1;
+ t->me = i;
if (g->affinity >= 0) {
if (g->affinity < g->cpus)
- targs[i].affinity = g->affinity;
+ t->affinity = g->affinity;
else
- targs[i].affinity = i % g->cpus;
- } else
- targs[i].affinity = -1;
+ t->affinity = i % g->cpus;
+ } else {
+ t->affinity = -1;
+ }
/* default, init packets */
- initialize_packet(&targs[i]);
+ initialize_packet(t);
- if (pthread_create(&targs[i].thread, NULL, g->td_body,
- &targs[i]) == -1) {
+ if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
D("Unable to create thread %d: %s", i, strerror(errno));
- targs[i].used = 0;
+ t->used = 0;
}
}
}
@@ -1375,7 +1400,6 @@ main_thread(struct glob_arg *g)
delta.tv_usec = (g->report_interval%1000)*1000;
select(0, NULL, NULL, NULL, &delta);
gettimeofday(&now, NULL);
- time_second = now.tv_sec;
timersub(&now, &toc, &toc);
my_count = 0;
for (i = 0; i < g->nthreads; i++) {
@@ -1388,8 +1412,10 @@ main_thread(struct glob_arg *g)
continue;
npkts = my_count - prev;
pps = (npkts*1000000 + usec/2) / usec;
- D("%" PRIu64 " pps (%" PRIu64 " pkts in %" PRIu64 " usec)",
- pps, npkts, usec);
+ D("%llu pps (%llu pkts in %llu usec)",
+ (unsigned long long)pps,
+ (unsigned long long)npkts,
+ (unsigned long long)usec);
prev = my_count;
toc = now;
if (done == g->nthreads)
@@ -1433,7 +1459,7 @@ main_thread(struct glob_arg *g)
rx_output(count, delta_t);
if (g->dev_type == DEV_NETMAP) {
- munmap(g->mmap_addr, g->mmap_size);
+ munmap(g->nmd->mem, g->nmd->req.nr_memsize);
close(g->main_fd);
}
}
@@ -1521,7 +1547,6 @@ main(int arc, char **argv)
struct glob_arg g;
- struct nmreq nmr;
int ch;
int wait_link = 2;
int devqueues = 1; /* how many device queues */
@@ -1548,7 +1573,7 @@ main(int arc, char **argv)
g.virt_header = 0;
while ( (ch = getopt(arc, argv,
- "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:h")) != -1) {
+ "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:")) != -1) {
struct sf *fn;
switch(ch) {
@@ -1594,23 +1619,28 @@ main(int arc, char **argv)
* otherwise we guess
*/
D("interface is %s", optarg);
- g.ifname = optarg;
+ if (strlen(optarg) > MAX_IFNAMELEN - 8) {
+ D("ifname too long %s", optarg);
+ break;
+ }
+ strcpy(g.ifname, optarg);
if (!strcmp(optarg, "null")) {
g.dev_type = DEV_NETMAP;
g.dummy_send = 1;
} else if (!strncmp(optarg, "tap:", 4)) {
g.dev_type = DEV_TAP;
- g.ifname = optarg + 4;
+ strcpy(g.ifname, optarg + 4);
} else if (!strncmp(optarg, "pcap:", 5)) {
g.dev_type = DEV_PCAP;
- g.ifname = optarg + 5;
- } else if (!strncmp(optarg, "netmap:", 7)) {
+ strcpy(g.ifname, optarg + 5);
+ } else if (!strncmp(optarg, "netmap:", 7) ||
+ !strncmp(optarg, "vale", 4)) {
g.dev_type = DEV_NETMAP;
- g.ifname = optarg + 7;
} else if (!strncmp(optarg, "tap", 3)) {
g.dev_type = DEV_TAP;
- } else {
+ } else { /* prepend netmap: */
g.dev_type = DEV_NETMAP;
+ sprintf(g.ifname, "netmap:%s", optarg);
}
break;
@@ -1618,18 +1648,6 @@ main(int arc, char **argv)
g.options |= OPT_INDIRECT; /* XXX use indirect buffer */
break;
- case 't': /* send, deprecated */
- D("-t deprecated, please use -f tx -n %s", optarg);
- g.td_body = sender_body;
- g.npackets = atoi(optarg);
- break;
-
- case 'r': /* receive */
- D("-r deprecated, please use -f rx -n %s", optarg);
- g.td_body = receiver_body;
- g.npackets = atoi(optarg);
- break;
-
case 'l': /* pkt_size */
g.pkt_size = atoi(optarg);
break;
@@ -1686,8 +1704,8 @@ main(int arc, char **argv)
case 'H':
g.virt_header = atoi(optarg);
break;
- case 'h':
- g.host_ring = 1;
+ case 'e': /* extra bufs */
+ g.extra_bufs = atoi(optarg);
break;
}
}
@@ -1759,42 +1777,33 @@ main(int arc, char **argv)
} else if (g.dummy_send) { /* but DEV_NETMAP */
D("using a dummy send routine");
} else {
- bzero(&nmr, sizeof(nmr));
- nmr.nr_version = NETMAP_API;
+ struct nm_desc base_nmd;
+
+ bzero(&base_nmd, sizeof(base_nmd));
+
+ g.nmd_flags = 0;
+ g.nmd_flags |= parse_nmr_config(g.nmr_config, &base_nmd.req);
+ if (g.extra_bufs) {
+ base_nmd.req.nr_arg3 = g.extra_bufs;
+ g.nmd_flags |= NM_OPEN_ARG3;
+ }
+
/*
- * Open the netmap device to fetch the number of queues of our
- * interface.
+ * Open the netmap device using nm_open().
*
- * The first NIOCREGIF also detaches the card from the
* protocol stack and may cause a reset of the card,
* which in turn may take some time for the PHY to
- * reconfigure.
- */
- g.main_fd = open("/dev/netmap", O_RDWR);
- if (g.main_fd == -1) {
- D("Unable to open /dev/netmap: %s", strerror(errno));
- // fail later
- }
- /*
- * Register the interface on the netmap device: from now on,
- * we can operate on the network interface without any
- * interference from the legacy network stack.
- *
- * We decide to put the first interface registration here to
- * give time to cards that take a long time to reset the PHY.
+ * reconfigure. We do the open here to have time to reset.
*/
- bzero(&nmr, sizeof(nmr));
- nmr.nr_version = NETMAP_API;
- strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name));
- parse_nmr_config(g.nmr_config, &nmr);
- if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) {
- D("Unable to register interface %s: %s", g.ifname, strerror(errno));
- //continue, fail later
+ g.nmd = nm_open(g.ifname, NULL, g.nmd_flags, &base_nmd);
+ if (g.nmd == NULL) {
+ D("Unable to open %s: %s", g.ifname, strerror(errno));
+ goto out;
}
- ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname,
- nmr.nr_tx_rings, nmr.nr_tx_slots,
- nmr.nr_rx_rings, nmr.nr_rx_slots);
- devqueues = nmr.nr_rx_rings;
+ g.main_fd = g.nmd->fd;
+ D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem);
+
+ devqueues = g.nmd->req.nr_rx_rings;
/* validate provided nthreads. */
if (g.nthreads < 1 || g.nthreads > devqueues) {
@@ -1802,32 +1811,18 @@ main(int arc, char **argv)
// continue, fail later
}
- /*
- * Map the netmap shared memory: instead of issuing mmap()
- * inside the body of the threads, we prefer to keep this
- * operation here to simplify the thread logic.
- */
- D("mapping %d Kbytes", nmr.nr_memsize>>10);
- g.mmap_size = nmr.nr_memsize;
- g.mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize,
- PROT_WRITE | PROT_READ,
- MAP_SHARED, g.main_fd, 0);
- if (g.mmap_addr == MAP_FAILED) {
- D("Unable to mmap %d KB: %s", nmr.nr_memsize >> 10, strerror(errno));
- // continue, fail later
- }
-
if (verbose) {
- struct netmap_if *nifp = NETMAP_IF(g.mmap_addr, nmr.nr_offset);
+ struct netmap_if *nifp = g.nmd->nifp;
+ struct nmreq *req = &g.nmd->req;
- D("nifp at offset %d, %d tx %d rx rings %s",
- nmr.nr_offset, nmr.nr_tx_rings, nmr.nr_rx_rings,
- nmr.nr_ringid & NETMAP_PRIV_MEM ? "PRIVATE" : "common" );
- for (i = 0; i <= nmr.nr_tx_rings; i++) {
+ D("nifp at offset %d, %d tx %d rx region %d",
+ req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
+ req->nr_arg2);
+ for (i = 0; i <= req->nr_tx_rings; i++) {
D(" TX%d at 0x%lx", i,
(char *)NETMAP_TXRING(nifp, i) - (char *)nifp);
}
- for (i = 0; i <= nmr.nr_rx_rings; i++) {
+ for (i = 0; i <= req->nr_rx_rings; i++) {
D(" RX%d at 0x%lx", i,
(char *)NETMAP_RXRING(nifp, i) - (char *)nifp);
}
@@ -1846,7 +1841,8 @@ main(int arc, char **argv)
g.src_ip.name, g.dst_ip.name,
g.src_mac.name, g.dst_mac.name);
}
-
+
+out:
/* Exit if something went wrong. */
if (g.main_fd < 0) {
D("aborting");
@@ -1854,7 +1850,7 @@ main(int arc, char **argv)
}
}
-
+
if (g.options) {
D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",
g.options & OPT_PREFETCH ? " prefetch" : "",
diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c
index eb6c48d15a04..e1d8da568063 100644
--- a/tools/tools/netmap/vale-ctl.c
+++ b/tools/tools/netmap/vale-ctl.c
@@ -33,6 +33,7 @@
#include <unistd.h> /* close */
#include <sys/ioctl.h> /* ioctl */
#include <sys/param.h>
+#include <sys/socket.h> /* apple needs sockaddr */
#include <net/if.h> /* ifreq */
#include <net/netmap.h>
#include <net/netmap_user.h>