aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Johnston <markj@FreeBSD.org>2023-04-05 16:12:30 +0000
committerMark Johnston <markj@FreeBSD.org>2023-04-05 16:12:30 +0000
commitce12afaa6fff46c9027eb6b2bd515a4e46ecefc9 (patch)
tree7a27252e13eff877ce257a95e4b4b19f1cfd0e08
parentda4068c4e1a090915a1a763c0d9fa5dc5b80842c (diff)
downloadsrc-ce12afaa6fff46c9027eb6b2bd515a4e46ecefc9.tar.gz
src-ce12afaa6fff46c9027eb6b2bd515a4e46ecefc9.zip
netmap: Fix queue stalls with generic interfaces
In emulated mode, the FreeBSD netmap port attempts to perform zero-copy transmission. This works as follows: the kernel ring is populated with mbuf headers to which netmap buffers are attached. When transmitting, the mbuf refcount is initialized to 2, and when the counter value has been decremented to 1 netmap infers that the driver has freed the mbuf and thus transmission is complete. This scheme does not generalize to the situation where netmap is attaching to a software interface which may transmit packets among multiple "queues", as is the case with bridge or lagg interfaces. In that case, we would be relying on backing hardware drivers to free transmitted mbufs promptly, but this isn't guaranteed; a driver may reasonably defer freeing a small number of transmitted buffers indefinitely. If such a buffer ends up at the tail of a netmap transmit ring, further transmits can end up blocked indefinitely. Fix the problem by removing the zero-copy scheme (which is also not implemented in the Linux port of netmap). Instead, the kernel ring is populated with regular mbuf clusters into which netmap buffers are copied by nm_os_generic_xmit_frame(). The refcounting scheme is preserved, and this lets us avoid allocating a fresh cluster per transmitted packet in the common case. If the transmit ring is full, a callout is used to free the "stuck" mbuf, avoiding the queue deadlock described above. Furthermore, when recycling mbuf clusters, be sure to fully reinitialize the mbuf header instead of simply re-setting M_PKTHDR. Some software interfaces, like if_vlan, may set fields in the header which should be reset before the mbuf is reused. Reviewed by: vmaffione MFC after: 1 month Sponsored by: Zenarmor Sponsored by: OPNsense Sponsored by: Klara, Inc. Differential Revision: https://reviews.freebsd.org/D38065
-rw-r--r--sys/dev/netmap/netmap_freebsd.c35
-rw-r--r--sys/dev/netmap/netmap_generic.c58
-rw-r--r--sys/dev/netmap/netmap_kern.h53
3 files changed, 107 insertions, 39 deletions
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 4b3b1a6edacd..e67d26d6788f 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -387,15 +387,20 @@ nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)
* addr and len identify the netmap buffer, m is the (preallocated)
* mbuf to use for transmissions.
*
- * We should add a reference to the mbuf so the m_freem() at the end
- * of the transmission does not consume resources.
+ * Zero-copy transmission is possible if netmap is attached directly to a
+ * hardware interface: when cleaning we simply wait for the mbuf cluster
+ * refcount to decrement to 1, indicating that the driver has completed
+ * transmission and is done with the buffer. However, this approach can
+ * lead to queue deadlocks when attaching to software interfaces (e.g.,
+ * if_bridge) since we cannot rely on member ports to promptly reclaim
+ * transmitted mbufs. Since there is no easy way to distinguish these
+ * cases, we currently always copy the buffer.
*
- * On FreeBSD, and on multiqueue cards, we can force the queue using
+ * On multiqueue cards, we can force the queue using
* if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
* i = m->m_pkthdr.flowid % adapter->num_queues;
* else
* i = curcpu % adapter->num_queues;
- *
*/
int
nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
@@ -405,16 +410,21 @@ nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
if_t ifp = a->ifp;
struct mbuf *m = a->m;
- /* Link the external storage to
- * the netmap buffer, so that no copy is necessary. */
- m->m_ext.ext_buf = m->m_data = a->addr;
- m->m_ext.ext_size = len;
+ M_ASSERTPKTHDR(m);
+ KASSERT((m->m_flags & M_EXT) != 0,
+ ("%s: mbuf %p has no cluster", __func__, m));
- m->m_flags |= M_PKTHDR;
- m->m_len = m->m_pkthdr.len = len;
+ if (MBUF_REFCNT(m) != 1) {
+ nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m);
+ panic("in generic_xmit_frame");
+ }
+ if (unlikely(m->m_ext.ext_size < len)) {
+ nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len);
+ len = m->m_ext.ext_size;
+ }
- /* mbuf refcnt is not contended, no need to use atomic
- * (a memory barrier is enough). */
+ m_copyback(m, 0, len, a->addr);
+ m->m_len = m->m_pkthdr.len = len;
SET_MBUF_REFCNT(m, 2);
M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
m->m_pkthdr.flowid = a->ring_nr;
@@ -425,7 +435,6 @@ nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
return ret ? -1 : 0;
}
-
struct netmap_adapter *
netmap_getna(if_t ifp)
{
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 77c56c74df65..f1c4fdd8b6ea 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -272,6 +272,7 @@ generic_netmap_unregister(struct netmap_adapter *na)
}
for_each_tx_kring(r, kring, na) {
+ callout_drain(&kring->tx_event_callout);
mtx_destroy(&kring->tx_event_lock);
if (kring->tx_pool == NULL) {
continue;
@@ -357,6 +358,9 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
}
mtx_init(&kring->tx_event_lock, "tx_event_lock",
NULL, MTX_SPIN);
+ callout_init_mtx(&kring->tx_event_callout,
+ &kring->tx_event_lock,
+ CALLOUT_RETURNUNLOCKED);
}
}
@@ -430,7 +434,7 @@ out:
* the NIC notifies the driver that transmission is completed.
*/
static void
-generic_mbuf_destructor(struct mbuf *m)
+generic_mbuf_dtor(struct mbuf *m)
{
struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m));
struct netmap_kring *kring;
@@ -473,7 +477,14 @@ generic_mbuf_destructor(struct mbuf *m)
if (++r == na->num_tx_rings) r = 0;
if (r == r_orig) {
+#ifndef __FreeBSD__
+ /*
+ * On FreeBSD this situation can arise if the tx_event
+ * callout handler cleared a stuck packet.
+ */
nm_prlim(1, "Cannot match event %p", m);
+#endif
+ nm_generic_mbuf_dtor(m);
return;
}
}
@@ -481,9 +492,7 @@ generic_mbuf_destructor(struct mbuf *m)
/* Second, wake up clients. They will reclaim the event through
* txsync. */
netmap_generic_irq(na, r, NULL);
-#ifdef __FreeBSD__
- void_mbuf_dtor(m);
-#endif
+ nm_generic_mbuf_dtor(m);
}
/* Record completed transmissions and update hwtail.
@@ -537,7 +546,6 @@ generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc)
}
/* The event has been consumed, we can go
* ahead. */
-
} else if (MBUF_REFCNT(m) != 1) {
/* This mbuf is still busy: its refcnt is 2. */
break;
@@ -577,6 +585,18 @@ ring_middle(u_int inf, u_int sup, u_int lim)
return e;
}
+#ifdef __FreeBSD__
+static void
+generic_tx_callout(void *arg)
+{
+ struct netmap_kring *kring = arg;
+
+ kring->tx_event = NULL;
+ mtx_unlock_spin(&kring->tx_event_lock);
+ netmap_generic_irq(kring->na, kring->ring_id, NULL);
+}
+#endif
+
static void
generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
{
@@ -620,8 +640,24 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
return;
}
- SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
+ SET_MBUF_DESTRUCTOR(m, generic_mbuf_dtor);
kring->tx_event = m;
+#ifdef __FreeBSD__
+ /*
+ * Handle the possibility that the transmitted buffer isn't reclaimed
+ * within a bounded period of time. This can arise when transmitting
+ * out of multiple ports via a lagg or bridge interface, since the
+ * member ports may legitimately only free transmitted buffers in
+ * batches.
+ *
+ * The callout handler clears the stuck packet from the ring, allowing
+ * transmission to proceed. In the common case we let
+ * generic_mbuf_dtor() unstick the ring, allowing mbufs to be
+ * reused most of the time.
+ */
+ callout_reset_sbt_curcpu(&kring->tx_event_callout, SBT_1MS, 0,
+ generic_tx_callout, kring, 0);
+#endif
mtx_unlock_spin(&kring->tx_event_lock);
kring->tx_pool[e] = NULL;
@@ -631,10 +667,8 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
/* Decrement the refcount. This will free it if we lose the race
* with the driver. */
m_freem(m);
- smp_mb();
}
-
/*
* generic_netmap_txsync() transforms netmap buffers into mbufs
* and passes them to the standard device driver
@@ -712,6 +746,8 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
break;
}
IFRATE(rate_ctx.new.txrepl++);
+ } else {
+ nm_os_mbuf_reinit(m);
}
a.m = m;
@@ -783,9 +819,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
#endif
}
- /*
- * Second, reclaim completed buffers
- */
if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) {
/* No more available slots? Set a notification event
* on a netmap slot that will be cleaned in the future.
@@ -795,6 +828,9 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
generic_set_tx_event(kring, nm_i);
}
+ /*
+ * Second, reclaim completed buffers
+ */
generic_netmap_tx_clean(kring, gna->txqdisc);
return 0;
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 7c68c79c61ef..04b9c199e1dc 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -501,6 +501,9 @@ struct netmap_kring {
struct mbuf **tx_pool;
struct mbuf *tx_event; /* TX event used as a notification */
NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */
+#ifdef __FreeBSD__
+ struct callout tx_event_callout;
+#endif
struct mbq rx_queue; /* intercepted rx mbufs. */
uint32_t users; /* existing bindings for this ring */
@@ -2381,39 +2384,59 @@ ptnet_sync_tail(struct nm_csb_ktoa *ktoa, struct netmap_kring *kring)
*
* We allocate mbufs with m_gethdr(), since the mbuf header is needed
* by the driver. We also attach a customly-provided external storage,
- * which in this case is a netmap buffer. When calling m_extadd(), however
- * we pass a NULL address, since the real address (and length) will be
- * filled in by nm_os_generic_xmit_frame() right before calling
- * if_transmit().
+ * which in this case is a netmap buffer.
*
* The dtor function does nothing, however we need it since mb_free_ext()
* has a KASSERT(), checking that the mbuf dtor function is not NULL.
*/
-static void void_mbuf_dtor(struct mbuf *m) { }
+static inline void
+nm_generic_mbuf_dtor(struct mbuf *m)
+{
+ uma_zfree(zone_clust, m->m_ext.ext_buf);
+}
#define SET_MBUF_DESTRUCTOR(m, fn) do { \
(m)->m_ext.ext_free = (fn != NULL) ? \
- (void *)fn : (void *)void_mbuf_dtor; \
+ (void *)fn : (void *)nm_generic_mbuf_dtor; \
} while (0)
static inline struct mbuf *
-nm_os_get_mbuf(if_t ifp, int len)
+nm_os_get_mbuf(if_t ifp __unused, int len)
{
struct mbuf *m;
+ void *buf;
- (void)ifp;
- (void)len;
+ KASSERT(len <= MCLBYTES, ("%s: len %d", __func__, len));
m = m_gethdr(M_NOWAIT, MT_DATA);
- if (m == NULL) {
- return m;
+ if (__predict_false(m == NULL))
+ return (NULL);
+ buf = uma_zalloc(zone_clust, M_NOWAIT);
+ if (__predict_false(buf == NULL)) {
+ m_free(m);
+ return (NULL);
}
+ m_extadd(m, buf, MCLBYTES, nm_generic_mbuf_dtor, NULL, NULL, 0,
+ EXT_NET_DRV);
+ return (m);
+}
- m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor,
- NULL, NULL, 0, EXT_NET_DRV);
-
- return m;
+static inline void
+nm_os_mbuf_reinit(struct mbuf *m)
+{
+ void *buf;
+
+ KASSERT((m->m_flags & M_EXT) != 0,
+ ("%s: mbuf %p has no external storage", __func__, m));
+ KASSERT(m->m_ext.ext_size == MCLBYTES,
+ ("%s: mbuf %p has wrong external storage size %u", __func__, m,
+ m->m_ext.ext_size));
+
+ buf = m->m_ext.ext_buf;
+ m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
+ m_extadd(m, buf, MCLBYTES, nm_generic_mbuf_dtor, NULL, NULL, 0,
+ EXT_NET_DRV);
}
#endif /* __FreeBSD__ */