aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNavdeep Parhar <np@FreeBSD.org>2020-09-18 02:37:57 +0000
committerNavdeep Parhar <np@FreeBSD.org>2020-09-18 02:37:57 +0000
commitb092fd6c973dab0ffaca8144f435c91a2588aa21 (patch)
tree9b5da4e610c2c342cf6f29213d50ce839268af94
parent72cc43df1731748c43db18cd491bc4b39f38fd83 (diff)
downloadsrc-b092fd6c973dab0ffaca8144f435c91a2588aa21.tar.gz
src-b092fd6c973dab0ffaca8144f435c91a2588aa21.zip
if_vxlan(4): add support for hardware assisted checksumming, TSO, and RSS.
This lets a VXLAN pseudo-interface take advantage of hardware checksumming (tx and rx), TSO, and RSS if the NIC is capable of performing these operations on inner VXLAN traffic. A VXLAN interface inherits the capabilities of its vxlandev interface if one is specified or of the interface that hosts the vxlanlocal address. If other interfaces will carry traffic for that VXLAN then they must have the same hardware capabilities. On transmit, if_vxlan verifies that the outbound interface has the required capabilities and then translates the CSUM_ flags to their inner equivalents. This tells the hardware ifnet that it needs to operate on the inner frame and not the outer VXLAN headers. An event is generated when a VXLAN ifnet starts. This allows hardware drivers to configure their devices to expect VXLAN traffic on the specified incoming port. On receive, the hardware does RSS and checksum verification on the inner frame. if_vxlan now does a direct netisr dispatch to take full advantage of RSS. It is not very clear why it didn't do this already. Future work: Rx: it should be possible to avoid the first trip up the protocol stack to get the frame to if_vxlan just so it can decapsulate and requeue for a second trip up the stack. The hardware NIC driver could directly call an if_vxlan receive routine for VXLAN traffic instead. Rx: LRO. depends on what happens with the previous item. There will have to to be a mechanism to indicate that it's time for if_vxlan to flush its LRO state. Reviewed by: kib@ Relnotes: Yes Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D25873
Notes
Notes: svn path=/head/; revision=365870
-rw-r--r--share/man/man4/vxlan.439
-rw-r--r--share/man/man9/EVENTHANDLER.96
-rw-r--r--sys/net/if_vxlan.c490
-rw-r--r--sys/net/if_vxlan.h7
-rw-r--r--sys/netinet/ip_output.c12
-rw-r--r--sys/netinet6/ip6_output.c3
6 files changed, 530 insertions, 27 deletions
diff --git a/share/man/man4/vxlan.4 b/share/man/man4/vxlan.4
index fcac8f8f837b..1848897d97c9 100644
--- a/share/man/man4/vxlan.4
+++ b/share/man/man4/vxlan.4
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd December 31, 2017
+.Dd September 17, 2020
.Dt VXLAN 4
.Os
.Sh NAME
@@ -182,6 +182,39 @@ command may be used to reduce the MTU size on the
.Nm
interface to allow the encapsulated frame to fit in the
current MTU of the physical network.
+.Sh HARDWARE
+The
+.Nm
+driver supports hardware checksum offload (receive and transmit) and TSO on the
+encapsulated traffic over physical interfaces that support these features.
+The
+.Nm
+interface examines the
+.Cm vxlandev
+interface, if one is specified, or the interface hosting the
+.Cm vxlanlocal
+address, and configures its capabilities based on the hardware offload
+capabilities of that physical interface.
+If multiple physical interfaces will transmit or receive traffic for the
+.Nm
+then they all must have the same hardware capabilities.
+The transmit routine of a
+.Nm
+interface may fail with
+.Er ENXIO
+if an outbound physical interface does not support
+an offload that the
+.Nm
+interface is requesting.
+This can happen if there are multiple physical interfaces involved, with
+different hardware capabilities, or an interface capability was disabled after
+the
+.Nm
+interface had already started.
+.Pp
+At present, these devices are capable of generating checksums and performing TSO
+on the inner frames in hardware:
+.Xr cxgbe 4 .
.Sh EXAMPLES
Create a
.Nm
@@ -244,3 +277,7 @@ The
.Nm
driver was written by
.An Bryan Venteicher Aq bryanv@freebsd.org .
+Support for stateless hardware offloads was added by
+.An Navdeep Parhar Aq np@freebsd.org
+in
+.Fx 13.0 .
diff --git a/share/man/man9/EVENTHANDLER.9 b/share/man/man9/EVENTHANDLER.9
index 9fca3d414eac..c38bb3d6c93a 100644
--- a/share/man/man9/EVENTHANDLER.9
+++ b/share/man/man9/EVENTHANDLER.9
@@ -23,7 +23,7 @@
.\" SUCH DAMAGE.
.\" $FreeBSD$
.\"
-.Dd October 21, 2018
+.Dd September 17, 2020
.Dt EVENTHANDLER 9
.Os
.Sh NAME
@@ -389,6 +389,10 @@ Callback invoked when the vlan configuration has changed.
Callback invoked when a vlan is destroyed.
.It Vt vm_lowmem
Callbacks invoked when virtual memory is low.
+.It Vt vxlan_start
+Callback invoked when a vxlan interface starts.
+.It Vt vxlan_stop
+Callback invoked when a vxlan interface stops.
.It Vt watchdog_list
Callbacks invoked when the system watchdog timer is reinitialized.
.El
diff --git a/sys/net/if_vxlan.c b/sys/net/if_vxlan.c
index b75cfc41ad15..fd2f1d94b29e 100644
--- a/sys/net/if_vxlan.c
+++ b/sys/net/if_vxlan.c
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
* All rights reserved.
+ * Copyright (c) 2020, Chelsio Communications.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -60,6 +61,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_types.h>
#include <net/if_vxlan.h>
#include <net/netisr.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
@@ -70,6 +73,8 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+#include <netinet/in_fib.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
@@ -77,6 +82,9 @@ __FBSDID("$FreeBSD$");
struct vxlan_softc;
LIST_HEAD(vxlan_softc_head, vxlan_softc);
+struct sx vxlan_sx;
+SX_SYSINIT(vxlan, &vxlan_sx, "VXLAN global start/stop lock");
+
struct vxlan_socket_mc_info {
union vxlan_sockaddr vxlsomc_saddr;
union vxlan_sockaddr vxlsomc_gaddr;
@@ -92,6 +100,7 @@ struct vxlan_socket_mc_info {
sizeof(struct udphdr) - \
sizeof(struct vxlan_header) - \
ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN)
+#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU)
#define VXLAN_SO_MC_MAX_GROUPS 32
@@ -146,10 +155,14 @@ LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry);
struct vxlan_statistics {
uint32_t ftable_nospace;
uint32_t ftable_lock_upgrade_failed;
+ counter_u64_t txcsum;
+ counter_u64_t tso;
+ counter_u64_t rxcsum;
};
struct vxlan_softc {
struct ifnet *vxl_ifp;
+ int vxl_reqcap;
struct vxlan_socket *vxl_sock;
uint32_t vxl_vni;
union vxlan_sockaddr vxl_src_addr;
@@ -193,6 +206,10 @@ struct vxlan_softc {
char vxl_mc_ifname[IFNAMSIZ];
LIST_ENTRY(vxlan_softc) vxl_entry;
LIST_ENTRY(vxlan_softc) vxl_ifdetach_list;
+
+ /* For rate limiting errors on the tx fast path. */
+ struct timeval err_time;
+ int err_pps;
};
#define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p))
@@ -297,7 +314,10 @@ static int vxlan_setup_multicast_interface(struct vxlan_softc *);
static int vxlan_setup_multicast(struct vxlan_softc *);
static int vxlan_setup_socket(struct vxlan_softc *);
-static void vxlan_setup_interface(struct vxlan_softc *);
+#ifdef INET6
+static void vxlan_setup_zero_checksum_port(struct vxlan_softc *);
+#endif
+static void vxlan_setup_interface_hdrlen(struct vxlan_softc *);
static int vxlan_valid_init_config(struct vxlan_softc *);
static void vxlan_init_wait(struct vxlan_softc *);
static void vxlan_init_complete(struct vxlan_softc *);
@@ -347,9 +367,13 @@ static void vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *,
static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
const struct sockaddr *);
+static int vxlan_stats_alloc(struct vxlan_softc *);
+static void vxlan_stats_free(struct vxlan_softc *);
static void vxlan_set_default_config(struct vxlan_softc *);
static int vxlan_set_user_config(struct vxlan_softc *,
struct ifvxlanparam *);
+static int vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int);
+static void vxlan_set_hwcaps(struct vxlan_softc *);
static int vxlan_clone_create(struct if_clone *, int, caddr_t);
static void vxlan_clone_destroy(struct ifnet *);
@@ -1555,8 +1579,44 @@ out:
return (error);
}
+#ifdef INET6
static void
-vxlan_setup_interface(struct vxlan_softc *sc)
+vxlan_setup_zero_checksum_port(struct vxlan_softc *sc)
+{
+
+ if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr))
+ return;
+
+ MPASS(sc->vxl_src_addr.in6.sin6_port != 0);
+ MPASS(sc->vxl_dst_addr.in6.sin6_port != 0);
+
+ if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) {
+ if_printf(sc->vxl_ifp, "port %d in src address does not match "
+ "port %d in dst address, rfc6935_port (%d) not updated.\n",
+ ntohs(sc->vxl_src_addr.in6.sin6_port),
+ ntohs(sc->vxl_dst_addr.in6.sin6_port),
+ V_zero_checksum_port);
+ return;
+ }
+
+ if (V_zero_checksum_port != 0) {
+ if (V_zero_checksum_port !=
+ ntohs(sc->vxl_src_addr.in6.sin6_port)) {
+ if_printf(sc->vxl_ifp, "rfc6935_port is already set to "
+ "%d, cannot set it to %d.\n", V_zero_checksum_port,
+ ntohs(sc->vxl_src_addr.in6.sin6_port));
+ }
+ return;
+ }
+
+ V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port);
+ if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n",
+ V_zero_checksum_port);
+}
+#endif
+
+static void
+vxlan_setup_interface_hdrlen(struct vxlan_softc *sc)
{
struct ifnet *ifp;
@@ -1655,9 +1715,11 @@ vxlan_init(void *xsc)
sc = xsc;
ifp = sc->vxl_ifp;
+ sx_xlock(&vxlan_sx);
VXLAN_WLOCK(sc);
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
VXLAN_WUNLOCK(sc);
+ sx_xunlock(&vxlan_sx);
return;
}
sc->vxl_flags |= VXLAN_FLAG_INIT;
@@ -1666,11 +1728,13 @@ vxlan_init(void *xsc)
if (vxlan_valid_init_config(sc) != 0)
goto out;
- vxlan_setup_interface(sc);
-
if (vxlan_setup_socket(sc) != 0)
goto out;
+#ifdef INET6
+ vxlan_setup_zero_checksum_port(sc);
+#endif
+
/* Initialize the default forwarding entry. */
vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
&sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
@@ -1682,8 +1746,12 @@ vxlan_init(void *xsc)
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_UP);
+
+ EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family,
+ ntohs(sc->vxl_src_addr.in4.sin_port));
out:
vxlan_init_complete(sc);
+ sx_xunlock(&vxlan_sx);
}
static void
@@ -1725,11 +1793,11 @@ vxlan_teardown_locked(struct vxlan_softc *sc)
struct ifnet *ifp;
struct vxlan_socket *vso;
- ifp = sc->vxl_ifp;
-
+ sx_assert(&vxlan_sx, SA_XLOCKED);
VXLAN_LOCK_WASSERT(sc);
MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN);
+ ifp = sc->vxl_ifp;
ifp->if_flags &= ~IFF_UP;
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
callout_stop(&sc->vxl_callout);
@@ -1738,6 +1806,8 @@ vxlan_teardown_locked(struct vxlan_softc *sc)
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_DOWN);
+ EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family,
+ ntohs(sc->vxl_src_addr.in4.sin_port));
if (vso != NULL) {
vxlan_socket_remove_softc(vso, sc);
@@ -1767,15 +1837,18 @@ static void
vxlan_teardown(struct vxlan_softc *sc)
{
+ sx_xlock(&vxlan_sx);
VXLAN_WLOCK(sc);
if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) {
vxlan_teardown_wait(sc);
VXLAN_WUNLOCK(sc);
+ sx_xunlock(&vxlan_sx);
return;
}
sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
vxlan_teardown_locked(sc);
+ sx_xunlock(&vxlan_sx);
}
static void
@@ -1907,6 +1980,7 @@ vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg)
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
+ vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@@ -1936,6 +2010,7 @@ vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg)
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
+ vxlan_setup_interface_hdrlen(sc);
error = 0;
} else
error = EBUSY;
@@ -2063,6 +2138,7 @@ vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg)
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
+ vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@@ -2284,6 +2360,14 @@ vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
ifp->if_mtu = ifr->ifr_mtu;
break;
+ case SIOCSIFCAP:
+ VXLAN_WLOCK(sc);
+ error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap);
+ if (error == 0)
+ vxlan_set_hwcaps(sc);
+ VXLAN_WUNLOCK(sc);
+ break;
+
default:
error = ether_ioctl(ifp, cmd, data);
break;
@@ -2335,6 +2419,48 @@ vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff,
}
#endif
+/*
+ * Return the CSUM_INNER_* equivalent of CSUM_* caps.
+ */
+static uint32_t
+csum_flags_to_inner_flags(uint32_t csum_flags_in, uint32_t encap)
+{
+ uint32_t csum_flags = CSUM_ENCAP_VXLAN;
+ const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP;
+
+ /*
+ * csum_flags can request either v4 or v6 offload but not both.
+ * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO)
+ * so those bits are no good to detect the IP version. Other bits are
+ * always set with CSUM_TSO and we use those to figure out the IP
+ * version.
+ */
+ if (csum_flags_in & v4) {
+ if (csum_flags_in & CSUM_IP)
+ csum_flags |= CSUM_INNER_IP;
+ if (csum_flags_in & CSUM_IP_UDP)
+ csum_flags |= CSUM_INNER_IP_UDP;
+ if (csum_flags_in & CSUM_IP_TCP)
+ csum_flags |= CSUM_INNER_IP_TCP;
+ if (csum_flags_in & CSUM_IP_TSO)
+ csum_flags |= CSUM_INNER_IP_TSO;
+ } else {
+#ifdef INVARIANTS
+ const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP;
+
+ MPASS((csum_flags_in & v6) != 0);
+#endif
+ if (csum_flags_in & CSUM_IP6_UDP)
+ csum_flags |= CSUM_INNER_IP6_UDP;
+ if (csum_flags_in & CSUM_IP6_TCP)
+ csum_flags |= CSUM_INNER_IP6_TCP;
+ if (csum_flags_in & CSUM_IP6_TSO)
+ csum_flags |= CSUM_INNER_IP6_TSO;
+ }
+
+ return (csum_flags);
+}
+
static int
vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
struct mbuf *m)
@@ -2345,6 +2471,11 @@ vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
struct in_addr srcaddr, dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
+ struct route route, *ro;
+ struct sockaddr_in *sin;
+ uint32_t csum_flags;
+
+ NET_EPOCH_ASSERT();
ifp = sc->vxl_ifp;
srcaddr = sc->vxl_src_addr.in4.sin_addr;
@@ -2376,7 +2507,57 @@ vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
m->m_flags &= ~(M_MCAST | M_BCAST);
- error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL);
+ m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
+ if (m->m_pkthdr.csum_flags != 0) {
+ /*
+ * HW checksum (L3 and/or L4) or TSO has been requested. Look
+ * up the ifnet for the outbound route and verify that the
+ * outbound ifnet can perform the requested operation on the
+ * inner frame.
+ */
+ bzero(&route, sizeof(route));
+ ro = &route;
+ sin = (struct sockaddr_in *)&ro->ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+ ro->ro_nh = fib4_lookup(RT_DEFAULT_FIB, ip->ip_dst, 0, NHR_NONE,
+ 0);
+ if (ro->ro_nh == NULL) {
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (EHOSTUNREACH);
+ }
+
+ csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
+ CSUM_ENCAP_VXLAN);
+ if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
+ csum_flags) {
+ if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
+ const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
+
+ if_printf(ifp, "interface %s is missing hwcaps "
+ "0x%08x, csum_flags 0x%08x -> 0x%08x, "
+ "hwassist 0x%08x\n", nh_ifp->if_xname,
+ csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
+ m->m_pkthdr.csum_flags, csum_flags,
+ (uint32_t)nh_ifp->if_hwassist);
+ }
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (ENXIO);
+ }
+ m->m_pkthdr.csum_flags = csum_flags;
+ if (csum_flags &
+ (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
+ counter_u64_add(sc->vxl_stats.txcsum, 1);
+ if (csum_flags & CSUM_INNER_TSO)
+ counter_u64_add(sc->vxl_stats.tso, 1);
+ }
+ } else
+ ro = NULL;
+ error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@@ -2402,6 +2583,11 @@ vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
const struct in6_addr *srcaddr, *dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
+ struct route_in6 route, *ro;
+ struct sockaddr_in6 *sin6;
+ uint32_t csum_flags;
+
+ NET_EPOCH_ASSERT();
ifp = sc->vxl_ifp;
srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
@@ -2429,22 +2615,67 @@ vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
- /*
- * XXX BMV We need support for RFC6935 before we can send and
- * receive IPv6 UDP packets with a zero checksum.
- */
- {
+ mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
+ m->m_flags &= ~(M_MCAST | M_BCAST);
+
+ ro = NULL;
+ m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
+ if (m->m_pkthdr.csum_flags != 0) {
+ /*
+ * HW checksum (L3 and/or L4) or TSO has been requested. Look
+ * up the ifnet for the outbound route and verify that the
+ * outbound ifnet can perform the requested operation on the
+ * inner frame.
+ */
+ bzero(&route, sizeof(route));
+ ro = &route;
+ sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_addr = ip6->ip6_dst;
+ ro->ro_nh = fib6_lookup(RT_DEFAULT_FIB, &ip6->ip6_dst, 0,
+ NHR_NONE, 0);
+ if (ro->ro_nh == NULL) {
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (EHOSTUNREACH);
+ }
+
+ csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
+ CSUM_ENCAP_VXLAN);
+ if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
+ csum_flags) {
+ if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
+ const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
+
+ if_printf(ifp, "interface %s is missing hwcaps "
+ "0x%08x, csum_flags 0x%08x -> 0x%08x, "
+ "hwassist 0x%08x\n", nh_ifp->if_xname,
+ csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
+ m->m_pkthdr.csum_flags, csum_flags,
+ (uint32_t)nh_ifp->if_hwassist);
+ }
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (ENXIO);
+ }
+ m->m_pkthdr.csum_flags = csum_flags;
+ if (csum_flags &
+ (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
+ counter_u64_add(sc->vxl_stats.txcsum, 1);
+ if (csum_flags & CSUM_INNER_TSO)
+ counter_u64_add(sc->vxl_stats.tso, 1);
+ }
+ } else if (ntohs(dstport) != V_zero_checksum_port) {
struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
+
hdr->uh_sum = in6_cksum_pseudo(ip6,
m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
}
-
- mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
- m->m_flags &= ~(M_MCAST | M_BCAST);
-
- error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL);
+ error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@@ -2593,8 +2824,30 @@ vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0,
m_clrprotoflags(m);
m->m_pkthdr.rcvif = ifp;
M_SETFIB(m, ifp->if_fib);
+ if (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN &&
+ ((ifp->if_capenable & IFCAP_RXCSUM &&
+ m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) ||
+ (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
+ !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) {
+ uint32_t csum_flags = 0;
+
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)
+ csum_flags |= CSUM_L3_CALC;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID)
+ csum_flags |= CSUM_L3_VALID;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC)
+ csum_flags |= CSUM_L4_CALC;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID)
+ csum_flags |= CSUM_L4_VALID;
+ m->m_pkthdr.csum_flags = csum_flags;
+ counter_u64_add(sc->vxl_stats.rxcsum, 1);
+ } else {
+ /* clear everything */
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.csum_data = 0;
+ }
- error = netisr_queue_src(NETISR_ETHER, 0, m);
+ error = netisr_dispatch(NETISR_ETHER, m);
*m0 = NULL;
out:
@@ -2602,6 +2855,48 @@ out:
return (error);
}
+static int
+vxlan_stats_alloc(struct vxlan_softc *sc)
+{
+ struct vxlan_statistics *stats = &sc->vxl_stats;
+
+ stats->txcsum = counter_u64_alloc(M_WAITOK);
+ if (stats->txcsum == NULL)
+ goto failed;
+
+ stats->tso = counter_u64_alloc(M_WAITOK);
+ if (stats->tso == NULL)
+ goto failed;
+
+ stats->rxcsum = counter_u64_alloc(M_WAITOK);
+ if (stats->rxcsum == NULL)
+ goto failed;
+
+ return (0);
+failed:
+ vxlan_stats_free(sc);
+ return (ENOMEM);
+}
+
+static void
+vxlan_stats_free(struct vxlan_softc *sc)
+{
+ struct vxlan_statistics *stats = &sc->vxl_stats;
+
+ if (stats->txcsum != NULL) {
+ counter_u64_free(stats->txcsum);
+ stats->txcsum = NULL;
+ }
+ if (stats->tso != NULL) {
+ counter_u64_free(stats->tso);
+ stats->tso = NULL;
+ }
+ if (stats->rxcsum != NULL) {
+ counter_u64_free(stats->rxcsum);
+ stats->rxcsum = NULL;
+ }
+}
+
static void
vxlan_set_default_config(struct vxlan_softc *sc)
{
@@ -2722,6 +3017,142 @@ vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp)
}
static int
+vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap)
+{
+ int mask = reqcap ^ ifp->if_capenable;
+
+ /* Disable TSO if tx checksums are disabled. */
+ if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) &&
+ reqcap & IFCAP_TSO4) {
+ reqcap &= ~IFCAP_TSO4;
+ if_printf(ifp, "tso4 disabled due to -txcsum.\n");
+ }
+ if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) &&
+ reqcap & IFCAP_TSO6) {
+ reqcap &= ~IFCAP_TSO6;
+ if_printf(ifp, "tso6 disabled due to -txcsum6.\n");
+ }
+
+ /* Do not enable TSO if tx checksums are disabled. */
+ if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 &&
+ !(reqcap & IFCAP_TXCSUM)) {
+ if_printf(ifp, "enable txcsum first.\n");
+ return (EAGAIN);
+ }
+ if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 &&
+ !(reqcap & IFCAP_TXCSUM_IPV6)) {
+ if_printf(ifp, "enable txcsum6 first.\n");
+ return (EAGAIN);
+ }
+
+ sc->vxl_reqcap = reqcap;
+ return (0);
+}
+
+/*
+ * A VXLAN interface inherits the capabilities of the vxlandev or the interface
+ * hosting the vxlanlocal address.
+ */
+static void
+vxlan_set_hwcaps(struct vxlan_softc *sc)
+{
+ struct epoch_tracker et;
+ struct ifnet *p;
+ struct ifaddr *ifa;
+ u_long hwa;
+ int cap, ena;
+ bool rel;
+ struct ifnet *ifp = sc->vxl_ifp;
+
+ /* reset caps */
+ ifp->if_capabilities &= VXLAN_BASIC_IFCAPS;
+ ifp->if_capenable &= VXLAN_BASIC_IFCAPS;
+ ifp->if_hwassist = 0;
+
+ NET_EPOCH_ENTER(et);
+ CURVNET_SET(ifp->if_vnet);
+
+ rel = false;
+ p = NULL;
+ if (sc->vxl_mc_ifname[0] != '\0') {
+ rel = true;
+ p = ifunit_ref(sc->vxl_mc_ifname);
+ } else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
+ if (sc->vxl_src_addr.sa.sa_family == AF_INET) {
+ struct sockaddr_in in4 = sc->vxl_src_addr.in4;
+
+ in4.sin_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&in4);
+ if (ifa != NULL)
+ p = ifa->ifa_ifp;
+ } else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) {
+ struct sockaddr_in6 in6 = sc->vxl_src_addr.in6;
+
+ in6.sin6_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&in6);
+ if (ifa != NULL)
+ p = ifa->ifa_ifp;
+ }
+ }
+ if (p == NULL)
+ goto done;
+
+ cap = ena = hwa = 0;
+
+ /* checksum offload */
+ if (p->if_capabilities & IFCAP_VXLAN_HWCSUM)
+ cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
+ if (p->if_capenable & IFCAP_VXLAN_HWCSUM) {
+ ena |= sc->vxl_reqcap & p->if_capenable &
+ (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
+ if (ena & IFCAP_TXCSUM) {
+ if (p->if_hwassist & CSUM_INNER_IP)
+ hwa |= CSUM_IP;
+ if (p->if_hwassist & CSUM_INNER_IP_UDP)
+ hwa |= CSUM_IP_UDP;
+ if (p->if_hwassist & CSUM_INNER_IP_TCP)
+ hwa |= CSUM_IP_TCP;
+ }
+ if (ena & IFCAP_TXCSUM_IPV6) {
+ if (p->if_hwassist & CSUM_INNER_IP6_UDP)
+ hwa |= CSUM_IP6_UDP;
+ if (p->if_hwassist & CSUM_INNER_IP6_TCP)
+ hwa |= CSUM_IP6_TCP;
+ }
+ }
+
+ /* hardware TSO */
+ if (p->if_capabilities & IFCAP_VXLAN_HWTSO) {
+ cap |= p->if_capabilities & IFCAP_TSO;
+ if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen)
+ ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen;
+ else
+ ifp->if_hw_tsomax = p->if_hw_tsomax;
+ /* XXX: tsomaxsegcount decrement is cxgbe specific */
+ ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1;
+ ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize;
+ }
+ if (p->if_capenable & IFCAP_VXLAN_HWTSO) {
+ ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO;
+ if (ena & IFCAP_TSO) {
+ if (p->if_hwassist & CSUM_INNER_IP_TSO)
+ hwa |= CSUM_IP_TSO;
+ if (p->if_hwassist & CSUM_INNER_IP6_TSO)
+ hwa |= CSUM_IP6_TSO;
+ }
+ }
+
+ ifp->if_capabilities |= cap;
+ ifp->if_capenable |= ena;
+ ifp->if_hwassist |= hwa;
+ if (rel)
+ if_rele(p);
+done:
+ CURVNET_RESTORE();
+ NET_EPOCH_EXIT(et);
+}
+
+static int
vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
{
struct vxlan_softc *sc;
@@ -2732,6 +3163,9 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
sc->vxl_unit = unit;
vxlan_set_default_config(sc);
+ error = vxlan_stats_alloc(sc);
+ if (error != 0)
+ goto fail;
if (params != 0) {
error = copyin(params, &vxlp, sizeof(vxlp));
@@ -2764,8 +3198,10 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
ifp->if_ioctl = vxlan_ioctl;
ifp->if_transmit = vxlan_transmit;
ifp->if_qflush = vxlan_qflush;
- ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
- ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
+ ifp->if_capabilities = VXLAN_BASIC_IFCAPS;
+ ifp->if_capenable = VXLAN_BASIC_IFCAPS;
+ sc->vxl_reqcap = -1;
+ vxlan_set_hwcaps(sc);
ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
@@ -2775,7 +3211,7 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
ether_ifattach(ifp, sc->vxl_hwaddr.octet);
ifp->if_baudrate = 0;
- ifp->if_hdrlen = 0;
+ vxlan_setup_interface_hdrlen(sc);
return (0);
@@ -2803,6 +3239,7 @@ vxlan_clone_destroy(struct ifnet *ifp)
vxlan_sysctl_destroy(sc);
rm_destroy(&sc->vxl_lock);
+ vxlan_stats_free(sc);
free(sc, M_VXLAN);
}
@@ -3087,6 +3524,15 @@ vxlan_sysctl_setup(struct vxlan_softc *sc)
"ftable_lock_upgrade_failed", CTLFLAG_RD,
&stats->ftable_lock_upgrade_failed, 0,
"Forwarding table update required lock upgrade");
+
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum",
+ CTLFLAG_RD, &stats->txcsum,
+ "# of times hardware assisted with tx checksum");
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso",
+ CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO");
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum",
+ CTLFLAG_RD, &stats->rxcsum,
+ "# of times hardware assisted with rx checksum");
}
static void
@@ -3131,10 +3577,12 @@ vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp)
LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) {
LIST_REMOVE(sc, vxl_ifdetach_list);
+ sx_xlock(&vxlan_sx);
VXLAN_WLOCK(sc);
if (sc->vxl_flags & VXLAN_FLAG_INIT)
vxlan_init_wait(sc);
vxlan_teardown_locked(sc);
+ sx_xunlock(&vxlan_sx);
}
}
diff --git a/sys/net/if_vxlan.h b/sys/net/if_vxlan.h
index 98b4d9026249..01c73a9307c3 100644
--- a/sys/net/if_vxlan.h
+++ b/sys/net/if_vxlan.h
@@ -143,4 +143,11 @@ struct ifvxlancmd {
char vxlcmd_ifname[IFNAMSIZ];
};
+#ifdef _KERNEL
+typedef void (*vxlan_event_handler_t)(void *, struct ifnet *, sa_family_t,
+ u_int);
+EVENTHANDLER_DECLARE(vxlan_start, vxlan_event_handler_t);
+EVENTHANDLER_DECLARE(vxlan_stop, vxlan_event_handler_t);
+#endif
+
#endif /* _NET_IF_VXLAN_H_ */
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index a37201ed237f..c1dd6bad6c6d 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -767,9 +767,13 @@ sendit:
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
+ * Note that if_vxlan could have requested TSO even though the outer
+ * frame is UDP. It is correct to not fragment such datagrams and
+ * instead just pass them on to the driver.
*/
if (ip_len <= mtu ||
- (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
+ (m->m_pkthdr.csum_flags & ifp->if_hwassist &
+ (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
ip->ip_sum = 0;
if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
ip->ip_sum = in_cksum(m, hlen);
@@ -783,7 +787,8 @@ sendit:
* once instead of for every generated packet.
*/
if (!(flags & IP_FORWARDING) && ia) {
- if (m->m_pkthdr.csum_flags & CSUM_TSO)
+ if (m->m_pkthdr.csum_flags &
+ (CSUM_TSO | CSUM_INNER_TSO))
counter_u64_add(ia->ia_ifa.ifa_opackets,
m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
else
@@ -807,7 +812,8 @@ sendit:
}
/* Balk when DF bit is set or the interface didn't support TSO. */
- if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
+ if ((ip_off & IP_DF) ||
+ (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index 9d37cf84bde6..6be710085b87 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -1119,7 +1119,8 @@ passout:
*/
sw_csum = m->m_pkthdr.csum_flags;
if (!hdrsplit) {
- tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
+ tso = ((sw_csum & ifp->if_hwassist &
+ (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
sw_csum &= ~ifp->if_hwassist;
} else
tso = 0;