aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet6/udp6_usrreq.c
diff options
context:
space:
mode:
authorBjoern A. Zeeb <bz@FreeBSD.org>2018-08-23 16:54:22 +0000
committerBjoern A. Zeeb <bz@FreeBSD.org>2018-08-23 16:54:22 +0000
commit20cc39d08565b2d8e372a50746cb1e95cd092b5c (patch)
tree0e2c1db584e7f27a1f321c9b9c78e08c99385d62 /sys/netinet6/udp6_usrreq.c
parent903eaa68f13b8a3b2d6fd035bb1376d7ac2247a7 (diff)
downloadsrc-20cc39d08565b2d8e372a50746cb1e95cd092b5c.tar.gz
src-20cc39d08565b2d8e372a50746cb1e95cd092b5c.zip
MFp4 bz_ipv6_fast:
Migrate udp6_send() v4mapped code to udp6_output() saving us a re-lock and further simplifying the address-family handling code by eliminating AF_INET checks and almost all v4mapped handling right after the start as cases could actually not happen anymore. Rework output path locking similar to UDP4 allowing for better parallelism (see r222488, and later versions). Sponsored by: The FreeBSD Foundation (2012) Sponsored by: iXsystems (2012) Differential Revision: https://reviews.freebsd.org/D3721
Notes
Notes: svn path=/head/; revision=338257
Diffstat (limited to 'sys/netinet6/udp6_usrreq.c')
-rw-r--r--sys/netinet6/udp6_usrreq.c406
1 files changed, 201 insertions, 205 deletions
diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c
index 08478d65e342..77cbc0b2ee2c 100644
--- a/sys/netinet6/udp6_usrreq.c
+++ b/sys/netinet6/udp6_usrreq.c
@@ -679,35 +679,38 @@ udp6_getcred(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0,
0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");
+#define UH_WLOCKED 2
+#define UH_RLOCKED 1
+#define UH_UNLOCKED 0
static int
-udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
- struct mbuf *control, struct thread *td)
+udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
+ struct sockaddr *addr6, struct mbuf *control, struct thread *td)
{
- u_int32_t ulen = m->m_pkthdr.len;
- u_int32_t plen = sizeof(struct udphdr) + ulen;
+ struct inpcbinfo *pcbinfo;
+ struct inpcb *inp;
struct ip6_hdr *ip6;
struct udphdr *udp6;
struct in6_addr *laddr, *faddr, in6a;
- struct sockaddr_in6 *sin6 = NULL;
- int cscov_partial = 0;
- int scope_ambiguous = 0;
- u_short fport;
- int error = 0;
- uint8_t nxt;
- uint16_t cscov = 0;
struct ip6_pktopts *optp, opt;
- int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
- int flags;
- struct sockaddr_in6 tmp;
+ struct sockaddr_in6 *sin6, tmp;
+ struct epoch_tracker et;
+ int cscov_partial, error, flags, hlen, scope_ambiguous;
+ u_int32_t ulen, plen;
+ uint16_t cscov;
+ u_short fport;
+ uint8_t nxt, unlock_udbinfo;
- INP_WLOCK_ASSERT(inp);
- INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
+ /* addr6 has been validated in udp6_send(). */
+ sin6 = (struct sockaddr_in6 *)addr6;
- if (addr6) {
- /* addr6 has been validated in udp6_send(). */
- sin6 = (struct sockaddr_in6 *)addr6;
+ /*
+ * In contrast to to IPv4 we do not validate the max. packet length
+ * here due to IPv6 Jumbograms (RFC2675).
+ */
- /* protect *sin6 from overwrites */
+ scope_ambiguous = 0;
+ if (sin6) {
+ /* Protect *addr6 from overwrites. */
tmp = *sin6;
sin6 = &tmp;
@@ -721,22 +724,86 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
*/
if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
scope_ambiguous = 1;
- if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
+ if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) {
+ if (control)
+ m_freem(control);
+ m_freem(m);
return (error);
+ }
}
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+ INP_RLOCK(inp);
nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
IPPROTO_UDP : IPPROTO_UDPLITE;
+
+#ifdef INET
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
+ int hasv4addr;
+
+ if (sin6 == NULL)
+ hasv4addr = (inp->inp_vflag & INP_IPV4);
+ else
+ hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
+ ? 1 : 0;
+ if (hasv4addr) {
+ struct pr_usrreqs *pru;
+
+ /*
+ * XXXRW: We release UDP-layer locks before calling
+ * udp_send() in order to avoid recursion. However,
+ * this does mean there is a short window where inp's
+ * fields are unstable. Could this lead to a
+ * potential race in which the factors causing us to
+ * select the UDPv4 output routine are invalidated?
+ */
+ INP_RUNLOCK(inp);
+ if (sin6)
+ in6_sin6_2_sin_in_sock((struct sockaddr *)sin6);
+ pru = inetsw[ip_protox[nxt]].pr_usrreqs;
+ /* addr will just be freed in sendit(). */
+ return ((*pru->pru_send)(so, flags_arg, m,
+ (struct sockaddr *)sin6, control, td));
+ }
+ }
+#endif
+
if (control) {
if ((error = ip6_setpktopts(control, &opt,
- inp->in6p_outputopts, td->td_ucred, nxt)) != 0)
- goto release;
+ inp->in6p_outputopts, td->td_ucred, nxt)) != 0) {
+ INP_RUNLOCK(inp);
+ ip6_clearpktopts(&opt, -1);
+ if (control)
+ m_freem(control);
+ m_freem(m);
+ return (error);
+ }
optp = &opt;
} else
optp = inp->in6p_outputopts;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
+ if (sin6 != NULL &&
+ IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) {
+ INP_RUNLOCK(inp);
+ /*
+ * XXX there is a short window here which could lead to a race;
+ * should we re-check that what got us here is still valid?
+ */
+ INP_WLOCK(inp);
+ INP_HASH_WLOCK(pcbinfo);
+ unlock_udbinfo = UH_WLOCKED;
+ } else if (sin6 != NULL &&
+ (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
+ IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
+ inp->inp_lport == 0)) {
+ INP_HASH_RLOCK_ET(pcbinfo, et);
+ unlock_udbinfo = UH_RLOCKED;
+ } else
+ unlock_udbinfo = UH_UNLOCKED;
+
if (sin6) {
- faddr = &sin6->sin6_addr;
/*
* Since we saw no essential reason for calling in_pcbconnect,
@@ -755,85 +822,47 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
goto release;
}
- fport = sin6->sin6_port; /* allow 0 port */
+ /*
+ * Given we handle the v4mapped case in the INET block above
+ * assert here that it must not happen anymore.
+ */
+ KASSERT(!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr),
+ ("%s: sin6(%p)->sin6_addr is v4mapped which we "
+ "should have handled.", __func__, sin6));
- if (IN6_IS_ADDR_V4MAPPED(faddr)) {
- if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
- /*
- * I believe we should explicitly discard the
- * packet when mapped addresses are disabled,
- * rather than send the packet as an IPv6 one.
- * If we chose the latter approach, the packet
- * might be sent out on the wire based on the
- * default route, the situation which we'd
- * probably want to avoid.
- * (20010421 jinmei@kame.net)
- */
- error = EINVAL;
- goto release;
- }
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
- !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
- /*
- * when remote addr is an IPv4-mapped address,
- * local addr should not be an IPv6 address,
- * since you cannot determine how to map IPv6
- * source address to IPv4.
- */
- error = EINVAL;
- goto release;
- }
+ /* This only requires read-locking. */
+ error = in6_selectsrc_socket(sin6, optp, inp,
+ td->td_ucred, scope_ambiguous, &in6a, NULL);
+ if (error)
+ goto release;
+ laddr = &in6a;
- af = AF_INET;
- }
+ if (inp->inp_lport == 0) {
- if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
- error = in6_selectsrc_socket(sin6, optp, inp,
- td->td_ucred, scope_ambiguous, &in6a, NULL);
- if (error)
+ INP_WLOCK_ASSERT(inp);
+ error = in6_pcbsetport(laddr, inp, td->td_ucred);
+ if (error != 0) {
+ /* Undo an address bind that may have occurred. */
+ inp->in6p_laddr = in6addr_any;
goto release;
- laddr = &in6a;
- } else
- laddr = &inp->in6p_laddr; /* XXX */
- if (laddr == NULL) {
- if (error == 0)
- error = EADDRNOTAVAIL;
- goto release;
- }
- if (inp->inp_lport == 0 &&
- (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) {
- /* Undo an address bind that may have occurred. */
- inp->in6p_laddr = in6addr_any;
- goto release;
+ }
}
+ faddr = &sin6->sin6_addr;
+ fport = sin6->sin6_port; /* allow 0 port */
+
} else {
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
error = ENOTCONN;
goto release;
}
- if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
- if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
- /*
- * XXX: this case would happen when the
- * application sets the V6ONLY flag after
- * connecting the foreign address.
- * Such applications should be fixed,
- * so we bark here.
- */
- log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
- "option was set for a connected socket\n");
- error = EINVAL;
- goto release;
- } else
- af = AF_INET;
- }
laddr = &inp->in6p_laddr;
faddr = &inp->in6p_faddr;
fport = inp->inp_fport;
}
- if (af == AF_INET)
- hlen = sizeof(struct ip);
+ ulen = m->m_pkthdr.len;
+ plen = sizeof(struct udphdr) + ulen;
+ hlen = sizeof(struct ip6_hdr);
/*
* Calculate data length and get a mbuf
@@ -848,6 +877,7 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
/*
* Stuff checksum and output datagram.
*/
+ cscov = cscov_partial = 0;
udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
udp6->uh_dport = fport;
@@ -870,59 +900,59 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
udp6->uh_ulen = 0;
udp6->uh_sum = 0;
- switch (af) {
- case AF_INET6:
- ip6 = mtod(m, struct ip6_hdr *);
- ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK;
- ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
- ip6->ip6_vfc |= IPV6_VERSION;
- ip6->ip6_plen = htons((u_short)plen);
- ip6->ip6_nxt = nxt;
- ip6->ip6_hlim = in6_selecthlim(inp, NULL);
- ip6->ip6_src = *laddr;
- ip6->ip6_dst = *faddr;
-
- if (cscov_partial) {
- if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
- sizeof(struct ip6_hdr), plen, cscov)) == 0)
- udp6->uh_sum = 0xffff;
- } else {
- udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
- m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
- m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
- }
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK;
+ ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
+ ip6->ip6_vfc |= IPV6_VERSION;
+ ip6->ip6_plen = htons((u_short)plen);
+ ip6->ip6_nxt = nxt;
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ ip6->ip6_src = *laddr;
+ ip6->ip6_dst = *faddr;
+
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+ if (cscov_partial) {
+ if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
+ sizeof(struct ip6_hdr), plen, cscov)) == 0)
+ udp6->uh_sum = 0xffff;
+ } else {
+ udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
+ m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ }
+
+ flags = 0;
#ifdef RSS
- {
- uint32_t hash_val, hash_type;
- uint8_t pr;
+ {
+ uint32_t hash_val, hash_type;
+ uint8_t pr;
- pr = inp->inp_socket->so_proto->pr_protocol;
- /*
- * Calculate an appropriate RSS hash for UDP and
- * UDP Lite.
- *
- * The called function will take care of figuring out
- * whether a 2-tuple or 4-tuple hash is required based
- * on the currently configured scheme.
- *
- * Later later on connected socket values should be
- * cached in the inpcb and reused, rather than constantly
- * re-calculating it.
- *
- * UDP Lite is a different protocol number and will
- * likely end up being hashed as a 2-tuple until
- * RSS / NICs grow UDP Lite protocol awareness.
- */
- if (rss_proto_software_hash_v6(faddr, laddr, fport,
- inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
- m->m_pkthdr.flowid = hash_val;
- M_HASHTYPE_SET(m, hash_type);
- }
+ pr = inp->inp_socket->so_proto->pr_protocol;
+ /*
+ * Calculate an appropriate RSS hash for UDP and
+ * UDP Lite.
+ *
+ * The called function will take care of figuring out
+ * whether a 2-tuple or 4-tuple hash is required based
+ * on the currently configured scheme.
+ *
+ * Later later on connected socket values should be
+ * cached in the inpcb and reused, rather than constantly
+ * re-calculating it.
+ *
+ * UDP Lite is a different protocol number and will
+ * likely end up being hashed as a 2-tuple until
+ * RSS / NICs grow UDP Lite protocol awareness.
+ */
+ if (rss_proto_software_hash_v6(faddr, laddr, fport,
+ inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
+ m->m_pkthdr.flowid = hash_val;
+ M_HASHTYPE_SET(m, hash_type);
}
-#endif
- flags = 0;
-#ifdef RSS
+
/*
* Don't override with the inp cached flowid.
*
@@ -932,28 +962,43 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
flags |= IP_NODEFAULTFLOWID;
#endif
- if (nxt == IPPROTO_UDPLITE)
- UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);
- else
- UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
- UDPSTAT_INC(udps_opackets);
- error = ip6_output(m, optp, &inp->inp_route6, flags,
- inp->in6p_moptions, NULL, inp);
- break;
- case AF_INET:
- error = EAFNOSUPPORT;
- goto release;
+ UDPSTAT_INC(udps_opackets);
+ if (unlock_udbinfo == UH_WLOCKED)
+ INP_HASH_WUNLOCK(pcbinfo);
+ else if (unlock_udbinfo == UH_RLOCKED)
+ INP_HASH_RUNLOCK_ET(pcbinfo, et);
+ if (nxt == IPPROTO_UDPLITE)
+ UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);
+ else
+ UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
+ error = ip6_output(m, optp, &inp->inp_route6, flags,
+ inp->in6p_moptions, NULL, inp);
+ if (unlock_udbinfo == UH_WLOCKED)
+ INP_WUNLOCK(inp);
+ else
+ INP_RUNLOCK(inp);
+
+ if (control) {
+ ip6_clearpktopts(&opt, -1);
+ m_freem(control);
}
- goto releaseopt;
+ return (error);
release:
- m_freem(m);
-
-releaseopt:
+ if (unlock_udbinfo == UH_WLOCKED) {
+ INP_HASH_WUNLOCK(pcbinfo);
+ INP_WUNLOCK(inp);
+ } else if (unlock_udbinfo == UH_RLOCKED) {
+ INP_HASH_RUNLOCK_ET(pcbinfo, et);
+ INP_RUNLOCK(inp);
+ } else
+ INP_RUNLOCK(inp);
if (control) {
ip6_clearpktopts(&opt, -1);
m_freem(control);
}
+ m_freem(m);
+
return (error);
}
@@ -1257,15 +1302,8 @@ static int
udp6_send(struct socket *so, int flags, struct mbuf *m,
struct sockaddr *addr, struct mbuf *control, struct thread *td)
{
- struct inpcb *inp;
- struct inpcbinfo *pcbinfo;
- int error = 0;
-
- pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("udp6_send: inp == NULL"));
+ int error;
- INP_WLOCK(inp);
if (addr) {
if (addr->sa_len != sizeof(struct sockaddr_in6)) {
error = EINVAL;
@@ -1277,53 +1315,11 @@ udp6_send(struct socket *so, int flags, struct mbuf *m,
}
}
-#ifdef INET
- if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
- int hasv4addr;
- struct sockaddr_in6 *sin6 = NULL;
-
- if (addr == NULL)
- hasv4addr = (inp->inp_vflag & INP_IPV4);
- else {
- sin6 = (struct sockaddr_in6 *)addr;
- hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
- ? 1 : 0;
- }
- if (hasv4addr) {
- struct pr_usrreqs *pru;
- uint8_t nxt;
-
- nxt = (inp->inp_socket->so_proto->pr_protocol ==
- IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE;
- /*
- * XXXRW: We release UDP-layer locks before calling
- * udp_send() in order to avoid recursion. However,
- * this does mean there is a short window where inp's
- * fields are unstable. Could this lead to a
- * potential race in which the factors causing us to
- * select the UDPv4 output routine are invalidated?
- */
- INP_WUNLOCK(inp);
- if (sin6)
- in6_sin6_2_sin_in_sock(addr);
- pru = inetsw[ip_protox[nxt]].pr_usrreqs;
- /* addr will just be freed in sendit(). */
- return ((*pru->pru_send)(so, flags, m, addr, control,
- td));
- }
- }
-#endif
-#ifdef MAC
- mac_inpcb_create_mbuf(inp, m);
-#endif
- INP_HASH_WLOCK(pcbinfo);
- error = udp6_output(inp, m, addr, control, td);
- INP_HASH_WUNLOCK(pcbinfo);
- INP_WUNLOCK(inp);
- return (error);
+ return (udp6_output(so, flags, m, addr, control, td));
bad:
- INP_WUNLOCK(inp);
+ if (control)
+ m_freem(control);
m_freem(m);
return (error);
}