diff options
author | Andrew Gallatin <gallatin@FreeBSD.org> | 2016-08-01 17:02:21 +0000 |
---|---|---|
committer | Andrew Gallatin <gallatin@FreeBSD.org> | 2016-08-01 17:02:21 +0000 |
commit | d4c22202e61fe1c5cad7120ddb56d095cecf3472 (patch) | |
tree | 2e27baa9740c1dc7e388ceac824b1ca404fa8806 /sys/netinet6 | |
parent | 70a3049ea54d505a8153acf16099202075e044bf (diff) | |
download | src-d4c22202e61fe1c5cad7120ddb56d095cecf3472.tar.gz src-d4c22202e61fe1c5cad7120ddb56d095cecf3472.zip |
Rework IPV6 TCP path MTU discovery to match IPv4
- Re-write tcp_ctlinput6() to closely mimic the IPv4 tcp_ctlinput()
- Now that tcp_ctlinput6() updates t_maxseg, we can allow ip6_output()
to send TCP packets without looking at the tcp host cache for every
single transmit.
- Make the icmp6 code mimic the IPv4 code & avoid returning
PRC_HOSTDEAD because it is so expensive.
Without these changes in place, every TCP6 pmtu discovery or host
unreachable ICMP resulted in a call to in6_pcbnotify() which walks the
tcbinfo table with the write lock held. Because the tcbinfo table is
shared between IPv4 and IPv6, this causes huge scalabilty issues on
servers with lots of (~100K) TCP connections, to the point where even
a small percent of IPv6 traffic had a disproportionate impact on
overall throughput.
Reviewed by: bz, rrs, ae (all earlier versions), lstewart (in Netflix's tree)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D7272
Notes
Notes:
svn path=/head/; revision=303626
Diffstat (limited to 'sys/netinet6')
-rw-r--r-- | sys/netinet6/icmp6.c | 4 | ||||
-rw-r--r-- | sys/netinet6/ip6_output.c | 21 |
2 files changed, 14 insertions, 11 deletions
diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index 69bb60fcfeaa..6b6c92e2539f 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -485,15 +485,13 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: + case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; - case ICMP6_DST_UNREACH_ADDR: - code = PRC_HOSTDEAD; - break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index a768f74d033e..a393cb51aaac 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -150,9 +150,10 @@ static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, - struct ifnet *, const struct in6_addr *, u_long *, int *, u_int); + struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, + u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, - u_long *, int *); + u_long *, int *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); @@ -718,7 +719,7 @@ again: /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, - &mtu, &alwaysfrag, fibnum)) != 0) + &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) goto bad; /* @@ -1250,7 +1251,7 @@ ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) ifp = nh6.nh_ifp; mtu = nh6.nh_mtu; - error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL); + error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0); fib6_free_nh_ext(fibnum, &nh6); return (error); @@ -1269,7 +1270,7 @@ ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, - int *alwaysfragp, u_int fibnum) + int *alwaysfragp, u_int fibnum, u_int proto) { struct nhop6_basic nh6; struct in6_addr kdst; @@ -1307,7 +1308,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, if (ro_pmtu->ro_rt) mtu = ro_pmtu->ro_rt->rt_mtu; - return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp)); + return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } /* @@ -1319,7 +1320,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, - u_long *mtup, int *alwaysfragp) + u_long *mtup, int *alwaysfragp, u_int proto) { u_long mtu = 0; int alwaysfrag = 0; @@ -1334,7 +1335,11 @@ ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); - mtu = tcp_hc_getmtu(&inc); + + /* TCP is known to react to pmtu changes so skip hc */ + if (proto != IPPROTO_TCP) + mtu = tcp_hc_getmtu(&inc); + if (mtu) mtu = min(mtu, rt_mtu); else |