aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet6
diff options
context:
space:
mode:
authorAndrew Gallatin <gallatin@FreeBSD.org>2016-08-01 17:02:21 +0000
committerAndrew Gallatin <gallatin@FreeBSD.org>2016-08-01 17:02:21 +0000
commitd4c22202e61fe1c5cad7120ddb56d095cecf3472 (patch)
tree2e27baa9740c1dc7e388ceac824b1ca404fa8806 /sys/netinet6
parent70a3049ea54d505a8153acf16099202075e044bf (diff)
downloadsrc-d4c22202e61fe1c5cad7120ddb56d095cecf3472.tar.gz
src-d4c22202e61fe1c5cad7120ddb56d095cecf3472.zip
Rework IPV6 TCP path MTU discovery to match IPv4
- Re-write tcp_ctlinput6() to closely mimic the IPv4 tcp_ctlinput() - Now that tcp_ctlinput6() updates t_maxseg, we can allow ip6_output() to send TCP packets without looking at the tcp host cache for every single transmit. - Make the icmp6 code mimic the IPv4 code & avoid returning PRC_HOSTDEAD because it is so expensive. Without these changes in place, every TCP6 pmtu discovery or host unreachable ICMP resulted in a call to in6_pcbnotify() which walks the tcbinfo table with the write lock held. Because the tcbinfo table is shared between IPv4 and IPv6, this causes huge scalabilty issues on servers with lots of (~100K) TCP connections, to the point where even a small percent of IPv6 traffic had a disproportionate impact on overall throughput. Reviewed by: bz, rrs, ae (all earlier versions), lstewart (in Netflix's tree) Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D7272
Notes
Notes: svn path=/head/; revision=303626
Diffstat (limited to 'sys/netinet6')
-rw-r--r--sys/netinet6/icmp6.c4
-rw-r--r--sys/netinet6/ip6_output.c21
2 files changed, 14 insertions, 11 deletions
diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c
index 69bb60fcfeaa..6b6c92e2539f 100644
--- a/sys/netinet6/icmp6.c
+++ b/sys/netinet6/icmp6.c
@@ -485,15 +485,13 @@ icmp6_input(struct mbuf **mp, int *offp, int proto)
icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
switch (code) {
case ICMP6_DST_UNREACH_NOROUTE:
+ case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */
code = PRC_UNREACH_NET;
break;
case ICMP6_DST_UNREACH_ADMIN:
icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
break;
- case ICMP6_DST_UNREACH_ADDR:
- code = PRC_HOSTDEAD;
- break;
case ICMP6_DST_UNREACH_BEYONDSCOPE:
/* I mean "source address was incorrect." */
code = PRC_PARAMPROB;
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index a768f74d033e..a393cb51aaac 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -150,9 +150,10 @@ static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
static int ip6_getpmtu(struct route_in6 *, int,
- struct ifnet *, const struct in6_addr *, u_long *, int *, u_int);
+ struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
+ u_int);
static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
- u_long *, int *);
+ u_long *, int *, u_int);
static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
@@ -718,7 +719,7 @@ again:
/* Determine path MTU. */
if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
- &mtu, &alwaysfrag, fibnum)) != 0)
+ &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
goto bad;
/*
@@ -1250,7 +1251,7 @@ ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
ifp = nh6.nh_ifp;
mtu = nh6.nh_mtu;
- error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL);
+ error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
fib6_free_nh_ext(fibnum, &nh6);
return (error);
@@ -1269,7 +1270,7 @@ ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
static int
ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
- int *alwaysfragp, u_int fibnum)
+ int *alwaysfragp, u_int fibnum, u_int proto)
{
struct nhop6_basic nh6;
struct in6_addr kdst;
@@ -1307,7 +1308,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
if (ro_pmtu->ro_rt)
mtu = ro_pmtu->ro_rt->rt_mtu;
- return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp));
+ return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
}
/*
@@ -1319,7 +1320,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
*/
static int
ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
- u_long *mtup, int *alwaysfragp)
+ u_long *mtup, int *alwaysfragp, u_int proto)
{
u_long mtu = 0;
int alwaysfrag = 0;
@@ -1334,7 +1335,11 @@ ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
inc.inc6_faddr = *dst;
ifmtu = IN6_LINKMTU(ifp);
- mtu = tcp_hc_getmtu(&inc);
+
+ /* TCP is known to react to pmtu changes so skip hc */
+ if (proto != IPPROTO_TCP)
+ mtu = tcp_hc_getmtu(&inc);
+
if (mtu)
mtu = min(mtu, rt_mtu);
else