aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Scheffenegger <rscheff@FreeBSD.org>2020-11-08 18:47:05 +0000
committerRichard Scheffenegger <rscheff@FreeBSD.org>2020-11-08 18:47:05 +0000
commit4d0770f1725f84e8bcd059e6094b6bd29bed6cc3 (patch)
treebf9ac666ec2a923e199b3e8c69da67644965565a
parent2d39824195933c173bbfc9b31773070202d2e30e (diff)
downloadsrc-4d0770f1725f84e8bcd059e6094b6bd29bed6cc3.tar.gz
src-4d0770f1725f84e8bcd059e6094b6bd29bed6cc3.zip
Prevent premature SACK block transmission during loss recovery
Under specific conditions, a window update can be sent with outdated SACK information. Some clients react to this by subsequently delaying loss recovery, making TCP perform very poorly. Reported by: chengc_netapp.com Reviewed by: rrs, jtl MFC after: 2 weeks Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D24237
Notes
Notes: svn path=/head/; revision=367492
-rw-r--r--sys/netinet/tcp_input.c46
-rw-r--r--sys/netinet/tcp_reass.c6
-rw-r--r--sys/netinet/tcp_stacks/bbr.c20
-rw-r--r--sys/netinet/tcp_stacks/rack.c20
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c1
-rw-r--r--sys/netinet/tcp_var.h12
6 files changed, 74 insertions, 31 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index c0a3d9600ca5..cb32e2f5d27c 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1462,6 +1462,29 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
void
+tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
+{
+ /*
+ * Since tp might be gone if the session entered
+ * the TIME_WAIT state before coming here, we need
+ * to check if the socket is still connected.
+ */
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return;
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ if (tp->t_flags & TF_WAKESOR) {
+ tp->t_flags &= ~TF_WAKESOR;
+ SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
+ sorwakeup(so);
+ }
+ if (tp->t_flags & TF_WAKESOW) {
+ tp->t_flags &= ~TF_WAKESOW;
+ SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+ sowwakeup(so);
+ }
+}
+
+void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
{
@@ -1811,7 +1834,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
else if (!tcp_timer_active(tp, TT_PERSIST))
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
- sowwakeup(so);
+ tp->t_flags |= TF_WAKESOW;
if (sbavail(&so->so_snd))
(void) tp->t_fb->tfb_tcp_output(tp);
goto check_delack;
@@ -1876,8 +1899,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m, 0);
}
- /* NB: sorwakeup_locked() does an implicit unlock. */
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
if (DELAY_ACK(tp, tlen)) {
tp->t_flags |= TF_DELACK;
} else {
@@ -2811,8 +2834,8 @@ process_ACK:
tp->snd_wnd = 0;
ourfinisacked = 0;
}
- /* NB: sowwakeup_locked() does an implicit unlock. */
- sowwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
/* Detect una wraparound. */
if (!IN_RECOVERY(tp->t_flags) &&
@@ -3033,8 +3056,8 @@ dodata: /* XXX */
m_freem(m);
else
sbappendstream_locked(&so->so_rcv, m, 0);
- /* NB: sorwakeup_locked() does an implicit unlock. */
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
} else {
/*
* XXX: Due to the header drop above "th" is
@@ -3101,6 +3124,8 @@ dodata: /* XXX */
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
+ /* The socket upcall is handled by socantrcvmore. */
+ tp->t_flags &= ~TF_WAKESOR;
/*
* If connection is half-synchronized
* (ie NEEDSYN flag on) then delay ACK,
@@ -3164,6 +3189,7 @@ check_delack:
tp->t_flags &= ~TF_DELACK;
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
+ tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
return;
@@ -3197,6 +3223,7 @@ dropafterack:
TCP_PROBE3(debug__input, tp, th, m);
tp->t_flags |= TF_ACKNOW;
(void) tp->t_fb->tfb_tcp_output(tp);
+ tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
@@ -3204,6 +3231,7 @@ dropafterack:
dropwithreset:
if (tp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
} else
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
@@ -3219,8 +3247,10 @@ drop:
&tcp_savetcp, 0);
#endif
TCP_PROBE3(debug__input, tp, th, m);
- if (tp != NULL)
+ if (tp != NULL) {
+ tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
+ }
m_freem(m);
}
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index d57cb4318a94..310d7d540507 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -959,7 +959,8 @@ new_entry:
} else {
sbappendstream_locked(&so->so_rcv, m, 0);
}
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
return (flags);
}
if (tcp_new_limits) {
@@ -1107,6 +1108,7 @@ present:
#ifdef TCP_REASS_LOGGING
tcp_reass_log_dump(tp);
#endif
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
return (flags);
}
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 2e74ecbe8ea1..5114ef4d2ba3 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -7876,8 +7876,8 @@ bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
acked_amount = min(acked, (int)sbavail(&so->so_snd));
tp->snd_wnd -= acked_amount;
mfree = sbcut_locked(&so->so_snd, acked_amount);
- /* NB: sowwakeup_locked() does an implicit unlock. */
- sowwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
if (SEQ_GT(th->th_ack, tp->snd_una)) {
bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
@@ -8353,8 +8353,8 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
- /* NB: sorwakeup_locked() does an implicit unlock. */
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
counter_fo_release(so->so_rcv.sb_shlim,
@@ -8414,6 +8414,8 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
+ /* The socket upcall is handled by socantrcvmore. */
+ tp->t_flags &= ~TF_WAKESOR;
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@@ -8604,8 +8606,8 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
- /* NB: sorwakeup_locked() does an implicit unlock. */
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@@ -8796,7 +8798,7 @@ bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
&tcp_savetcp, 0);
#endif
/* Wake up the socket if we have room to write more */
- sowwakeup(so);
+ tp->t_flags |= TF_WAKESOW;
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
@@ -11740,8 +11742,10 @@ bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
retval = bbr_do_segment_nounlock(m, th, so, tp,
drop_hdrlen, tlen, iptos, 0, &tv);
- if (retval == 0)
+ if (retval == 0) {
+ tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
+ }
}
/*
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 635e932bb80e..7523a98db4c5 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -8344,8 +8344,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
ourfinisacked = 1;
}
- /* NB: sowwakeup_locked() does an implicit unlock. */
- sowwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
if (rack->r_ctl.rc_early_recovery == 0) {
if (IN_RECOVERY(tp->t_flags)) {
@@ -8665,8 +8665,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
- /* NB: sorwakeup_locked() does an implicit unlock. */
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
counter_fo_release(so->so_rcv.sb_shlim,
@@ -8731,6 +8731,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
+ /* The socket upcall is handled by socantrcvmore. */
+ tp->t_flags &= ~TF_WAKESOR;
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@@ -8922,8 +8924,8 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
- /* NB: sorwakeup_locked() does an implicit unlock. */
- sorwakeup_locked(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@@ -9140,7 +9142,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
}
/* Wake up the socket if we have room to write more */
- sowwakeup(so);
+ tp->t_flags |= TF_WAKESOW;
if (sbavail(&so->so_snd)) {
rack->r_wanted_output = 1;
}
@@ -11188,8 +11190,10 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_get_usecs(&tv);
}
if(rack_do_segment_nounlock(m, th, so, tp,
- drop_hdrlen, tlen, iptos, 0, &tv) == 0)
+ drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
+ tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
+ }
}
struct rack_sendmap *
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index b7d758f5fcd8..e73a3e60fd64 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -458,6 +458,7 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
/* We lost the tcpcb (maybe a RST came in)? */
return(1);
}
+ tcp_handle_wakeup(tp, so);
}
return (0);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 5fe0d4a07d60..5600d52170fe 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -381,7 +381,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */
#define TF_NOPUSH 0x00001000 /* don't push */
#define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid */
-#define TF_UNUSED1 0x00004000 /* unused */
+#define TF_WAKESOR 0x00004000 /* wake up receive socket */
#define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */
#define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */
#define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */
@@ -393,9 +393,9 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF_FORCEDATA 0x00800000 /* force out a byte */
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
-#define TF_UNUSED3 0x04000000 /* unused */
-#define TF_UNUSED4 0x08000000 /* unused */
-#define TF_UNUSED5 0x10000000 /* unused */
+#define TF_WAKESOW 0x04000000 /* wake up send socket */
+#define TF_UNUSED1 0x08000000 /* unused */
+#define TF_UNUSED2 0x10000000 /* unused */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
#define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */
@@ -931,7 +931,8 @@ char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
const void *);
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
-int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *);
+int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *,
+ struct mbuf *);
void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
void tcp_dooptions(struct tcpopt *, u_char *, int, int);
@@ -955,6 +956,7 @@ void hhook_run_tcp_est_in(struct tcpcb *tp,
int tcp_input(struct mbuf **, int *, int);
int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
struct tcpcb *, int);
+void tcp_handle_wakeup(struct tcpcb *, struct socket *);
void tcp_do_segment(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, int, int, uint8_t);