aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorRichard Scheffenegger <rscheff@FreeBSD.org>2021-02-16 11:18:43 +0000
committerRichard Scheffenegger <rscheff@FreeBSD.org>2021-02-16 12:08:37 +0000
commit3c40e1d52cd86168779cf99dbabe58df465d7e3f (patch)
treebab7eb118a0354142f68ecf41bdd0f1c1f27868e /sys
parentafcb3c4cb49f1ba9690d066c3dc1af9c7bee1ea3 (diff)
downloadsrc-3c40e1d52cd86168779cf99dbabe58df465d7e3f.tar.gz
src-3c40e1d52cd86168779cf99dbabe58df465d7e3f.zip
update the SACK loss recovery to RFC6675, with the following new features:
- improved pipe calculation which does not degrade under heavy loss - engaging in Loss Recovery earlier under adverse conditions - Rescue Retransmission in case some of the trailing packets of a request got lost All above changes are toggled with the sysctl "rfc6675_pipe" (disabled by default). Reviewers: #transport, tuexen, lstewart, slavash, jtl, hselasky, kib, rgrimes, chengc_netapp.com, thj, #manpages, kbowling, #netapp, rscheff Reviewed By: #transport Subscribers: imp, melifaro MFC after: 2 weeks Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D18985
Diffstat (limited to 'sys')
-rw-r--r--sys/netinet/tcp_input.c34
-rw-r--r--sys/netinet/tcp_sack.c35
2 files changed, 64 insertions, 5 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index b7baef5bc0d6..dbe86e4e65c0 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1501,6 +1501,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct mbuf *mfree;
struct tcpopt to;
int tfo_syn;
+ u_int maxseg;
#ifdef TCPDEBUG
/*
@@ -2502,8 +2503,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
#endif
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
- u_int maxseg;
-
maxseg = tcp_maxseg(tp);
if (tlen == 0 &&
(tiwin == tp->snd_wnd ||
@@ -2644,7 +2643,21 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_cwnd += maxseg;
(void) tp->t_fb->tfb_tcp_output(tp);
goto drop;
- } else if (tp->t_dupacks == tcprexmtthresh) {
+ } else if (tp->t_dupacks == tcprexmtthresh ||
+ (tp->t_flags & TF_SACK_PERMIT &&
+ V_tcp_do_rfc6675_pipe &&
+ tp->sackhint.sacked_bytes >
+ (tcprexmtthresh - 1) * maxseg)) {
+enter_recovery:
+ /*
+ * Above is the RFC6675 trigger condition of
+ * more than (dupthresh-1)*maxseg sacked data.
+ * If the count of holes in the
+ * scoreboard is >= dupthresh, we could
+ * also enter loss recovery, but don't
+ * have that value readily available.
+ */
+ tp->t_dupacks = tcprexmtthresh;
tcp_seq onxt = tp->snd_nxt;
/*
@@ -2689,6 +2702,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_recover = tp->snd_nxt;
tp->snd_cwnd = maxseg;
(void) tp->t_fb->tfb_tcp_output(tp);
+ if (SEQ_GT(th->th_ack, tp->snd_una))
+ goto resume_partialack;
goto drop;
}
tp->snd_nxt = th->th_ack;
@@ -2775,10 +2790,19 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
(to.to_flags & TOF_SACK) &&
- sack_changed)
+ sack_changed) {
tp->t_dupacks++;
+ /* limit overhead by setting maxseg last */
+ if (!IN_FASTRECOVERY(tp->t_flags) &&
+ (tp->sackhint.sacked_bytes >
+ ((tcprexmtthresh - 1) *
+ (maxseg = tcp_maxseg(tp))))) {
+ goto enter_recovery;
+ }
+ }
}
+resume_partialack:
KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
("%s: th_ack <= snd_una", __func__));
@@ -2789,7 +2813,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (IN_FASTRECOVERY(tp->t_flags)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
if (tp->t_flags & TF_SACK_PERMIT)
- if (V_tcp_do_prr)
+ if (V_tcp_do_prr && to.to_flags & TOF_SACK)
tcp_prr_partialack(tp, th);
else
tcp_sack_partialack(tp, th);
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 2cae6a560c48..28cd5c93f106 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -750,6 +750,16 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
else
sblkp--;
}
+ if (!(to->to_flags & TOF_SACK))
+ /*
+ * If this ACK did not contain any
+ * SACK blocks, any only moved the
+ * left edge right, it is a pure
+ * cumulative ACK. Do not count
+ * DupAck for this. Also required
+ * for RFC6675 rescue retransmission.
+ */
+ sack_changed = 0;
tp->sackhint.delivered_data = delivered_data;
tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
KASSERT((delivered_data >= 0), ("delivered_data < 0"));
@@ -800,6 +810,31 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_flags |= TF_ACKNOW;
+ /*
+ * RFC6675 rescue retransmission
+ * Add a hole between th_ack (snd_una is not yet set) and snd_max,
+ * if this was a pure cumulative ACK and no data was send beyond
+ * recovery point. Since the data in the socket has not been freed
+ * at this point, we check if the scoreboard is empty, and the ACK
+ * delivered some new data, indicating a full ACK. Also, if the
+ * recovery point is still at snd_max, we are probably application
+ * limited. However, this inference might not always be true. The
+ * rescue retransmission may rarely be slightly premature
+ * compared to RFC6675.
+ * The corresponding ACK+SACK will cause any further outstanding
+ * segments to be retransmitted. This addresses a corner case, when
+ * the trailing packets of a window are lost and no further data
+ * is available for sending.
+ */
+ if ((V_tcp_do_rfc6675_pipe) &&
+ SEQ_LT(th->th_ack, tp->snd_recover) &&
+ (tp->snd_recover == tp->snd_max) &&
+ TAILQ_EMPTY(&tp->snd_holes) &&
+ (tp->sackhint.delivered_data > 0)) {
+ struct sackhole *hole;
+ int maxseg = tcp_maxseg(tp);
+ hole = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, tp->snd_max - maxseg), tp->snd_max, NULL);
+ }
(void) tp->t_fb->tfb_tcp_output(tp);
}