aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Scheffenegger <rscheff@FreeBSD.org>2020-12-04 11:29:27 +0000
committerRichard Scheffenegger <rscheff@FreeBSD.org>2020-12-04 11:29:27 +0000
commit0e1d7c25c5ab4014eb5ddd7676a1b64041a57d17 (patch)
tree0d061d97ba236963ff0b083f1c7cd62a05877fdd
parent34af05ead3cf0eaf69d678d8025864bcad295273 (diff)
downloadsrc-0e1d7c25c5ab.tar.gz
src-0e1d7c25c5ab.zip
Add TCP feature Proportional Rate Reduction (PRR) - RFC6937
PRR improves loss recovery and avoids RTOs in a wide range of scenarios (ACK thinning) over regular SACK loss recovery. PRR is disabled by default, enable by net.inet.tcp.do_prr = 1. Performance may be impeded by token bucket rate policers at the bottleneck, where net.inet.tcp.do_prr_conservate = 1 should be enabled in addition. Submitted by: Aris Angelogiannopoulos Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D18892
Notes
Notes: svn path=/head/; revision=368327
-rw-r--r--sys/netinet/tcp_input.c130
-rw-r--r--sys/netinet/tcp_var.h8
2 files changed, 131 insertions, 7 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 8fe7169b3c83..7746ccf24073 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -153,6 +153,16 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(drop_synfin), 0,
"Drop TCP packets with SYN+FIN set");
+VNET_DEFINE(int, tcp_do_prr_conservative) = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(tcp_do_prr_conservative), 0,
+ "Do conservative Proportional Rate Reduction");
+
+VNET_DEFINE(int, tcp_do_prr) = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(tcp_do_prr), 1,
+ "Enable Proportional Rate Reduction per RFC 6937");
+
VNET_DEFINE(int, tcp_do_newcwv) = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_newcwv), 0,
@@ -2554,7 +2564,55 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
IN_FASTRECOVERY(tp->t_flags)) {
cc_ack_received(tp, th, nsegs,
CC_DUPACK);
- if ((tp->t_flags & TF_SACK_PERMIT) &&
+ if (V_tcp_do_prr &&
+ IN_FASTRECOVERY(tp->t_flags) &&
+ (tp->t_flags & TF_SACK_PERMIT)) {
+ long snd_cnt = 0, limit = 0;
+ long del_data = 0, pipe = 0;
+ /*
+ * In a duplicate ACK del_data is only the
+ * diff_in_sack. If no SACK is used del_data
+ * will be 0. Pipe is the amount of data we
+ * estimate to be in the network.
+ */
+ del_data = tp->sackhint.delivered_data;
+ pipe = (tp->snd_nxt - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+ tp->sackhint.prr_delivered += del_data;
+ if (pipe > tp->snd_ssthresh) {
+ snd_cnt = (tp->sackhint.prr_delivered *
+ tp->snd_ssthresh /
+ tp->sackhint.recover_fs) +
+ 1 - tp->sackhint.sack_bytes_rexmit;
+ } else {
+ if (V_tcp_do_prr_conservative)
+ limit = tp->sackhint.prr_delivered -
+ tp->sackhint.sack_bytes_rexmit;
+ else
+ if ((tp->sackhint.prr_delivered -
+ tp->sackhint.sack_bytes_rexmit) >
+ del_data)
+ limit = tp->sackhint.prr_delivered -
+ tp->sackhint.sack_bytes_rexmit +
+ maxseg;
+ else
+ limit = del_data + maxseg;
+ if ((tp->snd_ssthresh - pipe) < limit)
+ snd_cnt = tp->snd_ssthresh - pipe;
+ else
+ snd_cnt = limit;
+ }
+ snd_cnt = max((snd_cnt / maxseg), 0);
+ /*
+ * Send snd_cnt new data into the network in
+ * response to this ACK. If there is a going
+ * to be a SACK retransmission, adjust snd_cwnd
+ * accordingly.
+ */
+ tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
+ tp->sackhint.sack_bytes_rexmit +
+ (snd_cnt * maxseg);
+ } else if ((tp->t_flags & TF_SACK_PERMIT) &&
IN_FASTRECOVERY(tp->t_flags)) {
int awnd;
@@ -2583,13 +2641,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_seq onxt = tp->snd_nxt;
/*
- * If we're doing sack, check to
- * see if we're already in sack
+ * If we're doing sack, or prr, check
+ * to see if we're already in sack
* recovery. If we're not doing sack,
* check to see if we're in newreno
* recovery.
*/
- if (tp->t_flags & TF_SACK_PERMIT) {
+ if (V_tcp_do_prr ||
+ (tp->t_flags & TF_SACK_PERMIT)) {
if (IN_FASTRECOVERY(tp->t_flags)) {
tp->t_dupacks = 0;
break;
@@ -2607,6 +2666,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
CC_DUPACK);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
+ if (V_tcp_do_prr) {
+ /*
+ * snd_ssthresh is already updated by
+ * cc_cong_signal.
+ */
+ tp->sackhint.prr_delivered = 0;
+ tp->sackhint.sack_bytes_rexmit = 0;
+ if (!(tp->sackhint.recover_fs = tp->snd_nxt - tp->snd_una))
+ tp->sackhint.recover_fs = 1;
+ }
if (tp->t_flags & TF_SACK_PERMIT) {
TCPSTAT_INC(
tcps_sack_recovery_episode);
@@ -2713,7 +2782,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (IN_FASTRECOVERY(tp->t_flags)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
if (tp->t_flags & TF_SACK_PERMIT)
- tcp_sack_partialack(tp, th);
+ if (V_tcp_do_prr)
+ tcp_prr_partialack(tp, th);
+ else
+ tcp_sack_partialack(tp, th);
else
tcp_newreno_partial_ack(tp, th);
} else
@@ -3839,6 +3911,54 @@ tcp_mssopt(struct in_conninfo *inc)
return (mss);
}
+void
+tcp_prr_partialack(struct tcpcb *tp, struct tcphdr *th)
+{
+ long snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
+ int maxseg = tcp_maxseg(tp);
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rtttime = 0;
+ /*
+ * Compute the amount of data that this ACK is indicating
+ * (del_data) and an estimate of how many bytes are in the
+ * network.
+ */
+ if (SEQ_GEQ(th->th_ack, tp->snd_una))
+ del_data = BYTES_THIS_ACK(tp, th);
+ del_data += tp->sackhint.delivered_data;
+ pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
+ tp->sackhint.prr_delivered += del_data;
+ /*
+ * Proportional Rate Reduction
+ */
+ if (pipe > tp->snd_ssthresh)
+ snd_cnt = (tp->sackhint.prr_delivered * tp->snd_ssthresh / tp->sackhint.recover_fs) -
+ tp->sackhint.sack_bytes_rexmit;
+ else {
+ if (V_tcp_do_prr_conservative)
+ limit = tp->sackhint.prr_delivered - tp->sackhint.sack_bytes_rexmit;
+ else
+ if ((tp->sackhint.prr_delivered - tp->sackhint.sack_bytes_rexmit) > del_data)
+ limit = tp->sackhint.prr_delivered - tp->sackhint.sack_bytes_rexmit + maxseg;
+ else
+ limit = del_data + maxseg;
+ snd_cnt = min((tp->snd_ssthresh - pipe), limit);
+ }
+ snd_cnt = max((snd_cnt / maxseg), 0);
+ /*
+ * Send snd_cnt new data into the network in response to this ack.
+ * If there is going to be a SACK retransmission, adjust snd_cwnd
+ * accordingly.
+ */
+ tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
+ tp->sackhint.sack_bytes_rexmit + (snd_cnt * maxseg);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+}
+
/*
* On a partial ack arrives, force the retransmission of the
* next unacknowledged segment. Do not clear tp->t_dupacks.
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 5600d52170fe..d9235f23706c 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -113,8 +113,9 @@ struct sackhint {
int32_t sacked_bytes; /* Total sacked bytes reported by the
* receiver via sack option
*/
- uint32_t _pad1[1]; /* TBD */
- uint64_t _pad[1]; /* TBD */
+ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */
+ uint32_t prr_delivered; /* Total bytes delivered using PRR */
+ uint32_t _pad[1]; /* TBD */
};
#define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
@@ -866,6 +867,8 @@ VNET_DECLARE(int, tcp_sendspace);
VNET_DECLARE(struct inpcbhead, tcb);
VNET_DECLARE(struct inpcbinfo, tcbinfo);
+#define V_tcp_do_prr VNET(tcp_do_prr)
+#define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative)
#define V_tcp_do_newcwv VNET(tcp_do_newcwv)
#define V_drop_synfin VNET(drop_synfin)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
@@ -1051,6 +1054,7 @@ void tcp_clean_dsack_blocks(struct tcpcb *tp);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
+void tcp_prr_partialack(struct tcpcb *, struct tcphdr *);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);