aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Scheffenegger <rscheff@FreeBSD.org>2024-10-29 17:47:06 +0000
committerRichard Scheffenegger <rscheff@FreeBSD.org>2024-10-29 18:04:12 +0000
commit7dc78150c730e90fa2afdaba3aa645932b30c429 (patch)
tree469e161877810b20d50dc030c24ab01fac2a5363
parent72ae04c7334743c0fec2e8c99ac13fd8ca63f51b (diff)
downloadsrc-7dc78150c730.tar.gz
src-7dc78150c730.zip
tcp: refactor cwnd during SACK transmissions to allow TSO
Refactoring of cwnd and moving the adjustment for SACKed data into tcp_output() - cwnd tracking the maximum extent starting at snd_una - allows both SACK loss recovery as well as SACK transmissions after RTO during slow start and if allowed, the use of TSO while in loss recovery. Reviewed By: tuexen, cc, #transport Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D43470
-rw-r--r--sys/netinet/tcp_input.c85
-rw-r--r--sys/netinet/tcp_output.c69
-rw-r--r--sys/netinet/tcp_sack.c17
3 files changed, 97 insertions, 74 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 5d14664f3fc0..e5f5e09e57d8 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -2653,13 +2653,14 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tcp_do_prr_ack(tp, th, &to,
sack_changed, &maxseg);
} else if (tcp_is_sack_recovery(tp, &to) &&
- IN_FASTRECOVERY(tp->t_flags)) {
+ IN_FASTRECOVERY(tp->t_flags) &&
+ (tp->snd_nxt == tp->snd_max)) {
int awnd;
/*
* Compute the amount of data in flight first.
* We can inject new data into the pipe iff
- * we have less than 1/2 the original window's
+ * we have less than ssthresh
* worth of data in flight.
*/
if (V_tcp_do_newsack) {
@@ -2669,10 +2670,18 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tp->sackhint.sack_bytes_rexmit;
}
if (awnd < tp->snd_ssthresh) {
- tp->snd_cwnd += maxseg;
+ tp->snd_cwnd += imax(maxseg,
+ imin(2 * maxseg,
+ tp->sackhint.delivered_data));
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
}
+ } else if (tcp_is_sack_recovery(tp, &to) &&
+ IN_FASTRECOVERY(tp->t_flags) &&
+ SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ tp->snd_cwnd += imax(maxseg,
+ imin(2 * maxseg,
+ tp->sackhint.delivered_data));
} else {
tp->snd_cwnd += maxseg;
}
@@ -2696,14 +2705,13 @@ enter_recovery:
tcp_seq onxt = tp->snd_nxt;
/*
- * If we're doing sack, or prr, check
- * to see if we're already in sack
+ * If we're doing sack, check to
+ * see if we're already in sack
* recovery. If we're not doing sack,
* check to see if we're in newreno
* recovery.
*/
- if (V_tcp_do_prr ||
- (tp->t_flags & TF_SACK_PERMIT)) {
+ if (tcp_is_sack_recovery(tp, &to)) {
if (IN_FASTRECOVERY(tp->t_flags)) {
tp->t_dupacks = 0;
break;
@@ -2723,29 +2731,40 @@ enter_recovery:
tp->t_rtttime = 0;
if (V_tcp_do_prr) {
/*
- * snd_ssthresh is already updated by
- * cc_cong_signal.
+ * snd_ssthresh and snd_recover are
+ * already updated by cc_cong_signal.
*/
if (tcp_is_sack_recovery(tp, &to)) {
/*
- * Exclude Limited Transmit
+ * Include Limited Transmit
* segments here
*/
tp->sackhint.prr_delivered =
- maxseg;
+ imin(tp->snd_max - th->th_ack,
+ (tp->snd_limited + 1) * maxseg);
} else {
tp->sackhint.prr_delivered =
- imin(tp->snd_max - tp->snd_una,
- imin(INT_MAX / 65536,
- tp->t_dupacks) * maxseg);
+ maxseg;
}
tp->sackhint.recover_fs = max(1,
tp->snd_nxt - tp->snd_una);
}
+ tp->snd_limited = 0;
if (tcp_is_sack_recovery(tp, &to)) {
TCPSTAT_INC(tcps_sack_recovery_episode);
- tp->snd_cwnd = maxseg;
+ /*
+ * When entering LR after RTO due to
+ * Duplicate ACKs, retransmit existing
+ * holes from the scoreboard.
+ */
+ tcp_resend_sackholes(tp);
+ /* Avoid inflating cwnd in tcp_output */
+ tp->snd_nxt = tp->snd_max;
+ tp->snd_cwnd = tcp_compute_pipe(tp) +
+ maxseg;
(void) tcp_output(tp);
+ /* Set cwnd to the expected flightsize */
+ tp->snd_cwnd = tp->snd_ssthresh;
if (SEQ_GT(th->th_ack, tp->snd_una)) {
goto resume_partialack;
}
@@ -2790,7 +2809,8 @@ enter_recovery:
(tp->t_rxtshift == 0))
tp->snd_cwnd =
SEQ_SUB(tp->snd_nxt,
- tp->snd_una);
+ tp->snd_una) -
+ tcp_sack_adjust(tp);
tp->snd_cwnd +=
(tp->t_dupacks - tp->snd_limited) *
maxseg;
@@ -3049,9 +3069,8 @@ process_ACK:
SEQ_GEQ(th->th_ack, tp->snd_recover)) {
cc_post_recovery(tp, th);
}
- if (tp->t_flags & TF_SACK_PERMIT) {
- if (SEQ_GT(tp->snd_una, tp->snd_recover))
- tp->snd_recover = tp->snd_una;
+ if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
+ tp->snd_recover = tp->snd_una;
}
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@@ -4138,9 +4157,7 @@ tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
*/
if (IN_FASTRECOVERY(tp->t_flags)) {
if (tcp_is_sack_recovery(tp, to)) {
- tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
- tp->sackhint.sack_bytes_rexmit +
- (snd_cnt * maxseg);
+ tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
} else {
tp->snd_cwnd = (tp->snd_max - tp->snd_una) +
(snd_cnt * maxseg);
@@ -4168,17 +4185,19 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
- tp->snd_nxt = th->th_ack;
- /*
- * Set snd_cwnd to one segment beyond acknowledged offset.
- * (tp->snd_una has not yet been updated when this function is called.)
- */
- tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
- tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
- tp->snd_cwnd = ocwnd;
- if (SEQ_GT(onxt, tp->snd_nxt))
- tp->snd_nxt = onxt;
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ tp->snd_nxt = th->th_ack;
+ /*
+ * Set snd_cwnd to one segment beyond acknowledged offset.
+ * (tp->snd_una has not yet been updated when this function is called.)
+ */
+ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ }
/*
* Partial window deflation. Relies on fact that tp->snd_una
* not updated yet.
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index a048abf5f97a..860b785b631b 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -266,8 +266,10 @@ again:
* resending already delivered data. Adjust snd_nxt accordingly.
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
- SEQ_LT(tp->snd_nxt, tp->snd_max))
+ (tp->sackhint.nexthole != NULL) &&
+ !IN_FASTRECOVERY(tp->t_flags)) {
sendwin = tcp_sack_adjust(tp);
+ }
sendalot = 0;
tso = 0;
mtu = 0;
@@ -292,10 +294,13 @@ again:
if ((tp->t_flags & TF_SACK_PERMIT) &&
(IN_FASTRECOVERY(tp->t_flags) || SEQ_LT(tp->snd_nxt, tp->snd_max)) &&
(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
- uint32_t cwin;
+ int32_t cwin;
- cwin =
- imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0);
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ cwin = imax(sendwin - tcp_compute_pipe(tp), 0);
+ } else {
+ cwin = imax(sendwin - off, 0);
+ }
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover)) {
/*
@@ -314,19 +319,34 @@ again:
goto after_sack_rexmit;
} else {
/* Can rexmit part of the current hole */
- len = ((int32_t)ulmin(cwin,
- SEQ_SUB(tp->snd_recover, p->rxmit)));
+ len = SEQ_SUB(tp->snd_recover, p->rxmit);
+ if (cwin <= len) {
+ len = cwin;
+ } else {
+ sendalot = 1;
+ }
}
} else {
- len = ((int32_t)ulmin(cwin,
- SEQ_SUB(p->end, p->rxmit)));
+ len = SEQ_SUB(p->end, p->rxmit);
+ if (cwin <= len) {
+ len = cwin;
+ } else {
+ sendalot = 1;
+ }
}
+ /* we could have transmitted from the scoreboard,
+ * but sendwin (expected flightsize) - pipe didn't
+ * allow any transmission.
+ * Bypass recalculating the possible transmission
+ * length further down by setting sack_rxmit.
+ * Wouldn't be here if there would have been
+ * nothing in the scoreboard to transmit.
+ */
+ sack_rxmit = 1;
if (len > 0) {
off = SEQ_SUB(p->rxmit, tp->snd_una);
KASSERT(off >= 0,("%s: sack block to the left of una : %d",
__func__, off));
- sack_rxmit = 1;
- sendalot = 1;
}
}
after_sack_rexmit:
@@ -390,34 +410,15 @@ after_sack_rexmit:
*/
if (sack_rxmit == 0) {
if ((sack_bytes_rxmt == 0) || SEQ_LT(tp->snd_nxt, tp->snd_max)) {
- len = ((int32_t)min(sbavail(&so->so_snd), sendwin) -
- off);
+ len = imin(sbavail(&so->so_snd), sendwin) - off;
} else {
- int32_t cwin;
-
/*
* We are inside of a SACK recovery episode and are
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
- len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) -
- off);
- /*
- * Don't remove this (len > 0) check !
- * We explicitly check for len > 0 here (although it
- * isn't really necessary), to work around a gcc
- * optimization issue - to force gcc to compute
- * len above. Without this check, the computation
- * of len is bungled by the optimizer.
- */
- if (len > 0) {
- cwin = tp->snd_cwnd - imax(0, (int32_t)
- (tp->snd_nxt - tp->snd_recover)) -
- sack_bytes_rxmt;
- if (cwin < 0)
- cwin = 0;
- len = imin(len, cwin);
- }
+ len = imin(sbavail(&so->so_snd) - off,
+ sendwin - tcp_compute_pipe(tp));
}
}
@@ -1647,6 +1648,10 @@ timer:
if ((error == 0) &&
sack_rxmit &&
SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) {
+ /*
+ * When transmitting from SACK scoreboard
+ * after an RTO, pull snd_nxt along.
+ */
tp->snd_nxt = SEQ_MIN(p->rxmit, p->end);
}
if (error) {
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 09e172ad4601..f642acd4c46a 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -961,16 +961,15 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp)
/* Send one or 2 segments based on how much new data was acked. */
if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2)
num_segs = 2;
- if (V_tcp_do_newsack) {
- tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack +
- tp->sackhint.sack_bytes_rexmit -
- tp->sackhint.sacked_bytes -
- tp->sackhint.lost_bytes, maxseg) +
- num_segs * maxseg;
- } else {
+ if (tp->snd_nxt == tp->snd_max) {
tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
- imax(0, tp->snd_nxt - tp->snd_recover) +
- num_segs * maxseg);
+ (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg);
+ } else {
+ /*
+ * Since cwnd is not the expected flightsize during
+ * SACK LR, not deflating cwnd allows the partial
+ * ACKed amount to be sent.
+ */
}
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;