aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRandall Stewart <rrs@FreeBSD.org>2018-06-07 18:18:13 +0000
committerRandall Stewart <rrs@FreeBSD.org>2018-06-07 18:18:13 +0000
commit89e560f441bb214495715039288c99442b3b5aea (patch)
tree92e58604010cc5bfd9f7e210d979ee8cfa36fcb7
parentce024bdc0c7e70c5bc32ddd2329ccd04ab747514 (diff)
downloadsrc-89e560f441bb214495715039288c99442b3b5aea.tar.gz
src-89e560f441bb214495715039288c99442b3b5aea.zip
This commit brings in a new refactored TCP stack called Rack.
Rack includes the following features: - A different SACK processing scheme (the old sack structures are not used). - RACK (Recent acknowledgment) where counting dup-acks is no longer done instead time is used to knwo when to retransmit. (see the I-D) - TLP (Tail Loss Probe) where we will probe for tail-losses to attempt to try not to take a retransmit time-out. (see the I-D) - Burst mitigation using TCPHTPS - PRR (partial rate reduction) see the RFC. Once built into your kernel, you can select this stack by either socket option with the name of the stack is "rack" or by setting the global sysctl so the default is rack. Note that any connection that does not support SACK will be kicked back to the "default" base FreeBSD stack (currently known as "default"). To build this into your kernel you will need to enable in your kernel: makeoptions WITH_EXTRA_TCP_STACKS=1 options TCPHPTS Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D15525
Notes
Notes: svn path=/head/; revision=334804
-rw-r--r--sys/kern/uipc_sockbuf.c49
-rw-r--r--sys/modules/tcp/Makefile2
-rw-r--r--sys/modules/tcp/rack/Makefile24
-rw-r--r--sys/netinet/tcp.h56
-rw-r--r--sys/netinet/tcp_log_buf.h2
-rw-r--r--sys/netinet/tcp_output.c145
-rw-r--r--sys/netinet/tcp_stacks/fastpath.c2
-rw-r--r--sys/netinet/tcp_stacks/rack.c9164
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.h70
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.c706
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.h58
-rw-r--r--sys/netinet/tcp_stacks/tcp_rack.h321
-rw-r--r--sys/netinet/tcp_timer.c117
-rw-r--r--sys/netinet/tcp_timer.h16
-rw-r--r--sys/netinet/tcp_var.h26
-rw-r--r--sys/sys/mbuf.h2
-rw-r--r--sys/sys/queue.h11
-rw-r--r--sys/sys/sockbuf.h4
-rw-r--r--sys/sys/time.h16
19 files changed, 10766 insertions, 25 deletions
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index c19ba6c7cc81..f5da502612ba 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -1283,6 +1283,55 @@ sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
return (ret);
}
+struct mbuf *
+sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff)
+{
+ struct mbuf *m;
+
+ KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
+ *moff = off;
+ if (sb->sb_sndptr == NULL) {
+ sb->sb_sndptr = sb->sb_mb;
+ sb->sb_sndptroff = 0;
+ }
+ return (sb->sb_mb);
+ } else {
+ m = sb->sb_sndptr;
+ off -= sb->sb_sndptroff;
+ }
+ *moff = off;
+ return (m);
+}
+
+void
+sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len)
+{
+ /*
+ * A small copy was done, advance forward the sb_sbsndptr to cover
+ * it.
+ */
+ struct mbuf *m;
+
+ if (mb != sb->sb_sndptr) {
+ /* Did not copyout at the same mbuf */
+ return;
+ }
+ m = mb;
+ while (m && (len > 0)) {
+ if (len >= m->m_len) {
+ len -= m->m_len;
+ if (m->m_next) {
+ sb->sb_sndptroff += m->m_len;
+ sb->sb_sndptr = m->m_next;
+ }
+ m = m->m_next;
+ } else {
+ len = 0;
+ }
+ }
+}
+
/*
* Return the first mbuf and the mbuf data offset for the provided
* send offset without changing the "sb_sndptroff" field.
diff --git a/sys/modules/tcp/Makefile b/sys/modules/tcp/Makefile
index a5ca9034a070..46f889693d45 100644
--- a/sys/modules/tcp/Makefile
+++ b/sys/modules/tcp/Makefile
@@ -7,10 +7,12 @@ SYSDIR?=${SRCTOP}/sys
SUBDIR= \
${_tcp_fastpath} \
+ ${_tcp_rack} \
${_tcpmd5} \
.if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES)
_tcp_fastpath= fastpath
+_tcp_rack= rack
.endif
.if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
new file mode 100644
index 000000000000..de7d9333a9b8
--- /dev/null
+++ b/sys/modules/tcp/rack/Makefile
@@ -0,0 +1,24 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
+
+STACKNAME= rack
+KMOD= tcp_${STACKNAME}
+SRCS= rack.c sack_filter.c
+
+SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
+SRCS+= opt_tcpdebug.h
+SRCS+= opt_kern_tls.h
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+CFLAGS+= -DMODNAME=${KMOD}
+CFLAGS+= -DSTACKNAME=${STACKNAME}
+CFLAGS+= -DSTACKALIAS=rack_18q21
+
+.include <bsd.kmod.mk>
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index c2ad951a4604..9cd44ce80b74 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -176,6 +176,7 @@ struct tcphdr {
device */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
+#define TCP_DELACK 72 /* socket option for delayed ack */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
@@ -184,6 +185,61 @@ struct tcphdr {
#define TCP_PCAP_OUT 2048 /* number of output packets to keep */
#define TCP_PCAP_IN 4096 /* number of input packets to keep */
#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
+/* Options for Rack and BBR */
+#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
+#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
+#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */
+#define TCP_RACK_PACE_MAX_SEG 1054 /* Max segments in a pace */
+#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
+#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
+#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
+#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */
+#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */
+#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */
+#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
+#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */
+#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
+#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
+#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
+#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
+#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
+#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
+#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
+#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
+#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
+#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */
+#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
+#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
+#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
+#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
+#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
+#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
+#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
+#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
+#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
+#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
+#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
+#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
+#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
+#define TCP_BBR_PACE_PER_SEC 1086
+#define TCP_BBR_PACE_DEL_TAR 1087
+#define TCP_BBR_PACE_SEG_MAX 1088
+#define TCP_BBR_PACE_SEG_MIN 1089
+#define TCP_BBR_PACE_CROSS 1090
+#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
+#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
+#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
+#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
+#define TCP_RACK_TLP_USE 1095
+#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
+#define TCP_BBR_EXTRA_GAIN 1097
+#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
+#define TCP_BBR_RETRAN_WTSO 1099
+#define TCP_DATA_AFTER_CLOSE 1100
+#define TCP_BBR_PROBE_RTT_GAIN 1101
+#define TCP_BBR_PROBE_RTT_LEN 1102
+
+
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index 58713fe52f66..62fb45d933ce 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -94,7 +94,7 @@ struct tcp_log_bbr {
uint16_t flex7;
uint8_t bbr_state;
uint8_t bbr_substate;
- uint8_t inpacer;
+ uint8_t inhpts;
uint8_t ininput;
uint8_t use_lt_bw;
uint8_t flex8;
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index a925e6967943..2804a4eda26c 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -143,18 +143,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_R
tcp_timer_active((tp), TT_PERSIST), \
("neither rexmt nor persist timer is set"))
-#ifdef TCP_HHOOK
-static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
- struct tcphdr *th, struct tcpopt *to,
- uint32_t len, int tso);
-#endif
static void inline cc_after_idle(struct tcpcb *tp);
#ifdef TCP_HHOOK
/*
* Wrapper for the TCP established output helper hook.
*/
-static void inline
+void
hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
struct tcpopt *to, uint32_t len, int tso)
{
@@ -1851,6 +1846,144 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
return (optlen);
}
+/*
+ * This is a copy of m_copym(), taking the TSO segment size/limit
+ * constraints into account, and advancing the sndptr as it goes.
+ */
+struct mbuf *
+tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
+ int32_t seglimit, int32_t segsize, struct sockbuf *sb)
+{
+ struct mbuf *n, **np;
+ struct mbuf *top;
+ int32_t off = off0;
+ int32_t len = *plen;
+ int32_t fragsize;
+ int32_t len_cp = 0;
+ int32_t *pkthdrlen;
+ uint32_t mlen, frags;
+ bool copyhdr;
+
+
+ KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off));
+ KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len));
+ if (off == 0 && m->m_flags & M_PKTHDR)
+ copyhdr = true;
+ else
+ copyhdr = false;
+ while (off > 0) {
+ KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain"));
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ if ((sb) && (m == sb->sb_sndptr)) {
+ sb->sb_sndptroff += m->m_len;
+ sb->sb_sndptr = m->m_next;
+ }
+ m = m->m_next;
+ }
+ np = &top;
+ top = NULL;
+ pkthdrlen = NULL;
+ while (len > 0) {
+ if (m == NULL) {
+ KASSERT(len == M_COPYALL,
+ ("tcp_m_copym, length > size of mbuf chain"));
+ *plen = len_cp;
+ if (pkthdrlen != NULL)
+ *pkthdrlen = len_cp;
+ break;
+ }
+ mlen = min(len, m->m_len - off);
+ if (seglimit) {
+ /*
+ * For M_NOMAP mbufs, add 3 segments
+ * + 1 in case we are crossing page boundaries
+ * + 2 in case the TLS hdr/trailer are used
+ * It is cheaper to just add the segments
+ * than it is to take the cache miss to look
+ * at the mbuf ext_pgs state in detail.
+ */
+ if (m->m_flags & M_NOMAP) {
+ fragsize = min(segsize, PAGE_SIZE);
+ frags = 3;
+ } else {
+ fragsize = segsize;
+ frags = 0;
+ }
+
+ /* Break if we really can't fit anymore. */
+ if ((frags + 1) >= seglimit) {
+ *plen = len_cp;
+ if (pkthdrlen != NULL)
+ *pkthdrlen = len_cp;
+ break;
+ }
+
+ /*
+ * Reduce size if you can't copy the whole
+ * mbuf. If we can't copy the whole mbuf, also
+ * adjust len so the loop will end after this
+ * mbuf.
+ */
+ if ((frags + howmany(mlen, fragsize)) >= seglimit) {
+ mlen = (seglimit - frags - 1) * fragsize;
+ len = mlen;
+ *plen = len_cp + len;
+ if (pkthdrlen != NULL)
+ *pkthdrlen = *plen;
+ }
+ frags += howmany(mlen, fragsize);
+ if (frags == 0)
+ frags++;
+ seglimit -= frags;
+ KASSERT(seglimit > 0,
+ ("%s: seglimit went too low", __func__));
+ }
+ if (copyhdr)
+ n = m_gethdr(M_NOWAIT, m->m_type);
+ else
+ n = m_get(M_NOWAIT, m->m_type);
+ *np = n;
+ if (n == NULL)
+ goto nospace;
+ if (copyhdr) {
+ if (!m_dup_pkthdr(n, m, M_NOWAIT))
+ goto nospace;
+ if (len == M_COPYALL)
+ n->m_pkthdr.len -= off0;
+ else
+ n->m_pkthdr.len = len;
+ pkthdrlen = &n->m_pkthdr.len;
+ copyhdr = false;
+ }
+ n->m_len = mlen;
+ len_cp += n->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data + off;
+ mb_dupcl(n, m);
+ } else
+ bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+ (u_int)n->m_len);
+
+ if (sb && (sb->sb_sndptr == m) &&
+ ((n->m_len + off) >= m->m_len) && m->m_next) {
+ sb->sb_sndptroff += m->m_len;
+ sb->sb_sndptr = m->m_next;
+ }
+ off = 0;
+ if (len != M_COPYALL) {
+ len -= n->m_len;
+ }
+ m = m->m_next;
+ np = &n->m_next;
+ }
+ return (top);
+nospace:
+ m_freem(top);
+ return (NULL);
+}
+
void
tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin)
{
diff --git a/sys/netinet/tcp_stacks/fastpath.c b/sys/netinet/tcp_stacks/fastpath.c
index c6632a22c058..d5be90fdb1a9 100644
--- a/sys/netinet/tcp_stacks/fastpath.c
+++ b/sys/netinet/tcp_stacks/fastpath.c
@@ -2392,7 +2392,7 @@ struct tcp_function_block __tcp_fastack = {
static int
tcp_addfastpaths(module_t mod, int type, void *data)
{
- int err=0;
+ int err = 0;
switch (type) {
case MOD_LOAD:
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
new file mode 100644
index 000000000000..087668c3d1d7
--- /dev/null
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -0,0 +1,9164 @@
+/*-
+ * Copyright (c) 2016-2018
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#ifdef NETFLIX_STATS
+#include <sys/stats.h>
+#endif
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#ifdef NETFLIX_CWV
+#include <netinet/tcp_newcwv.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+
+#include <netipsec/ipsec_support.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "sack_filter.h"
+#include "tcp_rack.h"
+#include "rack_bbr_common.h"
+
+uma_zone_t rack_zone;
+uma_zone_t rack_pcb_zone;
+
+#ifndef TICKS2SBT
+#define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t)))
+#endif
+
+struct sysctl_ctx_list rack_sysctl_ctx;
+struct sysctl_oid *rack_sysctl_root;
+
+#ifndef TCPHPTS
+fatal error missing option TCPHSTS in the build;
+#endif
+
+#define CUM_ACKED 1
+#define SACKED 2
+
+/*
+ * The RACK module incorporates a number of
+ * TCP ideas that have been put out into the IETF
+ * over the last few years:
+ * - Matt Mathis's Rate Halving which slowly drops
+ * the congestion window so that the ack clock can
+ * be maintained during a recovery.
+ * - Yuchung Cheng's RACK TCP (for which its named) that
+ * will stop us using the number of dup acks and instead
+ * use time as the gage of when we retransmit.
+ * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
+ * of Dukkipati et.al.
+ * RACK depends on SACK, so if an endpoint arrives that
+ * cannot do SACK the state machine below will shuttle the
+ * connection back to using the "default" TCP stack that is
+ * in FreeBSD.
+ *
+ * To implement RACK the original TCP stack was first decomposed
+ * into a functional state machine with individual states
+ * for each of the possible TCP connection states. The do_segement
+ * functions role in life is to mandate the connection supports SACK
+ * initially and then assure that the RACK state matches the conenction
+ * state before calling the states do_segment function. Each
+ * state is simplified due to the fact that the original do_segment
+ * has been decomposed and we *know* what state we are in (no
+ * switches on the state) and all tests for SACK are gone. This
+ * greatly simplifies what each state does.
+ *
+ * TCP output is also over-written with a new version since it
+ * must maintain the new rack scoreboard.
+ *
+ */
+static int32_t rack_precache = 1;
+static int32_t rack_tlp_thresh = 1;
+static int32_t rack_reorder_thresh = 2;
+static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000
+ * - 60 seconds */
+static int32_t rack_pkt_delay = 1;
+static int32_t rack_inc_var = 0;/* For TLP */
+static int32_t rack_reduce_largest_on_idle = 0;
+static int32_t rack_min_pace_time = 0;
+static int32_t rack_min_pace_time_seg_req=6;
+static int32_t rack_early_recovery = 1;
+static int32_t rack_early_recovery_max_seg = 6;
+static int32_t rack_send_a_lot_in_prr = 1;
+static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
+static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */
+static int32_t rack_verbose_logging = 0;
+static int32_t rack_ignore_data_after_close = 1;
+/*
+ * Currently regular tcp has a rto_min of 30ms
+ * the backoff goes 12 times so that ends up
+ * being a total of 122.850 seconds before a
+ * connection is killed.
+ */
+static int32_t rack_tlp_min = 10;
+static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */
+static int32_t rack_rto_max = 30000; /* 30 seconds */
+static const int32_t rack_free_cache = 2;
+static int32_t rack_hptsi_segments = 40;
+static int32_t rack_rate_sample_method = USE_RTT_LOW;
+static int32_t rack_pace_every_seg = 1;
+static int32_t rack_delayed_ack_time = 200; /* 200ms */
+static int32_t rack_slot_reduction = 4;
+static int32_t rack_lower_cwnd_at_tlp = 0;
+static int32_t rack_use_proportional_reduce = 0;
+static int32_t rack_proportional_rate = 10;
+static int32_t rack_tlp_max_resend = 2;
+static int32_t rack_limited_retran = 0;
+static int32_t rack_always_send_oldest = 0;
+static int32_t rack_sack_block_limit = 128;
+static int32_t rack_use_sack_filter = 1;
+static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
+
+/* Rack specific counters */
+counter_u64_t rack_badfr;
+counter_u64_t rack_badfr_bytes;
+counter_u64_t rack_rtm_prr_retran;
+counter_u64_t rack_rtm_prr_newdata;
+counter_u64_t rack_timestamp_mismatch;
+counter_u64_t rack_reorder_seen;
+counter_u64_t rack_paced_segments;
+counter_u64_t rack_unpaced_segments;
+counter_u64_t rack_saw_enobuf;
+counter_u64_t rack_saw_enetunreach;
+
+/* Tail loss probe counters */
+counter_u64_t rack_tlp_tot;
+counter_u64_t rack_tlp_newdata;
+counter_u64_t rack_tlp_retran;
+counter_u64_t rack_tlp_retran_bytes;
+counter_u64_t rack_tlp_retran_fail;
+counter_u64_t rack_to_tot;
+counter_u64_t rack_to_arm_rack;
+counter_u64_t rack_to_arm_tlp;
+counter_u64_t rack_to_alloc;
+counter_u64_t rack_to_alloc_hard;
+counter_u64_t rack_to_alloc_emerg;
+
+counter_u64_t rack_sack_proc_all;
+counter_u64_t rack_sack_proc_short;
+counter_u64_t rack_sack_proc_restart;
+counter_u64_t rack_runt_sacks;
+counter_u64_t rack_used_tlpmethod;
+counter_u64_t rack_used_tlpmethod2;
+counter_u64_t rack_enter_tlp_calc;
+counter_u64_t rack_input_idle_reduces;
+counter_u64_t rack_tlp_does_nada;
+
+/* Temp CPU counters */
+counter_u64_t rack_find_high;
+
+counter_u64_t rack_progress_drops;
+counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
+counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
+
+static void
+rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
+
+static int
+rack_process_ack(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked,
+ uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
+static int
+rack_process_data(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static void
+rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
+ struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
+static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
+static struct rack_sendmap *
+rack_check_recovery_mode(struct tcpcb *tp,
+ uint32_t tsused);
+static void
+rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
+ uint32_t type);
+static void rack_counter_destroy(void);
+static int
+rack_ctloutput(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp);
+static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
+static void
+rack_do_segment(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
+ uint8_t iptos, int32_t ti_locked);
+static void rack_dtor(void *mem, int32_t size, void *arg);
+static void
+rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
+ uint32_t t, uint32_t cts);
+static struct rack_sendmap *
+rack_find_high_nonack(struct tcp_rack *rack,
+ struct rack_sendmap *rsm);
+static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
+static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
+static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
+static int
+rack_get_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
+static int32_t rack_handoff_ok(struct tcpcb *tp);
+static int32_t rack_init(struct tcpcb *tp);
+static void rack_init_sysctls(void);
+static void
+rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
+ struct tcphdr *th);
+static void
+rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
+ uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
+ uint8_t pass, struct rack_sendmap *hintrsm);
+static void
+rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm);
+static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
+static int32_t rack_output(struct tcpcb *tp);
+static void
+rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
+ uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv);
+
+static uint32_t
+rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
+ struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
+ uint32_t cts);
+static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+static void rack_remxt_tmr(struct tcpcb *tp);
+static int
+rack_set_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
+static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
+static int32_t rack_stopall(struct tcpcb *tp);
+static void
+rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
+ uint32_t delta);
+static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
+static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
+static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
+static uint32_t
+rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
+static void
+rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, uint32_t ts);
+static int
+rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
+static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
+static void
+rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val);
+static int
+rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_do_closing(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked);
+static void
+rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val);
+static void
+rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen);
+static int
+rack_do_established(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt);
+static int
+rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_do_lastack(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
+ int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static int
+rack_drop_checks(struct tcpopt *to, struct mbuf *m,
+ struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf,
+ int32_t * drop_hdrlen, int32_t * ret_val);
+static int
+rack_process_rst(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp, int32_t * ti_locked);
+struct rack_sendmap *
+tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
+ uint32_t tsused);
+static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
+static void
+ tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
+
+static int
+rack_ts_check(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val);
+
+int32_t rack_clear_counter=0;
+
+
+static int
+sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t stat;
+ int32_t error;
+
+ error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
+ if (error || req->newptr == NULL)
+ return error;
+
+ error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
+ if (error)
+ return (error);
+ if (stat == 1) {
+#ifdef INVARIANTS
+ printf("Clearing RACK counters\n");
+#endif
+ counter_u64_zero(rack_badfr);
+ counter_u64_zero(rack_badfr_bytes);
+ counter_u64_zero(rack_rtm_prr_retran);
+ counter_u64_zero(rack_rtm_prr_newdata);
+ counter_u64_zero(rack_timestamp_mismatch);
+ counter_u64_zero(rack_reorder_seen);
+ counter_u64_zero(rack_tlp_tot);
+ counter_u64_zero(rack_tlp_newdata);
+ counter_u64_zero(rack_tlp_retran);
+ counter_u64_zero(rack_tlp_retran_bytes);
+ counter_u64_zero(rack_tlp_retran_fail);
+ counter_u64_zero(rack_to_tot);
+ counter_u64_zero(rack_to_arm_rack);
+ counter_u64_zero(rack_to_arm_tlp);
+ counter_u64_zero(rack_paced_segments);
+ counter_u64_zero(rack_unpaced_segments);
+ counter_u64_zero(rack_saw_enobuf);
+ counter_u64_zero(rack_saw_enetunreach);
+ counter_u64_zero(rack_to_alloc_hard);
+ counter_u64_zero(rack_to_alloc_emerg);
+ counter_u64_zero(rack_sack_proc_all);
+ counter_u64_zero(rack_sack_proc_short);
+ counter_u64_zero(rack_sack_proc_restart);
+ counter_u64_zero(rack_to_alloc);
+ counter_u64_zero(rack_find_high);
+ counter_u64_zero(rack_runt_sacks);
+ counter_u64_zero(rack_used_tlpmethod);
+ counter_u64_zero(rack_used_tlpmethod2);
+ counter_u64_zero(rack_enter_tlp_calc);
+ counter_u64_zero(rack_progress_drops);
+ counter_u64_zero(rack_tlp_does_nada);
+ }
+ rack_clear_counter = 0;
+ return (0);
+}
+
+
+
+static void
+rack_init_sysctls()
+{
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "rate_sample_method", CTLFLAG_RW,
+ &rack_rate_sample_method , USE_RTT_LOW,
+ "What method should we use for rate sampling 0=high, 1=low ");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "data_after_close", CTLFLAG_RW,
+ &rack_ignore_data_after_close, 0,
+ "Do we hold off sending a RST until all pending data is ack'd");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlpmethod", CTLFLAG_RW,
+ &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
+ "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "min_pace_time", CTLFLAG_RW,
+ &rack_min_pace_time, 0,
+ "Should we enforce a minimum pace time of 1ms");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "min_pace_segs", CTLFLAG_RW,
+ &rack_min_pace_time_seg_req, 6,
+ "How many segments have to be in the len to enforce min-pace-time");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
+ &rack_reduce_largest_on_idle, 0,
+ "Should we reduce the largest cwnd seen to IW on idle reduction");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "bb_verbose", CTLFLAG_RW,
+ &rack_verbose_logging, 0,
+ "Should RACK black box logging be verbose");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "sackfiltering", CTLFLAG_RW,
+ &rack_use_sack_filter, 1,
+ "Do we use sack filtering?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "delayed_ack", CTLFLAG_RW,
+ &rack_delayed_ack_time, 200,
+ "Delayed ack time (200ms)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlpminto", CTLFLAG_RW,
+ &rack_tlp_min, 10,
+ "TLP minimum timeout per the specification (10ms)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "precache", CTLFLAG_RW,
+ &rack_precache, 0,
+ "Where should we precache the mcopy (0 is not at all)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "sblklimit", CTLFLAG_RW,
+ &rack_sack_block_limit, 128,
+ "When do we start paying attention to small sack blocks");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "send_oldest", CTLFLAG_RW,
+ &rack_always_send_oldest, 1,
+ "Should we always send the oldest TLP and RACK-TLP");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
+ &rack_tlp_in_recovery, 1,
+ "Can we do a TLP during recovery?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "rack_tlimit", CTLFLAG_RW,
+ &rack_limited_retran, 0,
+ "How many times can a rack timeout drive out sends");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "minrto", CTLFLAG_RW,
+ &rack_rto_min, 0,
+ "Minimum RTO in ms -- set with caution below 1000 due to TLP");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "maxrto", CTLFLAG_RW,
+ &rack_rto_max, 0,
+ "Maxiumum RTO in ms -- should be at least as large as min_rto");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_retry", CTLFLAG_RW,
+ &rack_tlp_max_resend, 2,
+ "How many times does TLP retry a single segment or multiple with no ACK");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
+ &rack_use_proportional_reduce, 0,
+ "Should we proportionaly reduce cwnd based on the number of losses ");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "recovery_prop", CTLFLAG_RW,
+ &rack_proportional_rate, 10,
+ "What percent reduction per loss");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
+ &rack_lower_cwnd_at_tlp, 0,
+ "When a TLP completes a retran should we enter recovery?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
+ &rack_slot_reduction, 4,
+ "When setting a slot should we reduce by divisor");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
+ &rack_pace_every_seg, 1,
+ "Should we pace out every segment hptsi");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
+ &rack_hptsi_segments, 6,
+ "Should we pace out only a limited size of segments");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "prr_sendalot", CTLFLAG_RW,
+ &rack_send_a_lot_in_prr, 1,
+ "Send a lot in prr");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "minto", CTLFLAG_RW,
+ &rack_min_to, 1,
+ "Minimum rack timeout in milliseconds");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
+ &rack_early_recovery_max_seg, 6,
+ "Max segments in early recovery");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "earlyrecovery", CTLFLAG_RW,
+ &rack_early_recovery, 1,
+ "Do we do early recovery with rack");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "reorder_thresh", CTLFLAG_RW,
+ &rack_reorder_thresh, 2,
+ "What factor for rack will be added when seeing reordering (shift right)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
+ &rack_tlp_thresh, 1,
+ "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "reorder_fade", CTLFLAG_RW,
+ &rack_reorder_fade, 0,
+ "Does reorder detection fade, if so how many ms (0 means never)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "pktdelay", CTLFLAG_RW,
+ &rack_pkt_delay, 1,
+ "Extra RACK time (in ms) besides reordering thresh");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "inc_var", CTLFLAG_RW,
+ &rack_inc_var, 0,
+ "Should rack add to the TLP timer the variance in rtt calculation");
+ rack_badfr = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "badfr", CTLFLAG_RD,
+ &rack_badfr, "Total number of bad FRs");
+ rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "badfr_bytes", CTLFLAG_RD,
+ &rack_badfr_bytes, "Total number of bad FRs");
+ rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "prrsndret", CTLFLAG_RD,
+ &rack_rtm_prr_retran,
+ "Total number of prr based retransmits");
+ rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "prrsndnew", CTLFLAG_RD,
+ &rack_rtm_prr_newdata,
+ "Total number of prr based new transmits");
+ rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tsnf", CTLFLAG_RD,
+ &rack_timestamp_mismatch,
+ "Total number of timestamps that we could not find the reported ts");
+ rack_find_high = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "findhigh", CTLFLAG_RD,
+ &rack_find_high,
+ "Total number of FIN causing find-high");
+ rack_reorder_seen = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "reordering", CTLFLAG_RD,
+ &rack_reorder_seen,
+ "Total number of times we added delay due to reordering");
+ rack_tlp_tot = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_to_total", CTLFLAG_RD,
+ &rack_tlp_tot,
+ "Total number of tail loss probe expirations");
+ rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_new", CTLFLAG_RD,
+ &rack_tlp_newdata,
+ "Total number of tail loss probe sending new data");
+
+ rack_tlp_retran = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_retran", CTLFLAG_RD,
+ &rack_tlp_retran,
+ "Total number of tail loss probe sending retransmitted data");
+ rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
+ &rack_tlp_retran_bytes,
+ "Total bytes of tail loss probe sending retransmitted data");
+ rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
+ &rack_tlp_retran_fail,
+ "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
+ rack_to_tot = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "rack_to_tot", CTLFLAG_RD,
+ &rack_to_tot,
+ "Total number of times the rack to expired?");
+ rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "arm_rack", CTLFLAG_RD,
+ &rack_to_arm_rack,
+ "Total number of times the rack timer armed?");
+ rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "arm_tlp", CTLFLAG_RD,
+ &rack_to_arm_tlp,
+ "Total number of times the tlp timer armed?");
+ rack_paced_segments = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "paced", CTLFLAG_RD,
+ &rack_paced_segments,
+ "Total number of times a segment send caused hptsi");
+ rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "unpaced", CTLFLAG_RD,
+ &rack_unpaced_segments,
+ "Total number of times a segment did not cause hptsi");
+ rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "saw_enobufs", CTLFLAG_RD,
+ &rack_saw_enobuf,
+ "Total number of times a segment did not cause hptsi");
+ rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
+ &rack_saw_enetunreach,
+ "Total number of times a segment did not cause hptsi");
+ rack_to_alloc = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "allocs", CTLFLAG_RD,
+ &rack_to_alloc,
+ "Total allocations of tracking structures");
+ rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "allochard", CTLFLAG_RD,
+ &rack_to_alloc_hard,
+ "Total allocations done with sleeping the hard way");
+ rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "allocemerg", CTLFLAG_RD,
+ &rack_to_alloc_emerg,
+ "Total alocations done from emergency cache");
+ rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "sack_long", CTLFLAG_RD,
+ &rack_sack_proc_all,
+ "Total times we had to walk whole list for sack processing");
+
+ rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "sack_restart", CTLFLAG_RD,
+ &rack_sack_proc_restart,
+ "Total times we had to walk whole list due to a restart");
+ rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "sack_short", CTLFLAG_RD,
+ &rack_sack_proc_short,
+ "Total times we took shortcut for sack processing");
+ rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
+ &rack_enter_tlp_calc,
+ "Total times we called calc-tlp");
+ rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
+ &rack_used_tlpmethod,
+ "Total number of runt sacks");
+ rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
+ &rack_used_tlpmethod2,
+ "Total number of runt sacks 2");
+ rack_runt_sacks = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "runtsacks", CTLFLAG_RD,
+ &rack_runt_sacks,
+ "Total number of runt sacks");
+ rack_progress_drops = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "prog_drops", CTLFLAG_RD,
+ &rack_progress_drops,
+ "Total number of progress drops");
+ rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
+ &rack_input_idle_reduces,
+ "Total number of idle reductions on input");
+ rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "tlp_nada", CTLFLAG_RD,
+ &rack_tlp_does_nada,
+ "Total number of nada tlp calls");
+ COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "outsize", CTLFLAG_RD,
+ rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
+ COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "opts", CTLFLAG_RD,
+ rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
+ SYSCTL_ADD_PROC(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
+}
+
+static inline int32_t
+rack_progress_timeout_check(struct tcpcb *tp)
+{
+ if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
+ if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
+ /*
+ * There is an assumption that the caller
+ * will drop the connection so we will
+ * increment the counters here.
+ */
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ counter_u64_add(rack_progress_drops, 1);
+#ifdef NETFLIX_STATS
+ TCPSTAT_INC(tcps_progdrops);
+#endif
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
+ return (1);
+ }
+ }
+ return (0);
+}
+
+
+static void
+rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
+ log.u_bbr.flex2 = to;
+ log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex4 = slot;
+ log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
+ log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
+ log.u_bbr.flex8 = which;
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TIMERSTAR, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex8 = to_num;
+ log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
+ log.u_bbr.flex2 = rack->rc_rack_rtt;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_RTO, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
+ uint32_t o_srtt, uint32_t o_var)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = t;
+ log.u_bbr.flex2 = o_srtt;
+ log.u_bbr.flex3 = o_var;
+ log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
+ log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
+ log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
+ log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
+ log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
+ TCP_LOG_EVENT(tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRRTT, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
+{
+ /*
+ * Log the rtt sample we are
+ * applying to the srtt algorithm in
+ * useconds.
+ */
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ /* Convert our ms to a microsecond */
+ log.u_bbr.flex1 = rtt * 1000;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_LOG_RTT, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+
+static inline void
+rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line)
+{
+ if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = tick;
+ log.u_bbr.flex3 = tp->t_maxunacktime;
+ log.u_bbr.flex4 = tp->t_acktime;
+ log.u_bbr.flex8 = event;
+ TCP_LOG_EVENT(tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_PROGRESS, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = slot;
+ log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
+ log.u_bbr.flex8 = rack->rc_in_persist;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRSND, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ log.u_bbr.flex1 = did_out;
+ log.u_bbr.flex2 = nxt_pkt;
+ log.u_bbr.flex3 = way_out;
+ log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex7 = rack->r_wanted_output;
+ log.u_bbr.flex8 = rack->rc_in_persist;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_DOSEG_DONE, 0,
+ 0, &log, false);
+ }
+}
+
+
+static void
+rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = slot;
+ log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex7 = hpts_calling;
+ log.u_bbr.flex8 = rack->rc_in_persist;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_JUSTRET, 0,
+ tlen, &log, false);
+ }
+}
+
+static void
+rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = 0;
+ log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex4 = 0;
+ log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
+ log.u_bbr.flex8 = hpts_removed;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TIMERCANC, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = timers;
+ log.u_bbr.flex2 = ret;
+ log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
+ log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex5 = cts;
+ TCP_LOG_EVENT(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TO_PROCESS, 0,
+ 0, &log, false);
+ }
+}
+
+static void
+rack_counter_destroy()
+{
+ counter_u64_free(rack_badfr);
+ counter_u64_free(rack_badfr_bytes);
+ counter_u64_free(rack_rtm_prr_retran);
+ counter_u64_free(rack_rtm_prr_newdata);
+ counter_u64_free(rack_timestamp_mismatch);
+ counter_u64_free(rack_reorder_seen);
+ counter_u64_free(rack_tlp_tot);
+ counter_u64_free(rack_tlp_newdata);
+ counter_u64_free(rack_tlp_retran);
+ counter_u64_free(rack_tlp_retran_bytes);
+ counter_u64_free(rack_tlp_retran_fail);
+ counter_u64_free(rack_to_tot);
+ counter_u64_free(rack_to_arm_rack);
+ counter_u64_free(rack_to_arm_tlp);
+ counter_u64_free(rack_paced_segments);
+ counter_u64_free(rack_unpaced_segments);
+ counter_u64_free(rack_saw_enobuf);
+ counter_u64_free(rack_saw_enetunreach);
+ counter_u64_free(rack_to_alloc_hard);
+ counter_u64_free(rack_to_alloc_emerg);
+ counter_u64_free(rack_sack_proc_all);
+ counter_u64_free(rack_sack_proc_short);
+ counter_u64_free(rack_sack_proc_restart);
+ counter_u64_free(rack_to_alloc);
+ counter_u64_free(rack_find_high);
+ counter_u64_free(rack_runt_sacks);
+ counter_u64_free(rack_enter_tlp_calc);
+ counter_u64_free(rack_used_tlpmethod);
+ counter_u64_free(rack_used_tlpmethod2);
+ counter_u64_free(rack_progress_drops);
+ counter_u64_free(rack_input_idle_reduces);
+ counter_u64_free(rack_tlp_does_nada);
+ COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
+ COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
+}
+
+static struct rack_sendmap *
+rack_alloc(struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+
+ counter_u64_add(rack_to_alloc, 1);
+ rack->r_ctl.rc_num_maps_alloced++;
+ rsm = uma_zalloc(rack_zone, M_NOWAIT);
+ if (rsm) {
+ return (rsm);
+ }
+ if (rack->rc_free_cnt) {
+ counter_u64_add(rack_to_alloc_emerg, 1);
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
+ rack->rc_free_cnt--;
+ return (rsm);
+ }
+ return (NULL);
+}
+
+static void
+rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
+{
+ rack->r_ctl.rc_num_maps_alloced--;
+ if (rack->r_ctl.rc_tlpsend == rsm)
+ rack->r_ctl.rc_tlpsend = NULL;
+ if (rack->r_ctl.rc_next == rsm)
+ rack->r_ctl.rc_next = NULL;
+ if (rack->r_ctl.rc_sacklast == rsm)
+ rack->r_ctl.rc_sacklast = NULL;
+ if (rack->rc_free_cnt < rack_free_cache) {
+ memset(rsm, 0, sizeof(struct rack_sendmap));
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
+ rack->rc_free_cnt++;
+ return;
+ }
+ uma_zfree(rack_zone, rsm);
+}
+
+/*
+ * CC wrapper hook functions
+ */
+static void
+rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
+ uint16_t type, int32_t recovery)
+{
+#ifdef NETFLIX_STATS
+ int32_t gput;
+#endif
+#ifdef NETFLIX_CWV
+ u_long old_cwnd = tp->snd_cwnd;
+#endif
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ tp->ccv->nsegs = nsegs;
+ tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
+ if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
+ uint32_t max;
+
+ max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
+ if (tp->ccv->bytes_this_ack > max) {
+ tp->ccv->bytes_this_ack = max;
+ }
+ }
+ if (tp->snd_cwnd <= tp->snd_wnd)
+ tp->ccv->flags |= CCF_CWND_LIMITED;
+ else
+ tp->ccv->flags &= ~CCF_CWND_LIMITED;
+
+ if (type == CC_ACK) {
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
+ ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
+ if ((tp->t_flags & TF_GPUTINPROG) &&
+ SEQ_GEQ(th->th_ack, tp->gput_ack)) {
+ gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
+ max(1, tcp_ts_getticks() - tp->gput_ts);
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
+ gput);
+ /*
+ * XXXLAS: This is a temporary hack, and should be
+ * chained off VOI_TCP_GPUT when stats(9) grows an
+ * API to deal with chained VOIs.
+ */
+ if (tp->t_stats_gput_prev > 0)
+ stats_voi_update_abs_s32(tp->t_stats,
+ VOI_TCP_GPUT_ND,
+ ((gput - tp->t_stats_gput_prev) * 100) /
+ tp->t_stats_gput_prev);
+ tp->t_flags &= ~TF_GPUTINPROG;
+ tp->t_stats_gput_prev = gput;
+
+ if (tp->t_maxpeakrate) {
+ /*
+ * We update t_peakrate_thr. This gives us roughly
+ * one update per round trip time.
+ */
+ tcp_update_peakrate_thr(tp);
+ }
+ }
+#endif
+ if (tp->snd_cwnd > tp->snd_ssthresh) {
+ tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
+ nsegs * V_tcp_abc_l_var * tp->t_maxseg);
+ if (tp->t_bytes_acked >= tp->snd_cwnd) {
+ tp->t_bytes_acked -= tp->snd_cwnd;
+ tp->ccv->flags |= CCF_ABC_SENTAWND;
+ }
+ } else {
+ tp->ccv->flags &= ~CCF_ABC_SENTAWND;
+ tp->t_bytes_acked = 0;
+ }
+ }
+ if (CC_ALGO(tp)->ack_received != NULL) {
+ /* XXXLAS: Find a way to live without this */
+ tp->ccv->curack = th->th_ack;
+ CC_ALGO(tp)->ack_received(tp->ccv, type);
+ }
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
+#endif
+ if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
+ rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
+ }
+#ifdef NETFLIX_CWV
+ if (tp->cwv_enabled) {
+ /*
+ * Per RFC 7661: The behaviour in the non-validated phase is
+ * specified as: o A sender determines whether to increase
+ * the cwnd based upon whether it is cwnd-limited (see
+ * Section 4.5.3): * A sender that is cwnd-limited MAY use
+ * the standard TCP method to increase cwnd (i.e., the
+ * standard method permits a TCP sender that fully utilises
+ * the cwnd to increase the cwnd each time it receives an
+ * ACK). * A sender that is not cwnd-limited MUST NOT
+ * increase the cwnd when ACK packets are received in this
+ * phase (i.e., needs to avoid growing the cwnd when it has
+ * not recently sent using the current size of cwnd).
+ */
+ if ((tp->snd_cwnd > old_cwnd) &&
+ (tp->cwv_cwnd_valid == 0) &&
+ (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
+ tp->snd_cwnd = old_cwnd;
+ }
+ /* Try to update pipeAck and NCWV state */
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ !IN_RECOVERY(tp->t_flags)) {
+ uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
+
+ tcp_newcwv_update_pipeack(tp, data);
+ }
+ }
+#endif
+ /* we enforce max peak rate if it is set. */
+ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
+ tp->snd_cwnd = tp->t_peakrate_thr;
+ }
+}
+
+static void
+tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
+{
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (rack->r_ctl.rc_prr_sndcnt > 0)
+ rack->r_wanted_output++;
+}
+
+static void
+rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
+{
+ struct tcp_rack *rack;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (CC_ALGO(tp)->post_recovery != NULL) {
+ tp->ccv->curack = th->th_ack;
+ CC_ALGO(tp)->post_recovery(tp->ccv);
+ }
+ /*
+ * Here we can in theory adjust cwnd to be based on the number of
+ * losses in the window (rack->r_ctl.rc_loss_count). This is done
+ * based on the rack_use_proportional flag.
+ */
+ if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
+ int32_t reduce;
+
+ reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
+ if (reduce > 50) {
+ reduce = 50;
+ }
+ tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
+ } else {
+ if (tp->snd_cwnd > tp->snd_ssthresh) {
+ /* Drop us down to the ssthresh (1/2 cwnd at loss) */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ }
+ }
+ if (rack->r_ctl.rc_prr_sndcnt > 0) {
+ /* Suck the next prr cnt back into cwnd */
+ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
+ rack->r_ctl.rc_prr_sndcnt = 0;
+ }
+ EXIT_RECOVERY(tp->t_flags);
+
+
+#ifdef NETFLIX_CWV
+ if (tp->cwv_enabled) {
+ if ((tp->cwv_cwnd_valid == 0) &&
+ (tp->snd_cwv.in_recovery))
+ tcp_newcwv_end_recovery(tp);
+ }
+#endif
+}
+
+static void
+rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
+{
+ struct tcp_rack *rack;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ switch (type) {
+ case CC_NDUPACK:
+/* rack->r_ctl.rc_ssthresh_set = 1;*/
+ if (!IN_FASTRECOVERY(tp->t_flags)) {
+ rack->r_ctl.rc_tlp_rtx_out = 0;
+ rack->r_ctl.rc_prr_delivered = 0;
+ rack->r_ctl.rc_prr_out = 0;
+ rack->r_ctl.rc_loss_count = 0;
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_ECN_PERMIT)
+ tp->t_flags |= TF_ECN_SND_CWR;
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(tp->t_flags)) {
+ TCPSTAT_INC(tcps_ecn_rcwnd);
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_ECN_PERMIT)
+ tp->t_flags |= TF_ECN_SND_CWR;
+ }
+ break;
+ case CC_RTO:
+ tp->t_dupacks = 0;
+ tp->t_bytes_acked = 0;
+ EXIT_RECOVERY(tp->t_flags);
+ tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
+ tp->t_maxseg) * tp->t_maxseg;
+ tp->snd_cwnd = tp->t_maxseg;
+ break;
+ case CC_RTO_ERR:
+ TCPSTAT_INC(tcps_sndrexmitbad);
+ /* RTO was unnecessary, so reset everything. */
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ tp->snd_recover = tp->snd_recover_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp->t_flags);
+ if (tp->t_flags & TF_WASCRECOVERY)
+ ENTER_CONGRECOVERY(tp->t_flags);
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0;
+ break;
+ }
+
+ if (CC_ALGO(tp)->cong_signal != NULL) {
+ if (th != NULL)
+ tp->ccv->curack = th->th_ack;
+ CC_ALGO(tp)->cong_signal(tp->ccv, type);
+ }
+#ifdef NETFLIX_CWV
+ if (tp->cwv_enabled) {
+ if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
+ tcp_newcwv_enter_recovery(tp);
+ }
+ if (type == CC_RTO) {
+ tcp_newcwv_reset(tp);
+ }
+ }
+#endif
+}
+
+
+
+static inline void
+rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
+{
+ uint32_t i_cwnd;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+#ifdef NETFLIX_STATS
+ TCPSTAT_INC(tcps_idle_restarts);
+ if (tp->t_state == TCPS_ESTABLISHED)
+ TCPSTAT_INC(tcps_idle_estrestarts);
+#endif
+ if (CC_ALGO(tp)->after_idle != NULL)
+ CC_ALGO(tp)->after_idle(tp->ccv);
+
+ if (tp->snd_cwnd == 1)
+ i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
+ else if (V_tcp_initcwnd_segments)
+ i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
+ max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
+ else if (V_tcp_do_rfc3390)
+ i_cwnd = min(4 * tp->t_maxseg,
+ max(2 * tp->t_maxseg, 4380));
+ else {
+ /* Per RFC5681 Section 3.1 */
+ if (tp->t_maxseg > 2190)
+ i_cwnd = 2 * tp->t_maxseg;
+ else if (tp->t_maxseg > 1095)
+ i_cwnd = 3 * tp->t_maxseg;
+ else
+ i_cwnd = 4 * tp->t_maxseg;
+ }
+ if (reduce_largest) {
+ /*
+ * Do we reduce the largest cwnd to make
+ * rack play nice on restart hptsi wise?
+ */
+ if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd)
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
+ }
+ /*
+ * Being idle is no differnt than the initial window. If the cc
+ * clamps it down below the initial window raise it to the initial
+ * window.
+ */
+ if (tp->snd_cwnd < i_cwnd) {
+ tp->snd_cwnd = i_cwnd;
+ }
+}
+
+
+/*
+ * Indicate whether this ack should be delayed. We can delay the ack if
+ * following conditions are met:
+ * - There is no delayed ack timer in progress.
+ * - Our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window.
+ * - LRO wasn't used for this segment. We make sure by checking that the
+ * segment size is not larger than the MSS.
+ * - Delayed acks are enabled or this is a half-synchronized T/TCP
+ * connection.
+ */
+#define DELAY_ACK(tp, tlen) \
+ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
+ ((tp->t_flags & TF_DELACK) == 0) && \
+ (tlen <= tp->t_maxseg) && \
+ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
+
+static inline void
+rack_calc_rwin(struct socket *so, struct tcpcb *tp)
+{
+ int32_t win;
+
+ /*
+ * Calculate amount of space in receive window, and then do TCP
+ * input processing. Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+}
+
+static void
+rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked)
+{
+ if (*ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ }
+ /*
+ * Drop space held by incoming segment and return.
+ */
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ if (m)
+ m_freem(m);
+}
+
+static void
+rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen)
+{
+ if (*ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ }
+ if (tp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+static void
+rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val)
+{
+ /*
+ * Generate an ACK dropping incoming segment if it occupies sequence
+ * space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all paths to this
+ * code happen after packets containing RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the segment
+ * we received passes the SYN-RECEIVED ACK test. If it fails send a
+ * RST. This breaks the loop in the "LAND" DoS attack, and also
+ * prevents an ACK storm between two listening ports that have been
+ * sent forged SYN segments, each with the source address of the
+ * other.
+ */
+ struct tcp_rack *rack;
+
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ *ret_val = 1;
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return;
+ } else
+ *ret_val = 0;
+ if (*ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack->r_wanted_output++;
+ tp->t_flags |= TF_ACKNOW;
+ if (m)
+ m_freem(m);
+}
+
+
+static int
+rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked)
+{
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
+ * window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should test against
+ * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
+ * of closed window, not covered by the RFC.
+ */
+ int dropped = 0;
+
+ if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(*ti_locked == TI_RLOCKED,
+ ("%s: TH_RST ti_locked %d, th %p tp %p",
+ __func__, *ti_locked, th, tp));
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ (tp->last_ack_sent == th->th_seq) ||
+ (tp->rcv_nxt == th->th_seq) ||
+ ((tp->last_ack_sent - 1) == th->th_seq)) {
+ TCPSTAT_INC(tcps_drops);
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ dropped = 1;
+ rack_do_drop(m, tp, ti_locked);
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ }
+ } else {
+ m_freem(m);
+ }
+ return (dropped);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+static void
+rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val)
+{
+ KASSERT(*ti_locked == TI_RLOCKED,
+ ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ *ret_val = 1;
+ rack_do_drop(m, tp, ti_locked);
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ *ret_val = 0;
+ rack_do_drop(m, NULL, ti_locked);
+ }
+}
+
+/*
+ * rack_ts_check returns 1 for you should not proceed. It places
+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+static int
+rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val)
+{
+
+ /* Check to see if ts_recent is over 24 days old. */
+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates ts_recent,
+ * the age will be reset later and ts_recent will get a
+ * valid value. If it does not, setting ts_recent to zero
+ * will at least satisfy the requirement that zero be placed
+ * in the timestamp echo reply when ts_recent isn't valid.
+ * The age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be dropped
+ * when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+ TCPSTAT_INC(tcps_pawsdrop);
+ *ret_val = 0;
+ if (tlen) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ }
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * rack_drop_checks returns 1 for you should not proceed. It places
+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+static int
+rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+{
+ int32_t todrop;
+ int32_t thflags;
+ int32_t tlen;
+
+ thflags = *thf;
+ tlen = *tlenp;
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+ } else {
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+ }
+ *drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+ /*
+ * If segment ends after window, drop trailing data (and PUSH and
+ * FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+ if (todrop > 0) {
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ if (todrop >= tlen) {
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment and
+ * ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_rcvwinprobe);
+ } else {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val);
+ return (1);
+ }
+ } else
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH | TH_FIN);
+ }
+ *thf = thflags;
+ *tlenp = tlen;
+ return (0);
+}
+
+static struct rack_sendmap *
+rack_find_lowest_rsm(struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+
+ /*
+ * Walk the time-order transmitted list looking for an rsm that is
+ * not acked. This will be the one that was sent the longest time
+ * ago that is still outstanding.
+ */
+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
+ if (rsm->r_flags & RACK_ACKED) {
+ continue;
+ }
+ goto finish;
+ }
+finish:
+ return (rsm);
+}
+
+static struct rack_sendmap *
+rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
+{
+ struct rack_sendmap *prsm;
+
+ /*
+ * Walk the sequence order list backward until we hit and arrive at
+ * the highest seq not acked. In theory when this is called it
+ * should be the last segment (which it was not).
+ */
+ counter_u64_add(rack_find_high, 1);
+ prsm = rsm;
+ TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
+ if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
+ continue;
+ }
+ return (prsm);
+ }
+ return (NULL);
+}
+
+
+static uint32_t
+rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
+{
+ int32_t lro;
+ uint32_t thresh;
+
+ /*
+ * lro is the flag we use to determine if we have seen reordering.
+ * If it gets set we have seen reordering. The reorder logic either
+ * works in one of two ways:
+ *
+ * If reorder-fade is configured, then we track the last time we saw
+ * re-ordering occur. If we reach the point where enough time as
+ * passed we no longer consider reordering has occuring.
+ *
+ * Or if reorder-face is 0, then once we see reordering we consider
+ * the connection to alway be subject to reordering and just set lro
+ * to 1.
+ *
+ * In the end if lro is non-zero we add the extra time for
+ * reordering in.
+ */
+ if (srtt == 0)
+ srtt = 1;
+ if (rack->r_ctl.rc_reorder_ts) {
+ if (rack->r_ctl.rc_reorder_fade) {
+ if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
+ lro = cts - rack->r_ctl.rc_reorder_ts;
+ if (lro == 0) {
+ /*
+ * No time as passed since the last
+ * reorder, mark it as reordering.
+ */
+ lro = 1;
+ }
+ } else {
+ /* Negative time? */
+ lro = 0;
+ }
+ if (lro > rack->r_ctl.rc_reorder_fade) {
+ /* Turn off reordering seen too */
+ rack->r_ctl.rc_reorder_ts = 0;
+ lro = 0;
+ }
+ } else {
+ /* Reodering does not fade */
+ lro = 1;
+ }
+ } else {
+ lro = 0;
+ }
+ thresh = srtt + rack->r_ctl.rc_pkt_delay;
+ if (lro) {
+ /* It must be set, if not you get 1/4 rtt */
+ if (rack->r_ctl.rc_reorder_shift)
+ thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
+ else
+ thresh += (srtt >> 2);
+ } else {
+ thresh += 1;
+ }
+ /* We don't let the rack timeout be above a RTO */
+
+ if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
+ thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
+ }
+ /* And we don't want it above the RTO max either */
+ if (thresh > rack_rto_max) {
+ thresh = rack_rto_max;
+ }
+ return (thresh);
+}
+
+static uint32_t
+rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, uint32_t srtt)
+{
+ struct rack_sendmap *prsm;
+ uint32_t thresh, len;
+ int maxseg;
+
+ if (srtt == 0)
+ srtt = 1;
+ if (rack->r_ctl.rc_tlp_threshold)
+ thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
+ else
+ thresh = (srtt * 2);
+
+ /* Get the previous sent packet, if any */
+ maxseg = tcp_maxseg(tp);
+ counter_u64_add(rack_enter_tlp_calc, 1);
+ len = rsm->r_end - rsm->r_start;
+ if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
+ /* Exactly like the ID */
+ if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
+ uint32_t alt_thresh;
+ /*
+ * Compensate for delayed-ack with the d-ack time.
+ */
+ counter_u64_add(rack_used_tlpmethod, 1);
+ alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
+ if (alt_thresh > thresh)
+ thresh = alt_thresh;
+ }
+ } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
+ /* 2.1 behavior */
+ prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
+ if (prsm && (len <= maxseg)) {
+ /*
+ * Two packets outstanding, thresh should be (2*srtt) +
+ * possible inter-packet delay (if any).
+ */
+ uint32_t inter_gap = 0;
+ int idx, nidx;
+
+ counter_u64_add(rack_used_tlpmethod, 1);
+ idx = rsm->r_rtr_cnt - 1;
+ nidx = prsm->r_rtr_cnt - 1;
+ if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
+ /* Yes it was sent later (or at the same time) */
+ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
+ }
+ thresh += inter_gap;
+ } else if (len <= maxseg) {
+ /*
+ * Possibly compensate for delayed-ack.
+ */
+ uint32_t alt_thresh;
+
+ counter_u64_add(rack_used_tlpmethod2, 1);
+ alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
+ if (alt_thresh > thresh)
+ thresh = alt_thresh;
+ }
+ } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
+ /* 2.2 behavior */
+ if (len <= maxseg) {
+ uint32_t alt_thresh;
+ /*
+ * Compensate for delayed-ack with the d-ack time.
+ */
+ counter_u64_add(rack_used_tlpmethod, 1);
+ alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
+ if (alt_thresh > thresh)
+ thresh = alt_thresh;
+ }
+ }
+ /* Not above an RTO */
+ if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
+ thresh = TICKS_2_MSEC(tp->t_rxtcur);
+ }
+ /* Not above a RTO max */
+ if (thresh > rack_rto_max) {
+ thresh = rack_rto_max;
+ }
+ /* Apply user supplied min TLP */
+ if (thresh < rack_tlp_min) {
+ thresh = rack_tlp_min;
+ }
+ return (thresh);
+}
+
+static struct rack_sendmap *
+rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
+{
+ /*
+ * Check to see that we don't need to fall into recovery. We will
+ * need to do so if our oldest transmit is past the time we should
+ * have had an ack.
+ */
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+ int32_t idx;
+ uint32_t srtt_cur, srtt, thresh;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
+ return (NULL);
+ }
+ srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
+ srtt = TICKS_2_MSEC(srtt_cur);
+ if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
+ srtt = rack->rc_rack_rtt;
+
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm == NULL)
+ return (NULL);
+
+ if (rsm->r_flags & RACK_ACKED) {
+ rsm = rack_find_lowest_rsm(rack);
+ if (rsm == NULL)
+ return (NULL);
+ }
+ idx = rsm->r_rtr_cnt - 1;
+ thresh = rack_calc_thresh_rack(rack, srtt, tsused);
+ if (tsused < rsm->r_tim_lastsent[idx]) {
+ return (NULL);
+ }
+ if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
+ return (NULL);
+ }
+ /* Ok if we reach here we are over-due */
+ rack->r_ctl.rc_rsm_start = rsm->r_start;
+ rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
+ rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
+ rack_cong_signal(tp, NULL, CC_NDUPACK);
+ return (rsm);
+}
+
+static uint32_t
+rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ int32_t t;
+ int32_t tt;
+ uint32_t ret_val;
+
+ t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
+ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
+ tcp_persmin, tcp_persmax);
+ if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+ tp->t_rxtshift++;
+ rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
+ ret_val = (uint32_t)tt;
+ return (ret_val);
+}
+
+static uint32_t
+rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ /*
+ * Start the FR timer, we do this based on getting the first one in
+ * the rc_tmap. Note that if its NULL we must stop the timer. in all
+ * events we need to stop the running timer (if its running) before
+ * starting the new one.
+ */
+ uint32_t thresh, exp, to, srtt, time_since_sent;
+ uint32_t srtt_cur;
+ int32_t idx;
+ int32_t is_tlp_timer = 0;
+ struct rack_sendmap *rsm;
+
+ if (rack->t_timers_stopped) {
+ /* All timers have been stopped none are to run */
+ return (0);
+ }
+ if (rack->rc_in_persist) {
+ /* We can't start any timer in persists */
+ return (rack_get_persists_timer_val(tp, rack));
+ }
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm == NULL) {
+ /* Nothing on the send map */
+activate_rxt:
+ if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
+ rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
+ to = TICKS_2_MSEC(tp->t_rxtcur);
+ if (to == 0)
+ to = 1;
+ return (to);
+ }
+ return (0);
+ }
+ if (rsm->r_flags & RACK_ACKED) {
+ rsm = rack_find_lowest_rsm(rack);
+ if (rsm == NULL) {
+ /* No lowest? */
+ goto activate_rxt;
+ }
+ }
+ /* Convert from ms to usecs */
+ if (rsm->r_flags & RACK_SACK_PASSED) {
+ if ((tp->t_flags & TF_SENTFIN) &&
+ ((tp->snd_max - tp->snd_una) == 1) &&
+ (rsm->r_flags & RACK_HAS_FIN)) {
+ /*
+ * We don't start a rack timer if all we have is a
+ * FIN outstanding.
+ */
+ goto activate_rxt;
+ }
+ if (tp->t_srtt) {
+ srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
+ srtt = TICKS_2_MSEC(srtt_cur);
+ } else
+ srtt = RACK_INITIAL_RTO;
+
+ thresh = rack_calc_thresh_rack(rack, srtt, cts);
+ idx = rsm->r_rtr_cnt - 1;
+ exp = rsm->r_tim_lastsent[idx] + thresh;
+ if (SEQ_GEQ(exp, cts)) {
+ to = exp - cts;
+ if (to < rack->r_ctl.rc_min_to) {
+ to = rack->r_ctl.rc_min_to;
+ }
+ } else {
+ to = rack->r_ctl.rc_min_to;
+ }
+ } else {
+ /* Ok we need to do a TLP not RACK */
+ if ((rack->rc_tlp_in_progress != 0) ||
+ (rack->r_ctl.rc_tlp_rtx_out != 0)) {
+ /*
+ * The previous send was a TLP or a tlp_rtx is in
+ * process.
+ */
+ goto activate_rxt;
+ }
+ rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
+ if (rsm == NULL) {
+ /* We found no rsm to TLP with. */
+ goto activate_rxt;
+ }
+ if (rsm->r_flags & RACK_HAS_FIN) {
+ /* If its a FIN we dont do TLP */
+ rsm = NULL;
+ goto activate_rxt;
+ }
+ idx = rsm->r_rtr_cnt - 1;
+ if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx]))
+ time_since_sent = cts - rsm->r_tim_lastsent[idx];
+ else
+ time_since_sent = 0;
+ is_tlp_timer = 1;
+ if (tp->t_srtt) {
+ srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
+ srtt = TICKS_2_MSEC(srtt_cur);
+ } else
+ srtt = RACK_INITIAL_RTO;
+ thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
+ if (thresh > time_since_sent)
+ to = thresh - time_since_sent;
+ else
+ to = rack->r_ctl.rc_min_to;
+ if (to > TCPTV_REXMTMAX) {
+ /*
+ * If the TLP time works out to larger than the max
+ * RTO lets not do TLP.. just RTO.
+ */
+ goto activate_rxt;
+ }
+ if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
+ /*
+ * The tail is no longer the last one I did a probe
+ * on
+ */
+ rack->r_ctl.rc_tlp_seg_send_cnt = 0;
+ rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
+ }
+ }
+ if (is_tlp_timer == 0) {
+ rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
+ } else {
+ if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
+ (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
+ /*
+ * We have exceeded how many times we can retran the
+ * current TLP timer, switch to the RTO timer.
+ */
+ goto activate_rxt;
+ } else {
+ rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
+ }
+ }
+ if (to == 0)
+ to = 1;
+ return (to);
+}
+
+static void
+rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ if (rack->rc_in_persist == 0) {
+ if (((tp->t_flags & TF_SENTFIN) == 0) &&
+ (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
+ /* Must need to send more data to enter persist */
+ return;
+ rack->r_ctl.rc_went_idle_time = cts;
+ rack_timer_cancel(tp, rack, cts, __LINE__);
+ tp->t_rxtshift = 0;
+ rack->rc_in_persist = 1;
+ }
+}
+
+static void
+rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ if (rack->rc_inp->inp_in_hpts) {
+ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ rack->r_ctl.rc_hpts_flags = 0;
+ }
+ rack->rc_in_persist = 0;
+ rack->r_ctl.rc_went_idle_time = 0;
+ tp->t_flags &= ~TF_FORCEDATA;
+ tp->t_rxtshift = 0;
+}
+
+static void
+rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
+ int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
+{
+ struct inpcb *inp;
+ uint32_t delayed_ack = 0;
+ uint32_t hpts_timeout;
+ uint8_t stopped;
+ uint32_t left = 0;
+
+ inp = tp->t_inpcb;
+ if (inp->inp_in_hpts) {
+ /* A previous call is already set up */
+ return;
+ }
+ if (tp->t_state == TCPS_CLOSED) {
+ return;
+ }
+ stopped = rack->rc_tmr_stopped;
+ if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
+ left = rack->r_ctl.rc_timer_exp - cts;
+ }
+ rack->r_ctl.rc_timer_exp = 0;
+ if (rack->rc_inp->inp_in_hpts == 0) {
+ rack->r_ctl.rc_hpts_flags = 0;
+ }
+ if (slot) {
+ /* We are hptsi too */
+ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
+ } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ /*
+ * We are still left on the hpts when the to goes
+ * it will be for output.
+ */
+ if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
+ slot = cts - rack->r_ctl.rc_last_output_to;
+ else
+ slot = 1;
+ }
+ if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
+ /* No send window.. we must enter persist */
+ rack_enter_persist(tp, rack, cts);
+ } else if ((frm_out_sbavail &&
+ (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
+ (tp->snd_wnd < tp->t_maxseg)) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ /*
+ * If we have no window or we can't send a segment (and have
+ * data to send.. we cheat here and frm_out_sbavail is
+ * passed in with the sbavail(sb) only from bbr_output) and
+ * we are established, then we must enter persits (if not
+ * already in persits).
+ */
+ rack_enter_persist(tp, rack, cts);
+ }
+ hpts_timeout = rack_timer_start(tp, rack, cts);
+ if (tp->t_flags & TF_DELACK) {
+ delayed_ack = tcp_delacktime;
+ rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
+ }
+ if (delayed_ack && ((hpts_timeout == 0) ||
+ (delayed_ack < hpts_timeout)))
+ hpts_timeout = delayed_ack;
+ else
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
+ /*
+ * If no timers are going to run and we will fall off the hptsi
+ * wheel, we resort to a keep-alive timer if its configured.
+ */
+ if ((hpts_timeout == 0) &&
+ (slot == 0)) {
+ if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)) {
+ /*
+ * Ok we have no timer (persists, rack, tlp, rxt or
+ * del-ack), we don't have segments being paced. So
+ * all that is left is the keepalive timer.
+ */
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ /* Get the established keep-alive time */
+ hpts_timeout = TP_KEEPIDLE(tp);
+ } else {
+ /* Get the initial setup keep-alive time */
+ hpts_timeout = TP_KEEPINIT(tp);
+ }
+ rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
+ }
+ }
+ if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
+ (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
+ /*
+ * RACK, TLP, persists and RXT timers all are restartable
+ * based on actions input .. i.e we received a packet (ack
+ * or sack) and that changes things (rw, or snd_una etc).
+ * Thus we can restart them with a new value. For
+ * keep-alive, delayed_ack we keep track of what was left
+ * and restart the timer with a smaller value.
+ */
+ if (left < hpts_timeout)
+ hpts_timeout = left;
+ }
+ if (hpts_timeout) {
+ /*
+ * Hack alert for now we can't time-out over 2,147,483
+ * seconds (a bit more than 596 hours), which is probably ok
+ * :).
+ */
+ if (hpts_timeout > 0x7ffffffe)
+ hpts_timeout = 0x7ffffffe;
+ rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
+ }
+ if (slot) {
+ rack->r_ctl.rc_last_output_to = cts + slot;
+ if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
+ if (rack->rc_inp->inp_in_hpts == 0)
+ tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
+ rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
+ } else {
+ /*
+ * Arrange for the hpts to kick back in after the
+ * t-o if the t-o does not cause a send.
+ */
+ if (rack->rc_inp->inp_in_hpts == 0)
+ tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
+ rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
+ }
+ } else if (hpts_timeout) {
+ if (rack->rc_inp->inp_in_hpts == 0)
+ tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
+ rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
+ } else {
+ /* No timer starting */
+#ifdef INVARIANTS
+ if (SEQ_GT(tp->snd_max, tp->snd_una)) {
+ panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
+ tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
+ }
+#endif
+ }
+ rack->rc_tmr_stopped = 0;
+ if (slot)
+ rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
+}
+
+/*
+ * RACK Timer, here we simply do logging and house keeping.
+ * the normal rack_output() function will call the
+ * appropriate thing to check if we need to do a RACK retransmit.
+ * We return 1, saying don't proceed with rack_output only
+ * when all timers have been stopped (destroyed PCB?).
+ */
+static int
+rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ /*
+ * This timer simply provides an internal trigger to send out data.
+ * The check_recovery_mode call will see if there are needed
+ * retransmissions, if so we will enter fast-recovery. The output
+ * call may or may not do the same thing depending on sysctl
+ * settings.
+ */
+ struct rack_sendmap *rsm;
+ int32_t recovery;
+
+ if (tp->t_timers->tt_flags & TT_STOPPED) {
+ return (1);
+ }
+ if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
+ /* Its not time yet */
+ return (0);
+ }
+ rack_log_to_event(rack, RACK_TO_FRM_RACK);
+ recovery = IN_RECOVERY(tp->t_flags);
+ counter_u64_add(rack_to_tot, 1);
+ if (rack->r_state && (rack->r_state != tp->t_state))
+ rack_set_state(tp, rack);
+ rsm = rack_check_recovery_mode(tp, cts);
+ if (rsm) {
+ uint32_t rtt;
+
+ rtt = rack->rc_rack_rtt;
+ if (rtt == 0)
+ rtt = 1;
+ if ((recovery == 0) &&
+ (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
+ /*
+ * The rack-timeout that enter's us into recovery
+ * will force out one MSS and set us up so that we
+ * can do one more send in 2*rtt (transitioning the
+ * rack timeout into a rack-tlp).
+ */
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
+ ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
+ /*
+ * When a rack timer goes, we have to send at
+ * least one segment. They will be paced a min of 1ms
+ * apart via the next rack timer (or further
+ * if the rack timer dictates it).
+ */
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ }
+ } else {
+ /* This is a case that should happen rarely if ever */
+ counter_u64_add(rack_tlp_does_nada, 1);
+#ifdef TCP_BLACKBOX
+ tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
+#endif
+ rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ }
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
+ return (0);
+}
+
+/*
+ * TLP Timer, here we simply setup what segment we want to
+ * have the TLP expire on, the normal rack_output() will then
+ * send it out.
+ *
+ * We return 1, saying don't proceed with rack_output only
+ * when all timers have been stopped (destroyed PCB?).
+ */
+static int
+rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ /*
+ * Tail Loss Probe.
+ */
+ struct rack_sendmap *rsm = NULL;
+ struct socket *so;
+ uint32_t amm, old_prr_snd = 0;
+ uint32_t out, avail;
+
+ if (tp->t_timers->tt_flags & TT_STOPPED) {
+ return (1);
+ }
+ if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
+ /* Its not time yet */
+ return (0);
+ }
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ return (1);
+ }
+ /*
+ * A TLP timer has expired. We have been idle for 2 rtts. So we now
+ * need to figure out how to force a full MSS segment out.
+ */
+ rack_log_to_event(rack, RACK_TO_FRM_TLP);
+ counter_u64_add(rack_tlp_tot, 1);
+ if (rack->r_state && (rack->r_state != tp->t_state))
+ rack_set_state(tp, rack);
+ so = tp->t_inpcb->inp_socket;
+ avail = sbavail(&so->so_snd);
+ out = tp->snd_max - tp->snd_una;
+ rack->rc_timer_up = 1;
+ /*
+ * If we are in recovery we can jazz out a segment if new data is
+ * present simply by setting rc_prr_sndcnt to a segment.
+ */
+ if ((avail > out) &&
+ ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
+ /* New data is available */
+ amm = avail - out;
+ if (amm > tp->t_maxseg) {
+ amm = tp->t_maxseg;
+ } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
+ /* not enough to fill a MTU and no-delay is off */
+ goto need_retran;
+ }
+ if (IN_RECOVERY(tp->t_flags)) {
+ /* Unlikely */
+ old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
+ if (out + amm <= tp->snd_wnd)
+ rack->r_ctl.rc_prr_sndcnt = amm;
+ else
+ goto need_retran;
+ } else {
+ /* Set the send-new override */
+ if (out + amm <= tp->snd_wnd)
+ rack->r_ctl.rc_tlp_new_data = amm;
+ else
+ goto need_retran;
+ }
+ rack->r_ctl.rc_tlp_seg_send_cnt = 0;
+ rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
+ rack->r_ctl.rc_tlpsend = NULL;
+ counter_u64_add(rack_tlp_newdata, 1);
+ goto send;
+ }
+need_retran:
+ /*
+ * Ok we need to arrange the last un-acked segment to be re-sent, or
+ * optionally the first un-acked segment.
+ */
+ if (rack_always_send_oldest)
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ else {
+ rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
+ if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
+ rsm = rack_find_high_nonack(rack, rsm);
+ }
+ }
+ if (rsm == NULL) {
+ counter_u64_add(rack_tlp_does_nada, 1);
+#ifdef TCP_BLACKBOX
+ tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
+#endif
+ goto out;
+ }
+ if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
+ /*
+ * We need to split this the last segment in two.
+ */
+ int32_t idx;
+ struct rack_sendmap *nrsm;
+
+ nrsm = rack_alloc(rack);
+ if (nrsm == NULL) {
+ /*
+ * No memory to split, we will just exit and punt
+ * off to the RXT timer.
+ */
+ counter_u64_add(rack_tlp_does_nada, 1);
+ goto out;
+ }
+ nrsm->r_start = (rsm->r_end - tp->t_maxseg);
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ nrsm->r_sndcnt = rsm->r_sndcnt;
+ nrsm->r_rtr_bytes = 0;
+ rsm->r_end = nrsm->r_start;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ rsm = nrsm;
+ }
+ rack->r_ctl.rc_tlpsend = rsm;
+ rack->r_ctl.rc_tlp_rtx_out = 1;
+ if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
+ rack->r_ctl.rc_tlp_seg_send_cnt++;
+ tp->t_rxtshift++;
+ } else {
+ rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
+ rack->r_ctl.rc_tlp_seg_send_cnt = 1;
+ }
+send:
+ rack->r_ctl.rc_tlp_send_cnt++;
+ if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
+ /*
+ * Can't [re]/transmit a segment we have not heard from the
+ * peer in max times. We need the retransmit timer to take
+ * over.
+ */
+restore:
+ rack->r_ctl.rc_tlpsend = NULL;
+ if (rsm)
+ rsm->r_flags &= ~RACK_TLP;
+ rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
+ counter_u64_add(rack_tlp_retran_fail, 1);
+ goto out;
+ } else if (rsm) {
+ rsm->r_flags |= RACK_TLP;
+ }
+ if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
+ (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
+ /*
+ * We don't want to send a single segment more than the max
+ * either.
+ */
+ goto restore;
+ }
+ rack->r_timer_override = 1;
+ rack->r_tlp_running = 1;
+ rack->rc_tlp_in_progress = 1;
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
+ return (0);
+out:
+ rack->rc_timer_up = 0;
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
+ return (0);
+}
+
+/*
+ * Delayed ack Timer, here we simply need to setup the
+ * ACK_NOW flag and remove the DELACK flag. From there
+ * the output routine will send the ack out.
+ *
+ * We only return 1, saying don't proceed, if all timers
+ * are stopped (destroyed PCB?).
+ */
+static int
+rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ if (tp->t_timers->tt_flags & TT_STOPPED) {
+ return (1);
+ }
+ rack_log_to_event(rack, RACK_TO_FRM_DELACK);
+ tp->t_flags &= ~TF_DELACK;
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_delack);
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
+ return (0);
+}
+
+/*
+ * Persists timer, here we simply need to setup the
+ * FORCE-DATA flag the output routine will send
+ * the one byte send.
+ *
+ * We only return 1, saying don't proceed, if all timers
+ * are stopped (destroyed PCB?).
+ */
+static int
+rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ struct inpcb *inp;
+ int32_t retval = 0;
+
+ inp = tp->t_inpcb;
+
+ if (tp->t_timers->tt_flags & TT_STOPPED) {
+ return (1);
+ }
+ if (rack->rc_in_persist == 0)
+ return (0);
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(inp, ETIMEDOUT);
+ return (1);
+ }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
+ /*
+ * Persistence timer into zero window. Force a byte to be output, if
+ * possible.
+ */
+ TCPSTAT_INC(tcps_persisttimeo);
+ /*
+ * Hack: if the peer is dead/unreachable, we do not time out if the
+ * window is closed. After a full backoff, drop the connection if
+ * the idle time (no responses to probes) reaches the maximum
+ * backoff that we would use if retransmitting.
+ */
+ if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+ (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
+ ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ TCPSTAT_INC(tcps_persistdrop);
+ retval = 1;
+ tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ goto out;
+ }
+ if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
+ tp->snd_una == tp->snd_max)
+ rack_exit_persist(tp, rack);
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
+ /*
+ * If the user has closed the socket then drop a persisting
+ * connection after a much reduced timeout.
+ */
+ if (tp->t_state > TCPS_CLOSE_WAIT &&
+ (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
+ retval = 1;
+ TCPSTAT_INC(tcps_persistdrop);
+ tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ goto out;
+ }
+ tp->t_flags |= TF_FORCEDATA;
+out:
+ rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
+ return (retval);
+}
+
+/*
+ * If a keepalive goes off, we had no other timers
+ * happening. We always return 1 here since this
+ * routine either drops the connection or sends
+ * out a segment with respond.
+ */
+static int
+rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ struct tcptemp *t_template;
+ struct inpcb *inp;
+
+ if (tp->t_timers->tt_flags & TT_STOPPED) {
+ return (1);
+ }
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
+ inp = tp->t_inpcb;
+ rack_log_to_event(rack, RACK_TO_FRM_KEEP);
+ /*
+ * Keep-alive timer went off; send something or drop connection if
+ * idle for too long.
+ */
+ TCPSTAT_INC(tcps_keeptimeo);
+ if (tp->t_state < TCPS_ESTABLISHED)
+ goto dropit;
+ if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ tp->t_state <= TCPS_CLOSING) {
+ if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
+ goto dropit;
+ /*
+ * Send a packet designed to force a response if the peer is
+ * up and reachable: either an ACK if the connection is
+ * still alive, or an RST if the peer has closed the
+ * connection due to timeout or reboot. Using sequence
+ * number tp->snd_una-1 causes the transmitted zero-length
+ * segment to lie outside the receive window; by the
+ * protocol spec, this requires the correspondent TCP to
+ * respond.
+ */
+ TCPSTAT_INC(tcps_keepprobe);
+ t_template = tcpip_maketemplate(inp);
+ if (t_template) {
+ tcp_respond(tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ tp->rcv_nxt, tp->snd_una - 1, 0);
+ free(t_template, M_TEMP);
+ }
+ }
+ rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
+ return (1);
+dropit:
+ TCPSTAT_INC(tcps_keepdrops);
+ tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ return (1);
+}
+
+/*
+ * Retransmit helper function, clear up all the ack
+ * flags and take care of important book keeping.
+ */
+static void
+rack_remxt_tmr(struct tcpcb *tp)
+{
+ /*
+ * The retransmit timer went off, all sack'd blocks must be
+ * un-acked.
+ */
+ struct rack_sendmap *rsm, *trsm = NULL;
+ struct tcp_rack *rack;
+ int32_t cnt = 0;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
+ rack_log_to_event(rack, RACK_TO_FRM_TMR);
+ if (rack->r_state && (rack->r_state != tp->t_state))
+ rack_set_state(tp, rack);
+ /*
+ * Ideally we would like to be able to
+ * mark SACK-PASS on anything not acked here.
+ * However, if we do that we would burst out
+ * all that data 1ms apart. This would be unwise,
+ * so for now we will just let the normal rxt timer
+ * and tlp timer take care of it.
+ */
+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
+ if (rsm->r_flags & RACK_ACKED) {
+ cnt++;
+ rsm->r_sndcnt = 0;
+ if (rsm->r_in_tmap == 0) {
+ /* We must re-add it back to the tlist */
+ if (trsm == NULL) {
+ TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ } else {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
+ }
+ rsm->r_in_tmap = 1;
+ trsm = rsm;
+ }
+ }
+ rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
+ }
+ /* Clear the count (we just un-acked them) */
+ rack->r_ctl.rc_sacked = 0;
+ /* Clear the tlp rtx mark */
+ rack->r_ctl.rc_tlp_rtx_out = 0;
+ rack->r_ctl.rc_tlp_seg_send_cnt = 0;
+ rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ /* Setup so we send one segment */
+ if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_timer_override = 1;
+}
+
+/*
+ * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
+ * we will setup to retransmit the lowest seq number outstanding.
+ */
+static int
+rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+{
+ int32_t rexmt;
+ struct inpcb *inp;
+ int32_t retval = 0;
+
+ inp = tp->t_inpcb;
+ if (tp->t_timers->tt_flags & TT_STOPPED) {
+ return (1);
+ }
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(inp, ETIMEDOUT);
+ return (1);
+ }
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_una == tp->snd_max)) {
+ /* Nothing outstanding .. nothing to do */
+ return (0);
+ }
+ /*
+ * Retransmission timer went off. Message has not been acked within
+ * retransmit interval. Back off to a longer retransmit interval
+ * and retransmit one segment.
+ */
+ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ tp->t_rxtshift = TCP_MAXRXTSHIFT;
+ TCPSTAT_INC(tcps_timeoutdrop);
+ retval = 1;
+ tcp_set_inp_to_drop(rack->rc_inp,
+ (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
+ goto out;
+ }
+ rack_remxt_tmr(tp);
+ if (tp->t_state == TCPS_SYN_SENT) {
+ /*
+ * If the SYN was retransmitted, indicate CWND to be limited
+ * to 1 segment in cc_conn_init().
+ */
+ tp->snd_cwnd = 1;
+ } else if (tp->t_rxtshift == 1) {
+ /*
+ * first retransmit; record ssthresh and cwnd so they can be
+ * recovered if this turns out to be a "bad" retransmit. A
+ * retransmit is considered "bad" if an ACK for this segment
+ * is received within RTT/2 interval; the assumption here is
+ * that the ACK was already in flight. See "On Estimating
+ * End-to-End Network Path Properties" by Allman and Paxson
+ * for more details.
+ */
+ tp->snd_cwnd_prev = tp->snd_cwnd;
+ tp->snd_ssthresh_prev = tp->snd_ssthresh;
+ tp->snd_recover_prev = tp->snd_recover;
+ if (IN_FASTRECOVERY(tp->t_flags))
+ tp->t_flags |= TF_WASFRECOVERY;
+ else
+ tp->t_flags &= ~TF_WASFRECOVERY;
+ if (IN_CONGRECOVERY(tp->t_flags))
+ tp->t_flags |= TF_WASCRECOVERY;
+ else
+ tp->t_flags &= ~TF_WASCRECOVERY;
+ tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ tp->t_flags |= TF_PREVVALID;
+ } else
+ tp->t_flags &= ~TF_PREVVALID;
+ TCPSTAT_INC(tcps_rexmttimeo);
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
+ else
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ TCPT_RANGESET(tp->t_rxtcur, rexmt,
+ max(MSEC_2_TICKS(rack_rto_min), rexmt),
+ MSEC_2_TICKS(rack_rto_max));
+ /*
+ * We enter the path for PLMTUD if connection is established or, if
+ * connection is FIN_WAIT_1 status, reason for the last is that if
+ * amount of data we send is very small, we could send it in couple
+ * of packets and process straight to FIN. In that case we won't
+ * catch ESTABLISHED state.
+ */
+ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
+ || (tp->t_state == TCPS_FIN_WAIT_1))) {
+#ifdef INET6
+ int32_t isipv6;
+#endif
+
+ /*
+ * Idea here is that at each stage of mtu probe (usually,
+ * 1448 -> 1188 -> 524) should be given 2 chances to recover
+ * before further clamping down. 'tp->t_rxtshift % 2 == 0'
+ * should take care of that.
+ */
+ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
+ (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
+ (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
+ tp->t_rxtshift % 2 == 0)) {
+ /*
+ * Enter Path MTU Black-hole Detection mechanism: -
+ * Disable Path MTU Discovery (IP "DF" bit). -
+ * Reduce MTU to lower value than what we negotiated
+ * with peer.
+ */
+ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
+ /* Record that we may have found a black hole. */
+ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
+ /* Keep track of previous MSS. */
+ tp->t_pmtud_saved_maxseg = tp->t_maxseg;
+ }
+
+ /*
+ * Reduce the MSS to blackhole value or to the
+ * default in an attempt to retransmit.
+ */
+#ifdef INET6
+ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
+ if (isipv6 &&
+ tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated);
+ } else if (isipv6) {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_v6mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch
+ * to minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated);
+ } else {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch
+ * to minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
+ }
+#endif
+ } else {
+ /*
+ * If further retransmissions are still unsuccessful
+ * with a lowered MTU, maybe this isn't a blackhole
+ * and we restore the previous MSS and blackhole
+ * detection flags. The limit '6' is determined by
+ * giving each probe stage (1448, 1188, 524) 2
+ * chances to recover.
+ */
+ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
+ (tp->t_rxtshift >= 6)) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
+ tp->t_maxseg = tp->t_pmtud_saved_maxseg;
+ TCPSTAT_INC(tcps_pmtud_blackhole_failed);
+ }
+ }
+ }
+ /*
+ * Disable RFC1323 and SACK if we haven't got any response to our
+ * third SYN to work-around some broken terminal servers (most of
+ * which have hopefully been retired) that have bad VJ header
+ * compression code which trashes TCP segments containing
+ * unknown-to-them TCP options.
+ */
+ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
+ (tp->t_rxtshift == 3))
+ tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
+ /*
+ * If we backed off this far, our srtt estimate is probably bogus.
+ * Clobber it so we'll take the next rtt measurement as our srtt;
+ * move the current srtt into rttvar to keep the current retransmit
+ * times until then.
+ */
+ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
+#ifdef INET6
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
+ in6_losing(tp->t_inpcb);
+ else
+#endif
+ in_losing(tp->t_inpcb);
+ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
+ tp->t_srtt = 0;
+ }
+ if (rack_use_sack_filter)
+ sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
+ tp->snd_recover = tp->snd_max;
+ tp->t_flags |= TF_ACKNOW;
+ tp->t_rtttime = 0;
+ rack_cong_signal(tp, NULL, CC_RTO);
+out:
+ return (retval);
+}
+
+static int
+rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
+{
+ int32_t ret = 0;
+ int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
+
+ if (timers == 0) {
+ return (0);
+ }
+ if (tp->t_state == TCPS_LISTEN) {
+ /* no timers on listen sockets */
+ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
+ return (0);
+ return (1);
+ }
+ if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
+ uint32_t left;
+
+ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ ret = -1;
+ rack_log_to_processing(rack, cts, ret, 0);
+ return (0);
+ }
+ if (hpts_calling == 0) {
+ ret = -2;
+ rack_log_to_processing(rack, cts, ret, 0);
+ return (0);
+ }
+ /*
+ * Ok our timer went off early and we are not paced false
+ * alarm, go back to sleep.
+ */
+ ret = -3;
+ left = rack->r_ctl.rc_timer_exp - cts;
+ tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
+ rack_log_to_processing(rack, cts, ret, left);
+ rack->rc_last_pto_set = 0;
+ return (1);
+ }
+ rack->rc_tmr_stopped = 0;
+ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
+ if (timers & PACE_TMR_DELACK) {
+ ret = rack_timeout_delack(tp, rack, cts);
+ } else if (timers & PACE_TMR_RACK) {
+ ret = rack_timeout_rack(tp, rack, cts);
+ } else if (timers & PACE_TMR_TLP) {
+ ret = rack_timeout_tlp(tp, rack, cts);
+ } else if (timers & PACE_TMR_RXT) {
+ ret = rack_timeout_rxt(tp, rack, cts);
+ } else if (timers & PACE_TMR_PERSIT) {
+ ret = rack_timeout_persist(tp, rack, cts);
+ } else if (timers & PACE_TMR_KEEP) {
+ ret = rack_timeout_keepalive(tp, rack, cts);
+ }
+ rack_log_to_processing(rack, cts, ret, timers);
+ return (ret);
+}
+
+static void
+rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
+{
+ uint8_t hpts_removed = 0;
+
+ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
+ TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
+ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ hpts_removed = 1;
+ }
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
+ if (rack->rc_inp->inp_in_hpts &&
+ ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
+ /*
+ * Canceling timer's when we have no output being
+ * paced. We also must remove ourselves from the
+ * hpts.
+ */
+ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ hpts_removed = 1;
+ }
+ rack_log_to_cancel(rack, hpts_removed, line);
+ rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
+ }
+}
+
+static void
+rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+ return;
+}
+
+static int
+rack_stopall(struct tcpcb *tp)
+{
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack->t_timers_stopped = 1;
+ return (0);
+}
+
+static void
+rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
+{
+ return;
+}
+
+static int
+rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
+{
+ return (0);
+}
+
+static void
+rack_stop_all_timers(struct tcpcb *tp)
+{
+ struct tcp_rack *rack;
+
+ /*
+ * Assure no timers are running.
+ */
+ if (tcp_timer_active(tp, TT_PERSIST)) {
+ /* We enter in persists, set the flag appropriately */
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack->rc_in_persist = 1;
+ }
+ tcp_timer_suspend(tp, TT_PERSIST);
+ tcp_timer_suspend(tp, TT_REXMT);
+ tcp_timer_suspend(tp, TT_KEEP);
+ tcp_timer_suspend(tp, TT_DELACK);
+}
+
+static void
+rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, uint32_t ts)
+{
+ int32_t idx;
+
+ rsm->r_rtr_cnt++;
+ rsm->r_sndcnt++;
+ if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
+ rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
+ rsm->r_flags |= RACK_OVERMAX;
+ }
+ if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
+ rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
+ rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
+ }
+ idx = rsm->r_rtr_cnt - 1;
+ rsm->r_tim_lastsent[idx] = ts;
+ if (rsm->r_flags & RACK_ACKED) {
+ /* Problably MTU discovery messing with us */
+ rsm->r_flags &= ~RACK_ACKED;
+ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+ }
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ }
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ if (rsm->r_flags & RACK_SACK_PASSED) {
+ /* We have retransmitted due to the SACK pass */
+ rsm->r_flags &= ~RACK_SACK_PASSED;
+ rsm->r_flags |= RACK_WAS_SACKPASS;
+ }
+ /* Update memory for next rtr */
+ rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
+}
+
+
+static uint32_t
+rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
+{
+ /*
+ * We (re-)transmitted starting at rsm->r_start for some length
+ * (possibly less than r_end.
+ */
+ struct rack_sendmap *nrsm;
+ uint32_t c_end;
+ int32_t len;
+ int32_t idx;
+
+ len = *lenp;
+ c_end = rsm->r_start + len;
+ if (SEQ_GEQ(c_end, rsm->r_end)) {
+ /*
+ * We retransmitted the whole piece or more than the whole
+ * slopping into the next rsm.
+ */
+ rack_update_rsm(tp, rack, rsm, ts);
+ if (c_end == rsm->r_end) {
+ *lenp = 0;
+ return (0);
+ } else {
+ int32_t act_len;
+
+ /* Hangs over the end return whats left */
+ act_len = rsm->r_end - rsm->r_start;
+ *lenp = (len - act_len);
+ return (rsm->r_end);
+ }
+ /* We don't get out of this block. */
+ }
+ /*
+ * Here we retransmitted less than the whole thing which means we
+ * have to split this into what was transmitted and what was not.
+ */
+ nrsm = rack_alloc(rack);
+ if (nrsm == NULL) {
+ /*
+ * We can't get memory, so lets not proceed.
+ */
+ *lenp = 0;
+ return (0);
+ }
+ /*
+ * So here we are going to take the original rsm and make it what we
+ * retransmitted. nrsm will be the tail portion we did not
+ * retransmit. For example say the chunk was 1, 11 (10 bytes). And
+ * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
+ * 1, 6 and the new piece will be 6, 11.
+ */
+ nrsm->r_start = c_end;
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ nrsm->r_sndcnt = rsm->r_sndcnt;
+ nrsm->r_rtr_bytes = 0;
+ rsm->r_end = c_end;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ rack_update_rsm(tp, rack, rsm, ts);
+ *lenp = 0;
+ return (0);
+}
+
+
+static void
+rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
+ uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
+ uint8_t pass, struct rack_sendmap *hintrsm)
+{
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm, *nrsm;
+ register uint32_t snd_max, snd_una;
+ int32_t idx;
+
+ /*
+ * Add to the RACK log of packets in flight or retransmitted. If
+ * there is a TS option we will use the TS echoed, if not we will
+ * grab a TS.
+ *
+ * Retransmissions will increment the count and move the ts to its
+ * proper place. Note that if options do not include TS's then we
+ * won't be able to effectively use the ACK for an RTT on a retran.
+ *
+ * Notes about r_start and r_end. Lets consider a send starting at
+ * sequence 1 for 10 bytes. In such an example the r_start would be
+ * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
+ * This means that r_end is actually the first sequence for the next
+ * slot (11).
+ *
+ */
+ /*
+ * If err is set what do we do XXXrrs? should we not add the thing?
+ * -- i.e. return if err != 0 or should we pretend we sent it? --
+ * i.e. proceed with add ** do this for now.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (err)
+ /*
+ * We don't log errors -- we could but snd_max does not
+ * advance in this case either.
+ */
+ return;
+
+ if (th_flags & TH_RST) {
+ /*
+ * We don't log resets and we return immediately from
+ * sending
+ */
+ return;
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ snd_una = tp->snd_una;
+ if (SEQ_LEQ((seq_out + len), snd_una)) {
+ /* Are sending an old segment to induce an ack (keep-alive)? */
+ return;
+ }
+ if (SEQ_LT(seq_out, snd_una)) {
+ /* huh? should we panic? */
+ uint32_t end;
+
+ end = seq_out + len;
+ seq_out = snd_una;
+ len = end - seq_out;
+ }
+ snd_max = tp->snd_max;
+ if (th_flags & (TH_SYN | TH_FIN)) {
+ /*
+ * The call to rack_log_output is made before bumping
+ * snd_max. This means we can record one extra byte on a SYN
+ * or FIN if seq_out is adding more on and a FIN is present
+ * (and we are not resending).
+ */
+ if (th_flags & TH_SYN)
+ len++;
+ if (th_flags & TH_FIN)
+ len++;
+ if (SEQ_LT(snd_max, tp->snd_nxt)) {
+ /*
+ * The add/update as not been done for the FIN/SYN
+ * yet.
+ */
+ snd_max = tp->snd_nxt;
+ }
+ }
+ if (len == 0) {
+ /* We don't log zero window probes */
+ return;
+ }
+ rack->r_ctl.rc_time_last_sent = ts;
+ if (IN_RECOVERY(tp->t_flags)) {
+ rack->r_ctl.rc_prr_out += len;
+ }
+ /* First question is it a retransmission? */
+ if (seq_out == snd_max) {
+again:
+ rsm = rack_alloc(rack);
+ if (rsm == NULL) {
+ /*
+ * Hmm out of memory and the tcb got destroyed while
+ * we tried to wait.
+ */
+#ifdef INVARIANTS
+ panic("Out of memory when we should not be rack:%p", rack);
+#endif
+ return;
+ }
+ if (th_flags & TH_FIN) {
+ rsm->r_flags = RACK_HAS_FIN;
+ } else {
+ rsm->r_flags = 0;
+ }
+ rsm->r_tim_lastsent[0] = ts;
+ rsm->r_rtr_cnt = 1;
+ rsm->r_rtr_bytes = 0;
+ rsm->r_start = seq_out;
+ rsm->r_end = rsm->r_start + len;
+ rsm->r_sndcnt = 0;
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ return;
+ }
+ /*
+ * If we reach here its a retransmission and we need to find it.
+ */
+more:
+ if (hintrsm && (hintrsm->r_start == seq_out)) {
+ rsm = hintrsm;
+ hintrsm = NULL;
+ } else if (rack->r_ctl.rc_next) {
+ /* We have a hint from a previous run */
+ rsm = rack->r_ctl.rc_next;
+ } else {
+ /* No hints sorry */
+ rsm = NULL;
+ }
+ if ((rsm) && (rsm->r_start == seq_out)) {
+ /*
+ * We used rc_next or hintrsm to retransmit, hopefully the
+ * likely case.
+ */
+ seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
+ if (len == 0) {
+ return;
+ } else {
+ goto more;
+ }
+ }
+ /* Ok it was not the last pointer go through it the hard way. */
+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
+ if (rsm->r_start == seq_out) {
+ seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
+ rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
+ if (len == 0) {
+ return;
+ } else {
+ continue;
+ }
+ }
+ if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
+ /* Transmitted within this piece */
+ /*
+ * Ok we must split off the front and then let the
+ * update do the rest
+ */
+ nrsm = rack_alloc(rack);
+ if (nrsm == NULL) {
+#ifdef INVARIANTS
+ panic("Ran out of memory that was preallocated? rack:%p", rack);
+#endif
+ rack_update_rsm(tp, rack, rsm, ts);
+ return;
+ }
+ /*
+ * copy rsm to nrsm and then trim the front of rsm
+ * to not include this part.
+ */
+ nrsm->r_start = seq_out;
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ nrsm->r_sndcnt = rsm->r_sndcnt;
+ nrsm->r_rtr_bytes = 0;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+ rsm->r_end = nrsm->r_start;
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
+ if (len == 0) {
+ return;
+ }
+ }
+ }
+ /*
+ * Hmm not found in map did they retransmit both old and on into the
+ * new?
+ */
+ if (seq_out == tp->snd_max) {
+ goto again;
+ } else if (SEQ_LT(seq_out, tp->snd_max)) {
+#ifdef INVARIANTS
+ printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
+ seq_out, len, tp->snd_una, tp->snd_max);
+ printf("Starting Dump of all rack entries\n");
+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
+ printf("rsm:%p start:%u end:%u\n",
+ rsm, rsm->r_start, rsm->r_end);
+ }
+ printf("Dump complete\n");
+ panic("seq_out not found rack:%p tp:%p",
+ rack, tp);
+#endif
+ } else {
+#ifdef INVARIANTS
+ /*
+ * Hmm beyond sndmax? (only if we are using the new rtt-pack
+ * flag)
+ */
+ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
+ seq_out, len, tp->snd_max, tp);
+#endif
+ }
+}
+
+/*
+ * Record one of the RTT updates from an ack into
+ * our sample structure.
+ */
+static void
+tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
+{
+ if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
+ (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
+ rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
+ }
+ if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
+ (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
+ rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
+ }
+ rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
+ rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
+ rack->r_ctl.rack_rs.rs_rtt_cnt++;
+}
+
+/*
+ * Collect new round-trip time estimate
+ * and update averages and current timeout.
+ */
+static void
+tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
+{
+ int32_t delta;
+ uint32_t o_srtt, o_var;
+ int32_t rtt;
+
+ if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
+ /* No valid sample */
+ return;
+ if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
+ /* We are to use the lowest RTT seen in a single ack */
+ rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
+ } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
+ /* We are to use the highest RTT seen in a single ack */
+ rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
+ } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
+ /* We are to use the average RTT seen in a single ack */
+ rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
+ (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
+ } else {
+#ifdef INVARIANTS
+ panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
+#endif
+ return;
+ }
+ if (rtt == 0)
+ rtt = 1;
+ rack_log_rtt_sample(rack, rtt);
+ o_srtt = tp->t_srtt;
+ o_var = tp->t_rttvar;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (tp->t_srtt != 0) {
+ /*
+ * srtt is stored as fixed point with 5 bits after the
+ * binary point (i.e., scaled by 8). The following magic is
+ * equivalent to the smoothing algorithm in rfc793 with an
+ * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
+ * Adjust rtt to origin 0.
+ */
+ delta = ((rtt - 1) << TCP_DELTA_SHIFT)
+ - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+
+ tp->t_srtt += delta;
+ if (tp->t_srtt <= 0)
+ tp->t_srtt = 1;
+
+ /*
+ * We accumulate a smoothed rtt variance (actually, a
+ * smoothed mean difference), then set the retransmit timer
+ * to smoothed rtt + 4 times the smoothed variance. rttvar
+ * is stored as fixed point with 4 bits after the binary
+ * point (scaled by 16). The following is equivalent to
+ * rfc793 smoothing with an alpha of .75 (rttvar =
+ * rttvar*3/4 + |delta| / 4). This replaces rfc793's
+ * wired-in beta.
+ */
+ if (delta < 0)
+ delta = -delta;
+ delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
+ tp->t_rttvar += delta;
+ if (tp->t_rttvar <= 0)
+ tp->t_rttvar = 1;
+ if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+ } else {
+ /*
+ * No rtt measurement yet - use the unsmoothed rtt. Set the
+ * variance to half the rtt (so our first retransmit happens
+ * at 3*rtt).
+ */
+ tp->t_srtt = rtt << TCP_RTT_SHIFT;
+ tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+ }
+ TCPSTAT_INC(tcps_rttupdated);
+ rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
+ tp->t_rttupdated++;
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
+#endif
+ tp->t_rxtshift = 0;
+
+ /*
+ * the retransmit should happen at rtt + 4 * rttvar. Because of the
+ * way we do the smoothing, srtt and rttvar will each average +1/2
+ * tick of bias. When we compute the retransmit timer, we want 1/2
+ * tick of rounding and 1 extra tick because of +-1/2 tick
+ * uncertainty in the firing of the timer. The bias will give us
+ * exactly the 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below the minimum
+ * feasible timer (which is 2 ticks).
+ */
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
+ max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
+ tp->t_softerror = 0;
+}
+
+static void
+rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
+ uint32_t t, uint32_t cts)
+{
+ /*
+ * For this RSM, we acknowledged the data from a previous
+ * transmission, not the last one we made. This means we did a false
+ * retransmit.
+ */
+ struct tcp_rack *rack;
+
+ if (rsm->r_flags & RACK_HAS_FIN) {
+ /*
+ * The sending of the FIN often is multiple sent when we
+ * have everything outstanding ack'd. We ignore this case
+ * since its over now.
+ */
+ return;
+ }
+ if (rsm->r_flags & RACK_TLP) {
+ /*
+ * We expect TLP's to have this occur.
+ */
+ return;
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ /* should we undo cc changes and exit recovery? */
+ if (IN_RECOVERY(tp->t_flags)) {
+ if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
+ /*
+ * Undo what we ratched down and exit recovery if
+ * possible
+ */
+ EXIT_RECOVERY(tp->t_flags);
+ tp->snd_recover = tp->snd_una;
+ if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
+ tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
+ if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
+ tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
+ }
+ }
+ if (rsm->r_flags & RACK_WAS_SACKPASS) {
+ /*
+ * We retransmitted based on a sack and the earlier
+ * retransmission ack'd it - re-ordering is occuring.
+ */
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ counter_u64_add(rack_badfr, 1);
+ counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
+}
+
+
+static int
+rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
+{
+ int32_t i;
+ uint32_t t;
+
+ if (rsm->r_flags & RACK_ACKED)
+ /* Already done */
+ return (0);
+
+
+ if ((rsm->r_rtr_cnt == 1) ||
+ ((ack_type == CUM_ACKED) &&
+ (to->to_flags & TOF_TS) &&
+ (to->to_tsecr) &&
+ (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
+ ) {
+ /*
+ * We will only find a matching timestamp if its cum-acked.
+ * But if its only one retransmission its for-sure matching
+ * :-)
+ */
+ t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ if ((int)t <= 0)
+ t = 1;
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ if (!rack->r_ctl.rc_rack_min_rtt ||
+ SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
+ rack->r_ctl.rc_rack_min_rtt = t;
+ if (rack->r_ctl.rc_rack_min_rtt == 0) {
+ rack->r_ctl.rc_rack_min_rtt = 1;
+ }
+ }
+ tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
+ if ((rsm->r_flags & RACK_TLP) &&
+ (!IN_RECOVERY(tp->t_flags))) {
+ /* Segment was a TLP and our retrans matched */
+ if (rack->r_ctl.rc_tlp_cwnd_reduce) {
+ rack->r_ctl.rc_rsm_start = tp->snd_max;
+ rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
+ rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
+ rack_cong_signal(tp, NULL, CC_NDUPACK);
+ /*
+ * When we enter recovery we need to assure
+ * we send one packet.
+ */
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ } else
+ rack->r_ctl.rc_tlp_rtx_out = 0;
+ }
+ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
+ /* New more recent rack_tmit_time */
+ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ rack->rc_rack_rtt = t;
+ }
+ return (1);
+ }
+ /*
+ * We clear the soft/rxtshift since we got an ack.
+ * There is no assurance we will call the commit() function
+ * so we need to clear these to avoid incorrect handling.
+ */
+ tp->t_rxtshift = 0;
+ tp->t_softerror = 0;
+ if ((to->to_flags & TOF_TS) &&
+ (ack_type == CUM_ACKED) &&
+ (to->to_tsecr) &&
+ ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
+ /*
+ * Now which timestamp does it match? In this block the ACK
+ * must be coming from a previous transmission.
+ */
+ for (i = 0; i < rsm->r_rtr_cnt; i++) {
+ if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
+ t = cts - rsm->r_tim_lastsent[i];
+ if ((int)t <= 0)
+ t = 1;
+ if ((i + 1) < rsm->r_rtr_cnt) {
+ /* Likely */
+ rack_earlier_retran(tp, rsm, t, cts);
+ }
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
+ rack->r_ctl.rc_rack_min_rtt = t;
+ if (rack->r_ctl.rc_rack_min_rtt == 0) {
+ rack->r_ctl.rc_rack_min_rtt = 1;
+ }
+ }
+ /*
+ * Note the following calls to
+ * tcp_rack_xmit_timer() are being commented
+ * out for now. They give us no more accuracy
+ * and often lead to a wrong choice. We have
+ * enough samples that have not been
+ * retransmitted. I leave the commented out
+ * code in here in case in the future we
+ * decide to add it back (though I can't forsee
+ * doing that). That way we will easily see
+ * where they need to be placed.
+ */
+ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
+ rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
+ /* New more recent rack_tmit_time */
+ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ rack->rc_rack_rtt = t;
+ }
+ return (1);
+ }
+ }
+ goto ts_not_found;
+ } else {
+ /*
+ * Ok its a SACK block that we retransmitted. or a windows
+ * machine without timestamps. We can tell nothing from the
+ * time-stamp since its not there or the time the peer last
+ * recieved a segment that moved forward its cum-ack point.
+ */
+ts_not_found:
+ i = rsm->r_rtr_cnt - 1;
+ t = cts - rsm->r_tim_lastsent[i];
+ if ((int)t <= 0)
+ t = 1;
+ if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
+ /*
+ * We retransmitted and the ack came back in less
+ * than the smallest rtt we have observed. We most
+ * likey did an improper retransmit as outlined in
+ * 4.2 Step 3 point 2 in the rack-draft.
+ */
+ i = rsm->r_rtr_cnt - 2;
+ t = cts - rsm->r_tim_lastsent[i];
+ rack_earlier_retran(tp, rsm, t, cts);
+ } else if (rack->r_ctl.rc_rack_min_rtt) {
+ /*
+ * We retransmitted it and the retransmit did the
+ * job.
+ */
+ if (!rack->r_ctl.rc_rack_min_rtt ||
+ SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
+ rack->r_ctl.rc_rack_min_rtt = t;
+ if (rack->r_ctl.rc_rack_min_rtt == 0) {
+ rack->r_ctl.rc_rack_min_rtt = 1;
+ }
+ }
+ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
+ /* New more recent rack_tmit_time */
+ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
+ rack->rc_rack_rtt = t;
+ }
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
+ */
+static void
+rack_log_sack_passed(struct tcpcb *tp,
+ struct tcp_rack *rack, struct rack_sendmap *rsm)
+{
+ struct rack_sendmap *nrsm;
+ uint32_t ts;
+ int32_t idx;
+
+ idx = rsm->r_rtr_cnt - 1;
+ ts = rsm->r_tim_lastsent[idx];
+ nrsm = rsm;
+ TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
+ rack_head, r_tnext) {
+ if (nrsm == rsm) {
+ /* Skip orginal segment he is acked */
+ continue;
+ }
+ if (nrsm->r_flags & RACK_ACKED) {
+ /* Skip ack'd segments */
+ continue;
+ }
+ idx = nrsm->r_rtr_cnt - 1;
+ if (ts == nrsm->r_tim_lastsent[idx]) {
+ /*
+ * For this case lets use seq no, if we sent in a
+ * big block (TSO) we would have a bunch of segments
+ * sent at the same time.
+ *
+ * We would only get a report if its SEQ is earlier.
+ * If we have done multiple retransmits the times
+ * would not be equal.
+ */
+ if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
+ nrsm->r_flags |= RACK_SACK_PASSED;
+ nrsm->r_flags &= ~RACK_WAS_SACKPASS;
+ }
+ } else {
+ /*
+ * Here they were sent at different times, not a big
+ * block. Since we transmitted this one later and
+ * see it sack'd then this must also be missing (or
+ * we would have gotten a sack block for it)
+ */
+ nrsm->r_flags |= RACK_SACK_PASSED;
+ nrsm->r_flags &= ~RACK_WAS_SACKPASS;
+ }
+ }
+}
+
+static uint32_t
+rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
+ struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
+{
+ int32_t idx;
+ int32_t times = 0;
+ uint32_t start, end, changed = 0;
+ struct rack_sendmap *rsm, *nrsm;
+ int32_t used_ref = 1;
+
+ start = sack->start;
+ end = sack->end;
+ rsm = *prsm;
+ if (rsm && SEQ_LT(start, rsm->r_start)) {
+ TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
+ if (SEQ_GEQ(start, rsm->r_start) &&
+ SEQ_LT(start, rsm->r_end)) {
+ goto do_rest_ofb;
+ }
+ }
+ }
+ if (rsm == NULL) {
+start_at_beginning:
+ rsm = NULL;
+ used_ref = 0;
+ }
+ /* First lets locate the block where this guy is */
+ TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
+ if (SEQ_GEQ(start, rsm->r_start) &&
+ SEQ_LT(start, rsm->r_end)) {
+ break;
+ }
+ }
+do_rest_ofb:
+ if (rsm == NULL) {
+ /*
+ * This happens when we get duplicate sack blocks with the
+ * same end. For example SACK 4: 100 SACK 3: 100 The sort
+ * will not change there location so we would just start at
+ * the end of the first one and get lost.
+ */
+ if (tp->t_flags & TF_SENTFIN) {
+ /*
+ * Check to see if we have not logged the FIN that
+ * went out.
+ */
+ nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
+ if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
+ /*
+ * Ok we did not get the FIN logged.
+ */
+ nrsm->r_end++;
+ rsm = nrsm;
+ goto do_rest_ofb;
+ }
+ }
+ if (times == 1) {
+#ifdef INVARIANTS
+ panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
+ tp, rack, sack, to, prsm);
+#else
+ goto out;
+#endif
+ }
+ times++;
+ counter_u64_add(rack_sack_proc_restart, 1);
+ goto start_at_beginning;
+ }
+ /* Ok we have an ACK for some piece of rsm */
+ if (rsm->r_start != start) {
+ /*
+ * Need to split this in two pieces the before and after.
+ */
+ nrsm = rack_alloc(rack);
+ if (nrsm == NULL) {
+ /*
+ * failed XXXrrs what can we do but loose the sack
+ * info?
+ */
+ goto out;
+ }
+ nrsm->r_start = start;
+ nrsm->r_rtr_bytes = 0;
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ nrsm->r_sndcnt = rsm->r_sndcnt;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+ rsm->r_end = nrsm->r_start;
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ rsm = nrsm;
+ }
+ if (SEQ_GEQ(end, rsm->r_end)) {
+ /*
+ * The end of this block is either beyond this guy or right
+ * at this guy.
+ */
+
+ if ((rsm->r_flags & RACK_ACKED) == 0) {
+ rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
+ changed += (rsm->r_end - rsm->r_start);
+ rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
+ rack_log_sack_passed(tp, rack, rsm);
+ /* Is Reordering occuring? */
+ if (rsm->r_flags & RACK_SACK_PASSED) {
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ rsm->r_flags |= RACK_ACKED;
+ rsm->r_flags &= ~RACK_TLP;
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ }
+ if (end == rsm->r_end) {
+ /* This block only - done */
+ goto out;
+ }
+ /* There is more not coverend by this rsm move on */
+ start = rsm->r_end;
+ nrsm = TAILQ_NEXT(rsm, r_next);
+ rsm = nrsm;
+ times = 0;
+ goto do_rest_ofb;
+ }
+ /* Ok we need to split off this one at the tail */
+ nrsm = rack_alloc(rack);
+ if (nrsm == NULL) {
+ /* failed rrs what can we do but loose the sack info? */
+ goto out;
+ }
+ /* Clone it */
+ nrsm->r_start = end;
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_bytes = 0;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ nrsm->r_sndcnt = rsm->r_sndcnt;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+ /* The sack block does not cover this guy fully */
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ rsm->r_end = end;
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ if (rsm->r_flags & RACK_ACKED) {
+ /* Been here done that */
+ goto out;
+ }
+ rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
+ changed += (rsm->r_end - rsm->r_start);
+ rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
+ rack_log_sack_passed(tp, rack, rsm);
+ /* Is Reordering occuring? */
+ if (rsm->r_flags & RACK_SACK_PASSED) {
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ rsm->r_flags |= RACK_ACKED;
+ rsm->r_flags &= ~RACK_TLP;
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+out:
+ if (used_ref == 0) {
+ counter_u64_add(rack_sack_proc_all, 1);
+ } else {
+ counter_u64_add(rack_sack_proc_short, 1);
+ }
+ /* Save off where we last were */
+ if (rsm)
+ rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
+ else
+ rack->r_ctl.rc_sacklast = NULL;
+ *prsm = rsm;
+ return (changed);
+}
+
+static void inline
+rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
+{
+ struct rack_sendmap *tmap;
+
+ tmap = NULL;
+ while (rsm && (rsm->r_flags & RACK_ACKED)) {
+ /* Its no longer sacked, mark it so */
+ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+#ifdef INVARIANTS
+ if (rsm->r_in_tmap) {
+ panic("rack:%p rsm:%p flags:0x%x in tmap?",
+ rack, rsm, rsm->r_flags);
+ }
+#endif
+ rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
+ /* Rebuild it into our tmap */
+ if (tmap == NULL) {
+ TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ tmap = rsm;
+ } else {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
+ tmap = rsm;
+ }
+ tmap->r_in_tmap = 1;
+ rsm = TAILQ_NEXT(rsm, r_next);
+ }
+ /*
+ * Now lets possibly clear the sack filter so we start
+ * recognizing sacks that cover this area.
+ */
+ if (rack_use_sack_filter)
+ sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
+
+}
+
+static void
+rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
+{
+ uint32_t changed, last_seq, entered_recovery = 0;
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+ struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
+ register uint32_t th_ack;
+ int32_t i, j, k, num_sack_blks = 0;
+ uint32_t cts, acked, ack_point, sack_changed = 0;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (th->th_flags & TH_RST) {
+ /* We don't log resets */
+ return;
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ cts = tcp_ts_getticks();
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ changed = 0;
+ th_ack = th->th_ack;
+
+ if (SEQ_GT(th_ack, tp->snd_una)) {
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
+ tp->t_acktime = ticks;
+ }
+ if (rsm && SEQ_GT(th_ack, rsm->r_start))
+ changed = th_ack - rsm->r_start;
+ if (changed) {
+ /*
+ * The ACK point is advancing to th_ack, we must drop off
+ * the packets in the rack log and calculate any eligble
+ * RTT's.
+ */
+ rack->r_wanted_output++;
+more:
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ if (rsm == NULL) {
+ if ((th_ack - 1) == tp->iss) {
+ /*
+ * For the SYN incoming case we will not
+ * have called tcp_output for the sending of
+ * the SYN, so there will be no map. All
+ * other cases should probably be a panic.
+ */
+ goto proc_sack;
+ }
+ if (tp->t_flags & TF_SENTFIN) {
+ /* if we send a FIN we will not hav a map */
+ goto proc_sack;
+ }
+#ifdef INVARIANTS
+ panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
+ tp,
+ th, tp->t_state, rack,
+ tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
+#endif
+ goto proc_sack;
+ }
+ if (SEQ_LT(th_ack, rsm->r_start)) {
+ /* Huh map is missing this */
+#ifdef INVARIANTS
+ printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
+ rsm->r_start,
+ th_ack, tp->t_state, rack->r_state);
+#endif
+ goto proc_sack;
+ }
+ rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
+ /* Now do we consume the whole thing? */
+ if (SEQ_GEQ(th_ack, rsm->r_end)) {
+ /* Its all consumed. */
+ uint32_t left;
+
+ rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
+ rsm->r_rtr_bytes = 0;
+ TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ if (rack->r_ctl.rc_next == rsm) {
+ /* scoot along the marker */
+ rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ }
+ if (rsm->r_flags & RACK_ACKED) {
+ /*
+ * It was acked on the scoreboard -- remove
+ * it from total
+ */
+ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+ } else if (rsm->r_flags & RACK_SACK_PASSED) {
+ /*
+ * There are acked segments ACKED on the
+ * scoreboard further up. We are seeing
+ * reordering.
+ */
+ counter_u64_add(rack_reorder_seen, 1);
+ rsm->r_flags |= RACK_ACKED;
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ left = th_ack - rsm->r_end;
+ if (rsm->r_rtr_cnt > 1) {
+ /*
+ * Technically we should make r_rtr_cnt be
+ * monotonicly increasing and just mod it to
+ * the timestamp it is replacing.. that way
+ * we would have the last 3 retransmits. Now
+ * rc_loss_count will be wrong if we
+ * retransmit something more than 2 times in
+ * recovery :(
+ */
+ rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
+ }
+ /* Free back to zone */
+ rack_free(rack, rsm);
+ if (left) {
+ goto more;
+ }
+ goto proc_sack;
+ }
+ if (rsm->r_flags & RACK_ACKED) {
+ /*
+ * It was acked on the scoreboard -- remove it from
+ * total for the part being cum-acked.
+ */
+ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
+ }
+ rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
+ rsm->r_rtr_bytes = 0;
+ rsm->r_start = th_ack;
+ }
+proc_sack:
+ /* Check for reneging */
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
+ /*
+ * The peer has moved snd_una up to
+ * the edge of this send, i.e. one
+ * that it had previously acked. The only
+ * way that can be true if the peer threw
+ * away data (space issues) that it had
+ * previously sacked (else it would have
+ * given us snd_una up to (rsm->r_end).
+ * We need to undo the acked markings here.
+ *
+ * Note we have to look to make sure th_ack is
+ * our rsm->r_start in case we get an old ack
+ * where th_ack is behind snd_una.
+ */
+ rack_peer_reneges(rack, rsm, th->th_ack);
+ }
+ if ((to->to_flags & TOF_SACK) == 0) {
+ /* We are done nothing left to log */
+ goto out;
+ }
+ rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
+ if (rsm) {
+ last_seq = rsm->r_end;
+ } else {
+ last_seq = tp->snd_max;
+ }
+ /* Sack block processing */
+ if (SEQ_GT(th_ack, tp->snd_una))
+ ack_point = th_ack;
+ else
+ ack_point = tp->snd_una;
+ for (i = 0; i < to->to_nsacks; i++) {
+ bcopy((to->to_sacks + i * TCPOLEN_SACK),
+ &sack, sizeof(sack));
+ sack.start = ntohl(sack.start);
+ sack.end = ntohl(sack.end);
+ if (SEQ_GT(sack.end, sack.start) &&
+ SEQ_GT(sack.start, ack_point) &&
+ SEQ_LT(sack.start, tp->snd_max) &&
+ SEQ_GT(sack.end, ack_point) &&
+ SEQ_LEQ(sack.end, tp->snd_max)) {
+ if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
+ (SEQ_LT(sack.end, last_seq)) &&
+ ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
+ /*
+ * Not the last piece and its smaller than
+ * 1/8th of a MSS. We ignore this.
+ */
+ counter_u64_add(rack_runt_sacks, 1);
+ continue;
+ }
+ sack_blocks[num_sack_blks] = sack;
+ num_sack_blks++;
+#ifdef NETFLIX_STATS
+ } else if (SEQ_LEQ(sack.start, th_ack) &&
+ SEQ_LEQ(sack.end, th_ack)) {
+ /*
+ * Its a D-SACK block.
+ */
+ tcp_record_dsack(sack.start, sack.end);
+#endif
+ }
+
+ }
+ if (num_sack_blks == 0)
+ goto out;
+ /*
+ * Sort the SACK blocks so we can update the rack scoreboard with
+ * just one pass.
+ */
+ if (rack_use_sack_filter) {
+ num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
+ }
+ if (num_sack_blks < 2) {
+ goto do_sack_work;
+ }
+ /* Sort the sacks */
+ for (i = 0; i < num_sack_blks; i++) {
+ for (j = i + 1; j < num_sack_blks; j++) {
+ if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
+ sack = sack_blocks[i];
+ sack_blocks[i] = sack_blocks[j];
+ sack_blocks[j] = sack;
+ }
+ }
+ }
+ /*
+ * Now are any of the sack block ends the same (yes some
+ * implememtations send these)?
+ */
+again:
+ if (num_sack_blks > 1) {
+ for (i = 0; i < num_sack_blks; i++) {
+ for (j = i + 1; j < num_sack_blks; j++) {
+ if (sack_blocks[i].end == sack_blocks[j].end) {
+ /*
+ * Ok these two have the same end we
+ * want the smallest end and then
+ * throw away the larger and start
+ * again.
+ */
+ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
+ /*
+ * The second block covers
+ * more area use that
+ */
+ sack_blocks[i].start = sack_blocks[j].start;
+ }
+ /*
+ * Now collapse out the dup-sack and
+ * lower the count
+ */
+ for (k = (j + 1); k < num_sack_blks; k++) {
+ sack_blocks[j].start = sack_blocks[k].start;
+ sack_blocks[j].end = sack_blocks[k].end;
+ j++;
+ }
+ num_sack_blks--;
+ goto again;
+ }
+ }
+ }
+ }
+do_sack_work:
+ rsm = rack->r_ctl.rc_sacklast;
+ for (i = 0; i < num_sack_blks; i++) {
+ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
+ if (acked) {
+ rack->r_wanted_output++;
+ changed += acked;
+ sack_changed += acked;
+ }
+ }
+out:
+ if (changed) {
+ /* Something changed cancel the rack timer */
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ }
+ if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
+ /*
+ * Ok we have a high probability that we need to go in to
+ * recovery since we have data sack'd
+ */
+ struct rack_sendmap *rsm;
+ uint32_t tsused;
+
+ tsused = tcp_ts_getticks();
+ rsm = tcp_rack_output(tp, rack, tsused);
+ if (rsm) {
+ /* Enter recovery */
+ rack->r_ctl.rc_rsm_start = rsm->r_start;
+ rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
+ rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
+ entered_recovery = 1;
+ rack_cong_signal(tp, NULL, CC_NDUPACK);
+ /*
+ * When we enter recovery we need to assure we send
+ * one packet.
+ */
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_timer_override = 1;
+ }
+ }
+ if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
+ /* Deal with changed an PRR here (in recovery only) */
+ uint32_t pipe, snd_una;
+
+ rack->r_ctl.rc_prr_delivered += changed;
+ /* Compute prr_sndcnt */
+ if (SEQ_GT(tp->snd_una, th_ack)) {
+ snd_una = tp->snd_una;
+ } else {
+ snd_una = th_ack;
+ }
+ pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
+ if (pipe > tp->snd_ssthresh) {
+ long sndcnt;
+
+ sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
+ if (rack->r_ctl.rc_prr_recovery_fs > 0)
+ sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
+ else {
+ rack->r_ctl.rc_prr_sndcnt = 0;
+ sndcnt = 0;
+ }
+ sndcnt++;
+ if (sndcnt > (long)rack->r_ctl.rc_prr_out)
+ sndcnt -= rack->r_ctl.rc_prr_out;
+ else
+ sndcnt = 0;
+ rack->r_ctl.rc_prr_sndcnt = sndcnt;
+ } else {
+ uint32_t limit;
+
+ if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
+ limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
+ else
+ limit = 0;
+ if (changed > limit)
+ limit = changed;
+ limit += tp->t_maxseg;
+ if (tp->snd_ssthresh > pipe) {
+ rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
+ } else {
+ rack->r_ctl.rc_prr_sndcnt = min(0, limit);
+ }
+ }
+ if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
+ rack->r_timer_override = 1;
+ }
+ }
+}
+
+/*
+ * Return value of 1, we do not need to call rack_process_data().
+ * return value of 0, rack_process_data can be called.
+ * For ret_val if its 0 the TCP is locked, if its non-zero
+ * its unlocked and probably unsafe to touch the TCB.
+ */
+static int
+rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to,
+ int32_t * ti_locked, uint32_t tiwin, int32_t tlen,
+ int32_t * ofia, int32_t thflags, int32_t * ret_val)
+{
+ int32_t ourfinisacked = 0;
+ int32_t nsegs, acked_amount;
+ int32_t acked;
+ struct mbuf *mfree;
+ struct tcp_rack *rack;
+ int32_t recovery = 0;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (SEQ_GT(th->th_ack, tp->snd_max)) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val);
+ return (1);
+ }
+ if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
+ rack_log_ack(tp, to, th);
+ }
+ if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
+ /*
+ * Old ack, behind (or duplicate to) the last one rcv'd
+ * Note: Should mark reordering is occuring! We should also
+ * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
+ * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
+ * retran and> ack 3
+ */
+ return (0);
+ }
+ /*
+ * If we reach this point, ACK is not a duplicate, i.e., it ACKs
+ * something we sent.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our SYN has
+ * been ACK'd (so connection is now fully synchronized). Go
+ * to non-starred state, increment snd_una for ACK of SYN,
+ * and check if we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ /* Send window already scaled. */
+ }
+ }
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ acked = BYTES_THIS_ACK(tp, th);
+ TCPSTAT_ADD(tcps_rcvackpack, nsegs);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+
+ /*
+ * If we just performed our first retransmit, and the ACK arrives
+ * within our recovery window, then it was a mistake to do the
+ * retransmit in the first place. Recover our original cwnd and
+ * ssthresh, and proceed to transmit where we left off.
+ */
+ if (tp->t_flags & TF_PREVVALID) {
+ tp->t_flags &= ~TF_PREVVALID;
+ if (tp->t_rxtshift == 1 &&
+ (int)(ticks - tp->t_badrxtwin) < 0)
+ rack_cong_signal(tp, th, CC_RTO_ERR);
+ }
+ /*
+ * If we have a timestamp reply, update smoothed round trip time. If
+ * no timestamp is present but transmit timer is running and timed
+ * sequence number was acked, update smoothed round trip time. Since
+ * we now have an rtt measurement, cancel the timer backoff (cf.,
+ * Phil Karn's retransmit alg.). Recompute the initial retransmit
+ * timer.
+ *
+ * Some boxes send broken timestamp replies during the SYN+ACK
+ * phase, ignore timestamps of 0 or we could calculate a huge RTT
+ * and blow up the retransmit timer.
+ */
+ /*
+ * If all outstanding data is acked, stop retransmit timer and
+ * remember to restart (more output or persist). If there is more
+ * data to be acked, restart retransmit timer, using current
+ * (possibly backed-off) value.
+ */
+ if (th->th_ack == tp->snd_max) {
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ rack->r_wanted_output++;
+ }
+ /*
+ * If no data (only SYN) was ACK'd, skip rest of ACK processing.
+ */
+ if (acked == 0) {
+ if (ofia)
+ *ofia = ourfinisacked;
+ return (0);
+ }
+ if (rack->r_ctl.rc_early_recovery) {
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ tcp_rack_partialack(tp, th);
+ } else {
+ rack_post_recovery(tp, th);
+ recovery = 1;
+ }
+ }
+ }
+ /*
+ * Let the congestion control algorithm update congestion control
+ * related information. This typically means increasing the
+ * congestion window.
+ */
+ rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
+ SOCKBUF_LOCK(&so->so_snd);
+ acked_amount = min(acked, (int)sbavail(&so->so_snd));
+ tp->snd_wnd -= acked_amount;
+ mfree = sbcut_locked(&so->so_snd, acked_amount);
+ if ((sbused(&so->so_snd) == 0) &&
+ (acked > acked_amount) &&
+ (tp->t_state >= TCPS_FIN_WAIT_1)) {
+ ourfinisacked = 1;
+ }
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
+ m_freem(mfree);
+ if (rack->r_ctl.rc_early_recovery == 0) {
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ tcp_rack_partialack(tp, th);
+ } else {
+ rack_post_recovery(tp, th);
+ }
+ }
+ }
+ tp->snd_una = th->th_ack;
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
+ tp->snd_nxt = tp->snd_una;
+ }
+ if (tp->snd_una == tp->snd_max) {
+ /* Nothing left outstanding */
+ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
+ tp->t_acktime = 0;
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ /* Set need output so persist might get set */
+ rack->r_wanted_output++;
+ if (rack_use_sack_filter)
+ sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
+ if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
+ (sbavail(&so->so_snd) == 0) &&
+ (tp->t_flags2 & TF2_DROP_AF_DATA)) {
+ /*
+ * The socket was gone and the
+ * peer sent data, time to
+ * reset him.
+ */
+ *ret_val = 1;
+ tp = tcp_close(tp);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen);
+ return (1);
+ }
+ }
+ if (ofia)
+ *ofia = ourfinisacked;
+ return (0);
+}
+
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ /*
+ * Update window information. Don't look at window if no ACK: TAC's
+ * send garbage on first SYN.
+ */
+ int32_t nsegs;
+#ifdef TCP_RFC7413
+ int32_t tfo_syn;
+#else
+#define tfo_syn (FALSE)
+#endif
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ TCPSTAT_INC(tcps_rcvwinupd);
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ rack->r_wanted_output++;
+ } else if (thflags & TH_ACK) {
+ if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ }
+ }
+ /* Was persist timer active and now we have window space? */
+ if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
+ rack_exit_persist(tp, rack);
+ tp->snd_nxt = tp->snd_max;
+ /* Make sure we output to start the timer */
+ rack->r_wanted_output++;
+ }
+ /*
+ * Process segments with URG.
+ */
+ if ((thflags & TH_URG) && th->th_urp &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ /*
+ * This is a kludge, but if we receive and accept random
+ * urgent pointers, we'll crash in soreceive. It's hard to
+ * imagine someone actually wanting to send this much urgent
+ * data.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
+ th->th_urp = 0; /* XXX */
+ thflags &= ~TH_URG; /* XXX */
+ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
+ goto dodata; /* XXX */
+ }
+ /*
+ * If this segment advances the known urgent pointer, then
+ * mark the data stream. This should not happen in
+ * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
+ * FIN has been received from the remote side. In these
+ * states we ignore the URG.
+ *
+ * According to RFC961 (Assigned Protocols), the urgent
+ * pointer points to the last octet of urgent data. We
+ * continue, however, to consider it to indicate the first
+ * octet of data past the urgent section as the original
+ * spec states (in one of two places).
+ */
+ if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
+ tp->rcv_up = th->th_seq + th->th_urp;
+ so->so_oobmark = sbavail(&so->so_rcv) +
+ (tp->rcv_up - tp->rcv_nxt) - 1;
+ if (so->so_oobmark == 0)
+ so->so_rcv.sb_state |= SBS_RCVATMARK;
+ sohasoutofband(so);
+ tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /*
+ * Remove out of band data so doesn't get presented to user.
+ * This can happen independent of advancing the URG pointer,
+ * but if two URG's are pending at once, some out-of-band
+ * data may creep in... ick.
+ */
+ if (th->th_urp <= (uint32_t) tlen &&
+ !(so->so_options & SO_OOBINLINE)) {
+ /* hdr drop is delayed */
+ tcp_pulloutofband(so, th, m, drop_hdrlen);
+ }
+ } else {
+ /*
+ * If no out of band data is expected, pull receive urgent
+ * pointer along with the receive window.
+ */
+ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+ tp->rcv_up = tp->rcv_nxt;
+ }
+dodata: /* XXX */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Process the segment text, merging it into the TCP sequencing
+ * queue, and arranging for acknowledgment of receipt if necessary.
+ * This process logically involves adjusting tp->rcv_wnd as data is
+ * presented to the user (this happens in tcp_usrreq.c, case
+ * PRU_RCVD). If a FIN has already been received on this connection
+ * then we just ignore the text.
+ */
+#ifdef TCP_RFC7413
+ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
+ (tp->t_flags & TF_FASTOPEN));
+#endif
+ if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ tcp_seq save_start = th->th_seq;
+
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ /*
+ * Insert segment which includes th into TCP reassembly
+ * queue with control block tp. Set thflags to whether
+ * reassembly now includes a segment with FIN. This handles
+ * the common case inline (segment is the next to be
+ * received on an established connection, and the queue is
+ * empty), avoiding linkage into and removal from the queue
+ * and repetition of various conversions. Set DELACK for
+ * segments received in order, but ack immediately when
+ * segments are out of order (so fast retransmit can work).
+ */
+ if (th->th_seq == tp->rcv_nxt &&
+ LIST_EMPTY(&tp->t_segq) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state) ||
+ tfo_syn)) {
+ if (DELAY_ACK(tp, tlen) || tfo_syn) {
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ tp->t_flags |= TF_DELACK;
+ } else {
+ rack->r_wanted_output++;
+ tp->t_flags |= TF_ACKNOW;
+ }
+ tp->rcv_nxt += tlen;
+ thflags = th->th_flags & TH_FIN;
+ TCPSTAT_ADD(tcps_rcvpack, nsegs);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ m_freem(m);
+ else
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+ } else {
+ /*
+ * XXX: Due to the header drop above "th" is
+ * theoretically invalid by now. Fortunately
+ * m_adj() doesn't actually frees any mbufs when
+ * trimming from the head.
+ */
+ thflags = tcp_reass(tp, th, &tlen, m);
+ tp->t_flags |= TF_ACKNOW;
+ }
+ if (tlen > 0)
+ tcp_update_sack_list(tp, save_start, save_start + tlen);
+ } else {
+ m_freem(m);
+ thflags &= ~TH_FIN;
+ }
+
+ /*
+ * If FIN is received ACK the FIN and let the user know that the
+ * connection is closing.
+ */
+ if (thflags & TH_FIN) {
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ socantrcvmore(so);
+ /*
+ * If connection is half-synchronized (ie NEEDSYN
+ * flag on) then delay ACK, so it may be piggybacked
+ * when SYN is sent. Otherwise, since we received a
+ * FIN then no more input can be expected, send ACK
+ * now.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ tp->t_flags |= TF_DELACK;
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ }
+ tp->rcv_nxt++;
+ }
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED and ESTABLISHED STATES enter the
+ * CLOSE_WAIT state.
+ */
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /* FALLTHROUGH */
+ case TCPS_ESTABLISHED:
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ tcp_state_change(tp, TCPS_CLOSE_WAIT);
+ break;
+
+ /*
+ * If still in FIN_WAIT_1 STATE FIN has not been
+ * acked so enter the CLOSING state.
+ */
+ case TCPS_FIN_WAIT_1:
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ tcp_state_change(tp, TCPS_CLOSING);
+ break;
+
+ /*
+ * In FIN_WAIT_2 state enter the TIME_WAIT state,
+ * starting the time-wait timer, turning off the
+ * other standard timers.
+ */
+ case TCPS_FIN_WAIT_2:
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata "
+ "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
+ *ti_locked));
+ tcp_twstart(tp);
+ *ti_locked = TI_UNLOCKED;
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ return (1);
+ }
+ }
+ if (*ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ }
+ /*
+ * Return any desired output.
+ */
+ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
+ rack->r_wanted_output++;
+ }
+ KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+ __func__, *ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ return (0);
+}
+
+/*
+ * Here nothing is really faster, its just that we
+ * have broken out the fast-data path also just like
+ * the fast-ack.
+ */
+static int
+rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt)
+{
+ int32_t nsegs;
+ int32_t newsize = 0; /* automatic sockbuf scaling */
+ struct tcp_rack *rack;
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+
+#endif
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * the timestamp. NOTE that the test is modified according to the
+ * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if (__predict_false(th->th_seq != tp->rcv_nxt)) {
+ return (0);
+ }
+ if (__predict_false(tp->snd_nxt != tp->snd_max)) {
+ return (0);
+ }
+ if (tiwin && tiwin != tp->snd_wnd) {
+ return (0);
+ }
+ if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
+ return (0);
+ }
+ if (__predict_false((to->to_flags & TOF_TS) &&
+ (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
+ return (0);
+ }
+ if (__predict_false((th->th_ack != tp->snd_una))) {
+ return (0);
+ }
+ if (__predict_false(tlen > sbspace(&so->so_rcv))) {
+ return (0);
+ }
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ /*
+ * This is a pure, in-sequence data packet with nothing on the
+ * reassembly queue and we have enough buffer space to take it.
+ */
+ if (*ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ }
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+
+
+ /* Clean receiver SACK report if present */
+ if (tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+ TCPSTAT_INC(tcps_preddat);
+ tp->rcv_nxt += tlen;
+ /*
+ * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
+ */
+ tp->snd_wl1 = th->th_seq;
+ /*
+ * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
+ */
+ tp->rcv_up = tp->rcv_nxt;
+ TCPSTAT_ADD(tcps_rcvpack, nsegs);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+ newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
+
+ /* Add data to socket buffer. */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ m_freem(m);
+ } else {
+ /*
+ * Set new socket buffer size. Give up when limit is
+ * reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, NULL))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ rack_calc_rwin(so, tp);
+ }
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+ if (DELAY_ACK(tp, tlen)) {
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ tp->t_flags |= TF_DELACK;
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ rack->r_wanted_output++;
+ }
+ if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
+ sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
+ return (1);
+}
+
+/*
+ * This subfunction is used to try to highly optimize the
+ * fast path. We again allow window updates that are
+ * in sequence to remain in the fast-path. We also add
+ * in the __predict's to attempt to help the compiler.
+ * Note that if we return a 0, then we can *not* process
+ * it and the caller should push the packet into the
+ * slow-path.
+ */
+static int
+rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
+{
+ int32_t acked;
+ int32_t nsegs;
+
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+
+#endif
+ struct tcp_rack *rack;
+
+ if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
+ /* Old ack, behind (or duplicate to) the last one rcv'd */
+ return (0);
+ }
+ if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
+ /* Above what we have sent? */
+ return (0);
+ }
+ if (__predict_false(tp->snd_nxt != tp->snd_max)) {
+ /* We are retransmitting */
+ return (0);
+ }
+ if (__predict_false(tiwin == 0)) {
+ /* zero window */
+ return (0);
+ }
+ if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
+ /* We need a SYN or a FIN, unlikely.. */
+ return (0);
+ }
+ if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
+ /* Timestamp is behind .. old ack with seq wrap? */
+ return (0);
+ }
+ if (__predict_false(IN_RECOVERY(tp->t_flags))) {
+ /* Still recovering */
+ return (0);
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->r_ctl.rc_sacked) {
+ /* We have sack holes on our scoreboard */
+ return (0);
+ }
+ /* Ok if we reach here, we can process a fast-ack */
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ rack_log_ack(tp, to, th);
+ /* Did the window get updated? */
+ if (tiwin != tp->snd_wnd) {
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ }
+ if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
+ rack_exit_persist(tp, rack);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * the timestamp. NOTE that the test is modified according to the
+ * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * This is a pure ack for outstanding data.
+ */
+ if (*ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ }
+ TCPSTAT_INC(tcps_predack);
+
+ /*
+ * "bad retransmit" recovery.
+ */
+ if (tp->t_flags & TF_PREVVALID) {
+ tp->t_flags &= ~TF_PREVVALID;
+ if (tp->t_rxtshift == 1 &&
+ (int)(ticks - tp->t_badrxtwin) < 0)
+ rack_cong_signal(tp, th, CC_RTO_ERR);
+ }
+ /*
+ * Recalculate the transmit timer / rtt.
+ *
+ * Some boxes send broken timestamp replies during the SYN+ACK
+ * phase, ignore timestamps of 0 or we could calculate a huge RTT
+ * and blow up the retransmit timer.
+ */
+ acked = BYTES_THIS_ACK(tp, th);
+
+#ifdef TCP_HHOOK
+ /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+ hhook_run_tcp_est_in(tp, th, to);
+#endif
+
+ TCPSTAT_ADD(tcps_rcvackpack, nsegs);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+ sbdrop(&so->so_snd, acked);
+ /*
+ * Let the congestion control algorithm update congestion control
+ * related information. This typically means increasing the
+ * congestion window.
+ */
+ rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
+
+ tp->snd_una = th->th_ack;
+ /*
+ * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
+ */
+ tp->snd_wl2 = th->th_ack;
+ tp->t_dupacks = 0;
+ m_freem(m);
+ /* ND6_HINT(tp); *//* Some progress has been made. */
+
+ /*
+ * If all outstanding data are acked, stop retransmit timer,
+ * otherwise restart timer using current (possibly backed-off)
+ * value. If process is waiting for space, wakeup/selwakeup/signal.
+ * If data are ready to send, let tcp_output decide between more
+ * output or persist.
+ */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp->snd_una == tp->snd_max) {
+ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
+ tp->t_acktime = 0;
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ }
+ /* Wake up the socket if we have room to write more */
+ sowwakeup(so);
+ if (sbavail(&so->so_snd)) {
+ rack->r_wanted_output++;
+ }
+ return (1);
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+ int32_t todrop;
+ int32_t ourfinisacked = 0;
+
+ rack_calc_rwin(so, tp);
+ /*
+ * If the state is SYN_SENT: if seg contains an ACK, but not for our
+ * SYN, drop the input. if seg contains a RST, then drop the
+ * connection. if seg does not contain SYN, then drop it. Otherwise
+ * this is an acceptable SYN segment initialize tp->rcv_nxt and
+ * tp->irs if seg contains ack then advance tp->snd_una if seg
+ * contains an ECE and ECN support is enabled, the stream is ECN
+ * capable. if SYN has been acked change to ESTABLISHED else
+ * SYN_RCVD state arrange for segment to be acked (eventually)
+ * continue processing rest of data/controls, beginning with URG
+ */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
+ TCP_PROBE5(connect__refused, NULL, tp,
+ mtod(m, const char *), tp, th);
+ tp = tcp_drop(tp, ECONNREFUSED);
+ rack_do_drop(m, tp, ti_locked);
+ return (1);
+ }
+ if (thflags & TH_RST) {
+ rack_do_drop(m, tp, ti_locked);
+ return (1);
+ }
+ if (!(thflags & TH_SYN)) {
+ rack_do_drop(m, tp, ti_locked);
+ return (1);
+ }
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+#ifdef MAC
+ mac_socketpeer_set_from_mbuf(m, so);
+#endif
+ /* Do window scaling on this connection? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ tp->rcv_adv += min(tp->rcv_wnd,
+ TCP_MAXWIN << tp->rcv_scale);
+ /*
+ * If there's data, delay ACK; if there's also a FIN ACKNOW
+ * will be turned on later.
+ */
+ if (DELAY_ACK(tp, tlen) && tlen != 0) {
+ rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
+ tp->t_flags |= TF_DELACK;
+ } else {
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
+ tp->t_flags |= TF_ACKNOW;
+ }
+
+ if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ TCPSTAT_INC(tcps_ecn_shs);
+ }
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
+ * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+ } else {
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(connect__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+ cc_conn_init(tp);
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state => simultaneous
+ * open. If segment contains CC option and there is a
+ * cached CC, apply TAO test. If it succeeds, connection is *
+ * half-synchronized. Otherwise, do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
+ * there was no CC option, clear cached CC value.
+ */
+ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+ tcp_state_change(tp, TCPS_SYN_RECEIVED);
+ }
+ KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
+ "ti_locked %d", __func__, *ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * Advance th->th_seq to correspond to first data byte. If data,
+ * trim to stay within window, dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data. If the
+ * remote host used T/TCP to validate the SYN, our data will be
+ * ACK'd; if so, enter normal data segment processing in the middle
+ * of step 5, ack processing. Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK) {
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
+ return (ret_val);
+ /* We may have changed to FIN_WAIT_1 above */
+ if (tp->t_state == TCPS_FIN_WAIT_1) {
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing
+ * for the ESTABLISHED state if our FIN is now
+ * acknowledged then enter FIN_WAIT_2.
+ */
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more data, then
+ * closing user can proceed. Starting the
+ * timer is contrary to the specification,
+ * but if we don't get a FIN we'll hang
+ * forever.
+ *
+ * XXXjl: we should release the tp also, and
+ * use a compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+ int32_t ourfinisacked = 0;
+
+ rack_calc_rwin(so, tp);
+
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN) {
+ /*
+ * When a TFO connection is in SYN_RECEIVED, the only valid
+ * packets are the initial SYN, a retransmit/copy of the
+ * initial SYN (possibly with a subset of the original
+ * data), a valid ACK, a FIN, or a RST.
+ */
+ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ } else if (thflags & TH_SYN) {
+ /* non-initial SYN is ignored */
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
+ (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
+ (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+#endif
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ /*
+ * In the SYN-RECEIVED state, validate that the packet belongs to
+ * this connection before trimming the data to fit the receive
+ * window. Check the sequence number versus IRS since we know the
+ * sequence numbers haven't wrapped. This is a partial fix for the
+ * "LAND" DoS attack.
+ */
+ if (SEQ_LT(th->th_seq, tp->irs)) {
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN) {
+ tp->snd_wnd = tiwin;
+ cc_conn_init(tp);
+ }
+#endif
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+ }
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ tp->snd_wnd = tiwin;
+ }
+ /*
+ * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
+ * FIN-WAIT-1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(accept__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+#ifdef TCP_RFC7413
+ if (tp->t_tfo_pending) {
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+
+ /*
+ * Account for the ACK of our SYN prior to regular
+ * ACK processing below.
+ */
+ tp->snd_una++;
+ }
+ /*
+ * TFO connections call cc_conn_init() during SYN
+ * processing. Calling it again here for such connections
+ * is not harmless as it would undo the snd_cwnd reduction
+ * that occurs when a TFO SYN|ACK is retransmitted.
+ */
+ if (!(tp->t_flags & TF_FASTOPEN))
+#endif
+ cc_conn_init(tp);
+ }
+ /*
+ * If segment contains data or ACK, will call tcp_reass() later; if
+ * not, do so now to pass queued data to user.
+ */
+ if (tlen == 0 && (thflags & TH_FIN) == 0)
+ (void)tcp_reass(tp, (struct tcphdr *)0, 0,
+ (struct mbuf *)0);
+ tp->snd_wl1 = th->th_seq - 1;
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (tp->t_state == TCPS_FIN_WAIT_1) {
+ /* We could have went to FIN_WAIT_1 (or EST) above */
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing for the
+ * ESTABLISHED state if our FIN is now acknowledged then
+ * enter FIN_WAIT_2.
+ */
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more data, then closing
+ * user can proceed. Starting the timer is contrary
+ * to the specification, but if we don't get a FIN
+ * we'll hang forever.
+ *
+ * XXXjl: we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+
+ /*
+ * Header prediction: check for the two common cases of a
+ * uni-directional data xfer. If the packet has no control flags,
+ * is in-sequence, the window didn't change and we're not
+ * retransmitting, it's a candidate. If the length is zero and the
+ * ack moved forward, we're the sender side of the xfer. Just free
+ * the data acked & wake any higher level process that was blocked
+ * waiting for space. If the length is non-zero and the ack didn't
+ * move, we're the receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data toc The socket
+ * buffer and note that we need a delayed ack. Make sure that the
+ * hidden state-flags are also off. Since we check for
+ * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
+ */
+ if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
+ __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
+ __predict_true(LIST_EMPTY(&tp->t_segq)) &&
+ __predict_true(th->th_seq == tp->rcv_nxt)) {
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (tlen == 0) {
+ if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
+ ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
+ return (0);
+ }
+ } else {
+ if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
+ ti_locked, tiwin, nxt_pkt)) {
+ return (0);
+ }
+ }
+ }
+ rack_calc_rwin(so, tp);
+
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+
+ } else if (tp->t_flags & TF_ACKNOW) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
+ return (ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ /* State changes only happen in rack_process_data() */
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+
+ rack_calc_rwin(so, tp);
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+
+ } else if (tp->t_flags & TF_ACKNOW) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
+ return (ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+static int
+rack_check_data_after_close(struct mbuf *m,
+ struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so)
+{
+ struct tcp_rack *rack;
+
+ KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
+ "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->rc_allow_data_af_clo == 0) {
+ close_now:
+ tp = tcp_close(tp);
+ TCPSTAT_INC(tcps_rcvafterclose);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen));
+ return (1);
+ }
+ if (sbavail(&so->so_snd) == 0)
+ goto close_now;
+ /* Ok we allow data that is ignored and a followup reset */
+ tp->rcv_nxt = th->th_seq + *tlen;
+ tp->t_flags2 |= TF2_DROP_AF_DATA;
+ rack->r_wanted_output = 1;
+ *tlen = 0;
+ return (0);
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+ int32_t ourfinisacked = 0;
+
+ rack_calc_rwin(so, tp);
+
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) && tlen) {
+ if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
+ return (ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more data, then closing user can
+ * proceed. Starting the timer is contrary to the
+ * specification, but if we don't get a FIN we'll hang
+ * forever.
+ *
+ * XXXjl: we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+ int32_t ourfinisacked = 0;
+
+ rack_calc_rwin(so, tp);
+
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) && tlen) {
+ if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
+ return (ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (ourfinisacked) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ tcp_twstart(tp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ *ti_locked = TI_UNLOCKED;
+ m_freem(m);
+ return (1);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+ int32_t ourfinisacked = 0;
+
+ rack_calc_rwin(so, tp);
+
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) && tlen) {
+ if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
+ return (ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+ /*
+ * case TCPS_LAST_ACK: Ack processing.
+ */
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (ourfinisacked) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ tp = tcp_close(tp);
+ rack_do_drop(m, tp, ti_locked);
+ return (1);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCP is still
+ * locked.
+ */
+static int
+rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ret_val = 0;
+ int32_t ourfinisacked = 0;
+
+ rack_calc_rwin(so, tp);
+
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ if (thflags & TH_RST)
+ return (rack_process_rst(m, th, so, tp, ti_locked));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) &&
+ tlen) {
+ if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
+ return (ret_val);
+ } else {
+ rack_do_drop(m, NULL, ti_locked);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (rack_progress_timeout_check(tp)) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ ti_locked, tiwin, thflags, nxt_pkt));
+}
+
+
+static void inline
+rack_clear_rate_sample(struct tcp_rack *rack)
+{
+ rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
+ rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
+ rack->r_ctl.rack_rs.rs_rtt_tot = 0;
+}
+
+static int
+rack_init(struct tcpcb *tp)
+{
+ struct tcp_rack *rack = NULL;
+
+ tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
+ if (tp->t_fb_ptr == NULL) {
+ /*
+ * We need to allocate memory but cant. The INP and INP_INFO
+ * locks and they are recusive (happens during setup. So a
+ * scheme to drop the locks fails :(
+ *
+ */
+ return (ENOMEM);
+ }
+ memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ TAILQ_INIT(&rack->r_ctl.rc_map);
+ TAILQ_INIT(&rack->r_ctl.rc_free);
+ TAILQ_INIT(&rack->r_ctl.rc_tmap);
+ rack->rc_tp = tp;
+ if (tp->t_inpcb) {
+ rack->rc_inp = tp->t_inpcb;
+ }
+ /* Probably not needed but lets be sure */
+ rack_clear_rate_sample(rack);
+ rack->r_cpu = 0;
+ rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
+ rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
+ rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
+ rack->rc_pace_reduce = rack_slot_reduction;
+ if (V_tcp_delack_enabled)
+ tp->t_delayed_ack = 1;
+ else
+ tp->t_delayed_ack = 0;
+ rack->rc_pace_max_segs = rack_hptsi_segments;
+ rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
+ rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
+ rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
+ rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
+ rack->r_idle_reduce_largest = rack_reduce_largest_on_idle;
+ rack->r_enforce_min_pace = rack_min_pace_time;
+ rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
+ rack->r_ctl.rc_prop_rate = rack_proportional_rate;
+ rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
+ rack->r_ctl.rc_early_recovery = rack_early_recovery;
+ rack->rc_always_pace = rack_pace_every_seg;
+ rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
+ rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
+ rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
+ rack->r_ctl.rc_min_to = rack_min_to;
+ rack->r_ctl.rc_prr_inc_var = rack_inc_var;
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+ if (tp->snd_una != tp->snd_max) {
+ /* Create a send map for the current outstanding data */
+ struct rack_sendmap *rsm;
+
+ rsm = rack_alloc(rack);
+ if (rsm == NULL) {
+ uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
+ tp->t_fb_ptr = NULL;
+ return (ENOMEM);
+ }
+ rsm->r_flags = RACK_OVERMAX;
+ rsm->r_tim_lastsent[0] = tcp_ts_getticks();
+ rsm->r_rtr_cnt = 1;
+ rsm->r_rtr_bytes = 0;
+ rsm->r_start = tp->snd_una;
+ rsm->r_end = tp->snd_max;
+ rsm->r_sndcnt = 0;
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ }
+ return (0);
+}
+
+static int
+rack_handoff_ok(struct tcpcb *tp)
+{
+ if ((tp->t_state == TCPS_CLOSED) ||
+ (tp->t_state == TCPS_LISTEN)) {
+ /* Sure no problem though it may not stick */
+ return (0);
+ }
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED)) {
+ /*
+ * We really don't know you have to get to ESTAB or beyond
+ * to tell.
+ */
+ return (EAGAIN);
+ }
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ return (0);
+ }
+ /*
+ * If we reach here we don't do SACK on this connection so we can
+ * never do rack.
+ */
+ return (EINVAL);
+}
+
+static void
+rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
+{
+ if (tp->t_fb_ptr) {
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+#ifdef TCP_BLACKBOX
+ tcp_log_flowend(tp);
+#endif
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ while (rsm) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
+ uma_zfree(rack_zone, rsm);
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ }
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
+ while (rsm) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
+ uma_zfree(rack_zone, rsm);
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
+ }
+ rack->rc_free_cnt = 0;
+ uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
+ tp->t_fb_ptr = NULL;
+ }
+}
+
+static void
+rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ switch (tp->t_state) {
+ case TCPS_SYN_SENT:
+ rack->r_state = TCPS_SYN_SENT;
+ rack->r_substate = rack_do_syn_sent;
+ break;
+ case TCPS_SYN_RECEIVED:
+ rack->r_state = TCPS_SYN_RECEIVED;
+ rack->r_substate = rack_do_syn_recv;
+ break;
+ case TCPS_ESTABLISHED:
+ rack->r_state = TCPS_ESTABLISHED;
+ rack->r_substate = rack_do_established;
+ break;
+ case TCPS_CLOSE_WAIT:
+ rack->r_state = TCPS_CLOSE_WAIT;
+ rack->r_substate = rack_do_close_wait;
+ break;
+ case TCPS_FIN_WAIT_1:
+ rack->r_state = TCPS_FIN_WAIT_1;
+ rack->r_substate = rack_do_fin_wait_1;
+ break;
+ case TCPS_CLOSING:
+ rack->r_state = TCPS_CLOSING;
+ rack->r_substate = rack_do_closing;
+ break;
+ case TCPS_LAST_ACK:
+ rack->r_state = TCPS_LAST_ACK;
+ rack->r_substate = rack_do_lastack;
+ break;
+ case TCPS_FIN_WAIT_2:
+ rack->r_state = TCPS_FIN_WAIT_2;
+ rack->r_substate = rack_do_fin_wait_2;
+ break;
+ case TCPS_LISTEN:
+ case TCPS_CLOSED:
+ case TCPS_TIME_WAIT:
+ default:
+#ifdef INVARIANTS
+ panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
+#endif
+ break;
+ };
+}
+
+
+static void
+rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
+{
+ /*
+ * We received an ack, and then did not
+ * call send or were bounced out due to the
+ * hpts was running. Now a timer is up as well, is
+ * it the right timer?
+ */
+ struct rack_sendmap *rsm;
+ int tmr_up;
+
+ tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
+ if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
+ return;
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
+ (tmr_up == PACE_TMR_RXT)) {
+ /* Should be an RXT */
+ return;
+ }
+ if (rsm == NULL) {
+ /* Nothing outstanding? */
+ if (tp->t_flags & TF_DELACK) {
+ if (tmr_up == PACE_TMR_DELACK)
+ /* We are supposed to have delayed ack up and we do */
+ return;
+ } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
+ /*
+ * if we hit enobufs then we would expect the possiblity
+ * of nothing outstanding and the RXT up (and the hptsi timer).
+ */
+ return;
+ } else if (((tcp_always_keepalive ||
+ rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)) &&
+ (tmr_up == PACE_TMR_KEEP) &&
+ (tp->snd_max == tp->snd_una)) {
+ /* We should have keep alive up and we do */
+ return;
+ }
+ }
+ if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
+ if ((tp->t_flags & TF_SENTFIN) &&
+ ((tp->snd_max - tp->snd_una) == 1) &&
+ (rsm->r_flags & RACK_HAS_FIN)) {
+ /* needs to be a RXT */
+ if (tmr_up == PACE_TMR_RXT)
+ return;
+ } else if (tmr_up == PACE_TMR_RACK)
+ return;
+ } else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
+ ((tmr_up == PACE_TMR_TLP) ||
+ (tmr_up == PACE_TMR_RXT))) {
+ /*
+ * Either a TLP or RXT is fine if no sack-passed
+ * is in place and data is outstanding.
+ */
+ return;
+ } else if (tmr_up == PACE_TMR_DELACK) {
+ /*
+ * If the delayed ack was going to go off
+ * before the rtx/tlp/rack timer were going to
+ * expire, then that would be the timer in control.
+ * Note we don't check the time here trusting the
+ * code is correct.
+ */
+ return;
+ }
+ /*
+ * Ok the timer originally started is not what we want now.
+ * We will force the hpts to be stopped if any, and restart
+ * with the slot set to what was in the saved slot.
+ */
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+}
+
+static void
+rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
+ int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv)
+{
+ int32_t thflags, retval, did_out = 0;
+ int32_t way_out = 0;
+ uint32_t cts;
+ uint32_t tiwin;
+ struct tcpopt to;
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+ int32_t prev_state = 0;
+
+ cts = tcp_tv_to_mssectick(tv);
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+
+ kern_prefetch(rack, &prev_state);
+ prev_state = 0;
+ thflags = th->th_flags;
+ /*
+ * If this is either a state-changing packet or current state isn't
+ * established, we require a read lock on tcbinfo. Otherwise, we
+ * allow the tcbinfo to be in either locked or unlocked, as the
+ * caller may have unnecessarily acquired a lock due to a race.
+ */
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tp->t_state != TCPS_ESTABLISHED) {
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
+ "SYN/FIN/RST/!EST", __func__, ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+ __func__));
+ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+ __func__));
+ {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
+ tlen, &log, true);
+ }
+ /*
+ * Segment received on connection. Reset idle time and keep-alive
+ * timer. XXX: This should be done after segment validation to
+ * ignore broken/spoofed segs.
+ */
+ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
+#ifdef NETFLIX_CWV
+ if ((tp->cwv_enabled) &&
+ ((tp->cwv_cwnd_valid == 0) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
+ tcp_newcwv_nvp_closedown(tp);
+ } else
+#endif
+ if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
+ counter_u64_add(rack_input_idle_reduces, 1);
+ rack_cc_after_idle(tp,
+ (rack->r_idle_reduce_largest ? 1 :0));
+ }
+ }
+ rack->r_ctl.rc_rcvtime = cts;
+ tp->t_rcvtime = ticks;
+
+#ifdef NETFLIX_CWV
+ if (tp->cwv_enabled) {
+ if ((tp->cwv_cwnd_valid == 0) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
+ tcp_newcwv_nvp_closedown(tp);
+ }
+#endif
+ /*
+ * Unscale the window into a 32-bit value. For the SYN_SENT state
+ * the scale is zero.
+ */
+ tiwin = th->th_win << tp->snd_scale;
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+#endif
+ /*
+ * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
+ * this to occur after we've validated the segment.
+ */
+ if (tp->t_flags & TF_ECN_PERMIT) {
+ if (thflags & TH_CWR)
+ tp->t_flags &= ~TF_ECN_SND_ECE;
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags |= TF_ECN_SND_ECE;
+ TCPSTAT_INC(tcps_ecn_ce);
+ break;
+ case IPTOS_ECN_ECT0:
+ TCPSTAT_INC(tcps_ecn_ect0);
+ break;
+ case IPTOS_ECN_ECT1:
+ TCPSTAT_INC(tcps_ecn_ect1);
+ break;
+ }
+ /* Congestion experienced. */
+ if (thflags & TH_ECE) {
+ rack_cong_signal(tp, th, CC_ECN);
+ }
+ }
+ /*
+ * Parse options on any incoming segment.
+ */
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
+
+ /*
+ * If echoed timestamp is later than the current time, fall back to
+ * non RFC1323 RTT calculation. Normalize timestamp if syncookies
+ * were used when this connection was established.
+ */
+ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+ to.to_tsecr -= tp->ts_offset;
+ if (TSTMP_GT(to.to_tsecr, cts))
+ to.to_tsecr = 0;
+ }
+ /*
+ * If its the first time in we need to take care of options and
+ * verify we can do SACK for rack!
+ */
+ if (rack->r_state == 0) {
+ /* Should be init'd by rack_init() */
+ KASSERT(rack->rc_inp != NULL,
+ ("%s: rack->rc_inp unexpectedly NULL", __func__));
+ if (rack->rc_inp == NULL) {
+ rack->rc_inp = tp->t_inpcb;
+ }
+
+ /*
+ * Process options only when we get SYN/ACK back. The SYN
+ * case for incoming connections is handled in tcp_syncache.
+ * According to RFC1323 the window field in a SYN (i.e., a
+ * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
+ * this is traditional behavior, may need to be cleaned up.
+ */
+ rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ if ((to.to_flags & TOF_SCALE) &&
+ (tp->t_flags & TF_REQ_SCALE)) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->snd_scale = to.to_wscale;
+ }
+ /*
+ * Initial send window. It will be updated with the
+ * next incoming segment to the scaled value.
+ */
+ tp->snd_wnd = th->th_win;
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = cts;
+ }
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ (to.to_flags & TOF_SACKPERM) == 0)
+ tp->t_flags &= ~TF_SACK_PERMIT;
+ }
+ /*
+ * At this point we are at the initial call. Here we decide
+ * if we are doing RACK or not. We do this by seeing if
+ * TF_SACK_PERMIT is set, if not rack is *not* possible and
+ * we switch to the default code.
+ */
+ if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
+ tcp_switch_back_to_default(tp);
+ (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
+ tlen, iptos, ti_locked);
+ return;
+ }
+ /* Set the flag */
+ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
+ tcp_set_hpts(tp->t_inpcb);
+ rack_stop_all_timers(tp);
+ sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
+ }
+ /*
+ * This is the one exception case where we set the rack state
+ * always. All other times (timers etc) we must have a rack-state
+ * set (so we assure we have done the checks above for SACK).
+ */
+ if (rack->r_state != tp->t_state)
+ rack_set_state(tp, rack);
+ if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
+ kern_prefetch(rsm, &prev_state);
+ prev_state = rack->r_state;
+ rack->r_ctl.rc_tlp_send_cnt = 0;
+ rack_clear_rate_sample(rack);
+ retval = (*rack->r_substate) (m, th, so,
+ tp, &to, drop_hdrlen,
+ tlen, &ti_locked, tiwin, thflags, nxt_pkt);
+#ifdef INVARIANTS
+ if ((retval == 0) &&
+ (tp->t_inpcb == NULL)) {
+ panic("retval:%d tp:%p t_inpcb:NULL state:%d",
+ retval, tp, prev_state);
+ }
+#endif
+ if (ti_locked != TI_UNLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ ti_locked = TI_UNLOCKED;
+ }
+ if (retval == 0) {
+ /*
+ * If retval is 1 the tcb is unlocked and most likely the tp
+ * is gone.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ tcp_rack_xmit_timer_commit(rack, tp);
+ if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
+ (rack->rc_in_persist == 0)){
+ /*
+ * The peer shrunk its window on us to the point
+ * where we have sent too much. The only thing
+ * we can do here is stop any timers and
+ * enter persist. We most likely lost the last
+ * bytes we sent but oh well, we will have to
+ * retransmit them after the peer is caught up.
+ */
+ if (rack->rc_inp->inp_in_hpts)
+ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ rack_timer_cancel(tp, rack, cts, __LINE__);
+ rack_enter_persist(tp, rack, cts);
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+ way_out = 3;
+ goto done_with_input;
+ }
+ if (nxt_pkt == 0) {
+ if (rack->r_wanted_output != 0) {
+ did_out = 1;
+ (void)tp->t_fb->tfb_tcp_output(tp);
+ }
+ rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
+ }
+ if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
+ (SEQ_GT(tp->snd_max, tp->snd_una) ||
+ (tp->t_flags & TF_DELACK) ||
+ ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)))) {
+ /* We could not send (probably in the hpts but stopped the timer earlier)? */
+ if ((tp->snd_max == tp->snd_una) &&
+ ((tp->t_flags & TF_DELACK) == 0) &&
+ (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
+ /* keep alive not needed if we are hptsi output yet */
+ ;
+ } else {
+ if (rack->rc_inp->inp_in_hpts)
+ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+ }
+ way_out = 1;
+ } else {
+ /* Do we have the correct timer running? */
+ rack_timer_audit(tp, rack, &so->so_snd);
+ way_out = 2;
+ }
+ done_with_input:
+ rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
+ if (did_out)
+ rack->r_wanted_output = 0;
+#ifdef INVARIANTS
+ if (tp->t_inpcb == NULL) {
+ panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
+ did_out,
+ retval, tp, prev_state);
+ }
+#endif
+ INP_WUNLOCK(tp->t_inpcb);
+ }
+}
+
+void
+rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
+ int32_t ti_locked)
+{
+ struct timeval tv;
+#ifdef RSS
+ struct tcp_function_block *tfb;
+ struct tcp_rack *rack;
+ struct inpcb *inp;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->r_state == 0) {
+ /*
+ * Initial input (ACK to SYN-ACK etc)lets go ahead and get
+ * it processed
+ */
+ if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo))
+ ti_locked = TI_RLOCKED;
+ if (ti_locked != TI_RLOCKED) {
+ inp = tp->t_inpcb;
+ tfb = tp->t_fb;
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp))
+ inp = NULL;
+ if (inp == NULL || (inp->inp_flags2 & INP_FREED) ||
+ (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
+ /* The TCPCB went away. Free the packet. */
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (inp)
+ INP_WUNLOCK(inp);
+ m_freem(m);
+ return;
+ }
+ /* If the stack changed, call the correct stack. */
+ if (tp->t_fb != tfb) {
+ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp,
+ drop_hdrlen, tlen, iptos, ti_locked);
+ return;
+ }
+ }
+ tcp_get_usecs(&tv);
+ rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
+ tlen, iptos, ti_locked, 0, &tv);
+ return;
+ }
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked);
+ INP_WUNLOCK(tp->t_inpcb);
+#else
+ tcp_get_usecs(&tv);
+ rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
+ tlen, iptos, ti_locked, 0, &tv);
+#endif
+}
+
+struct rack_sendmap *
+tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
+{
+ struct rack_sendmap *rsm = NULL;
+ int32_t idx;
+ uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
+
+ /* Return the next guy to be re-transmitted */
+ if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
+ return (NULL);
+ }
+ if (tp->t_flags & TF_SENTFIN) {
+ /* retran the end FIN? */
+ return (NULL);
+ }
+ /* ok lets look at this one */
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
+ goto check_it;
+ }
+ rsm = rack_find_lowest_rsm(rack);
+ if (rsm == NULL) {
+ return (NULL);
+ }
+check_it:
+ srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
+ srtt = TICKS_2_MSEC(srtt_cur);
+ if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
+ srtt = rack->rc_rack_rtt;
+ if (rsm->r_flags & RACK_ACKED) {
+ return (NULL);
+ }
+ if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
+ /* Its not yet ready */
+ return (NULL);
+ }
+ idx = rsm->r_rtr_cnt - 1;
+ ts_low = rsm->r_tim_lastsent[idx];
+ thresh = rack_calc_thresh_rack(rack, srtt, tsused);
+ if (tsused <= ts_low) {
+ return (NULL);
+ }
+ if ((tsused - ts_low) >= thresh) {
+ return (rsm);
+ }
+ return (NULL);
+}
+
+static int
+rack_output(struct tcpcb *tp)
+{
+ struct socket *so;
+ uint32_t recwin, sendwin;
+ uint32_t sb_offset;
+ int32_t len, flags, error = 0;
+ struct mbuf *m;
+ struct mbuf *mb;
+ uint32_t if_hw_tsomaxsegcount = 0;
+ uint32_t if_hw_tsomaxsegsize;
+ long tot_len_this_send = 0;
+ struct ip *ip = NULL;
+#ifdef TCPDEBUG
+ struct ipovly *ipov = NULL;
+#endif
+ struct udphdr *udp = NULL;
+ struct tcp_rack *rack;
+ struct tcphdr *th;
+ uint8_t pass = 0;
+ u_char opt[TCP_MAXOLEN];
+ unsigned ipoptlen, optlen, hdrlen, ulen=0;
+ uint32_t rack_seq;
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ unsigned ipsec_optlen = 0;
+
+#endif
+ int32_t idle, sendalot;
+ int32_t sub_from_prr = 0;
+ volatile int32_t sack_rxmit;
+ struct rack_sendmap *rsm = NULL;
+ int32_t tso, mtu, would_have_fin = 0;
+ struct tcpopt to;
+ int32_t slot = 0;
+ uint32_t cts;
+ uint8_t hpts_calling, doing_tlp = 0;
+ int32_t do_a_prefetch;
+ int32_t prefetch_rsm = 0;
+ int32_t prefetch_so_done = 0;
+ struct tcp_log_buffer *lgb = NULL;
+ struct inpcb *inp;
+ struct sockbuf *sb;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int32_t isipv6;
+#endif
+ /* setup and take the cache hits here */
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ inp = rack->rc_inp;
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ kern_prefetch(sb, &do_a_prefetch);
+ do_a_prefetch = 1;
+
+ INP_WLOCK_ASSERT(inp);
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE)
+ return (tcp_offload_output(tp));
+#endif
+
+#ifdef TCP_RFC7413
+ /*
+ * For TFO connections in SYN_RECEIVED, only allow the initial
+ * SYN|ACK and those sent by the retransmit timer.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */
+ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
+ return (0);
+#endif
+#ifdef INET6
+ if (rack->r_state) {
+ /* Use the cache line loaded if possible */
+ isipv6 = rack->r_is_v6;
+ } else {
+ isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+ }
+#endif
+ cts = tcp_ts_getticks();
+ if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
+ inp->inp_in_hpts) {
+ /*
+ * We are on the hpts for some timer but not hptsi output.
+ * Remove from the hpts unconditionally.
+ */
+ rack_timer_cancel(tp, rack, cts, __LINE__);
+ }
+ /* Mark that we have called rack_output(). */
+ if ((rack->r_timer_override) ||
+ (tp->t_flags & TF_FORCEDATA) ||
+ (tp->t_state < TCPS_ESTABLISHED)) {
+ if (tp->t_inpcb->inp_in_hpts)
+ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
+ } else if (tp->t_inpcb->inp_in_hpts) {
+ /*
+ * On the hpts you can't pass even if ACKNOW is on, we will
+ * when the hpts fires.
+ */
+ counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
+ return (0);
+ }
+ hpts_calling = inp->inp_hpts_calls;
+ inp->inp_hpts_calls = 0;
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ if (rack_process_timers(tp, rack, cts, hpts_calling)) {
+ counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
+ return (0);
+ }
+ }
+ rack->r_wanted_output = 0;
+ rack->r_timer_override = 0;
+ /*
+ * Determine length of data that should be transmitted, and flags
+ * that will be used. If there is some data or critical controls
+ * (SYN, RST) to send, then transmit; otherwise, investigate
+ * further.
+ */
+ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
+#ifdef NETFLIX_CWV
+ if (tp->cwv_enabled) {
+ if ((tp->cwv_cwnd_valid == 0) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
+ tcp_newcwv_nvp_closedown(tp);
+ } else
+#endif
+ if (tp->t_idle_reduce) {
+ if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
+ rack_cc_after_idle(tp,
+ (rack->r_idle_reduce_largest ? 1 :0));
+ }
+ tp->t_flags &= ~TF_LASTIDLE;
+ if (idle) {
+ if (tp->t_flags & TF_MORETOCOME) {
+ tp->t_flags |= TF_LASTIDLE;
+ idle = 0;
+ }
+ }
+again:
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_nxt. There may be SACK information that allows us to avoid
+ * resending already delivered data. Adjust snd_nxt accordingly.
+ */
+ sendalot = 0;
+ cts = tcp_ts_getticks();
+ tso = 0;
+ mtu = 0;
+ sb_offset = tp->snd_max - tp->snd_una;
+ sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+
+ flags = tcp_outflags[tp->t_state];
+ /*
+ * Send any SACK-generated retransmissions. If we're explicitly
+ * trying to send out new data (when sendalot is 1), bypass this
+ * function. If we retransmit in fast recovery mode, decrement
+ * snd_cwnd, since we're replacing a (future) new transmission with
+ * a retransmission now, and we previously incremented snd_cwnd in
+ * tcp_input().
+ */
+ /*
+ * Still in sack recovery , reset rxmit flag to zero.
+ */
+ while (rack->rc_free_cnt < rack_free_cache) {
+ rsm = rack_alloc(rack);
+ if (rsm == NULL) {
+ if (inp->inp_hpts_calls)
+ /* Retry in a ms */
+ slot = 1;
+ goto just_return_nolock;
+ }
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
+ rack->rc_free_cnt++;
+ rsm = NULL;
+ }
+ if (inp->inp_hpts_calls)
+ inp->inp_hpts_calls = 0;
+ sack_rxmit = 0;
+ len = 0;
+ rsm = NULL;
+ if (flags & TH_RST) {
+ SOCKBUF_LOCK(sb);
+ goto send;
+ }
+ if (rack->r_ctl.rc_tlpsend) {
+ /* Tail loss probe */
+ long cwin;
+ long tlen;
+
+ doing_tlp = 1;
+ rsm = rack->r_ctl.rc_tlpsend;
+ rack->r_ctl.rc_tlpsend = NULL;
+ sack_rxmit = 1;
+ tlen = rsm->r_end - rsm->r_start;
+ if (tlen > tp->t_maxseg)
+ tlen = tp->t_maxseg;
+#ifdef INVARIANTS
+ if (SEQ_GT(tp->snd_una, rsm->r_start)) {
+ panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
+ tp, rack, tp->snd_una, rsm, rsm->r_start);
+ }
+#endif
+ sb_offset = rsm->r_start - tp->snd_una;
+ cwin = min(tp->snd_wnd, tlen);
+ len = cwin;
+ } else if (rack->r_ctl.rc_resend) {
+ /* Retransmit timer */
+ rsm = rack->r_ctl.rc_resend;
+ rack->r_ctl.rc_resend = NULL;
+ len = rsm->r_end - rsm->r_start;
+ sack_rxmit = 1;
+ sendalot = 0;
+ sb_offset = rsm->r_start - tp->snd_una;
+ if (len >= tp->t_maxseg) {
+ len = tp->t_maxseg;
+ }
+ KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
+ __func__, sb_offset));
+ } else if ((rack->rc_in_persist == 0) &&
+ ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
+ long tlen;
+
+ if ((!IN_RECOVERY(tp->t_flags)) &&
+ ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
+ /* Enter recovery if not induced by a time-out */
+ rack->r_ctl.rc_rsm_start = rsm->r_start;
+ rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
+ rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
+ rack_cong_signal(tp, NULL, CC_NDUPACK);
+ /*
+ * When we enter recovery we need to assure we send
+ * one packet.
+ */
+ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ }
+#ifdef INVARIANTS
+ if (SEQ_LT(rsm->r_start, tp->snd_una)) {
+ panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
+ tp, rack, rsm, rsm->r_start, tp->snd_una);
+ }
+#endif
+ tlen = rsm->r_end - rsm->r_start;
+ sb_offset = rsm->r_start - tp->snd_una;
+ if (tlen > rack->r_ctl.rc_prr_sndcnt) {
+ len = rack->r_ctl.rc_prr_sndcnt;
+ } else {
+ len = tlen;
+ }
+ if (len >= tp->t_maxseg) {
+ sendalot = 1;
+ len = tp->t_maxseg;
+ } else {
+ sendalot = 0;
+ if ((rack->rc_timer_up == 0) &&
+ (len < tlen)) {
+ /*
+ * If its not a timer don't send a partial
+ * segment.
+ */
+ len = 0;
+ goto just_return_nolock;
+ }
+ }
+ KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
+ __func__, sb_offset));
+ if (len > 0) {
+ sub_from_prr = 1;
+ sack_rxmit = 1;
+ TCPSTAT_INC(tcps_sack_rexmits);
+ TCPSTAT_ADD(tcps_sack_rexmit_bytes,
+ min(len, tp->t_maxseg));
+ counter_u64_add(rack_rtm_prr_retran, 1);
+ }
+ }
+ if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
+ /* we are retransmitting the fin */
+ len--;
+ if (len) {
+ /*
+ * When retransmitting data do *not* include the
+ * FIN. This could happen from a TLP probe.
+ */
+ flags &= ~TH_FIN;
+ }
+ }
+#ifdef INVARIANTS
+ /* For debugging */
+ rack->r_ctl.rc_rsm_at_retran = rsm;
+#endif
+ /*
+ * Get standard flags, and add SYN or FIN if requested by 'hidden'
+ * state flags.
+ */
+ if (tp->t_flags & TF_NEEDFIN)
+ flags |= TH_FIN;
+ if (tp->t_flags & TF_NEEDSYN)
+ flags |= TH_SYN;
+ if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
+ void *end_rsm;
+ end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
+ if (end_rsm)
+ kern_prefetch(end_rsm, &prefetch_rsm);
+ prefetch_rsm = 1;
+ }
+ SOCKBUF_LOCK(sb);
+ /*
+ * If in persist timeout with window of 0, send 1 byte. Otherwise,
+ * if window is small but nonzero and time TF_SENTFIN expired, we
+ * will send what we can and go to transmit state.
+ */
+ if (tp->t_flags & TF_FORCEDATA) {
+ if (sendwin == 0) {
+ /*
+ * If we still have some data to send, then clear
+ * the FIN bit. Usually this would happen below
+ * when it realizes that we aren't sending all the
+ * data. However, if we have exactly 1 byte of
+ * unsent data, then it won't clear the FIN bit
+ * below, and if we are in persist state, we wind up
+ * sending the packet without recording that we sent
+ * the FIN bit.
+ *
+ * We can't just blindly clear the FIN bit, because
+ * if we don't have any more data to send then the
+ * probe will be the FIN itself.
+ */
+ if (sb_offset < sbused(sb))
+ flags &= ~TH_FIN;
+ sendwin = 1;
+ } else {
+ if (rack->rc_in_persist)
+ rack_exit_persist(tp, rack);
+ /*
+ * If we are dropping persist mode then we need to
+ * correct snd_nxt/snd_max and off.
+ */
+ tp->snd_nxt = tp->snd_max;
+ sb_offset = tp->snd_nxt - tp->snd_una;
+ }
+ }
+ /*
+ * If snd_nxt == snd_max and we have transmitted a FIN, the
+ * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
+ * negative length. This can also occur when TCP opens up its
+ * congestion window while receiving additional duplicate acks after
+ * fast-retransmit because TCP will reset snd_nxt to snd_max after
+ * the fast-retransmit.
+ *
+ * In the normal retransmit-FIN-only case, however, snd_nxt will be
+ * set to snd_una, the sb_offset will be 0, and the length may wind
+ * up 0.
+ *
+ * If sack_rxmit is true we are retransmitting from the scoreboard
+ * in which case len is already set.
+ */
+ if (sack_rxmit == 0) {
+ uint32_t avail;
+
+ avail = sbavail(sb);
+ if (SEQ_GT(tp->snd_nxt, tp->snd_una))
+ sb_offset = tp->snd_nxt - tp->snd_una;
+ else
+ sb_offset = 0;
+ if (IN_RECOVERY(tp->t_flags) == 0) {
+ if (rack->r_ctl.rc_tlp_new_data) {
+ /* TLP is forcing out new data */
+ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
+ rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
+ }
+ if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
+ len = tp->snd_wnd;
+ else
+ len = rack->r_ctl.rc_tlp_new_data;
+ rack->r_ctl.rc_tlp_new_data = 0;
+ doing_tlp = 1;
+ } else {
+ if (sendwin > avail) {
+ /* use the available */
+ if (avail > sb_offset) {
+ len = (int32_t)(avail - sb_offset);
+ } else {
+ len = 0;
+ }
+ } else {
+ if (sendwin > sb_offset) {
+ len = (int32_t)(sendwin - sb_offset);
+ } else {
+ len = 0;
+ }
+ }
+ }
+ } else {
+ uint32_t outstanding;
+
+ /*
+ * We are inside of a SACK recovery episode and are
+ * sending new data, having retransmitted all the
+ * data possible so far in the scoreboard.
+ */
+ outstanding = tp->snd_max - tp->snd_una;
+ if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
+ len = 0;
+ else if (avail > sb_offset)
+ len = avail - sb_offset;
+ else
+ len = 0;
+ if (len > 0) {
+ if (len > rack->r_ctl.rc_prr_sndcnt)
+ len = rack->r_ctl.rc_prr_sndcnt;
+
+ if (len > 0) {
+ sub_from_prr = 1;
+ counter_u64_add(rack_rtm_prr_newdata, 1);
+ }
+ }
+ if (len > tp->t_maxseg) {
+ /*
+ * We should never send more than a MSS when
+ * retransmitting or sending new data in prr
+ * mode unless the override flag is on. Most
+ * likely the PRR algorithm is not going to
+ * let us send a lot as well :-)
+ */
+ if (rack->r_ctl.rc_prr_sendalot == 0)
+ len = tp->t_maxseg;
+ } else if (len < tp->t_maxseg) {
+ /*
+ * Do we send any? The idea here is if the
+ * send empty's the socket buffer we want to
+ * do it. However if not then lets just wait
+ * for our prr_sndcnt to get bigger.
+ */
+ long leftinsb;
+
+ leftinsb = sbavail(sb) - sb_offset;
+ if (leftinsb > len) {
+ /* This send does not empty the sb */
+ len = 0;
+ }
+ }
+ }
+ }
+ if (prefetch_so_done == 0) {
+ kern_prefetch(so, &prefetch_so_done);
+ prefetch_so_done = 1;
+ }
+ /*
+ * Lop off SYN bit if it has already been sent. However, if this is
+ * SYN-SENT state and if segment contains data and if we don't know
+ * that foreign host supports TAO, suppress sending segment.
+ */
+ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
+ if ((tp->t_state != TCPS_SYN_RECEIVED) &&
+ (tp->t_state != TCPS_SYN_SENT))
+ flags &= ~TH_SYN;
+#ifdef TCP_RFC7413
+ /*
+ * When sending additional segments following a TFO SYN|ACK,
+ * do not include the SYN bit.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ flags &= ~TH_SYN;
+#endif
+ sb_offset--, len++;
+ if (sbavail(sb) == 0)
+ len = 0;
+ }
+ /*
+ * Be careful not to send data and/or FIN on SYN segments. This
+ * measure is needed to prevent interoperability problems with not
+ * fully conformant TCP implementations.
+ */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
+ len = 0;
+ flags &= ~TH_FIN;
+ }
+#ifdef TCP_RFC7413
+ /*
+ * When retransmitting SYN|ACK on a passively-created TFO socket,
+ * don't include data, as the presence of data may have caused the
+ * original SYN|ACK to have been dropped by a middlebox.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)))
+ len = 0;
+#endif
+ if (len <= 0) {
+ /*
+ * If FIN has been sent but not acked, but we haven't been
+ * called to retransmit, len will be < 0. Otherwise, window
+ * shrank after we sent into it. If window shrank to 0,
+ * cancel pending retransmit, pull snd_nxt back to (closed)
+ * window, and set the persist timer if it isn't already
+ * going. If the window didn't close completely, just wait
+ * for an ACK.
+ *
+ * We also do a general check here to ensure that we will
+ * set the persist timer when we have data to send, but a
+ * 0-byte window. This makes sure the persist timer is set
+ * even if the packet hits one of the "goto send" lines
+ * below.
+ */
+ len = 0;
+ if ((tp->snd_wnd == 0) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ (sb_offset < (int)sbavail(sb))) {
+ tp->snd_nxt = tp->snd_una;
+ rack_enter_persist(tp, rack, cts);
+ }
+ }
+ /* len will be >= 0 after this point. */
+ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
+ tcp_sndbuf_autoscale(tp, so, sendwin);
+ /*
+ * Decide if we can use TCP Segmentation Offloading (if supported by
+ * hardware).
+ *
+ * TSO may only be used if we are in a pure bulk sending state. The
+ * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
+ * options prevent using TSO. With TSO the TCP header is the same
+ * (except for the sequence number) for all generated packets. This
+ * makes it impossible to transmit any options which vary per
+ * generated segment or packet.
+ *
+ * IPv4 handling has a clear separation of ip options and ip header
+ * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
+ * the right thing below to provide length of just ip options and thus
+ * checking for ipoptlen is enough to decide if ip options are present.
+ */
+
+#ifdef INET6
+ if (isipv6)
+ ipoptlen = ip6_optlen(tp->t_inpcb);
+ else
+#endif
+ if (tp->t_inpcb->inp_options)
+ ipoptlen = tp->t_inpcb->inp_options->m_len -
+ offsetof(struct ipoption, ipopt_list);
+ else
+ ipoptlen = 0;
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ /*
+ * Pre-calculate here as we save another lookup into the darknesses
+ * of IPsec that way and can actually decide if TSO is ok.
+ */
+#ifdef INET6
+ if (isipv6 && IPSEC_ENABLED(ipv6))
+ ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
+#ifdef INET
+ else
+#endif
+#endif /* INET6 */
+#ifdef INET
+ if (IPSEC_ENABLED(ipv4))
+ ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
+#endif /* INET */
+#endif
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ ipoptlen += ipsec_optlen;
+#endif
+ if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+ (tp->t_port == 0) &&
+ ((tp->t_flags & TF_SIGNATURE) == 0) &&
+ tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
+ ipoptlen == 0)
+ tso = 1;
+ {
+ uint32_t outstanding;
+
+ outstanding = tp->snd_max - tp->snd_una;
+ if (tp->t_flags & TF_SENTFIN) {
+ /*
+ * If we sent a fin, snd_max is 1 higher than
+ * snd_una
+ */
+ outstanding--;
+ }
+ if (outstanding > 0) {
+ /*
+ * This is sub-optimal. We only send a stand alone
+ * FIN on its own segment.
+ */
+ if (flags & TH_FIN) {
+ flags &= ~TH_FIN;
+ would_have_fin = 1;
+ }
+ } else if (sack_rxmit) {
+ if ((rsm->r_flags & RACK_HAS_FIN) == 0)
+ flags &= ~TH_FIN;
+ } else {
+ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
+ sbused(sb)))
+ flags &= ~TH_FIN;
+ }
+ }
+ recwin = sbspace(&so->so_rcv);
+
+ /*
+ * Sender silly window avoidance. We transmit under the following
+ * conditions when len is non-zero:
+ *
+ * - We have a full segment (or more with TSO) - This is the last
+ * buffer in a write()/send() and we are either idle or running
+ * NODELAY - we've timed out (e.g. persist timer) - we have more
+ * then 1/2 the maximum send window's worth of data (receiver may be
+ * limited the window size) - we need to retransmit
+ */
+ if (len) {
+ if (len >= tp->t_maxseg) {
+ pass = 1;
+ goto send;
+ }
+ /*
+ * NOTE! on localhost connections an 'ack' from the remote
+ * end may occur synchronously with the output and cause us
+ * to flush a buffer queued with moretocome. XXX
+ *
+ */
+ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
+ (idle || (tp->t_flags & TF_NODELAY)) &&
+ ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
+ (tp->t_flags & TF_NOPUSH) == 0) {
+ pass = 2;
+ goto send;
+ }
+ if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */
+ pass = 3;
+ goto send;
+ }
+ if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */
+ goto send;
+ }
+ if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
+ pass = 4;
+ goto send;
+ }
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
+ pass = 5;
+ goto send;
+ }
+ if (sack_rxmit) {
+ pass = 6;
+ goto send;
+ }
+ }
+ /*
+ * Sending of standalone window updates.
+ *
+ * Window updates are important when we close our window due to a
+ * full socket buffer and are opening it again after the application
+ * reads data from it. Once the window has opened again and the
+ * remote end starts to send again the ACK clock takes over and
+ * provides the most current window information.
+ *
+ * We must avoid the silly window syndrome whereas every read from
+ * the receive buffer, no matter how small, causes a window update
+ * to be sent. We also should avoid sending a flurry of window
+ * updates when the socket buffer had queued a lot of data and the
+ * application is doing small reads.
+ *
+ * Prevent a flurry of pointless window updates by only sending an
+ * update when we can increase the advertized window by more than
+ * 1/4th of the socket buffer capacity. When the buffer is getting
+ * full or is very small be more aggressive and send an update
+ * whenever we can increase by two mss sized segments. In all other
+ * situations the ACK's to new incoming data will carry further
+ * window increases.
+ *
+ * Don't send an independent window update if a delayed ACK is
+ * pending (it will get piggy-backed on it) or the remote side
+ * already has done a half-close and won't send more data. Skip
+ * this if the connection is in T/TCP half-open state.
+ */
+ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
+ !(tp->t_flags & TF_DELACK) &&
+ !TCPS_HAVERCVDFIN(tp->t_state)) {
+ /*
+ * "adv" is the amount we could increase the window, taking
+ * into account that we are limited by TCP_MAXWIN <<
+ * tp->rcv_scale.
+ */
+ int32_t adv;
+ int oldwin;
+
+ adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
+ oldwin = (tp->rcv_adv - tp->rcv_nxt);
+ adv -= oldwin;
+ } else
+ oldwin = 0;
+
+ /*
+ * If the new window size ends up being the same as the old
+ * size when it is scaled, then don't force a window update.
+ */
+ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
+ goto dontupdate;
+
+ if (adv >= (int32_t)(2 * tp->t_maxseg) &&
+ (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
+ recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
+ so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
+ pass = 7;
+ goto send;
+ }
+ if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
+ goto send;
+ }
+dontupdate:
+
+ /*
+ * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
+ * is also a catch-all for the retransmit timer timeout case.
+ */
+ if (tp->t_flags & TF_ACKNOW) {
+ pass = 8;
+ goto send;
+ }
+ if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
+ pass = 9;
+ goto send;
+ }
+ if (SEQ_GT(tp->snd_up, tp->snd_una)) {
+ pass = 10;
+ goto send;
+ }
+ /*
+ * If our state indicates that FIN should be sent and we have not
+ * yet done so, then we need to send.
+ */
+ if (flags & TH_FIN) {
+ if ((tp->t_flags & TF_SENTFIN) ||
+ (((tp->t_flags & TF_SENTFIN) == 0) &&
+ (tp->snd_nxt == tp->snd_una))) {
+ pass = 11;
+ goto send;
+ }
+ }
+ /*
+ * No reason to send a segment, just return.
+ */
+just_return:
+ SOCKBUF_UNLOCK(sb);
+just_return_nolock:
+ if (tot_len_this_send == 0)
+ counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
+ rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
+ rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
+ tp->t_flags &= ~TF_FORCEDATA;
+ return (0);
+
+send:
+ if (doing_tlp == 0) {
+ /*
+ * Data not a TLP, and its not the rxt firing. If it is the
+ * rxt firing, we want to leave the tlp_in_progress flag on
+ * so we don't send another TLP. It has to be a rack timer
+ * or normal send (response to acked data) to clear the tlp
+ * in progress flag.
+ */
+ rack->rc_tlp_in_progress = 0;
+ }
+ SOCKBUF_LOCK_ASSERT(sb);
+ if (len > 0) {
+ if (len >= tp->t_maxseg)
+ tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
+ }
+ /*
+ * Before ESTABLISHED, force sending of initial options unless TCP
+ * set not to do any options. NOTE: we assume that the IP/TCP header
+ * plus TCP options always fit in a single mbuf, leaving room for a
+ * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
+ * + optlen <= MCLBYTES
+ */
+ optlen = 0;
+#ifdef INET6
+ if (isipv6)
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ else
+#endif
+ hdrlen = sizeof(struct tcpiphdr);
+
+ /*
+ * Compute options for segment. We only have to care about SYN and
+ * established connection segments. Options for SYN-ACK segments
+ * are handled in TCP syncache.
+ */
+ to.to_flags = 0;
+ if ((tp->t_flags & TF_NOOPT) == 0) {
+ /* Maximum segment size. */
+ if (flags & TH_SYN) {
+ tp->snd_nxt = tp->iss;
+ to.to_mss = tcp_mssopt(&inp->inp_inc);
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port)
+ to.to_mss -= V_tcp_udp_tunneling_overhead;
+#endif
+ to.to_flags |= TOF_MSS;
+#ifdef TCP_RFC7413
+ /*
+ * Only include the TFO option on the first
+ * transmission of the SYN|ACK on a
+ * passively-created TFO socket, as the presence of
+ * the TFO option may have caused the original
+ * SYN|ACK to have been dropped by a middlebox.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED) &&
+ (tp->t_rxtshift == 0)) {
+ to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN;
+ to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
+ to.to_flags |= TOF_FASTOPEN;
+ }
+#endif
+ }
+ /* Window scaling. */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
+ to.to_wscale = tp->request_r_scale;
+ to.to_flags |= TOF_SCALE;
+ }
+ /* Timestamps. */
+ if ((tp->t_flags & TF_RCVD_TSTMP) ||
+ ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
+ to.to_tsval = cts + tp->ts_offset;
+ to.to_tsecr = tp->ts_recent;
+ to.to_flags |= TOF_TS;
+ }
+ /* Set receive buffer autosizing timestamp. */
+ if (tp->rfbuf_ts == 0 &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE))
+ tp->rfbuf_ts = tcp_ts_getticks();
+ /* Selective ACK's. */
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_SACKPERM;
+ else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ tp->rcv_numsacks > 0) {
+ to.to_flags |= TOF_SACK;
+ to.to_nsacks = tp->rcv_numsacks;
+ to.to_sacks = (u_char *)tp->sackblks;
+ }
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ /* TCP-MD5 (RFC2385). */
+ if (tp->t_flags & TF_SIGNATURE)
+ to.to_flags |= TOF_SIGNATURE;
+#endif /* TCP_SIGNATURE */
+
+ /* Processing the options. */
+ hdrlen += optlen = tcp_addoptions(&to, opt);
+ }
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ if (V_tcp_udp_tunneling_port == 0) {
+ /* The port was removed?? */
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (EHOSTUNREACH);
+ }
+ hdrlen += sizeof(struct udphdr);
+ }
+#endif
+ ipoptlen = 0;
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ ipoptlen += ipsec_optlen;
+#endif
+
+ /*
+ * Adjust data length if insertion of options will bump the packet
+ * length beyond the t_maxseg length. Clear the FIN bit because we
+ * cut off the tail of the segment.
+ */
+ if (len + optlen + ipoptlen > tp->t_maxseg) {
+ if (flags & TH_FIN) {
+ would_have_fin = 1;
+ flags &= ~TH_FIN;
+ }
+ if (tso) {
+ uint32_t if_hw_tsomax;
+ uint32_t moff;
+ int32_t max_len;
+
+ /* extract TSO information */
+ if_hw_tsomax = tp->t_tsomax;
+ if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
+ if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
+ KASSERT(ipoptlen == 0,
+ ("%s: TSO can't do IP options", __func__));
+
+ /*
+ * Check if we should limit by maximum payload
+ * length:
+ */
+ if (if_hw_tsomax != 0) {
+ /* compute maximum TSO length */
+ max_len = (if_hw_tsomax - hdrlen -
+ max_linkhdr);
+ if (max_len <= 0) {
+ len = 0;
+ } else if (len > max_len) {
+ sendalot = 1;
+ len = max_len;
+ }
+ }
+ /*
+ * Prevent the last segment from being fractional
+ * unless the send sockbuf can be emptied:
+ */
+ max_len = (tp->t_maxseg - optlen);
+ if ((sb_offset + len) < sbavail(sb)) {
+ moff = len % (u_int)max_len;
+ if (moff != 0) {
+ len -= moff;
+ sendalot = 1;
+ }
+ }
+ /*
+ * In case there are too many small fragments don't
+ * use TSO:
+ */
+ if (len <= max_len) {
+ len = max_len;
+ sendalot = 1;
+ tso = 0;
+ }
+ /*
+ * Send the FIN in a separate segment after the bulk
+ * sending is done. We don't trust the TSO
+ * implementations to clear the FIN flag on all but
+ * the last segment.
+ */
+ if (tp->t_flags & TF_NEEDFIN)
+ sendalot = 1;
+
+ } else {
+ len = tp->t_maxseg - optlen - ipoptlen;
+ sendalot = 1;
+ }
+ } else
+ tso = 0;
+ KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
+ ("%s: len > IP_MAXPACKET", __func__));
+#ifdef DIAGNOSTIC
+#ifdef INET6
+ if (max_linkhdr + hdrlen > MCLBYTES)
+#else
+ if (max_linkhdr + hdrlen > MHLEN)
+#endif
+ panic("tcphdr too big");
+#endif
+
+ /*
+ * This KASSERT is here to catch edge cases at a well defined place.
+ * Before, those had triggered (random) panic conditions further
+ * down.
+ */
+ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
+ if ((len == 0) &&
+ (flags & TH_FIN) &&
+ (sbused(sb))) {
+ /*
+ * We have outstanding data, don't send a fin by itself!.
+ */
+ goto just_return;
+ }
+ /*
+ * Grab a header mbuf, attaching a copy of data to be transmitted,
+ * and initialize the header from the template for sends on this
+ * connection.
+ */
+ if (len) {
+ uint32_t max_val;
+ uint32_t moff;
+
+ if (rack->rc_pace_max_segs)
+ max_val = rack->rc_pace_max_segs * tp->t_maxseg;
+ else
+ max_val = len;
+ /*
+ * We allow a limit on sending with hptsi.
+ */
+ if (len > max_val) {
+ len = max_val;
+ }
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr)
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ else
+#endif
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+
+ if (m == NULL) {
+ SOCKBUF_UNLOCK(sb);
+ error = ENOBUFS;
+ sack_rxmit = 0;
+ goto out;
+ }
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+
+ /*
+ * Start the m_copy functions from the closest mbuf to the
+ * sb_offset in the socket buffer chain.
+ */
+ mb = sbsndptr_noadv(sb, sb_offset, &moff);
+ if (len <= MHLEN - hdrlen - max_linkhdr) {
+ m_copydata(mb, moff, (int)len,
+ mtod(m, caddr_t)+hdrlen);
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+ sbsndptr_adv(sb, mb, len);
+ m->m_len += len;
+ } else {
+ struct sockbuf *msb;
+
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+ msb = NULL;
+ else
+ msb = sb;
+ m->m_next = tcp_m_copym(mb, moff, &len,
+ if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
+ if (len <= (tp->t_maxseg - optlen)) {
+ /*
+ * Must have ran out of mbufs for the copy
+ * shorten it to no longer need tso. Lets
+ * not put on sendalot since we are low on
+ * mbufs.
+ */
+ tso = 0;
+ }
+ if (m->m_next == NULL) {
+ SOCKBUF_UNLOCK(sb);
+ (void)m_free(m);
+ error = ENOBUFS;
+ sack_rxmit = 0;
+ goto out;
+ }
+ }
+ if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
+ TCPSTAT_INC(tcps_sndprobe);
+#ifdef NETFLIX_STATS
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+ stats_voi_update_abs_u32(tp->t_stats,
+ VOI_TCP_RETXPB, len);
+ else
+ stats_voi_update_abs_u64(tp->t_stats,
+ VOI_TCP_TXPB, len);
+#endif
+ } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+ if (rsm && (rsm->r_flags & RACK_TLP)) {
+ /*
+ * TLP should not count in retran count, but
+ * in its own bin
+ */
+ counter_u64_add(rack_tlp_retran, 1);
+ counter_u64_add(rack_tlp_retran_bytes, len);
+ } else {
+ tp->t_sndrexmitpack++;
+ TCPSTAT_INC(tcps_sndrexmitpack);
+ TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+ }
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
+ len);
+#endif
+ } else {
+ TCPSTAT_INC(tcps_sndpack);
+ TCPSTAT_ADD(tcps_sndbyte, len);
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
+ len);
+#endif
+ }
+ /*
+ * If we're sending everything we've got, set PUSH. (This
+ * will keep happy those implementations which only give
+ * data to the user when a buffer fills or a PUSH comes in.)
+ */
+ if (sb_offset + len == sbused(sb) &&
+ sbused(sb) &&
+ !(flags & TH_SYN))
+ flags |= TH_PUSH;
+
+ /*
+ * Are we doing hptsi, if so we must calculate the slot. We
+ * only do hptsi in ESTABLISHED and with no RESET being
+ * sent where we have data to send.
+ */
+ if (((tp->t_state == TCPS_ESTABLISHED) ||
+ (tp->t_state == TCPS_CLOSE_WAIT) ||
+ ((tp->t_state == TCPS_FIN_WAIT_1) &&
+ ((tp->t_flags & TF_SENTFIN) == 0) &&
+ ((flags & TH_FIN) == 0))) &&
+ ((flags & TH_RST) == 0) &&
+ (rack->rc_always_pace)) {
+ /*
+ * We use the most optimistic possible cwnd/srtt for
+ * sending calculations. This will make our
+ * calculation anticipate getting more through
+ * quicker then possible. But thats ok we don't want
+ * the peer to have a gap in data sending.
+ */
+ uint32_t srtt, cwnd, tr_perms = 0;
+
+ if (rack->r_ctl.rc_rack_min_rtt)
+ srtt = rack->r_ctl.rc_rack_min_rtt;
+ else
+ srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
+ if (rack->r_ctl.rc_rack_largest_cwnd)
+ cwnd = rack->r_ctl.rc_rack_largest_cwnd;
+ else
+ cwnd = tp->snd_cwnd;
+ tr_perms = cwnd / srtt;
+ if (tr_perms == 0) {
+ tr_perms = tp->t_maxseg;
+ }
+ tot_len_this_send += len;
+ /*
+ * Calculate how long this will take to drain, if
+ * the calculation comes out to zero, thats ok we
+ * will use send_a_lot to possibly spin around for
+ * more increasing tot_len_this_send to the point
+ * that its going to require a pace, or we hit the
+ * cwnd. Which in that case we are just waiting for
+ * a ACK.
+ */
+ slot = tot_len_this_send / tr_perms;
+ /* Now do we reduce the time so we don't run dry? */
+ if (slot && rack->rc_pace_reduce) {
+ int32_t reduce;
+
+ reduce = (slot / rack->rc_pace_reduce);
+ if (reduce < slot) {
+ slot -= reduce;
+ } else
+ slot = 0;
+ }
+ if (rack->r_enforce_min_pace &&
+ (slot == 0) &&
+ (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
+ /* We are enforcing a minimum pace time of 1ms */
+ slot = rack->r_enforce_min_pace;
+ }
+ }
+ SOCKBUF_UNLOCK(sb);
+ } else {
+ SOCKBUF_UNLOCK(sb);
+ if (tp->t_flags & TF_ACKNOW)
+ TCPSTAT_INC(tcps_sndacks);
+ else if (flags & (TH_SYN | TH_FIN | TH_RST))
+ TCPSTAT_INC(tcps_sndctrl);
+ else if (SEQ_GT(tp->snd_up, tp->snd_una))
+ TCPSTAT_INC(tcps_sndurg);
+ else
+ TCPSTAT_INC(tcps_sndwinup);
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ sack_rxmit = 0;
+ goto out;
+ }
+#ifdef INET6
+ if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
+ MHLEN >= hdrlen) {
+ M_ALIGN(m, hdrlen);
+ } else
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ }
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+#ifdef INET6
+ if (isipv6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else
+#endif
+ th = (struct tcphdr *)(ip6 + 1);
+ tcpip_fillheaders(inp, ip6, th);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+#ifdef TCPDEBUG
+ ipov = (struct ipovly *)ip;
+#endif
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else
+#endif
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(inp, ip, th);
+ }
+ /*
+ * Fill in fields, remembering maximum advertised window for use in
+ * delaying messages about window sizes. If resending a FIN, be sure
+ * not to use a new sequence number.
+ */
+ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
+ tp->snd_nxt == tp->snd_max)
+ tp->snd_nxt--;
+ /*
+ * If we are starting a connection, send ECN setup SYN packet. If we
+ * are on a retransmit, we may resend those bits a number of times
+ * as per RFC 3168.
+ */
+ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
+ if (tp->t_rxtshift >= 1) {
+ if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
+ flags |= TH_ECE | TH_CWR;
+ } else
+ flags |= TH_ECE | TH_CWR;
+ }
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & TF_ECN_PERMIT)) {
+ /*
+ * If the peer has ECN, mark data packets with ECN capable
+ * transmission (ECT). Ignore pure ack packets,
+ * retransmissions and window probes.
+ */
+ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
+ !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
+#ifdef INET6
+ if (isipv6)
+ ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+ else
+#endif
+ ip->ip_tos |= IPTOS_ECN_ECT0;
+ TCPSTAT_INC(tcps_ecn_ect0);
+ }
+ /*
+ * Reply with proper ECN notifications.
+ */
+ if (tp->t_flags & TF_ECN_SND_CWR) {
+ flags |= TH_CWR;
+ tp->t_flags &= ~TF_ECN_SND_CWR;
+ }
+ if (tp->t_flags & TF_ECN_SND_ECE)
+ flags |= TH_ECE;
+ }
+ /*
+ * If we are doing retransmissions, then snd_nxt will not reflect
+ * the first unsent octet. For ACK only packets, we do not want the
+ * sequence number of the retransmitted packet, we want the sequence
+ * number of the next unsent octet. So, if there is no data (and no
+ * SYN or FIN), use snd_max instead of snd_nxt when filling in
+ * ti_seq. But if we are in persist state, snd_max might reflect
+ * one byte beyond the right edge of the window, so use snd_nxt in
+ * that case, since we know we aren't doing a retransmission.
+ * (retransmit and persist are mutually exclusive...)
+ */
+ if (sack_rxmit == 0) {
+ if (len || (flags & (TH_SYN | TH_FIN)) ||
+ rack->rc_in_persist) {
+ th->th_seq = htonl(tp->snd_nxt);
+ rack_seq = tp->snd_nxt;
+ } else if (flags & TH_RST) {
+ /*
+ * For a Reset send the last cum ack in sequence
+ * (this like any other choice may still generate a
+ * challenge ack, if a ack-update packet is in
+ * flight).
+ */
+ th->th_seq = htonl(tp->snd_una);
+ rack_seq = tp->snd_una;
+ } else {
+ th->th_seq = htonl(tp->snd_max);
+ rack_seq = tp->snd_max;
+ }
+ } else {
+ th->th_seq = htonl(rsm->r_start);
+ rack_seq = rsm->r_start;
+ }
+ th->th_ack = htonl(tp->rcv_nxt);
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ }
+ th->th_flags = flags;
+ /*
+ * Calculate receive window. Don't shrink window, but avoid silly
+ * window syndrome.
+ */
+ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
+ recwin < (long)tp->t_maxseg)
+ recwin = 0;
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
+ recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
+ recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
+ if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
+ recwin = (long)TCP_MAXWIN << tp->rcv_scale;
+
+ /*
+ * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
+ * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is
+ * handled in syncache.
+ */
+ if (flags & TH_SYN)
+ th->th_win = htons((u_short)
+ (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
+ else
+ th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
+ /*
+ * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
+ * window. This may cause the remote transmitter to stall. This
+ * flag tells soreceive() to disable delayed acknowledgements when
+ * draining the buffer. This can occur if the receiver is
+ * attempting to read more data than can be buffered prior to
+ * transmitting on the connection.
+ */
+ if (th->th_win == 0) {
+ tp->t_sndzerowin++;
+ tp->t_flags |= TF_RXWIN0SENT;
+ } else
+ tp->t_flags &= ~TF_RXWIN0SENT;
+ if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
+ th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
+ th->th_flags |= TH_URG;
+ } else
+ /*
+ * If no urgent pointer to send, then we pull the urgent
+ * pointer to the left edge of the send window so that it
+ * doesn't drift into the send window on sequence number
+ * wraparound.
+ */
+ tp->snd_up = tp->snd_una; /* drag it along */
+
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (to.to_flags & TOF_SIGNATURE) {
+ /*
+ * Calculate MD5 signature and put it into the place
+ * determined before.
+ * NOTE: since TCP options buffer doesn't point into
+ * mbuf's data, calculate offset and use it.
+ */
+ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
+ (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
+ /*
+ * Do not send segment if the calculation of MD5
+ * digest has failed.
+ */
+ goto out;
+ }
+ }
+#endif
+
+ /*
+ * Put TCP length in extended header, and then checksum extended
+ * header and data.
+ */
+ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+#ifdef INET6
+ if (isipv6) {
+ /*
+ * ip6_plen is not need to be filled now, and will be filled
+ * in ip6_output.
+ */
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
+ th->th_sum = htons(0);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in6_cksum_pseudo(ip6,
+ sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
+ 0);
+ }
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ {
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+ th->th_sum = htons(0);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+ IPPROTO_TCP + len + optlen));
+ }
+ /* IP version must be set here for ipv4/ipv6 checking later */
+ KASSERT(ip->ip_v == IPVERSION,
+ ("%s: IP version incorrect: %d", __func__, ip->ip_v));
+ }
+#endif
+
+ /*
+ * Enable TSO and specify the size of the segments. The TCP pseudo
+ * header checksum is always provided. XXX: Fixme: This is currently
+ * not the case for IPv6.
+ */
+ if (tso) {
+ KASSERT(len > tp->t_maxseg - optlen,
+ ("%s: len <= tso_segsz", __func__));
+ m->m_pkthdr.csum_flags |= CSUM_TSO;
+ m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
+ }
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
+ ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
+ __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
+#else
+ KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
+ ("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
+ __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
+#endif
+
+#ifdef TCP_HHOOK
+ /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
+ hhook_run_tcp_est_out(tp, th, &to, len, tso);
+#endif
+
+#ifdef TCPDEBUG
+ /*
+ * Trace.
+ */
+ if (so->so_options & SO_DEBUG) {
+ u_short save = 0;
+
+#ifdef INET6
+ if (!isipv6)
+#endif
+ {
+ save = ipov->ih_len;
+ ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen +
+ * (th->th_off << 2) */ );
+ }
+ tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
+#ifdef INET6
+ if (!isipv6)
+#endif
+ ipov->ih_len = save;
+ }
+#endif /* TCPDEBUG */
+
+ /* We're getting ready to send; log now. */
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
+ if (rsm || sack_rxmit) {
+ log.u_bbr.flex8 = 1;
+ } else {
+ log.u_bbr.flex8 = 0;
+ }
+ lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
+ len, &log, false, NULL, NULL, 0, NULL);
+ } else
+ lgb = NULL;
+
+ /*
+ * Fill in IP length and desired time to live and send to IP level.
+ * There should be a better way to handle ttl and tos; we could keep
+ * them in the template, but need a way to checksum without them.
+ */
+ /*
+ * m->m_pkthdr.len should have been set before cksum calcuration,
+ * because in6_cksum() need it.
+ */
+#ifdef INET6
+ if (isipv6) {
+ /*
+ * we separately set hoplimit for every segment, since the
+ * user might want to change the value via setsockopt. Also,
+ * desired default hop limit might be changed via Neighbor
+ * Discovery.
+ */
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+
+ /*
+ * Set the packet size here for the benefit of DTrace
+ * probes. ip6_output() will set it properly; it's supposed
+ * to include the option header lengths as well.
+ */
+ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
+
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+
+ if (tp->t_state == TCPS_SYN_SENT)
+ TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
+
+ TCP_PROBE5(send, NULL, tp, ip6, tp, th);
+ /* TODO: IPv6 IP6TOS_ECT bit on */
+ error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
+ &inp->inp_route6,
+ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
+ NULL, NULL, inp);
+
+ if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
+ mtu = inp->inp_route6.ro_rt->rt_mtu;
+ }
+#endif /* INET6 */
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ ip->ip_len = htons(m->m_pkthdr.len);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ ip->ip_ttl = in6_selecthlim(inp, NULL);
+#endif /* INET6 */
+ /*
+ * If we do path MTU discovery, then we set DF on every
+ * packet. This might not be the best thing to do according
+ * to RFC3390 Section 2. However the tcp hostcache migitates
+ * the problem so it affects only the first tcp connection
+ * with a host.
+ *
+ * NB: Don't set DF on small MTU/MSS to have a safe
+ * fallback.
+ */
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ if (tp->t_port == 0 || len < V_tcp_minmss) {
+ ip->ip_off |= htons(IP_DF);
+ }
+ } else {
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+
+ if (tp->t_state == TCPS_SYN_SENT)
+ TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
+
+ TCP_PROBE5(send, NULL, tp, ip, tp, th);
+
+ error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
+ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
+ inp);
+ if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
+ mtu = inp->inp_route.ro_rt->rt_mtu;
+ }
+#endif /* INET */
+
+out:
+ if (lgb) {
+ lgb->tlb_errno = error;
+ lgb = NULL;
+ }
+ /*
+ * In transmit state, time the transmission and arrange for the
+ * retransmit. In persist state, just set snd_max.
+ */
+ if (error == 0) {
+ if (len == 0)
+ counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
+ else if (len == 1) {
+ counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
+ } else if (len > 1) {
+ int idx;
+
+ idx = (len / tp->t_maxseg) + 3;
+ if (idx >= TCP_MSS_ACCT_ATIMER)
+ counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
+ else
+ counter_u64_add(rack_out_size[idx], 1);
+ }
+ }
+ if (sub_from_prr && (error == 0)) {
+ rack->r_ctl.rc_prr_sndcnt -= len;
+ }
+ sub_from_prr = 0;
+ rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
+ pass, rsm);
+ if ((tp->t_flags & TF_FORCEDATA) == 0 ||
+ (rack->rc_in_persist == 0)) {
+#ifdef NETFLIX_STATS
+ tcp_seq startseq = tp->snd_nxt;
+#endif
+
+ /*
+ * Advance snd_nxt over sequence space of this segment.
+ */
+ if (error)
+ /* We don't log or do anything with errors */
+ goto timer;
+
+ if (flags & (TH_SYN | TH_FIN)) {
+ if (flags & TH_SYN)
+ tp->snd_nxt++;
+ if (flags & TH_FIN) {
+ tp->snd_nxt++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ }
+ /* In the ENOBUFS case we do *not* update snd_max */
+ if (sack_rxmit)
+ goto timer;
+
+ tp->snd_nxt += len;
+ if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
+ if (tp->snd_una == tp->snd_max) {
+ /*
+ * Update the time we just added data since
+ * none was outstanding.
+ */
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
+ tp->t_acktime = ticks;
+ }
+ tp->snd_max = tp->snd_nxt;
+#ifdef NETFLIX_STATS
+ if (!(tp->t_flags & TF_GPUTINPROG) && len) {
+ tp->t_flags |= TF_GPUTINPROG;
+ tp->gput_seq = startseq;
+ tp->gput_ack = startseq +
+ ulmin(sbavail(sb) - sb_offset, sendwin);
+ tp->gput_ts = tcp_ts_getticks();
+ }
+#endif
+ }
+ /*
+ * Set retransmit timer if not currently set, and not doing
+ * a pure ack or a keep-alive probe. Initial value for
+ * retransmit timer is smoothed round-trip time + 2 *
+ * round-trip time variance. Initialize shift counter which
+ * is used for backoff of retransmit time.
+ */
+timer:
+ if ((tp->snd_wnd == 0) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ /*
+ * If the persists timer was set above (right before
+ * the goto send), and still needs to be on. Lets
+ * make sure all is canceled. If the persist timer
+ * is not running, we want to get it up.
+ */
+ if (rack->rc_in_persist == 0) {
+ rack_enter_persist(tp, rack, cts);
+ }
+ }
+ } else {
+ /*
+ * Persist case, update snd_max but since we are in persist
+ * mode (no window) we do not update snd_nxt.
+ */
+ int32_t xlen = len;
+
+ if (error)
+ goto nomore;
+
+ if (flags & TH_SYN)
+ ++xlen;
+ if (flags & TH_FIN) {
+ ++xlen;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ /* In the ENOBUFS case we do *not* update snd_max */
+ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
+ if (tp->snd_una == tp->snd_max) {
+ /*
+ * Update the time we just added data since
+ * none was outstanding.
+ */
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
+ tp->t_acktime = ticks;
+ }
+ tp->snd_max = tp->snd_nxt + len;
+ }
+ }
+nomore:
+ if (error) {
+ SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */
+ /*
+ * Failures do not advance the seq counter above. For the
+ * case of ENOBUFS we will fall out and retry in 1ms with
+ * the hpts. Everything else will just have to retransmit
+ * with the timer.
+ *
+ * In any case, we do not want to loop around for another
+ * send without a good reason.
+ */
+ sendalot = 0;
+ switch (error) {
+ case EPERM:
+ tp->t_flags &= ~TF_FORCEDATA;
+ tp->t_softerror = error;
+ return (error);
+ case ENOBUFS:
+ if (slot == 0) {
+ /*
+ * Pace us right away to retry in a some
+ * time
+ */
+ slot = 1 + rack->rc_enobuf;
+ if (rack->rc_enobuf < 255)
+ rack->rc_enobuf++;
+ if (slot > (rack->rc_rack_rtt / 2)) {
+ slot = rack->rc_rack_rtt / 2;
+ }
+ if (slot < 10)
+ slot = 10;
+ }
+ counter_u64_add(rack_saw_enobuf, 1);
+ error = 0;
+ goto enobufs;
+ case EMSGSIZE:
+ /*
+ * For some reason the interface we used initially
+ * to send segments changed to another or lowered
+ * its MTU. If TSO was active we either got an
+ * interface without TSO capabilits or TSO was
+ * turned off. If we obtained mtu from ip_output()
+ * then update it and try again.
+ */
+ if (tso)
+ tp->t_flags &= ~TF_TSO;
+ if (mtu != 0) {
+ tcp_mss_update(tp, -1, mtu, NULL, NULL);
+ goto again;
+ }
+ slot = 10;
+ rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
+ tp->t_flags &= ~TF_FORCEDATA;
+ return (error);
+ case ENETUNREACH:
+ counter_u64_add(rack_saw_enetunreach, 1);
+ case EHOSTDOWN:
+ case EHOSTUNREACH:
+ case ENETDOWN:
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_softerror = error;
+ }
+ /* FALLTHROUGH */
+ default:
+ slot = 10;
+ rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
+ tp->t_flags &= ~TF_FORCEDATA;
+ return (error);
+ }
+ } else {
+ rack->rc_enobuf = 0;
+ }
+ TCPSTAT_INC(tcps_sndtotal);
+
+ /*
+ * Data sent (as far as we can tell). If this advertises a larger
+ * window than any other segment, then remember the size of the
+ * advertised window. Any pending ACK has now been sent.
+ */
+ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
+ tp->rcv_adv = tp->rcv_nxt + recwin;
+ tp->last_ack_sent = tp->rcv_nxt;
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
+enobufs:
+ rack->r_tlp_running = 0;
+ if ((flags & TH_RST) || (would_have_fin == 1)) {
+ /*
+ * We don't send again after a RST. We also do *not* send
+ * again if we would have had a find, but now have
+ * outstanding data.
+ */
+ slot = 0;
+ sendalot = 0;
+ }
+ if (slot) {
+ /* set the rack tcb into the slot N */
+ counter_u64_add(rack_paced_segments, 1);
+ } else if (sendalot) {
+ if (len)
+ counter_u64_add(rack_unpaced_segments, 1);
+ sack_rxmit = 0;
+ tp->t_flags &= ~TF_FORCEDATA;
+ goto again;
+ } else if (len) {
+ counter_u64_add(rack_unpaced_segments, 1);
+ }
+ tp->t_flags &= ~TF_FORCEDATA;
+ rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
+ return (error);
+}
+
+/*
+ * rack_ctloutput() must drop the inpcb lock before performing copyin on
+ * socket option arguments. When it re-acquires the lock after the copy, it
+ * has to revalidate that the connection is still valid for the socket
+ * option.
+ */
+static int
+rack_set_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
+{
+ int32_t error = 0, optval;
+
+ switch (sopt->sopt_name) {
+ case TCP_RACK_PROP_RATE:
+ case TCP_RACK_PROP:
+ case TCP_RACK_TLP_REDUCE:
+ case TCP_RACK_EARLY_RECOV:
+ case TCP_RACK_PACE_ALWAYS:
+ case TCP_DELACK:
+ case TCP_RACK_PACE_REDUCE:
+ case TCP_RACK_PACE_MAX_SEG:
+ case TCP_RACK_PRR_SENDALOT:
+ case TCP_RACK_MIN_TO:
+ case TCP_RACK_EARLY_SEG:
+ case TCP_RACK_REORD_THRESH:
+ case TCP_RACK_REORD_FADE:
+ case TCP_RACK_TLP_THRESH:
+ case TCP_RACK_PKT_DELAY:
+ case TCP_RACK_TLP_USE:
+ case TCP_RACK_TLP_INC_VAR:
+ case TCP_RACK_IDLE_REDUCE_HIGH:
+ case TCP_RACK_MIN_PACE:
+ case TCP_RACK_MIN_PACE_SEG:
+ case TCP_BBR_RACK_RTT_USE:
+ case TCP_DATA_AFTER_CLOSE:
+ break;
+ default:
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+ if (error)
+ return (error);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
+ }
+ tp = intotcpcb(inp);
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ switch (sopt->sopt_name) {
+ case TCP_RACK_PROP_RATE:
+ if ((optval <= 0) || (optval >= 100)) {
+ error = EINVAL;
+ break;
+ }
+ RACK_OPTS_INC(tcp_rack_prop_rate);
+ rack->r_ctl.rc_prop_rate = optval;
+ break;
+ case TCP_RACK_TLP_USE:
+ if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
+ error = EINVAL;
+ break;
+ }
+ RACK_OPTS_INC(tcp_tlp_use);
+ rack->rack_tlp_threshold_use = optval;
+ break;
+ case TCP_RACK_PROP:
+ /* RACK proportional rate reduction (bool) */
+ RACK_OPTS_INC(tcp_rack_prop);
+ rack->r_ctl.rc_prop_reduce = optval;
+ break;
+ case TCP_RACK_TLP_REDUCE:
+ /* RACK TLP cwnd reduction (bool) */
+ RACK_OPTS_INC(tcp_rack_tlp_reduce);
+ rack->r_ctl.rc_tlp_cwnd_reduce = optval;
+ break;
+ case TCP_RACK_EARLY_RECOV:
+ /* Should recovery happen early (bool) */
+ RACK_OPTS_INC(tcp_rack_early_recov);
+ rack->r_ctl.rc_early_recovery = optval;
+ break;
+ case TCP_RACK_PACE_ALWAYS:
+ /* Use the always pace method (bool) */
+ RACK_OPTS_INC(tcp_rack_pace_always);
+ if (optval > 0)
+ rack->rc_always_pace = 1;
+ else
+ rack->rc_always_pace = 0;
+ break;
+ case TCP_RACK_PACE_REDUCE:
+ /* RACK Hptsi reduction factor (divisor) */
+ RACK_OPTS_INC(tcp_rack_pace_reduce);
+ if (optval)
+ /* Must be non-zero */
+ rack->rc_pace_reduce = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_RACK_PACE_MAX_SEG:
+ /* Max segments in a pace */
+ RACK_OPTS_INC(tcp_rack_max_seg);
+ rack->rc_pace_max_segs = optval;
+ break;
+ case TCP_RACK_PRR_SENDALOT:
+ /* Allow PRR to send more than one seg */
+ RACK_OPTS_INC(tcp_rack_prr_sendalot);
+ rack->r_ctl.rc_prr_sendalot = optval;
+ break;
+ case TCP_RACK_MIN_TO:
+ /* Minimum time between rack t-o's in ms */
+ RACK_OPTS_INC(tcp_rack_min_to);
+ rack->r_ctl.rc_min_to = optval;
+ break;
+ case TCP_RACK_EARLY_SEG:
+ /* If early recovery max segments */
+ RACK_OPTS_INC(tcp_rack_early_seg);
+ rack->r_ctl.rc_early_recovery_segs = optval;
+ break;
+ case TCP_RACK_REORD_THRESH:
+ /* RACK reorder threshold (shift amount) */
+ RACK_OPTS_INC(tcp_rack_reord_thresh);
+ if ((optval > 0) && (optval < 31))
+ rack->r_ctl.rc_reorder_shift = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_RACK_REORD_FADE:
+ /* Does reordering fade after ms time */
+ RACK_OPTS_INC(tcp_rack_reord_fade);
+ rack->r_ctl.rc_reorder_fade = optval;
+ break;
+ case TCP_RACK_TLP_THRESH:
+ /* RACK TLP theshold i.e. srtt+(srtt/N) */
+ RACK_OPTS_INC(tcp_rack_tlp_thresh);
+ if (optval)
+ rack->r_ctl.rc_tlp_threshold = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_RACK_PKT_DELAY:
+ /* RACK added ms i.e. rack-rtt + reord + N */
+ RACK_OPTS_INC(tcp_rack_pkt_delay);
+ rack->r_ctl.rc_pkt_delay = optval;
+ break;
+ case TCP_RACK_TLP_INC_VAR:
+ /* Does TLP include rtt variance in t-o */
+ RACK_OPTS_INC(tcp_rack_tlp_inc_var);
+ rack->r_ctl.rc_prr_inc_var = optval;
+ break;
+ case TCP_RACK_IDLE_REDUCE_HIGH:
+ RACK_OPTS_INC(tcp_rack_idle_reduce_high);
+ if (optval)
+ rack->r_idle_reduce_largest = 1;
+ else
+ rack->r_idle_reduce_largest = 0;
+ break;
+ case TCP_DELACK:
+ if (optval == 0)
+ tp->t_delayed_ack = 0;
+ else
+ tp->t_delayed_ack = 1;
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tp->t_flags |= TF_ACKNOW;
+ rack_output(tp);
+ }
+ break;
+ case TCP_RACK_MIN_PACE:
+ RACK_OPTS_INC(tcp_rack_min_pace);
+ if (optval > 3)
+ rack->r_enforce_min_pace = 3;
+ else
+ rack->r_enforce_min_pace = optval;
+ break;
+ case TCP_RACK_MIN_PACE_SEG:
+ RACK_OPTS_INC(tcp_rack_min_pace_seg);
+ if (optval >= 16)
+ rack->r_min_pace_seg_thresh = 15;
+ else
+ rack->r_min_pace_seg_thresh = optval;
+ break;
+ case TCP_BBR_RACK_RTT_USE:
+ if ((optval != USE_RTT_HIGH) &&
+ (optval != USE_RTT_LOW) &&
+ (optval != USE_RTT_AVG))
+ error = EINVAL;
+ else
+ rack->r_ctl.rc_rate_sample_method = optval;
+ break;
+ case TCP_DATA_AFTER_CLOSE:
+ if (optval)
+ rack->rc_allow_data_af_clo = 1;
+ else
+ rack->rc_allow_data_af_clo = 0;
+ break;
+ default:
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+#ifdef NETFLIX_STATS
+ tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
+#endif
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+static int
+rack_get_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
+{
+ int32_t error, optval;
+
+ /*
+ * Because all our options are either boolean or an int, we can just
+ * pull everything into optval and then unlock and copy. If we ever
+ * add a option that is not a int, then this will have quite an
+ * impact to this routine.
+ */
+ switch (sopt->sopt_name) {
+ case TCP_RACK_PROP_RATE:
+ optval = rack->r_ctl.rc_prop_rate;
+ break;
+ case TCP_RACK_PROP:
+ /* RACK proportional rate reduction (bool) */
+ optval = rack->r_ctl.rc_prop_reduce;
+ break;
+ case TCP_RACK_TLP_REDUCE:
+ /* RACK TLP cwnd reduction (bool) */
+ optval = rack->r_ctl.rc_tlp_cwnd_reduce;
+ break;
+ case TCP_RACK_EARLY_RECOV:
+ /* Should recovery happen early (bool) */
+ optval = rack->r_ctl.rc_early_recovery;
+ break;
+ case TCP_RACK_PACE_REDUCE:
+ /* RACK Hptsi reduction factor (divisor) */
+ optval = rack->rc_pace_reduce;
+ break;
+ case TCP_RACK_PACE_MAX_SEG:
+ /* Max segments in a pace */
+ optval = rack->rc_pace_max_segs;
+ break;
+ case TCP_RACK_PACE_ALWAYS:
+ /* Use the always pace method */
+ optval = rack->rc_always_pace;
+ break;
+ case TCP_RACK_PRR_SENDALOT:
+ /* Allow PRR to send more than one seg */
+ optval = rack->r_ctl.rc_prr_sendalot;
+ break;
+ case TCP_RACK_MIN_TO:
+ /* Minimum time between rack t-o's in ms */
+ optval = rack->r_ctl.rc_min_to;
+ break;
+ case TCP_RACK_EARLY_SEG:
+ /* If early recovery max segments */
+ optval = rack->r_ctl.rc_early_recovery_segs;
+ break;
+ case TCP_RACK_REORD_THRESH:
+ /* RACK reorder threshold (shift amount) */
+ optval = rack->r_ctl.rc_reorder_shift;
+ break;
+ case TCP_RACK_REORD_FADE:
+ /* Does reordering fade after ms time */
+ optval = rack->r_ctl.rc_reorder_fade;
+ break;
+ case TCP_RACK_TLP_THRESH:
+ /* RACK TLP theshold i.e. srtt+(srtt/N) */
+ optval = rack->r_ctl.rc_tlp_threshold;
+ break;
+ case TCP_RACK_PKT_DELAY:
+ /* RACK added ms i.e. rack-rtt + reord + N */
+ optval = rack->r_ctl.rc_pkt_delay;
+ break;
+ case TCP_RACK_TLP_USE:
+ optval = rack->rack_tlp_threshold_use;
+ break;
+ case TCP_RACK_TLP_INC_VAR:
+ /* Does TLP include rtt variance in t-o */
+ optval = rack->r_ctl.rc_prr_inc_var;
+ break;
+ case TCP_RACK_IDLE_REDUCE_HIGH:
+ optval = rack->r_idle_reduce_largest;
+ break;
+ case TCP_RACK_MIN_PACE:
+ optval = rack->r_enforce_min_pace;
+ break;
+ case TCP_RACK_MIN_PACE_SEG:
+ optval = rack->r_min_pace_seg_thresh;
+ break;
+ case TCP_BBR_RACK_RTT_USE:
+ optval = rack->r_ctl.rc_rate_sample_method;
+ break;
+ case TCP_DELACK:
+ optval = tp->t_delayed_ack;
+ break;
+ case TCP_DATA_AFTER_CLOSE:
+ optval = rack->rc_allow_data_af_clo;
+ break;
+ default:
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ return (error);
+}
+
+static int
+rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
+{
+ int32_t error = EINVAL;
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack == NULL) {
+ /* Huh? */
+ goto out;
+ }
+ if (sopt->sopt_dir == SOPT_SET) {
+ return (rack_set_sockopt(so, sopt, inp, tp, rack));
+ } else if (sopt->sopt_dir == SOPT_GET) {
+ return (rack_get_sockopt(so, sopt, inp, tp, rack));
+ }
+out:
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+
+struct tcp_function_block __tcp_rack = {
+ .tfb_tcp_block_name = __XSTRING(STACKNAME),
+ .tfb_tcp_output = rack_output,
+ .tfb_tcp_do_segment = rack_do_segment,
+ .tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
+ .tfb_tcp_ctloutput = rack_ctloutput,
+ .tfb_tcp_fb_init = rack_init,
+ .tfb_tcp_fb_fini = rack_fini,
+ .tfb_tcp_timer_stop_all = rack_stopall,
+ .tfb_tcp_timer_activate = rack_timer_activate,
+ .tfb_tcp_timer_active = rack_timer_active,
+ .tfb_tcp_timer_stop = rack_timer_stop,
+ .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
+ .tfb_tcp_handoff_ok = rack_handoff_ok
+};
+
+static const char *rack_stack_names[] = {
+ __XSTRING(STACKNAME),
+#ifdef STACKALIAS
+ __XSTRING(STACKALIAS),
+#endif
+};
+
+static int
+rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
+{
+ memset(mem, 0, size);
+ return (0);
+}
+
+static void
+rack_dtor(void *mem, int32_t size, void *arg)
+{
+
+}
+
+static bool rack_mod_inited = false;
+
+static int
+tcp_addrack(module_t mod, int32_t type, void *data)
+{
+ int32_t err = 0;
+ int num_stacks;
+
+ switch (type) {
+ case MOD_LOAD:
+ rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
+ sizeof(struct rack_sendmap),
+ rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
+
+ rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
+ sizeof(struct tcp_rack),
+ rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
+
+ sysctl_ctx_init(&rack_sysctl_ctx);
+ rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
+ OID_AUTO,
+ __XSTRING(STACKNAME),
+ CTLFLAG_RW, 0,
+ "");
+ if (rack_sysctl_root == NULL) {
+ printf("Failed to add sysctl node\n");
+ err = EFAULT;
+ goto free_uma;
+ }
+ rack_init_sysctls();
+ num_stacks = nitems(rack_stack_names);
+ err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
+ rack_stack_names, &num_stacks);
+ if (err) {
+ printf("Failed to register %s stack name for "
+ "%s module\n", rack_stack_names[num_stacks],
+ __XSTRING(MODNAME));
+ sysctl_ctx_free(&rack_sysctl_ctx);
+free_uma:
+ uma_zdestroy(rack_zone);
+ uma_zdestroy(rack_pcb_zone);
+ rack_counter_destroy();
+ printf("Failed to register rack module -- err:%d\n", err);
+ return (err);
+ }
+ rack_mod_inited = true;
+ break;
+ case MOD_QUIESCE:
+ err = deregister_tcp_functions(&__tcp_rack, true, false);
+ break;
+ case MOD_UNLOAD:
+ err = deregister_tcp_functions(&__tcp_rack, false, true);
+ if (err == EBUSY)
+ break;
+ if (rack_mod_inited) {
+ uma_zdestroy(rack_zone);
+ uma_zdestroy(rack_pcb_zone);
+ sysctl_ctx_free(&rack_sysctl_ctx);
+ rack_counter_destroy();
+ rack_mod_inited = false;
+ }
+ err = 0;
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (err);
+}
+
+static moduledata_t tcp_rack = {
+ .name = __XSTRING(MODNAME),
+ .evhand = tcp_addrack,
+ .priv = 0
+};
+
+MODULE_VERSION(MODNAME, 1);
+DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h
new file mode 100644
index 000000000000..d52022daee66
--- /dev/null
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@@ -0,0 +1,70 @@
+#ifndef __pacer_timer_h__
+#define __pacer_timer_h__
+/*-
+ * Copyright (c) 2017
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$");
+ */
+/* Common defines and such used by both RACK and BBR */
+/* Special values for mss accounting array */
+#define TCP_MSS_ACCT_JUSTRET 0
+#define TCP_MSS_ACCT_SNDACK 1
+#define TCP_MSS_ACCT_PERSIST 2
+#define TCP_MSS_ACCT_ATIMER 60
+#define TCP_MSS_ACCT_INPACE 61
+#define TCP_MSS_ACCT_LATE 62
+#define TCP_MSS_SMALL_SIZE_OFF 63 /* Point where small sizes enter */
+#define TCP_MSS_ACCT_SIZE 70
+#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
+
+
+/* Magic flags to tell whats cooking on the pacing wheel */
+#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
+#define PACE_TMR_RACK 0x02 /* RACK timer running */
+#define PACE_TMR_TLP 0x04 /* TLP timer running */
+#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
+#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
+#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
+#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
+#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+
+/* Magic flags for tracing progress events */
+#define PROGRESS_DROP 1
+#define PROGRESS_UPDATE 2
+#define PROGRESS_CLEAR 3
+#define PROGRESS_START 4
+
+
+/* RTT sample methods */
+#define USE_RTT_HIGH 0
+#define USE_RTT_LOW 1
+#define USE_RTT_AVG 2
+
+#ifdef _KERNEL
+/* We have only 7 bits in rack so assert its true */
+CTASSERT((PACE_TMR_MASK & 0x80) == 0);
+#endif
+#endif
diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c
new file mode 100644
index 000000000000..993d5851db79
--- /dev/null
+++ b/sys/netinet/tcp_stacks/sack_filter.c
@@ -0,0 +1,706 @@
+/*-
+ * Copyright (c) 2017
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/mbuf.h>
+#include <sys/sockopt.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_seq.h>
+#ifndef _KERNEL
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <getopt.h>
+#endif
+#include "sack_filter.h"
+
+/*
+ * Sack filter is used to filter out sacks
+ * that have already been processed. The idea
+ * is pretty simple really, consider two sacks
+ *
+ * SACK 1
+ * cum-ack A
+ * sack B - C
+ * SACK 2
+ * cum-ack A
+ * sack D - E
+ * sack B - C
+ *
+ * The previous sack information (B-C) is repeated
+ * in SACK 2. If the receiver gets SACK 1 and then
+ * SACK 2 then any work associated with B-C as already
+ * been completed. This only effects where we may have
+ * (as in bbr or rack) cases where we walk a linked list.
+ *
+ * Now the utility trys to keep everything in a single
+ * cache line. This means that its not perfect and
+ * it could be that so big of sack's come that a
+ * "remembered" processed sack falls off the list and
+ * so gets re-processed. Thats ok, it just means we
+ * did some extra work. We could of course take more
+ * cache line hits by expanding the size of this
+ * structure, but then that would cost more.
+ */
+
+#ifndef _KERNEL
+int detailed_dump = 0;
+uint64_t cnt_skipped_oldsack = 0;
+uint64_t cnt_used_oldsack = 0;
+int highest_used=0;
+int over_written=0;
+int empty_avail=0;
+int no_collapse = 0;
+FILE *out = NULL;
+FILE *in = NULL;
+#endif
+
+#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
+#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
+#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
+
+#ifndef _KERNEL
+static
+#endif
+void
+sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
+{
+ sf->sf_ack = seq;
+ sf->sf_bits = 0;
+ sf->sf_cur = 0;
+ sf->sf_used = 0;
+}
+/*
+ * Given a previous sack filter block, filter out
+ * any entries where the cum-ack moves over them
+ * fully or partially.
+ */
+static void
+sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
+{
+ int32_t i;
+ /* start with the oldest */
+ for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
+ if (sack_blk_used(sf, i)) {
+ if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
+ /* This block is consumed */
+ sf->sf_bits = sack_blk_clr(sf, i);
+ sf->sf_used--;
+ } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
+ /* Some of it is acked */
+ sf->sf_blks[i].start = th_ack;
+ /* We could in theory break here, but
+ * there are some broken implementations
+ * that send multiple blocks. We want
+ * to catch them all with similar seq's.
+ */
+ }
+ }
+ }
+ sf->sf_ack = th_ack;
+}
+
+/*
+ * Return true if you find that
+ * the sackblock b is on the score
+ * board. Update it along the way
+ * if part of it is on the board.
+ */
+static int32_t
+is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
+{
+ int32_t i, cnt;
+ for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
+ if (sack_blk_used(sf, i)) {
+ if (SEQ_LT(b->start, sf->sf_ack)) {
+ /* Behind cum-ack update */
+ b->start = sf->sf_ack;
+ }
+ if (SEQ_LT(b->end, sf->sf_ack)) {
+ /* End back behind too */
+ b->end = sf->sf_ack;
+ }
+ if (b->start == b->end)
+ return(1);
+ /* Jonathans Rule 1 */
+ if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
+ SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
+ /**
+ * Our board has this entirely in
+ * whole or in part:
+ *
+ * board |-------------|
+ * sack |-------------|
+ * <or>
+ * board |-------------|
+ * sack |----|
+ *
+ */
+ return(1);
+ }
+ /* Jonathans Rule 2 */
+ if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
+ /**
+ * Not near each other:
+ *
+ * board |---|
+ * sack |---|
+ */
+ goto nxt_blk;
+ }
+ /* Jonathans Rule 3 */
+ if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
+ /**
+ * Not near each other:
+ *
+ * board |---|
+ * sack |---|
+ */
+ goto nxt_blk;
+ }
+ if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
+ /**
+ * The board block partial meets:
+ *
+ * board |--------|
+ * sack |----------|
+ * <or>
+ * board |--------|
+ * sack |--------------|
+ *
+ * up with this one (we have part of it).
+ * 1) Update the board block to the new end
+ * and
+ * 2) Update the start of this block to my end.
+ */
+ b->start = sf->sf_blks[i].end;
+ sf->sf_blks[i].end = b->end;
+ goto nxt_blk;
+ }
+ if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
+ /**
+ * The board block partial meets:
+ *
+ * board |--------|
+ * sack |----------|
+ * <or>
+ * board |----|
+ * sack |----------|
+ * 1) Update the board block to the new start
+ * and
+ * 2) Update the start of this block to my end.
+ */
+ b->end = sf->sf_blks[i].start;
+ sf->sf_blks[i].start = b->start;
+ goto nxt_blk;
+ }
+ }
+ nxt_blk:
+ i++;
+ i %= SACK_FILTER_BLOCKS;
+ }
+ /* Did we totally consume it in pieces? */
+ if (b->start != b->end)
+ return(0);
+ else
+ return(1);
+}
+
+static int32_t
+sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
+{
+ int32_t num, i;
+ struct sackblk blkboard[TCP_MAX_SACK];
+ /*
+ * An old sack has arrived. It may contain data
+ * we do not have. We might not have it since
+ * we could have had a lost ack <or> we might have the
+ * entire thing on our current board. We want to prune
+ * off anything we have. With this function though we
+ * won't add to the board.
+ */
+ for( i = 0, num = 0; i<numblks; i++ ) {
+ if (is_sack_on_board(sf, &in[i])) {
+#ifndef _KERNEL
+ cnt_skipped_oldsack++;
+#endif
+ continue;
+ }
+ /* Did not find it (or found only
+ * a piece of it). Copy it to
+ * our outgoing board.
+ */
+ memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
+#ifndef _KERNEL
+ cnt_used_oldsack++;
+#endif
+ num++;
+ }
+ if (num) {
+ memcpy(in, blkboard, (num * sizeof(struct sackblk)));
+ }
+ return (num);
+}
+
+/*
+ * Given idx its used but there is space available
+ * move the entry to the next free slot
+ */
+static void
+sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
+{
+ int32_t i, cnt;
+
+ i = (idx + 1) % SACK_FILTER_BLOCKS;
+ for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
+ if (sack_blk_used(sf, i) == 0) {
+ memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
+ sf->sf_bits = sack_blk_clr(sf, idx);
+ sf->sf_bits = sack_blk_set(sf, i);
+ return;
+ }
+ i++;
+ i %= SACK_FILTER_BLOCKS;
+ }
+}
+
+static int32_t
+sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
+{
+ struct sackblk blkboard[TCP_MAX_SACK];
+ int32_t num, i;
+ /*
+ * First lets trim the old and possibly
+ * throw any away we have.
+ */
+ for(i=0, num=0; i<numblks; i++) {
+ if (is_sack_on_board(sf, &in[i]))
+ continue;
+ memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
+ num++;
+ }
+ if (num == 0)
+ return(num);
+
+ /* Now what we are left is either
+ * completely merged on to the board
+ * from the above steps, or are new
+ * and need to be added to the board
+ * with the last one updated to current.
+ *
+ * First copy it out we want to return that
+ * to our caller for processing.
+ */
+ memcpy(in, blkboard, (num * sizeof(struct sackblk)));
+ numblks = num;
+ /* Now go through and add to our board as needed */
+ for(i=(num-1); i>=0; i--) {
+ if (is_sack_on_board(sf, &blkboard[i]))
+ continue;
+ /* Add this guy its not listed */
+ sf->sf_cur++;
+ sf->sf_cur %= SACK_FILTER_BLOCKS;
+ if ((sack_blk_used(sf, sf->sf_cur)) &&
+ (sf->sf_used < SACK_FILTER_BLOCKS)) {
+ sack_move_to_empty(sf, sf->sf_cur);
+ }
+#ifndef _KERNEL
+ if (sack_blk_used(sf, sf->sf_cur)) {
+ over_written++;
+ if (sf->sf_used < SACK_FILTER_BLOCKS)
+ empty_avail++;
+ }
+#endif
+ memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
+ if (sack_blk_used(sf, sf->sf_cur) == 0) {
+ sf->sf_used++;
+#ifndef _KERNEL
+ if (sf->sf_used > highest_used)
+ highest_used = sf->sf_used;
+#endif
+ sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
+ }
+ }
+ return(numblks);
+}
+
+/*
+ * Given a sack block on the board (the skip index) see if
+ * any other used entries overlap or meet, if so return the index.
+ */
+static int32_t
+sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
+{
+ int32_t i;
+
+ for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+ if (sack_blk_used(sf, i) == 0)
+ continue;
+ if (i == skip)
+ continue;
+ if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
+ SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
+ SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
+ /**
+ * The two board blocks meet:
+ *
+ * board1 |--------|
+ * board2 |----------|
+ * <or>
+ * board1 |--------|
+ * board2 |--------------|
+ * <or>
+ * board1 |--------|
+ * board2 |--------|
+ */
+ return(i);
+ }
+ if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
+ SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
+ SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
+ /**
+ * The board block partial meets:
+ *
+ * board |--------|
+ * sack |----------|
+ * <or>
+ * board |----|
+ * sack |----------|
+ * 1) Update the board block to the new start
+ * and
+ * 2) Update the start of this block to my end.
+ */
+ return(i);
+ }
+ }
+ return (-1);
+}
+
+/*
+ * Collapse entry src into entry into
+ * and free up the src entry afterwards.
+ */
+static void
+sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
+{
+ if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
+ /* src has a lower starting point */
+ sf->sf_blks[into].start = sf->sf_blks[src].start;
+ }
+ if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
+ /* src has a higher ending point */
+ sf->sf_blks[into].end = sf->sf_blks[src].end;
+ }
+ sf->sf_bits = sack_blk_clr(sf, src);
+ sf->sf_used--;
+}
+
+static void
+sack_board_collapse(struct sack_filter *sf)
+{
+ int32_t i, j, i_d, j_d;
+
+ for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+ if (sack_blk_used(sf, i) == 0)
+ continue;
+ /*
+ * Look at all other blocks but this guy
+ * to see if they overlap. If so we collapse
+ * the two blocks together.
+ */
+ j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
+ if (j == -1) {
+ /* No overlap */
+ continue;
+ }
+ /*
+ * Ok j and i overlap with each other, collapse the
+ * one out furthest away from the current position.
+ */
+ if (sf->sf_cur > i)
+ i_d = sf->sf_cur - i;
+ else
+ i_d = i - sf->sf_cur;
+ if (sf->sf_cur > j)
+ j_d = sf->sf_cur - j;
+ else
+ j_d = j - sf->sf_cur;
+ if (j_d > i_d) {
+ sack_collapse(sf, j, i);
+ } else
+ sack_collapse(sf, i, j);
+ }
+}
+
+#ifndef _KERNEL
+static
+#endif
+int
+sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
+{
+ int32_t i, ret;
+
+ if (numblks > TCP_MAX_SACK) {
+ panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
+ sf, in,
+ numblks);
+ return(numblks);
+ }
+ if ((sf->sf_used == 0) && numblks) {
+ /*
+ * We are brand new add the blocks in
+ * reverse order. Note we can see more
+ * than one in new, since ack's could be lost.
+ */
+ sf->sf_ack = th_ack;
+ for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
+ memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
+ sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
+ sf->sf_cur++;
+ sf->sf_cur %= SACK_FILTER_BLOCKS;
+ sf->sf_used++;
+#ifndef _KERNEL
+ if (sf->sf_used > highest_used)
+ highest_used = sf->sf_used;
+#endif
+ }
+ if (sf->sf_cur)
+ sf->sf_cur--;
+ return(numblks);
+ }
+ if (SEQ_GT(th_ack, sf->sf_ack)) {
+ sack_filter_prune(sf, th_ack);
+ }
+ if (numblks) {
+ if (SEQ_GEQ(th_ack, sf->sf_ack)) {
+ ret = sack_filter_new(sf, in, numblks, th_ack);
+ } else {
+ ret = sack_filter_old(sf, in, numblks);
+ }
+ } else
+ ret = 0;
+#ifndef _KERNEL
+ if ((sf->sf_used > 1) && (no_collapse == 0))
+ sack_board_collapse(sf);
+
+#else
+ if (sf->sf_used > 1)
+ sack_board_collapse(sf);
+
+#endif
+ return (ret);
+}
+
+#ifndef _KERNEL
+uint64_t saved=0;
+uint64_t tot_sack_blks=0;
+
+static void
+sack_filter_dump(FILE *out, struct sack_filter *sf)
+{
+ int i;
+ fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
+ sf->sf_ack, sf->sf_bits,
+ sf->sf_cur, sf->sf_used);
+
+ for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+ if (sack_blk_used(sf, i)) {
+ fprintf(out, "Entry:%d start:%u end:%u\n", i,
+ sf->sf_blks[i].start,
+ sf->sf_blks[i].end);
+ }
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ char buffer[512];
+ struct sackblk blks[TCP_MAX_SACK];
+ FILE *err;
+ tcp_seq th_ack, snd_una;
+ struct sack_filter sf;
+ int32_t numblks,i;
+ int snd_una_set=0;
+ double a, b, c;
+ int invalid_sack_print = 0;
+ uint32_t chg_remembered=0;
+ uint32_t sack_chg=0;
+ char line_buf[10][256];
+ int line_buf_at=0;
+
+ in = stdin;
+ out = stdout;
+ while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
+ switch (i) {
+ case 'n':
+ no_collapse = 1;
+ break;
+ case 'd':
+ detailed_dump = 1;
+ break;
+ case'I':
+ invalid_sack_print = 1;
+ break;
+ case 'i':
+ in = fopen(optarg, "r");
+ if (in == NULL) {
+ fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
+ exit(-1);
+ }
+ break;
+ case 'o':
+ out = fopen(optarg, "w");
+ if (out == NULL) {
+ fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
+ exit(-1);
+ }
+ break;
+ default:
+ case '?':
+ case 'h':
+ fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
+ return(0);
+ break;
+ };
+ }
+ sack_filter_clear(&sf, 0);
+ memset(buffer, 0, sizeof(buffer));
+ memset(blks, 0, sizeof(blks));
+ numblks = 0;
+ fprintf(out, "************************************\n");
+ while (fgets(buffer, sizeof(buffer), in) != NULL) {
+ sprintf(line_buf[line_buf_at], "%s", buffer);
+ line_buf_at++;
+ if (strncmp(buffer, "QUIT", 4) == 0) {
+ break;
+ } else if (strncmp(buffer, "DONE", 4) == 0) {
+ int nn, ii;
+ if (numblks) {
+ uint32_t szof, tot_chg;
+ for(ii=0; ii<line_buf_at; ii++) {
+ fprintf(out, "%s", line_buf[ii]);
+ }
+ fprintf(out, "------------------------------------\n");
+ nn = sack_filter_blks(&sf, blks, numblks, th_ack);
+ saved += numblks - nn;
+ tot_sack_blks += numblks;
+ fprintf(out, "ACK:%u\n", sf.sf_ack);
+ for(ii=0, tot_chg=0; ii<nn; ii++) {
+ szof = blks[ii].end - blks[ii].start;
+ tot_chg += szof;
+ fprintf(out, "SACK:%u:%u [%u]\n",
+ blks[ii].start,
+ blks[ii].end, szof);
+ }
+ fprintf(out,"************************************\n");
+ chg_remembered = tot_chg;
+ if (detailed_dump) {
+ sack_filter_dump(out, &sf);
+ fprintf(out,"************************************\n");
+ }
+ }
+ memset(blks, 0, sizeof(blks));
+ memset(line_buf, 0, sizeof(line_buf));
+ line_buf_at=0;
+ numblks = 0;
+ } else if (strncmp(buffer, "CHG:", 4) == 0) {
+ sack_chg = strtoul(&buffer[4], NULL, 0);
+ if ((sack_chg != chg_remembered) &&
+ (sack_chg > chg_remembered)){
+ fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
+ sack_chg, chg_remembered
+ );
+ }
+ sack_chg = chg_remembered = 0;
+ } else if (strncmp(buffer, "RXT", 3) == 0) {
+ sack_filter_clear(&sf, snd_una);
+ } else if (strncmp(buffer, "ACK:", 4) == 0) {
+ th_ack = strtoul(&buffer[4], NULL, 0);
+ if (snd_una_set == 0) {
+ snd_una = th_ack;
+ snd_una_set = 1;
+ } else if (SEQ_GT(th_ack, snd_una)) {
+ snd_una = th_ack;
+ }
+ } else if (strncmp(buffer, "EXIT", 4) == 0) {
+ sack_filter_clear(&sf, snd_una);
+ sack_chg = chg_remembered = 0;
+ } else if (strncmp(buffer, "SACK:", 5) == 0) {
+ char *end=NULL;
+ uint32_t start;
+ uint32_t endv;
+ start = strtoul(&buffer[5], &end, 0);
+ if (end) {
+ endv = strtoul(&end[1], NULL, 0);
+ } else {
+ fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
+ continue;
+ }
+ if (SEQ_LT(endv, start)) {
+ fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
+ continue;
+ }
+ if (numblks == TCP_MAX_SACK) {
+ fprintf(out, "--Exceeded max %d\n", numblks);
+ exit(0);
+ }
+ blks[numblks].start = start;
+ blks[numblks].end = endv;
+ numblks++;
+ }
+ memset(buffer, 0, sizeof(buffer));
+ }
+ if (in != stdin) {
+ fclose(in);
+ }
+ if (out != stdout) {
+ fclose(out);
+ }
+ a = saved * 100.0;
+ b = tot_sack_blks * 1.0;
+ if (b > 0.0)
+ c = a/b;
+ else
+ c = 0.0;
+ if (out != stdout)
+ err = stdout;
+ else
+ err = stderr;
+ fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
+ saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);
+ return(0);
+}
+#endif
diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h
new file mode 100644
index 000000000000..3ef0f1693f5b
--- /dev/null
+++ b/sys/netinet/tcp_stacks/sack_filter.h
@@ -0,0 +1,58 @@
+#ifndef __sack_filter_h__
+#define __sack_filter_h__
+/*-
+ * Copyright (c) 2017
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$");
+ */
+
+/*
+ * Seven entry's is carefully choosen to
+ * fit in one cache line. We can easily
+ * change this to 15 (but it gets very
+ * little extra filtering). To change it
+ * to be larger than 15 would require either
+ * sf_bits becoming a uint32_t and then you
+ * could go to 31.. or change it to a full
+ * bitstring.. It is really doubtful you
+ * will get much benefit beyond 7, in testing
+ * there was a small amount but very very small.
+ */
+#define SACK_FILTER_BLOCKS 7
+
+struct sack_filter {
+ tcp_seq sf_ack;
+ uint16_t sf_bits;
+ uint8_t sf_cur;
+ uint8_t sf_used;
+ struct sackblk sf_blks[SACK_FILTER_BLOCKS];
+};
+#ifdef _KERNEL
+void sack_filter_clear(struct sack_filter *sf, tcp_seq seq);
+int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack);
+
+#endif
+#endif
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
new file mode 100644
index 000000000000..36bc1e6ae0b8
--- /dev/null
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -0,0 +1,321 @@
+/*-
+ * Copyright (c) 2016
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_RACK_H_
+#define _NETINET_TCP_RACK_H_
+
+#define RACK_ACKED 0x0001/* The remote endpoint acked this */
+#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */
+#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */
+#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */
+#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */
+#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */
+#define RACK_HAS_FIN 0x0040/* segment is sent with fin */
+#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */
+
+#define RACK_NUM_OF_RETRANS 3
+
+#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
+
+struct rack_sendmap {
+ TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */
+ TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
+ uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
+ uint32_t r_start; /* Sequence number of the segment */
+ uint32_t r_end; /* End seq, this is 1 beyond actually */
+ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
+ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time
+ * sent */
+ uint8_t r_flags; /* Flags as defined above */
+ uint8_t r_sndcnt; /* Retran count, not limited by
+ * RACK_NUM_OF_RETRANS */
+ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */
+ uint8_t r_resv[3];
+};
+
+TAILQ_HEAD(rack_head, rack_sendmap);
+
+
+/*
+ * We use the rate sample structure to
+ * assist in single sack/ack rate and rtt
+ * calculation. In the future we will expand
+ * this in BBR to do forward rate sample
+ * b/w estimation.
+ */
+#define RACK_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */
+#define RACK_RTT_VALID 0x00000002 /* We have at least one valid RTT */
+struct rack_rtt_sample {
+ uint32_t rs_flags;
+ uint32_t rs_rtt_lowest;
+ uint32_t rs_rtt_highest;
+ uint32_t rs_rtt_cnt;
+ uint64_t rs_rtt_tot;
+};
+
+#define RACK_LOG_TYPE_ACK 0x01
+#define RACK_LOG_TYPE_OUT 0x02
+#define RACK_LOG_TYPE_TO 0x03
+#define RACK_LOG_TYPE_ALLOC 0x04
+#define RACK_LOG_TYPE_FREE 0x05
+
+
+struct rack_log {
+ union {
+ struct rack_sendmap *rsm; /* For alloc/free */
+ uint64_t sb_acc;/* For out/ack or t-o */
+ };
+ uint32_t th_seq;
+ uint32_t th_ack;
+ uint32_t snd_una;
+ uint32_t snd_nxt; /* th_win for TYPE_ACK */
+ uint32_t snd_max;
+ uint32_t blk_start[4];
+ uint32_t blk_end[4];
+ uint8_t type;
+ uint8_t n_sackblks;
+ uint16_t len; /* Timeout T3=1, TLP=2, RACK=3 */
+};
+
+/*
+ * Magic numbers for logging timeout events if the
+ * logging is enabled.
+ */
+#define RACK_TO_FRM_TMR 1
+#define RACK_TO_FRM_TLP 2
+#define RACK_TO_FRM_RACK 3
+#define RACK_TO_FRM_KEEP 4
+#define RACK_TO_FRM_PERSIST 5
+#define RACK_TO_FRM_DELACK 6
+
+struct rack_opts_stats {
+ uint64_t tcp_rack_prop_rate;
+ uint64_t tcp_rack_prop;
+ uint64_t tcp_rack_tlp_reduce;
+ uint64_t tcp_rack_early_recov;
+ uint64_t tcp_rack_pace_always;
+ uint64_t tcp_rack_pace_reduce;
+ uint64_t tcp_rack_max_seg;
+ uint64_t tcp_rack_prr_sendalot;
+ uint64_t tcp_rack_min_to;
+ uint64_t tcp_rack_early_seg;
+ uint64_t tcp_rack_reord_thresh;
+ uint64_t tcp_rack_reord_fade;
+ uint64_t tcp_rack_tlp_thresh;
+ uint64_t tcp_rack_pkt_delay;
+ uint64_t tcp_rack_tlp_inc_var;
+ uint64_t tcp_tlp_use;
+ uint64_t tcp_rack_idle_reduce;
+ uint64_t tcp_rack_idle_reduce_high;
+ uint64_t rack_no_timer_in_hpts;
+ uint64_t tcp_rack_min_pace_seg;
+ uint64_t tcp_rack_min_pace;
+};
+
+#define TLP_USE_ID 1 /* Internet draft behavior */
+#define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */
+#define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */
+
+#ifdef _KERNEL
+#define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t))
+extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
+#define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm))
+#define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1)
+#endif
+/*
+ * As we get each SACK we wade through the
+ * rc_map and mark off what is acked.
+ * We also increment rc_sacked as well.
+ *
+ * We also pay attention to missing entries
+ * based on the time and possibly mark them
+ * for retransmit. If we do and we are not already
+ * in recovery we enter recovery. In doing
+ * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec.
+ * We also setup rc_next/rc_snd_nxt/rc_send_end so
+ * we will know where to send from. When not in
+ * recovery rc_next will be NULL and rc_snd_nxt should
+ * equal snd_max.
+ *
+ * Whenever we retransmit from recovery we increment
+ * rc_holes_rxt as we retran a block and mark it as retransmitted
+ * with the time it was sent. During non-recovery sending we
+ * add to our map and note the time down of any send expanding
+ * the rc_map at the tail and moving rc_snd_nxt up with snd_max.
+ *
+ * In recovery during SACK/ACK processing if a chunk has
+ * been retransmitted and it is now acked, we decrement rc_holes_rxt.
+ * When we retransmit from the scoreboard we use
+ * rc_next and rc_snd_nxt/rc_send_end to help us
+ * find what needs to be retran.
+ *
+ * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt
+ * This gets us the effect of RFC6675 pipe, counting twice for
+ * bytes retransmitted.
+ */
+
+#define TT_RACK_FR_TMR 0x2000
+
+/*
+ * Locking for the rack control block.
+ * a) Locked by INP_WLOCK
+ * b) Locked by the hpts-mutex
+ *
+ */
+
+struct rack_control {
+ /* Second cache line 0x40 from tcp_rack */
+ struct rack_head rc_map;/* List of all segments Lock(a) */
+ struct rack_head rc_tmap; /* List in transmit order Lock(a) */
+ struct rack_sendmap *rc_tlpsend; /* Remembered place for
+ * tlp_sending Lock(a) */
+ struct rack_sendmap *rc_resend; /* something we have been asked to
+ * resend */
+ uint32_t rc_hpts_flags;
+ uint32_t rc_timer_exp; /* If a timer ticks of expiry */
+ uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */
+ uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */
+
+ /* Third Cache line 0x80 */
+ struct rack_head rc_free; /* Allocation array */
+ uint32_t rc_time_last_sent; /* Time we last sent some data and
+ * logged it Lock(a). */
+ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
+
+ uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP
+ * Lock(a) */
+ uint32_t rc_prr_out; /* bytes sent during recovery Lock(a) */
+
+ uint32_t rc_prr_recovery_fs; /* recovery fs point Lock(a) */
+
+ uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */
+
+ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */
+ uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */
+
+ uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */
+ uint16_t rc_tlp_send_cnt; /* Number of TLP sends we have done
+ * since peer spoke to us Lock(a) */
+ uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent
+ * rc_last_tlp_seq Lock(a) */
+
+ uint32_t rc_loss_count; /* During recovery how many segments were lost
+ * Lock(a) */
+ uint32_t rc_reorder_fade; /* Socket option value Lock(a) */
+
+ /* Forth cache line 0xc0 */
+ /* Times */
+
+ uint32_t rc_rack_tmit_time; /* Rack transmit time Lock(a) */
+ uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */
+
+ /* Variables to track bad retransmits and recover */
+ uint32_t rc_rsm_start; /* RSM seq number we retransmitted Lock(a) */
+ uint32_t rc_cwnd_at; /* cwnd at the retransmit Lock(a) */
+
+ uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */
+ uint32_t rc_num_maps_alloced; /* Number of map blocks (sacks) we
+ * have allocated */
+ uint32_t rc_rcvtime; /* When we last received data */
+ uint32_t rc_notused;
+ uint32_t rc_last_output_to;
+ uint32_t rc_went_idle_time;
+
+ struct rack_sendmap *rc_sacklast; /* sack remembered place
+ * Lock(a) */
+
+ struct rack_sendmap *rc_next; /* remembered place where we next
+ * retransmit at Lock(a) */
+ struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for
+ * cache line alignment
+ * Lock(a) */
+ /* Cache line split 0x100 */
+ struct sack_filter rack_sf;
+ /* Cache line split 0x140 */
+ /* Flags for various things */
+ struct rack_rtt_sample rack_rs;
+ uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
+ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
+ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
+ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */
+ uint8_t rc_prop_rate; /* Socket option value Lock(a) */
+ uint8_t rc_prop_reduce; /* Socket option value Lock(a) */
+ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
+ uint8_t rc_early_recovery; /* Socket option value Lock(a) */
+ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
+ uint8_t rc_min_to; /* Socket option value Lock(a) */
+ uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */
+ uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */
+ uint8_t rc_rate_sample_method;
+};
+
+#ifdef _KERNEL
+
+struct tcp_rack {
+ /* First cache line 0x00 */
+ TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */
+ int32_t(*r_substate) (struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, struct tcpopt *,
+ int32_t, int32_t, int32_t *, uint32_t, int, int); /* Lock(a) */
+ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
+ struct inpcb *rc_inp; /* The inpcb Lock(a) */
+ uint32_t rc_free_cnt; /* Number of free entries on the rc_free list
+ * Lock(a) */
+ uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
+ uint16_t r_wanted_output; /* Output routine wanted to be called */
+ uint16_t r_cpu; /* CPU that the INP is running on Lock(a) */
+ uint16_t rc_pace_max_segs; /* Socket option value Lock(a) */
+ uint16_t rc_pace_reduce;/* Socket option value Lock(a) */
+
+ uint8_t r_state; /* Current rack state Lock(a) */
+ uint8_t rc_tmr_stopped : 7,
+ t_timers_stopped : 1;
+ uint8_t rc_enobuf; /* count of enobufs on connection provides
+ * backoff Lock(a) */
+ uint8_t r_timer_override : 1, /* hpts override Lock(a) */
+ r_tlp_running : 1, /* Running from a TLP timeout Lock(a) */
+ r_is_v6 : 1, /* V6 pcb Lock(a) */
+ rc_in_persist : 1,
+ rc_last_pto_set : 1, /* XXX not used */
+ rc_tlp_in_progress : 1,
+ rc_always_pace : 1, /* Socket option value Lock(a) */
+ rc_timer_up : 1; /* The rack timer is up flag Lock(a) */
+ uint8_t r_idle_reduce_largest : 1,
+ r_enforce_min_pace : 2,
+ r_min_pace_seg_thresh : 5;
+ uint8_t rack_tlp_threshold_use;
+ uint8_t rc_allow_data_af_clo: 1,
+ delayed_ack : 1,
+ rc_avail : 6;
+ uint8_t r_resv[2]; /* Fill to cache line boundary */
+ /* Cache line 2 0x40 */
+ struct rack_control r_ctl;
+} __aligned(CACHE_LINE_SIZE);
+
+#endif
+#endif
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 50c6a8e99f0c..799cd37f0db7 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_seq.h>
#include <netinet/cc/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
@@ -139,7 +140,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
/* max idle probes */
int tcp_maxpersistidle;
-static int tcp_rexmit_drop_options = 0;
+int tcp_rexmit_drop_options = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
&tcp_rexmit_drop_options, 0,
"Drop TCP options from 3rd and later retransmitted SYN");
@@ -174,18 +175,13 @@ static int per_cpu_timers = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
&per_cpu_timers , 0, "run tcp timers on all cpus");
-#if 0
-#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
- ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
-#endif
-
/*
* Map the given inp to a CPU id.
*
* This queries RSS if it's compiled in, else it defaults to the current
* CPU ID.
*/
-static inline int
+inline int
inp_to_cpuid(struct inpcb *inp)
{
u_int cpuid;
@@ -243,7 +239,7 @@ int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
{ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
-static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
+int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
/*
* TCP timer processing.
@@ -948,6 +944,111 @@ tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
return callout_active(t_callout);
}
+/*
+ * Stop the timer from running, and apply a flag
+ * against the timer_flags that will force the
+ * timer never to run. The flag is needed to assure
+ * a race does not leave it running and cause
+ * the timer to possibly restart itself (keep and persist
+ * especially do this).
+ */
+int
+tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type)
+{
+ struct callout *t_callout;
+ uint32_t t_flags;
+
+ switch (timer_type) {
+ case TT_DELACK:
+ t_flags = TT_DELACK_SUS;
+ t_callout = &tp->t_timers->tt_delack;
+ break;
+ case TT_REXMT:
+ t_flags = TT_REXMT_SUS;
+ t_callout = &tp->t_timers->tt_rexmt;
+ break;
+ case TT_PERSIST:
+ t_flags = TT_PERSIST_SUS;
+ t_callout = &tp->t_timers->tt_persist;
+ break;
+ case TT_KEEP:
+ t_flags = TT_KEEP_SUS;
+ t_callout = &tp->t_timers->tt_keep;
+ break;
+ case TT_2MSL:
+ t_flags = TT_2MSL_SUS;
+ t_callout = &tp->t_timers->tt_2msl;
+ break;
+ default:
+ panic("tp:%p bad timer_type 0x%x", tp, timer_type);
+ }
+ tp->t_timers->tt_flags |= t_flags;
+ return (callout_stop(t_callout));
+}
+
+void
+tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type)
+{
+ switch (timer_type) {
+ case TT_DELACK:
+ if (tp->t_timers->tt_flags & TT_DELACK_SUS) {
+ tp->t_timers->tt_flags &= ~TT_DELACK_SUS;
+ if (tp->t_flags & TF_DELACK) {
+ /* Delayed ack timer should be up activate a timer */
+ tp->t_flags &= ~TF_DELACK;
+ tcp_timer_activate(tp, TT_DELACK,
+ tcp_delacktime);
+ }
+ }
+ break;
+ case TT_REXMT:
+ if (tp->t_timers->tt_flags & TT_REXMT_SUS) {
+ tp->t_timers->tt_flags &= ~TT_REXMT_SUS;
+ if (SEQ_GT(tp->snd_max, tp->snd_una) &&
+ (tcp_timer_active((tp), TT_PERSIST) == 0) &&
+ tp->snd_wnd) {
+ /* We have outstanding data activate a timer */
+ tcp_timer_activate(tp, TT_REXMT,
+ tp->t_rxtcur);
+ }
+ }
+ break;
+ case TT_PERSIST:
+ if (tp->t_timers->tt_flags & TT_PERSIST_SUS) {
+ tp->t_timers->tt_flags &= ~TT_PERSIST_SUS;
+ if (tp->snd_wnd == 0) {
+ /* Activate the persists timer */
+ tp->t_rxtshift = 0;
+ tcp_setpersist(tp);
+ }
+ }
+ break;
+ case TT_KEEP:
+ if (tp->t_timers->tt_flags & TT_KEEP_SUS) {
+ tp->t_timers->tt_flags &= ~TT_KEEP_SUS;
+ tcp_timer_activate(tp, TT_KEEP,
+ TCPS_HAVEESTABLISHED(tp->t_state) ?
+ TP_KEEPIDLE(tp) : TP_KEEPINIT(tp));
+ }
+ break;
+ case TT_2MSL:
+ if (tp->t_timers->tt_flags &= TT_2MSL_SUS) {
+ tp->t_timers->tt_flags &= ~TT_2MSL_SUS;
+ if ((tp->t_state == TCPS_FIN_WAIT_2) &&
+ ((tp->t_inpcb->inp_socket == NULL) ||
+ (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) {
+ /* Star the 2MSL timer */
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle) ?
+ tcp_finwait2_timeout : TP_MAXIDLE(tp));
+ }
+ }
+ break;
+ default:
+ panic("tp:%p bad timer_type 0x%x", tp, timer_type);
+ }
+}
+
void
tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
{
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index b0ff38091a0b..9250196200dd 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -168,11 +168,15 @@ struct tcp_timer {
#define TT_2MSL 0x0010
#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
-#define TT_DELACK_RST 0x0100
-#define TT_REXMT_RST 0x0200
-#define TT_PERSIST_RST 0x0400
-#define TT_KEEP_RST 0x0800
-#define TT_2MSL_RST 0x1000
+/*
+ * Suspend flags - used when suspending a timer
+ * from ever running again.
+ */
+#define TT_DELACK_SUS 0x0100
+#define TT_REXMT_SUS 0x0200
+#define TT_PERSIST_SUS 0x0400
+#define TT_KEEP_SUS 0x0800
+#define TT_2MSL_SUS 0x1000
#define TT_STOPPED 0x00010000
@@ -196,6 +200,8 @@ extern int tcp_msl;
extern int tcp_ttl; /* time to live for TCP segs */
extern int tcp_backoff[];
extern int tcp_syn_backoff[];
+extern int tcp_totbackoff;
+extern int tcp_rexmit_drop_options;
extern int tcp_always_keepalive;
extern int tcp_finwait2_timeout;
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index adaaff614484..07ddfac3bfa0 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -93,8 +93,11 @@ struct tcpcb {
void *t_fb_ptr; /* Pointer to t_fb specific data */
uint32_t t_maxseg:24, /* maximum segment size */
t_logstate:8; /* State of "black box" logging */
- uint32_t t_state:4, /* state of this connection */
- bits_spare : 24;
+ uint32_t t_port:16, /* Tunneling (over udp) port */
+ t_state:4, /* state of this connection */
+ t_idle_reduce : 1,
+ t_delayed_ack: 7, /* Delayed ack variable */
+ bits_spare : 4;
u_int t_flags;
tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@@ -104,7 +107,7 @@ struct tcpcb {
tcp_seq snd_up; /* send urgent pointer */
uint32_t snd_wnd; /* send window */
uint32_t snd_cwnd; /* congestion-controlled window */
- uint32_t cl1_spare; /* Spare to round out CL 1 */
+ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */
/* Cache line 2 */
u_int32_t ts_offset; /* our timestamp offset */
u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
@@ -189,6 +192,7 @@ struct tcpcb {
struct cc_var *ccv; /* congestion control specific vars */
struct osd *osd; /* storage for Khelp module data */
int t_bytes_acked; /* # bytes acked during current RTT */
+ u_int t_maxunacktime;
u_int t_keepinit; /* time to establish connection */
u_int t_keepidle; /* time before keepalive probes begin */
u_int t_keepintvl; /* interval between keepalives */
@@ -361,6 +365,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */
#define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */
#define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */
+#define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */
/*
* Structure to hold TCP options that are only used during segment
@@ -649,6 +654,11 @@ struct tcp_hhook_data {
int tso;
tcp_seq curack;
};
+#ifdef TCP_HHOOK
+void hhook_run_tcp_est_out(struct tcpcb *tp,
+ struct tcphdr *th, struct tcpopt *to,
+ uint32_t len, int tso);
+#endif
#endif
/*
@@ -801,6 +811,9 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
#define V_tcp_sack_maxholes VNET(tcp_sack_maxholes)
#define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail)
#define V_tcp_sendspace VNET(tcp_sendspace)
+#define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead)
+#define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port)
+
#ifdef TCP_HHOOK
VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
@@ -893,9 +906,12 @@ struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, void *, void *);
void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+int tcp_timer_suspend(struct tcpcb *, uint32_t);
+void tcp_timers_unsuspend(struct tcpcb *, uint32_t);
int tcp_timer_active(struct tcpcb *, uint32_t);
void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
+int inp_to_cpuid(struct inpcb *inp);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
*/
@@ -921,6 +937,10 @@ void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);
int tcp_compute_pipe(struct tcpcb *);
void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
+struct mbuf *
+ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
+ int32_t seglimit, int32_t segsize, struct sockbuf *sb);
+
static inline void
tcp_fields_to_host(struct tcphdr *th)
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index 280c863acc49..2b7040b784cd 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -304,7 +304,7 @@ struct mbuf {
#define M_MCAST 0x00000020 /* send/received as link-level multicast */
#define M_PROMISC 0x00000040 /* packet was not for us */
#define M_VLANTAG 0x00000080 /* ether_vtag is valid */
-#define M_UNUSED_8 0x00000100 /* --available-- */
+#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
diff --git a/sys/sys/queue.h b/sys/sys/queue.h
index 732141288ee7..2e961ddc7e08 100644
--- a/sys/sys/queue.h
+++ b/sys/sys/queue.h
@@ -95,6 +95,7 @@
* _NEXT + + + +
* _PREV - + - +
* _LAST - - + +
+ * _LAST_FAST - - - +
* _FOREACH + + + +
* _FOREACH_FROM + + + +
* _FOREACH_SAFE + + + +
@@ -817,6 +818,16 @@ struct { \
#define TAILQ_LAST(head, headname) \
(*(((struct headname *)((head)->tqh_last))->tqh_last))
+/*
+ * The FAST function is fast in that it causes no data access other
+ * then the access to the head. The standard LAST function above
+ * will cause a data access of both the element you want and
+ * the previous element. FAST is very useful for instances when
+ * you may want to prefetch the last data element.
+ */
+#define TAILQ_LAST_FAST(head, type, field) \
+ (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, QUEUE_TYPEOF(type), field.tqe_next))
+
#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
#define TAILQ_PREV(elm, headname, field) \
diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h
index 175538071a18..ff7863da6055 100644
--- a/sys/sys/sockbuf.h
+++ b/sys/sys/sockbuf.h
@@ -166,6 +166,10 @@ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
struct mbuf *
sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff);
struct mbuf *
+ sbsndptr_noadv(struct sockbuf *sb, u_int off, u_int *moff);
+void
+ sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, u_int len);
+struct mbuf *
sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff);
int sbwait(struct sockbuf *sb);
int sblock(struct sockbuf *sb, int flags);
diff --git a/sys/sys/time.h b/sys/sys/time.h
index f278e6315d68..cd517d856d76 100644
--- a/sys/sys/time.h
+++ b/sys/sys/time.h
@@ -289,6 +289,22 @@ tvtosbt(struct timeval _tv)
#endif /* __BSD_VISIBLE */
#ifdef _KERNEL
+/*
+ * Simple macros to convert ticks to milliseconds
+ * or microseconds and vice-versa. The answer
+ * will always be at least 1. Note the return
+ * value is a uint32_t however we step up the
+ * operations to 64 bit to avoid any overflow/underflow
+ * problems.
+ */
+#define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \
+ (t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz))
+#define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \
+ ((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz))
+#define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \
+ (m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000))
+#define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \
+ ((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000))
/* Operations on timespecs */
#define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0)