aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRandall Stewart <rrs@FreeBSD.org>2023-04-01 05:46:38 +0000
committerRandall Stewart <rrs@FreeBSD.org>2023-04-01 05:46:38 +0000
commit73ee5756dee6b2110eb6fb2b2ef3cde39a1fcb4f (patch)
tree32251ae7fe68feb89a31e29ff45abcaba7750f1a
parent63b113af5706420b149b5b8b2189d1e4d0b9782d (diff)
downloadsrc-73ee5756dee6b2110eb6fb2b2ef3cde39a1fcb4f.tar.gz
src-73ee5756dee6b2110eb6fb2b2ef3cde39a1fcb4f.zip
Fixes in the tcp infrastructure with respect to stack changes as well as other infrastructure updates for incoming rack features.
So stack switching as always been a bit of a issue. We currently use a break before make setup which means that if something goes wrong you have to try to get back to a stack. This patch among a lot of other things changes that so that it is a make before break. We also expand some of the function blocks in prep for new features in rack that will allow more controlled pacing. We also add other abilities such as the pathway for a stack to query a previous stack to acquire from it critical state information so things in flight don't get dropped or mis-handled when switching stacks. We also add the concept of a timer granularity. This allows an alternate stack to change from the old ticks granularity to microseconds and of course this even gives us a pathway to go to nanosecond timekeeping if we need to (something for the data center to consider for sure). Once all this lands I will then update rack to begin using all these new features. Reviewed by: tuexen Sponsored by: Netflix Inc Differential Revision: https://reviews.freebsd.org/D39210
-rw-r--r--sys/conf/options1
-rw-r--r--sys/kern/kern_sendfile.c9
-rw-r--r--sys/modules/tcp/rack/Makefile2
-rw-r--r--sys/netinet/tcp.h66
-rw-r--r--sys/netinet/tcp_hpts.h9
-rw-r--r--sys/netinet/tcp_log_buf.c89
-rw-r--r--sys/netinet/tcp_stacks/bbr.c110
-rw-r--r--sys/netinet/tcp_stacks/rack.c12
-rw-r--r--sys/netinet/tcp_subr.c593
-rw-r--r--sys/netinet/tcp_syncache.c29
-rw-r--r--sys/netinet/tcp_usrreq.c66
-rw-r--r--sys/netinet/tcp_var.h274
-rw-r--r--sys/sys/mbuf.h14
13 files changed, 1172 insertions, 102 deletions
diff --git a/sys/conf/options b/sys/conf/options
index 173c56229084..40bb1e56e8b0 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -227,6 +227,7 @@ SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
TCPHPTS opt_inet.h
+TCP_REQUEST_TRK opt_global.h
TCP_ACCOUNTING opt_inet.h
TURNSTILE_PROFILING
UMTX_PROFILING
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 12842e3476e1..9804d14d675d 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
@@ -1188,6 +1191,12 @@ prepend_header:
NULL, NULL, td);
sendfile_iodone(sfio, NULL, 0, error);
}
+#ifdef TCP_REQUEST_TRK
+ if (so->so_proto->pr_protocol == IPPROTO_TCP) {
+ /* log the sendfile call to the TCP log, if enabled */
+ tcp_log_sendfile(so, offset, nbytes, flags);
+ }
+#endif
CURVNET_RESTORE();
m = NULL;
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
index 68ce40cc074e..cf95faa7fcfd 100644
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c rack_bbr_common.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_kern_tls.h
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 1c34442f2617..bec1dc3552d1 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -217,15 +217,15 @@ struct tcphdr {
/* Options for Rack and BBR */
#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */
#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
-#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
+#define TCP_RACK_PROP 1051 /* Not used */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */
#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */
#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
-#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
+#define TCP_RACK_PROP_RATE 1056 /* Not used */
#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */
-#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */
+#define TCP_RACK_EARLY_RECOV 1059 /* Not used */
#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */
#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */
@@ -309,12 +309,22 @@ struct tcphdr {
#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */
#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
-#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
+#define TCP_FAST_RSM_HACK 1137 /* Not used in modern stacks */
#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */
#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */
#define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */
#define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */
#define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */
+#define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */
+#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */
+#define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */
+#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
+#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
+#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
+#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
+#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
+#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
+
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -448,6 +458,53 @@ struct tcp_function_set {
#define TLS_GET_RECORD 2
/*
+ * TCP log user opaque
+ */
+struct http_req {
+ uint64_t timestamp;
+ uint64_t start;
+ uint64_t end;
+ uint32_t flags;
+};
+
+union tcp_log_userdata {
+ struct http_req http_req;
+};
+
+struct tcp_log_user {
+ uint32_t type;
+ uint32_t subtype;
+ union tcp_log_userdata data;
+};
+
+/* user types, i.e. apps */
+#define TCP_LOG_USER_HTTPD 1
+
+/* user subtypes */
+#define TCP_LOG_HTTPD_TS 1 /* client timestamp */
+#define TCP_LOG_HTTPD_TS_REQ 2 /* client timestamp and request info */
+
+/* HTTPD REQ flags */
+#define TCP_LOG_HTTPD_RANGE_START 0x0001
+#define TCP_LOG_HTTPD_RANGE_END 0x0002
+
+/* Flags for hybrid pacing */
+#define TCP_HYBRID_PACING_CU 0x0001 /* Enable catch-up mode */
+#define TCP_HYBRID_PACING_DTL 0x0002 /* Enable Detailed logging */
+#define TCP_HYBRID_PACING_CSPR 0x0004 /* A client suggested rate is present */
+#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */
+#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */
+#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */
+#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */
+
+struct tcp_hybrid_req {
+ struct http_req req;
+ uint64_t cspr;
+ uint32_t hint_maxseg;
+ uint32_t hybrid_flags;
+};
+
+/*
* TCP specific variables of interest for tp->t_stats stats(9) accounting.
*/
#define VOI_TCP_TXPB 0 /* Transmit payload bytes */
@@ -460,6 +517,7 @@ struct tcp_function_set {
#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */
#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */
#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */
+#define VOI_TCP_PATHRTT 10 /* The path RTT based on ACK arrival */
#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */
#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index ebee6a01b983..51e6d62929d6 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -187,6 +187,15 @@ tcp_tv_to_lusectick(const struct timeval *sv)
}
#ifdef _KERNEL
+
+extern int32_t tcp_min_hptsi_time;
+
+__inline int32_t
+get_hpts_min_sleep_time()
+{
+ return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
+}
+
static __inline uint32_t
tcp_gethptstick(struct timeval *sv)
{
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index 491e1c23588c..5a16c7593cfc 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_var.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_seq.h>
#include <netinet/tcp_hpts.h>
/* Default expiry time */
@@ -2844,6 +2845,10 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
{
struct inpcb *inp;
struct tcpcb *tp;
+#ifdef TCP_REQUEST_TRK
+ struct http_sendfile_track *ent;
+ int i, fnd;
+#endif
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
@@ -2873,6 +2878,90 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
&tptosocket(tp)->so_snd,
TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
}
+#ifdef TCP_REQUEST_TRK
+ if (tp->t_http_req == 0) {
+ /* No http requests to track */
+ goto done;
+ }
+ fnd = 0;
+ if (tp->t_http_closed == 0) {
+ /* No closed end req to track */
+ goto skip_closed_req;
+ }
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ /* Lets see if this one can be found */
+ ent = &tp->t_http_info[i];
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ /* Not used */
+ continue;
+ }
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ /* This pass does not consider open requests */
+ continue;
+ }
+ if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) {
+ /* Don't look at what we have completed */
+ continue;
+ }
+ /* If we reach here its a allocated closed end request */
+ if ((ent->start == offset) ||
+ ((offset > ent->start) && (offset < ent->end))){
+ /* Its within this request?? */
+ fnd = 1;
+ }
+ if (fnd) {
+ /*
+ * It is at or past the end, its complete.
+ */
+ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+ /*
+ * When an entry completes we can take (snd_una + sb_cc) and know where
+ * the end of the range really is. Note that this works since two
+ * requests must be sequential and sendfile now is complete for *this* request.
+ * we must use sb_ccc since the data may still be in-flight in TLS.
+ *
+ * We always cautiously move the end_seq only if our calculations
+ * show it happened (just in case sf has the call to here at the wrong
+ * place). When we go COMP we will stop coming here and hopefully be
+ * left with the correct end_seq.
+ */
+ if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
+ ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
+ if ((offset + nbytes) >= ent->end) {
+ ent->flags |= TCP_HTTP_TRACK_FLG_COMP;
+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_COMPLETE, offset, nbytes);
+ } else {
+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_MOREYET, offset, nbytes);
+ }
+ /* We assume that sendfile never sends overlapping requests */
+ goto done;
+ }
+ }
+skip_closed_req:
+ if (!fnd) {
+ /* Ok now lets look for open requests */
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ /* Not used */
+ continue;
+ }
+ if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0)
+ continue;
+ /* If we reach here its an allocated open request */
+ if (ent->start == offset) {
+ /* It begins this request */
+ ent->start_seq = tp->snd_una +
+ tptosocket(tp)->so_snd.sb_ccc;
+ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+ break;
+ } else if (offset > ent->start) {
+ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+ break;
+ }
+ }
+ }
+#endif
done:
INP_WUNLOCK(inp);
}
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 66f19ccd6c2b..621357494a02 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -500,7 +500,7 @@ static void
bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
int32_t line);
static void
-bbr_stop_all_timers(struct tcpcb *tp);
+bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr);
static void
bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
static void
@@ -1970,7 +1970,7 @@ bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
static void
bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
{
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -2669,7 +2669,7 @@ bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
uint32_t newbw, uint32_t obw, uint32_t diff,
uint32_t tim)
{
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (/*bbr_verbose_logging && */tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -2697,7 +2697,7 @@ bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
static inline void
bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
{
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (bbr_verbose_logging && tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
@@ -6281,6 +6281,9 @@ tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
else
apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
}
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rtt));
+#endif
if (bbr->rc_ack_was_delayed)
rtt += bbr->r_ctl.rc_ack_hdwr_delay;
@@ -9850,16 +9853,13 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
static void
-bbr_stop_all_timers(struct tcpcb *tp)
+bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr)
{
- struct tcp_bbr *bbr;
-
/*
* Assure no timers are running.
*/
if (tcp_timer_active(tp, TT_PERSIST)) {
/* We enter in persists, set the flag appropriately */
- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
bbr->rc_in_persist = 1;
}
}
@@ -9927,14 +9927,14 @@ bbr_google_mode_off(struct tcp_bbr *bbr)
* which indicates the error (usually no memory).
*/
static int
-bbr_init(struct tcpcb *tp)
+bbr_init(struct tcpcb *tp, void **ptr)
{
struct inpcb *inp = tptoinpcb(tp);
struct tcp_bbr *bbr = NULL;
uint32_t cts;
- tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
- if (tp->t_fb_ptr == NULL) {
+ *ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
+ if (*ptr == NULL) {
/*
* We need to allocate memory but cant. The INP and INP_INFO
* locks and they are recursive (happens during setup. So a
@@ -9943,10 +9943,16 @@ bbr_init(struct tcpcb *tp)
*/
return (ENOMEM);
}
- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ bbr = (struct tcp_bbr *)*ptr;
bbr->rtt_valid = 0;
inp->inp_flags2 |= INP_CANNOT_DO_ECN;
inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ /* Take off any undesired flags */
+ inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ inp->inp_flags2 &= ~INP_MBUF_ACKCMP;
+ inp->inp_flags2 &= ~INP_MBUF_L_ACKS;
+
TAILQ_INIT(&bbr->r_ctl.rc_map);
TAILQ_INIT(&bbr->r_ctl.rc_free);
TAILQ_INIT(&bbr->r_ctl.rc_tmap);
@@ -10074,8 +10080,8 @@ bbr_init(struct tcpcb *tp)
rsm = bbr_alloc(bbr);
if (rsm == NULL) {
- uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
- tp->t_fb_ptr = NULL;
+ uma_zfree(bbr_pcb_zone, *ptr);
+ *ptr = NULL;
return (ENOMEM);
}
rsm->r_rtt_not_allowed = 1;
@@ -10128,7 +10134,17 @@ bbr_init(struct tcpcb *tp)
* the TCB on the hptsi wheel if a timer is needed with appropriate
* flags.
*/
- bbr_stop_all_timers(tp);
+ bbr_stop_all_timers(tp, bbr);
+ /*
+ * Validate the timers are not in usec, if they are convert.
+ * BBR should in theory move to USEC and get rid of a
+ * lot of the TICKS_2 calls.. but for now we stay
+ * with tick timers.
+ */
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
return (0);
}
@@ -10172,7 +10188,6 @@ static void
bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
{
if (tp->t_fb_ptr) {
- struct inpcb *inp = tptoinpcb(tp);
uint32_t calc;
struct tcp_bbr *bbr;
struct bbr_sendmap *rsm;
@@ -10182,10 +10197,6 @@ bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
bbr_log_flowend(bbr);
bbr->rc_tp = NULL;
- /* Backout any flags2 we applied */
- inp->inp_flags2 &= ~INP_CANNOT_DO_ECN;
- inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
- inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
if (bbr->bbr_hdrw_pacing)
counter_u64_add(bbr_flows_whdwr_pacing, -1);
else
@@ -11853,7 +11864,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
int32_t isipv6;
#endif
uint8_t app_limited = BBR_JR_SENT_DATA;
- uint8_t filled_all = 0;
bbr = (struct tcp_bbr *)tp->t_fb_ptr;
/* We take a cache hit here */
memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
@@ -13162,7 +13172,7 @@ send:
if_hw_tsomaxsegsize, msb,
((rsm == NULL) ? hw_tls : 0)
#ifdef NETFLIX_COPY_ARGS
- , &filled_all
+ , NULL, NULL
#endif
);
if (len <= maxseg) {
@@ -13474,7 +13484,7 @@ send:
#endif
/* Log to the black box */
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -13483,13 +13493,10 @@ send:
log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
log.u_bbr.flex3 = maxseg;
log.u_bbr.flex4 = delay_calc;
- /* Encode filled_all into the upper flex5 bit */
log.u_bbr.flex5 = bbr->rc_past_init_win;
log.u_bbr.flex5 <<= 1;
log.u_bbr.flex5 |= bbr->rc_no_pacing;
log.u_bbr.flex5 <<= 29;
- if (filled_all)
- log.u_bbr.flex5 |= 0x80000000;
log.u_bbr.flex5 |= tp->t_maxseg;
log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
@@ -14073,6 +14080,56 @@ bbr_pru_options(struct tcpcb *tp, int flags)
return (0);
}
+static void
+bbr_switch_failed(struct tcpcb *tp)
+{
+ /*
+ * If a switch fails we only need to
+ * make sure mbuf_queuing is still in place.
+ * We also need to make sure we are still in
+ * ticks granularity (though we should probably
+ * change bbr to go to USECs).
+ *
+ * For timers we need to see if we are still in the
+ * pacer (if our flags are up) if so we are good, if
+ * not we need to get back into the pacer.
+ */
+ struct inpcb *inp = tptoinpcb(tp);
+ struct timeval tv;
+ uint32_t cts;
+ uint32_t toval;
+ struct tcp_bbr *bbr;
+ struct hpts_diag diag;
+
+ inp->inp_flags2 |= INP_CANNOT_DO_ECN;
+ inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+ if (inp->inp_in_hpts) {
+ return;
+ }
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ cts = tcp_get_usecs(&tv);
+ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ if (TSTMP_GT(bbr->rc_pacer_started, cts)) {
+ toval = bbr->rc_pacer_started - cts;
+ } else {
+ /* one slot please */
+ toval = HPTS_TICKS_PER_SLOT;
+ }
+ } else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
+ toval = bbr->r_ctl.rc_timer_exp - cts;
+ } else {
+ /* one slot please */
+ toval = HPTS_TICKS_PER_SLOT;
+ }
+ } else
+ toval = HPTS_TICKS_PER_SLOT;
+ (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),
+ __LINE__, &diag);
+ bbr_log_hpts_diag(bbr, cts, &diag);
+}
+
struct tcp_function_block __tcp_bbr = {
.tfb_tcp_block_name = __XSTRING(STACKNAME),
.tfb_tcp_output = bbr_output,
@@ -14087,6 +14144,7 @@ struct tcp_function_block __tcp_bbr = {
.tfb_tcp_handoff_ok = bbr_handoff_ok,
.tfb_tcp_mtu_chg = bbr_mtu_chg,
.tfb_pru_options = bbr_pru_options,
+ .tfb_switch_failed = bbr_switch_failed,
.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
};
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index d4ba3771ab6e..8b205d12d7f7 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -458,7 +458,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
static int32_t rack_handoff_ok(struct tcpcb *tp);
-static int32_t rack_init(struct tcpcb *tp);
+static int32_t rack_init(struct tcpcb *tp, void **ptr);
static void rack_init_sysctls(void);
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
@@ -12344,7 +12344,7 @@ rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
}
static int
-rack_init(struct tcpcb *tp)
+rack_init(struct tcpcb *tp, void **ptr)
{
struct inpcb *inp = tptoinpcb(tp);
struct tcp_rack *rack = NULL;
@@ -12354,8 +12354,8 @@ rack_init(struct tcpcb *tp)
uint32_t iwin, snt, us_cts;
int err;
- tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
- if (tp->t_fb_ptr == NULL) {
+ *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
+ if (*ptr == NULL) {
/*
* We need to allocate memory but cant. The INP and INP_INFO
* locks and they are recursive (happens during setup. So a
@@ -12364,9 +12364,9 @@ rack_init(struct tcpcb *tp)
*/
return (ENOMEM);
}
- memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
+ memset(ptr, 0, sizeof(struct tcp_rack));
- rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack = (struct tcp_rack *)ptr;
RB_INIT(&rack->r_ctl.rc_mtree);
TAILQ_INIT(&rack->r_ctl.rc_free);
TAILQ_INIT(&rack->r_ctl.rc_tmap);
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 4abc0776b14e..1f2256c6b6f9 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -109,6 +109,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_lro.h>
#include <netinet/cc/cc.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_fastopen.h>
@@ -152,6 +153,11 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
CTLFLAG_RW,
&tcp_force_detection, 0,
"Do we force detection even if the INP has it off?");
+int32_t tcp_sad_limit = 10000;
+SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
+ CTLFLAG_RW,
+ &tcp_sad_limit, 10000,
+ "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
CTLFLAG_RW,
@@ -363,7 +369,7 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
#define V_ts_offset_secret VNET(ts_offset_secret)
-static int tcp_default_fb_init(struct tcpcb *tp);
+static int tcp_default_fb_init(struct tcpcb *tp, void **ptr);
static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -519,19 +525,12 @@ void
tcp_switch_back_to_default(struct tcpcb *tp)
{
struct tcp_function_block *tfb;
+ void *ptr = NULL;
KASSERT(tp->t_fb != &tcp_def_funcblk,
("%s: called by the built-in default stack", __func__));
/*
- * Release the old stack. This function will either find a new one
- * or panic.
- */
- if (tp->t_fb->tfb_tcp_fb_fini != NULL)
- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
- refcount_release(&tp->t_fb->tfb_refcnt);
-
- /*
* Now, we'll find a new function block to use.
* Start by trying the current user-selected
* default, unless this stack is the user-selected
@@ -551,14 +550,20 @@ tcp_switch_back_to_default(struct tcpcb *tp)
/* Try to use that stack. */
if (tfb != NULL) {
/* Initialize the new stack. If it succeeds, we are done. */
- tp->t_fb = tfb;
- if (tp->t_fb->tfb_tcp_fb_init == NULL ||
- (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+ if (tfb->tfb_tcp_fb_init == NULL ||
+ (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) {
+ /* Release the old stack */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ /* Now set in all the pointers */
+ tp->t_fb = tfb;
+ tp->t_fb_ptr = ptr;
return;
-
+ }
/*
* Initialization failed. Release the reference count on
- * the stack.
+ * the looked up default stack.
*/
refcount_release(&tfb->tfb_refcnt);
}
@@ -578,12 +583,18 @@ tcp_switch_back_to_default(struct tcpcb *tp)
panic("Default stack rejects a new session?");
}
}
- tp->t_fb = tfb;
- if (tp->t_fb->tfb_tcp_fb_init != NULL &&
- (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ if (tfb->tfb_tcp_fb_init != NULL &&
+ (*tfb->tfb_tcp_fb_init)(tp, &ptr)) {
/* The default stack cannot fail */
panic("Default stack initialization failed");
}
+ /* Now release the old stack */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ /* And set in the pointers to the new */
+ tp->t_fb = tfb;
+ tp->t_fb_ptr = ptr;
}
static bool
@@ -1040,16 +1051,37 @@ tcp_default_handoff_ok(struct tcpcb *tp)
* it is required to always succeed since it is the stack of last resort!
*/
static int
-tcp_default_fb_init(struct tcpcb *tp)
+tcp_default_fb_init(struct tcpcb *tp, void **ptr)
{
struct socket *so = tptosocket(tp);
+ int rexmt;
INP_WLOCK_ASSERT(tptoinpcb(tp));
+ /* We don't use the pointer */
+ *ptr = NULL;
KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
("%s: connection %p in unexpected state %d", __func__, tp,
tp->t_state));
+ /* Make sure we get no interesting mbuf queuing behavior */
+ /* All mbuf queue/ack compress flags should be off */
+ tcp_lro_features_off(tptoinpcb(tp));
+
+ /* Cancel the GP measurement in progress */
+ tp->t_flags &= ~TF_GPUTINPROG;
+ /* Validate the timers are not in usec, if they are convert */
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
+ else
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ if (tp->t_rxtshift == 0)
+ tp->t_rxtcur = rexmt;
+ else
+ TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX);
+
/*
* Nothing to do for ESTABLISHED or LISTEN states. And, we don't
* know what to do for unexpected states (which includes TIME_WAIT).
@@ -2240,6 +2272,8 @@ tcp_newtcpcb(struct inpcb *inp)
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
+ /* We always start with ticks granularity */
+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -2265,7 +2299,7 @@ tcp_newtcpcb(struct inpcb *inp)
#endif
tp->t_pacing_rate = -1;
if (tp->t_fb->tfb_tcp_fb_init) {
- if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
refcount_release(&tp->t_fb->tfb_refcnt);
return (NULL);
}
@@ -4019,3 +4053,524 @@ tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, ui
}
}
#endif
+
+void
+tcp_change_time_units(struct tcpcb *tp, int granularity)
+{
+ if (tp->t_tmr_granularity == granularity) {
+ /* We are there */
+ return;
+ }
+ if (granularity == TCP_TMR_GRANULARITY_USEC) {
+ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS),
+ ("Granularity is not TICKS its %u in tp:%p",
+ tp->t_tmr_granularity, tp));
+ tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
+ if (tp->t_srtt > 1) {
+ uint32_t val, frac;
+
+ val = tp->t_srtt >> TCP_RTT_SHIFT;
+ frac = tp->t_srtt & 0x1f;
+ tp->t_srtt = TICKS_2_USEC(val);
+ /*
+ * frac is the fractional part of the srtt (if any)
+ * but its in ticks and every bit represents
+ * 1/32nd of a hz.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+ }
+ tp->t_srtt += frac;
+ }
+ }
+ if (tp->t_rttvar) {
+ uint32_t val, frac;
+
+ val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
+ frac = tp->t_rttvar & 0x1f;
+ tp->t_rttvar = TICKS_2_USEC(val);
+ /*
+ * frac is the fractional part of the srtt (if any)
+ * but its in ticks and every bit represents
+ * 1/32nd of a hz.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+ }
+ tp->t_rttvar += frac;
+ }
+ }
+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC;
+ } else if (granularity == TCP_TMR_GRANULARITY_TICKS) {
+ /* Convert back to ticks, with */
+ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC),
+ ("Granularity is not USEC its %u in tp:%p",
+ tp->t_tmr_granularity, tp));
+ if (tp->t_srtt > 1) {
+ uint32_t val, frac;
+
+ val = USEC_2_TICKS(tp->t_srtt);
+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+ tp->t_srtt = val << TCP_RTT_SHIFT;
+ /*
+ * frac is the fractional part here is left
+ * over from converting to hz and shifting.
+ * We need to convert this to the 5 bit
+ * remainder.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+ }
+ tp->t_srtt += frac;
+ }
+ }
+ if (tp->t_rttvar) {
+ uint32_t val, frac;
+
+ val = USEC_2_TICKS(tp->t_rttvar);
+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+ tp->t_rttvar = val << TCP_RTTVAR_SHIFT;
+ /*
+ * frac is the fractional part here is left
+ * over from converting to hz and shifting.
+ * We need to convert this to the 5 bit
+ * remainder.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+ }
+ tp->t_rttvar += frac;
+ }
+ }
+ tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
+ }
+#ifdef INVARIANTS
+ else {
+ panic("Unknown granularity:%d tp:%p",
+ granularity, tp);
+ }
+#endif
+}
+
+void
+tcp_handle_orphaned_packets(struct tcpcb *tp)
+{
+ struct mbuf *save, *m, *prev;
+ /*
+ * Called when a stack switch is occuring from the fini()
+ * of the old stack. We assue the init() as already been
+ * run of the new stack and it has set the inp_flags2 to
+ * what it supports. This function will then deal with any
+ * differences i.e. cleanup packets that maybe queued that
+ * the newstack does not support.
+ */
+
+ if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_L_ACKS)
+ return;
+ if ((tptoinpcb(tp)->inp_flags2 & INP_SUPPORTS_MBUFQ) == 0) {
+ /*
+ * It is unsafe to process the packets since a
+ * reset may be lurking in them (its rare but it
+ * can occur). If we were to find a RST, then we
+ * would end up dropping the connection and the
+ * INP lock, so when we return the caller (tcp_usrreq)
+ * will blow up when it trys to unlock the inp.
+ * This new stack does not do any fancy LRO features
+ * so all we can do is toss the packets.
+ */
+ m = tp->t_in_pkt;
+ tp->t_in_pkt = NULL;
+ tp->t_tail_pkt = NULL;
+ while (m) {
+ save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = save;
+ }
+ } else {
+ /*
+ * Here we have a stack that does mbuf queuing but
+ * does not support compressed ack's. We must
+ * walk all the mbufs and discard any compressed acks.
+ */
+ m = tp->t_in_pkt;
+ prev = NULL;
+ while (m) {
+ if (m->m_flags & M_ACKCMP) {
+ /* We must toss this packet */
+ if (tp->t_tail_pkt == m)
+ tp->t_tail_pkt = prev;
+ if (prev)
+ prev->m_nextpkt = m->m_nextpkt;
+ else
+ tp->t_in_pkt = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ /* move forward */
+ if (prev)
+ m = prev->m_nextpkt;
+ else
+ m = tp->t_in_pkt;
+ } else {
+ /* this one is ok */
+ prev = m;
+ m = m->m_nextpkt;
+ }
+ }
+ }
+}
+
+#ifdef TCP_REQUEST_TRK
+uint32_t
+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes)
+{
+#ifdef KERN_TLS
+ struct ktls_session *tls;
+ uint32_t rec_oh, records;
+
+ tls = so->so_snd.sb_tls_info;
+ if (tls == NULL)
+ return (0);
+
+ rec_oh = tls->params.tls_hlen + tls->params.tls_tlen;
+ records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len);
+ return (records * rec_oh);
+#else
+ return (0);
+#endif
+}
+
+extern uint32_t tcp_stale_entry_time;
+uint32_t tcp_stale_entry_time = 250000;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW,
+ &tcp_stale_entry_time, 250000, "Time that a http entry without a sendfile ages out");
+
+void
+tcp_http_log_req_info(struct tcpcb *tp, struct http_sendfile_track *http,
+ uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes)
+{
+ if (tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+#ifdef TCPHPTS
+ log.u_bbr.inhpts = tcp_in_hpts(tptoinpcb(tp));
+#endif
+ log.u_bbr.flex8 = val;
+ log.u_bbr.rttProp = http->timestamp;
+ log.u_bbr.delRate = http->start;
+ log.u_bbr.cur_del_rate = http->end;
+ log.u_bbr.flex1 = http->start_seq;
+ log.u_bbr.flex2 = http->end_seq;
+ log.u_bbr.flex3 = http->flags;
+ log.u_bbr.flex4 = ((http->localtime >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex5 = (http->localtime & 0x00000000ffffffff);
+ log.u_bbr.flex7 = slot;
+ log.u_bbr.bw_inuse = offset;
+ /* nbytes = flex6 | epoch */
+ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff);
+ log.u_bbr.epoch = (nbytes & 0x00000000ffffffff);
+ /* cspr = lt_epoch | pkts_out */
+ log.u_bbr.lt_epoch = ((http->cspr >> 32) & 0x00000000ffffffff);
+ log.u_bbr.pkts_out |= (http->cspr & 0x00000000ffffffff);
+ log.u_bbr.applimited = tp->t_http_closed;
+ log.u_bbr.applimited <<= 8;
+ log.u_bbr.applimited |= tp->t_http_open;
+ log.u_bbr.applimited <<= 8;
+ log.u_bbr.applimited |= tp->t_http_req;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ TCP_LOG_HTTP_T, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+void
+tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent)
+{
+ if (tp->t_http_req > 0)
+ tp->t_http_req--;
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ if (tp->t_http_open > 0)
+ tp->t_http_open--;
+ } else {
+ if (tp->t_http_closed > 0)
+ tp->t_http_closed--;
+ }
+ ent->flags = TCP_HTTP_TRACK_FLG_EMPTY;
+}
+
+static void
+tcp_http_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest)
+{
+ struct http_sendfile_track *ent;
+ uint64_t time_delta, oldest_delta;
+ int i, oldest, oldest_set = 0, cnt_rm = 0;
+
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ if (ent->flags != TCP_HTTP_TRACK_FLG_USED) {
+ /*
+ * We only care about closed end ranges
+ * that are allocated and have no sendfile
+ * ever touching them. They would be in
+ * state USED.
+ */
+ continue;
+ }
+ if (ts >= ent->localtime)
+ time_delta = ts - ent->localtime;
+ else
+ time_delta = 0;
+ if (time_delta &&
+ ((oldest_delta < time_delta) || (oldest_set == 0))) {
+ oldest_set = 1;
+ oldest = i;
+ oldest_delta = time_delta;
+ }
+ if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) {
+ /*
+ * No sendfile in a our time-limit
+ * time to purge it.
+ */
+ cnt_rm++;
+ tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE,
+ time_delta, 0);
+ tcp_http_free_a_slot(tp, ent);
+ }
+ }
+ if ((cnt_rm == 0) && rm_oldest && oldest_set) {
+ ent = &tp->t_http_info[oldest];
+ tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE,
+ oldest_delta, 1);
+ tcp_http_free_a_slot(tp, ent);
+ }
+}
+
+int
+tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point)
+{
+ int i, ret=0;
+ struct http_sendfile_track *ent;
+
+ /* Clean up any old closed end requests that are now completed */
+ if (tp->t_http_req == 0)
+ return(0);
+ if (tp->t_http_closed == 0)
+ return(0);
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ /* Skip empty ones */
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
+ continue;
+ /* Skip open ones */
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN)
+ continue;
+ if (SEQ_GEQ(ack_point, ent->end_seq)) {
+ /* We are past it -- free it */
+ tcp_http_log_req_info(tp, ent,
+ i, TCP_HTTP_REQ_LOG_FREED, 0, 0);
+ tcp_http_free_a_slot(tp, ent);
+ ret++;
+ }
+ }
+ return (ret);
+}
+
+int
+tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point)
+{
+ if (tp->t_http_req == 0)
+ return(-1);
+ if (tp->t_http_closed == 0)
+ return(-1);
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
+ return(-1);
+ if (SEQ_GEQ(ack_point, ent->end_seq)) {
+ return (1);
+ }
+ return (0);
+}
+
+struct http_sendfile_track *
+tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip)
+{
+ /*
+ * Given an ack point (th_ack) walk through our entries and
+ * return the first one found that th_ack goes past the
+ * end_seq.
+ */
+ struct http_sendfile_track *ent;
+ int i;
+
+ if (tp->t_http_req == 0) {
+ /* none open */
+ return (NULL);
+ }
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
+ continue;
+ if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) {
+ if (SEQ_GEQ(th_ack, ent->end_seq)) {
+ *ip = i;
+ return (ent);
+ }
+ }
+ }
+ return (NULL);
+}
+
+struct http_sendfile_track *
+tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq)
+{
+ struct http_sendfile_track *ent;
+ int i;
+
+ if (tp->t_http_req == 0) {
+ /* none open */
+ return (NULL);
+ }
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_SEARCH,
+ (uint64_t)seq, 0);
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ continue;
+ }
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ /*
+ * An open end request only needs to
+ * match the beginning seq or be
+ * all we have (once we keep going on
+ * a open end request we may have a seq
+ * wrap).
+ */
+ if ((SEQ_GEQ(seq, ent->start_seq)) ||
+ (tp->t_http_closed == 0))
+ return (ent);
+ } else {
+ /*
+ * For this one we need to
+ * be a bit more careful if its
+ * completed at least.
+ */
+ if ((SEQ_GEQ(seq, ent->start_seq)) &&
+ (SEQ_LT(seq, ent->end_seq))) {
+ return (ent);
+ }
+ }
+ }
+ return (NULL);
+}
+
+/* Should this be in its own file tcp_http.c ? */
+struct http_sendfile_track *
+tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups)
+{
+ struct http_sendfile_track *fil;
+ int i, allocated;
+
+ /* In case the stack does not check for completions do so now */
+ tcp_http_check_for_comp(tp, tp->snd_una);
+ /* Check for stale entries */
+ if (tp->t_http_req)
+ tcp_http_check_for_stale_entries(tp, ts,
+ (tp->t_http_req >= MAX_TCP_HTTP_REQ));
+ /* Check to see if this is a duplicate of one not started */
+ if (tp->t_http_req) {
+ for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ fil = &tp->t_http_info[i];
+ if (fil->flags != TCP_HTTP_TRACK_FLG_USED)
+ continue;
+ if ((fil->timestamp == req->timestamp) &&
+ (fil->start == req->start) &&
+ ((fil->flags & TCP_HTTP_TRACK_FLG_OPEN) ||
+ (fil->end == req->end))) {
+ /*
+ * We already have this request
+ * and it has not been started with sendfile.
+ * This probably means the user was returned
+ * a 4xx of some sort and its going to age
+ * out, lets not duplicate it.
+ */
+ return(fil);
+ }
+ }
+ }
+ /* Ok if there is no room at the inn we are in trouble */
+ if (tp->t_http_req >= MAX_TCP_HTTP_REQ) {
+ tcp_trace_point(tp, TCP_TP_HTTP_LOG_FAIL);
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ tcp_http_log_req_info(tp, &tp->t_http_info[i],
+ i, TCP_HTTP_REQ_LOG_ALLOCFAIL, 0, 0);
+ }
+ return (NULL);
+ }
+ for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ fil = &tp->t_http_info[i];
+ if (fil->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ allocated = 1;
+ fil->flags = TCP_HTTP_TRACK_FLG_USED;
+ fil->timestamp = req->timestamp;
+ fil->localtime = ts;
+ fil->start = req->start;
+ if (req->flags & TCP_LOG_HTTPD_RANGE_END) {
+ fil->end = req->end;
+ } else {
+ fil->end = 0;
+ fil->flags |= TCP_HTTP_TRACK_FLG_OPEN;
+ }
+ /*
+ * We can set the min boundaries to the TCP Sequence space,
+ * but it might be found to be further up when sendfile
+ * actually runs on this range (if it ever does).
+ */
+ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc;
+ fil->start_seq = tp->snd_una +
+ tptosocket(tp)->so_snd.sb_ccc;
+ fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
+ if (tptosocket(tp)->so_snd.sb_tls_info) {
+ /*
+ * This session is doing TLS. Take a swag guess
+ * at the overhead.
+ */
+ fil->end_seq += tcp_estimate_tls_overhead(
+ tptosocket(tp), (fil->end - fil->start));
+ }
+ tp->t_http_req++;
+ if (fil->flags & TCP_HTTP_TRACK_FLG_OPEN)
+ tp->t_http_open++;
+ else
+ tp->t_http_closed++;
+ tcp_http_log_req_info(tp, fil, i,
+ TCP_HTTP_REQ_LOG_NEW, 0, 0);
+ break;
+ } else
+ fil = NULL;
+ }
+ return (fil);
+}
+
+void
+tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts)
+{
+ (void)tcp_http_alloc_req_full(tp, &user->http_req, ts, 1);
+}
+#endif
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index e45fc457f4d6..6a3561b179c2 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -932,22 +932,27 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
* pickup one on the new entry.
*/
struct tcp_function_block *rblk;
+ void *ptr = NULL;
rblk = find_and_ref_tcp_fb(blk);
KASSERT(rblk != NULL,
("cannot find blk %p out of syncache?", blk));
- if (tp->t_fb->tfb_tcp_fb_fini)
- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
- refcount_release(&tp->t_fb->tfb_refcnt);
- tp->t_fb = rblk;
- /*
- * XXXrrs this is quite dangerous, it is possible
- * for the new function to fail to init. We also
- * are not asking if the handoff_is_ok though at
- * the very start thats probalbly ok.
- */
- if (tp->t_fb->tfb_tcp_fb_init) {
- (*tp->t_fb->tfb_tcp_fb_init)(tp);
+
+ if (rblk->tfb_tcp_fb_init == NULL ||
+ (*rblk->tfb_tcp_fb_init)(tp, &ptr) == 0) {
+ /* Release the old stack */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ /* Now set in all the pointers */
+ tp->t_fb = rblk;
+ tp->t_fb_ptr = ptr;
+ } else {
+ /*
+ * Initialization failed. Release the reference count on
+ * the looked up default stack.
+ */
+ refcount_release(&rblk->tfb_refcnt);
}
}
tp->snd_wl1 = sc->sc_irs;
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 7508b596bad3..7abf4c215102 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1659,6 +1659,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
*/
struct tcp_function_set fsn;
struct tcp_function_block *blk;
+ void *ptr = NULL;
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
@@ -1666,10 +1667,6 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
return (error);
INP_WLOCK(inp);
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- return (ECONNRESET);
- }
tp = intotcpcb(inp);
blk = find_and_ref_tcp_functions(&fsn);
@@ -1710,41 +1707,57 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
return (ENOENT);
}
/*
- * Release the old refcnt, the
- * lookup acquired a ref on the
- * new one already.
+ * Ensure the new stack takes ownership with a
+ * clean slate on peak rate threshold.
*/
- if (tp->t_fb->tfb_tcp_fb_fini) {
- struct epoch_tracker et;
- /*
- * Tell the stack to cleanup with 0 i.e.
- * the tcb is not going away.
- */
- NET_EPOCH_ENTER(et);
- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
- NET_EPOCH_EXIT(et);
- }
+ tp->t_peakrate_thr = 0;
#ifdef TCPHPTS
/* Assure that we are not on any hpts */
tcp_hpts_remove(tptoinpcb(tp));
#endif
if (blk->tfb_tcp_fb_init) {
- error = (*blk->tfb_tcp_fb_init)(tp);
+ error = (*blk->tfb_tcp_fb_init)(tp, &ptr);
if (error) {
+ /*
+ * Release the ref count the lookup
+ * acquired.
+ */
refcount_release(&blk->tfb_refcnt);
- if (tp->t_fb->tfb_tcp_fb_init) {
- if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
- /* Fall back failed, drop the connection */
- INP_WUNLOCK(inp);
- soabort(so);
- return (error);
- }
+ /*
+ * Now there is a chance that the
+ * init() function mucked with some
+ * things before it failed, such as
+ * hpts or inp_flags2 or timer granularity.
+ * It should not of, but lets give the old
+ * stack a chance to reset to a known good state.
+ */
+ if (tp->t_fb->tfb_switch_failed) {
+ (*tp->t_fb->tfb_switch_failed)(tp);
}
- goto err_out;
+ goto err_out;
}
}
+ if (tp->t_fb->tfb_tcp_fb_fini) {
+ struct epoch_tracker et;
+ /*
+ * Tell the stack to cleanup with 0 i.e.
+ * the tcb is not going away.
+ */
+ NET_EPOCH_ENTER(et);
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ NET_EPOCH_EXIT(et);
+ }
+ /*
+ * Release the old refcnt, the
+ * lookup acquired a ref on the
+ * new one already.
+ */
refcount_release(&tp->t_fb->tfb_refcnt);
+ /*
+ * Set in the new stack.
+ */
tp->t_fb = blk;
+ tp->t_fb_ptr = ptr;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
@@ -1754,6 +1767,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
err_out:
INP_WUNLOCK(inp);
return (error);
+
}
/* Pass in the INP locked, callee must unlock it. */
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 57fbf6851f4b..a86c52ad90a0 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -61,6 +61,15 @@
#define TCP_EI_STATUS_2MSL 0xb
#define TCP_EI_STATUS_MAX_VALUE 0xb
+#define TCP_HTTP_REQ_LOG_NEW 0x01
+#define TCP_HTTP_REQ_LOG_COMPLETE 0x02
+#define TCP_HTTP_REQ_LOG_FREED 0x03
+#define TCP_HTTP_REQ_LOG_ALLOCFAIL 0x04
+#define TCP_HTTP_REQ_LOG_MOREYET 0x05
+#define TCP_HTTP_REQ_LOG_FORCEFREE 0x06
+#define TCP_HTTP_REQ_LOG_STALE 0x07
+#define TCP_HTTP_REQ_LOG_SEARCH 0x08
+
/************************************************/
/* Status bits we track to assure no duplicates,
* the bits here are not used by the code but
@@ -126,6 +135,154 @@ struct sackhint {
STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
+#define TCP_HTTP_TRACK_FLG_EMPTY 0x00 /* Available */
+#define TCP_HTTP_TRACK_FLG_USED 0x01 /* In use */
+#define TCP_HTTP_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */
+#define TCP_HTTP_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */
+#define TCP_HTTP_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */
+#define TCP_HTTP_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
+#define MAX_TCP_HTTP_REQ 5 /* Max we will have at once */
+
+#ifdef TCP_REQUEST_TRK
+struct http_sendfile_track {
+ uint64_t timestamp; /* User sent timestamp */
+ uint64_t start; /* Start of sendfile offset */
+ uint64_t end; /* End if not open-range req */
+ uint64_t localtime; /* Time we actually got the req */
+ uint64_t deadline; /* If in CU mode, deadline to delivery */
+ uint64_t first_send; /* Time of first send in the range */
+ uint64_t cspr; /* Client suggested pace rate */
+ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */
+ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */
+ tcp_seq start_seq; /* First TCP Seq assigned */
+ tcp_seq end_seq; /* If range req last seq */
+ uint32_t flags; /* Type of request open etc */
+ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */
+ uint32_t hint_maxseg; /* Client hinted maxseg */
+ uint32_t hybrid_flags; /* Hybrid flags on this request */
+};
+
+#endif
+
+/*
+ * Change Query responses for a stack switch we create a structure
+ * that allows query response from the new stack to the old, if
+ * supported.
+ *
+ * There are three queries currently defined.
+ * - sendmap
+ * - timers
+ * - rack_times
+ *
+ * For the sendmap query the caller fills in the
+ * req and the req_param as the first seq (usually
+ * snd_una). When the response comes back indicating
+ * that there was data (return value 1), then the caller
+ * can build a sendmap entry based on the range and the
+ * times. The next query would then be done at the
+ * newly created sendmap_end. Repeated until sendmap_end == snd_max.
+ *
+ * Flags in sendmap_flags are defined below as well.
+ *
+ * For timers the standard PACE_TMR_XXXX flags are returned indicating
+ * a pacing timer (possibly) and one other timer. If pacing timer then
+ * the expiration timeout time in microseconds is in timer_pacing_to.
+ * And the value used with whatever timer (if a flag is set) is in
+ * timer_rxt. If no timers are running a 0 is returned and of
+ * course no flags are set in timer_hpts_flags.
+ *
+ * The rack_times are a misc collection of information that
+ * the old stack might possibly fill in. Of course its possible
+ * that an old stack may not have a piece of information. If so
+ * then setting that value to zero is advised. Setting any
+ * timestamp passed should only place a zero in it when it
+ * is unfilled. This may mean that a time is off by a micro-second
+ * but this is ok in the grand scheme of things.
+ *
+ * When switching stacks it is desireable to get as much information
+ * from the old stack to the new stack as possible. Though not always
+ * will the stack be compatible in the types of information. The
+ * init() function needs to take care when it begins changing
+ * things such as inp_flags2 and the timer units to position these
+ * changes at a point where it is unlikely they will fail after
+ * making such changes. A stack optionally can have an "undo"
+ * function
+ *
+ * To transfer information to the old stack from the new in
+ * respect to LRO and the inp_flags2, the new stack should set
+ * the inp_flags2 to what it supports. The old stack in its
+ * fini() function should call the tcp_handle_orphaned_packets()
+ * to clean up any packets. Note that a new stack should attempt
+ */
+
+/* Query types */
+#define TCP_QUERY_SENDMAP 1
+#define TCP_QUERY_TIMERS_UP 2
+#define TCP_QUERY_RACK_TIMES 3
+
+/* Flags returned in sendmap_flags */
+#define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */
+#define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */
+#define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */
+#define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */
+#define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */
+#define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */
+#define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */
+#define SNDMAP_MASK (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\
+ |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH)
+#define SNDMAP_NRTX 3
+
+struct tcp_query_resp {
+ int req;
+ uint32_t req_param;
+ union {
+ struct {
+ tcp_seq sendmap_start;
+ tcp_seq sendmap_end;
+ int sendmap_send_cnt;
+ uint64_t sendmap_time[SNDMAP_NRTX];
+ uint64_t sendmap_ack_arrival;
+ int sendmap_flags;
+ uint32_t sendmap_r_rtr_bytes;
+ /* If FAS is available if not 0 */
+ uint32_t sendmap_fas;
+ uint8_t sendmap_dupacks;
+ };
+ struct {
+ uint32_t timer_hpts_flags;
+ uint32_t timer_pacing_to;
+ uint32_t timer_timer_exp;
+ };
+ struct {
+ /* Timestamps and rtt's */
+ uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */
+ uint32_t rack_num_dsacks; /* Num of dsacks seen */
+ uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */
+ uint32_t rack_min_rtt; /* never 0 smallest rtt seen */
+ uint32_t rack_rtt; /* Last rtt used by rack */
+ uint32_t rack_tmit_time; /* The time the rtt seg was tmited */
+ uint32_t rack_time_went_idle; /* If in persist the time we went idle */
+ /* Prr data */
+ uint32_t rack_sacked;
+ uint32_t rack_holes_rxt;
+ uint32_t rack_prr_delivered;
+ uint32_t rack_prr_recovery_fs;
+ uint32_t rack_prr_out;
+ uint32_t rack_prr_sndcnt;
+ /* TLP data */
+ uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */
+ /* Various bits */
+ uint8_t rack_tlp_out; /* Is a TLP outstanding */
+ uint8_t rack_srtt_measured; /* The previous stack has measured srtt */
+ uint8_t rack_in_persist; /* Is the old stack in persists? */
+ uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */
+ };
+ };
+};
+
+#define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */
+#define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */
+
typedef enum {
TT_REXMT = 0,
TT_PERSIST,
@@ -277,6 +434,11 @@ struct tcpcb {
uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS];
#endif
+#ifdef TCP_REQUEST_TRK
+ uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */
+ uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */
+ uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */
+#endif
uint32_t t_logsn; /* Log "serial number" */
uint32_t gput_ts; /* Time goodput measurement started */
tcp_seq gput_seq; /* Outbound measurement seq */
@@ -290,6 +452,7 @@ struct tcpcb {
uint32_t t_dsack_bytes; /* dsack bytes received */
uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */
uint32_t t_dsack_pack; /* dsack packets we have eceived */
+ uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */
uint8_t t_rttupdated; /* number of times rtt sampled */
/* TCP Fast Open */
uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */
@@ -311,6 +474,13 @@ struct tcpcb {
struct osd t_osd; /* storage for Khelp module data */
#endif
uint8_t _t_logpoint; /* Used when a BB log points is enabled */
+#ifdef TCP_REQUEST_TRK
+ /* Response tracking addons. */
+ uint8_t t_http_req; /* Request count */
+ uint8_t t_http_open; /* Number of open range requests */
+ uint8_t t_http_closed; /* Number of closed range requests */
+ struct http_sendfile_track t_http_info[MAX_TCP_HTTP_REQ];
+#endif
};
#endif /* _KERNEL || _WANT_TCPCB */
@@ -346,7 +516,7 @@ struct tcptemp {
#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
#define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */
-/*
+/**
* If defining the optional tcp_timers, in the
* tfb_tcp_timer_stop call you must use the
* callout_async_drain() function with the
@@ -356,6 +526,7 @@ struct tcptemp {
* does not know your callbacks you must provide a
* stop_all function that loops through and calls
* tcp_timer_stop() with each of your defined timers.
+ *
* Adding a tfb_tcp_handoff_ok function allows the socket
* option to change stacks to query you even if the
* connection is in a later stage. You return 0 to
@@ -363,16 +534,67 @@ struct tcptemp {
* non-zero (an error number) to say no you can't.
* If the function is undefined you can only change
* in the early states (before connect or listen).
+ *
+ * tfb_tcp_fb_init is used to allow the new stack to
+ * setup its control block. Among the things it must
+ * do is:
+ * a) Make sure that the inp_flags2 is setup correctly
+ * for LRO. There are two flags that the previous
+ * stack may have set INP_MBUF_ACKCMP and
+ * INP_SUPPORTS_MBUFQ. If the new stack does not
+ * support these it *should* clear the flags.
+ * b) Make sure that the timers are in the proper
+ * granularity that the stack wants. The stack
+ * should check the t_tmr_granularity field. Currently
+ * there are two values that it may hold
+ * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC.
+ * Use the functions tcp_timer_convert(tp, granularity);
+ * to move the timers to the correct format for your stack.
+ *
+ * The new stack may also optionally query the tfb_chg_query
+ * function if the old stack has one. The new stack may ask
+ * for one of three entries and can also state to the old
+ * stack its support for the INP_MBUF_ACKCMP and
+ * INP_SUPPORTS_MBUFQ. This is important since if there are
+ * queued ack's without that statement the old stack will
+ * be forced to discard the queued acks. The requests that
+ * can be made for information by the new stacks are:
+ *
+ * Note also that the tfb_tcp_fb_init() when called can
+ * determine if a query is needed by looking at the
+ * value passed in the ptr. The ptr is designed to be
+ * set in with any allocated memory, but the address
+ * of the condtion (ptr == &tp->t_fb_ptr) will be
+ * true if this is not a stack switch but the initial
+ * setup of a tcb (which means no query would be needed).
+ * If, however, the value is not t_fb_ptr, then the caller
+ * is in the middle of a stack switch and is the new stack.
+ * A query would be appropriate (if the new stack support
+ * the query mechanism).
+ *
+ * TCP_QUERY_SENDMAP - Query of outstanding data.
+ * TCP_QUERY_TIMERS_UP - Query about running timers.
+ * TCP_SUPPORTED_LRO - Declaration in req_param of
+ * the inp_flags2 supported by
+ * the new stack.
+ * TCP_QUERY_RACK_TIMES - Enquire about various timestamps
+ * and states the old stack may be in.
+ *
* tfb_tcp_fb_fini is changed to add a flag to tell
* the old stack if the tcb is being destroyed or
* not. A one in the flag means the TCB is being
* destroyed, a zero indicates its transitioning to
- * another stack (via socket option).
+ * another stack (via socket option). The
+ * tfb_tcp_fb_fini() function itself should not change timers
+ * or inp_flags2 (the tfb_tcp_fb_init() must do that). However
+ * if the old stack supports the LRO mbuf queuing, and the new
+ * stack does not communicate via chg messages that it too does,
+ * it must assume it does not and free any queued mbufs.
+ *
*/
struct tcp_function_block {
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
int (*tfb_tcp_output)(struct tcpcb *);
- int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t);
@@ -387,15 +609,18 @@ struct tcp_function_block {
int, struct timeval *);
int (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt);
/* Optional memory allocation/free routine */
- int (*tfb_tcp_fb_init)(struct tcpcb *);
+ int (*tfb_tcp_fb_init)(struct tcpcb *, void **);
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
- void (*tfb_tcp_mtu_chg)(struct tcpcb *);
+ void (*tfb_tcp_mtu_chg)(struct tcpcb *tp);
int (*tfb_pru_options)(struct tcpcb *, int);
void (*tfb_hwtls_change)(struct tcpcb *, int);
+ int (*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *);
+ void (*tfb_switch_failed)(struct tcpcb *);
+ bool (*tfb_early_wake_check)(struct tcpcb *);
int (*tfb_compute_pipe)(struct tcpcb *tp);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
@@ -445,6 +670,16 @@ tcp_output(struct tcpcb *tp)
return (rv);
}
+static inline void
+tcp_lro_features_off(struct inpcb *inp)
+{
+ inp->inp_flags2 &= ~(INP_SUPPORTS_MBUFQ|
+ INP_MBUF_QUEUE_READY|
+ INP_DONT_SACK_QUEUE|
+ INP_MBUF_ACKCMP|
+ INP_MBUF_L_ACKS);
+}
+
/*
* tcp_output_unlock()
* Always returns unlocked, handles drop request from advanced stacks.
@@ -1169,6 +1404,7 @@ extern counter_u64_t tcp_bad_csums;
#ifdef NETFLIX_EXP_DETECTION
/* Various SACK attack thresholds */
extern int32_t tcp_force_detection;
+extern int32_t tcp_sad_limit;
extern int32_t tcp_sack_to_ack_thresh;
extern int32_t tcp_sack_to_move_thresh;
extern int32_t tcp_restoral_thresh;
@@ -1176,6 +1412,7 @@ extern int32_t tcp_sad_decay_val;
extern int32_t tcp_sad_pacing_interval;
extern int32_t tcp_sad_low_pps;
extern int32_t tcp_map_minimum;
+extern int32_t tcp_attack_on_turns_on_logging;
#endif
extern uint32_t tcp_ack_war_time_window;
extern uint32_t tcp_ack_war_cnt;
@@ -1246,6 +1483,8 @@ int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
size_t seed_len);
int tcp_can_enable_pacing(void);
void tcp_decrement_paced_conn(void);
+void tcp_change_time_units(struct tcpcb *, int);
+void tcp_handle_orphaned_packets(struct tcpcb *);
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
@@ -1253,6 +1492,31 @@ struct mbuf *
int tcp_stats_init(void);
void tcp_log_end_status(struct tcpcb *tp, uint8_t status);
+#ifdef TCP_REQUEST_TRK
+void tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent);
+struct http_sendfile_track *
+tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip);
+int tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point);
+int
+tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point);
+struct http_sendfile_track *
+tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq);
+void
+tcp_http_log_req_info(struct tcpcb *tp,
+ struct http_sendfile_track *http, uint16_t slot,
+ uint8_t val, uint64_t offset, uint64_t nbytes);
+
+uint32_t
+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes);
+void
+tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user,
+ uint64_t ts);
+
+struct http_sendfile_track *
+tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups);
+
+
+#endif
#ifdef TCP_ACCOUNTING
int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss);
#endif
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index a6a28291123d..4798c9c2a9ab 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -1236,6 +1236,16 @@ m_align(struct mbuf *m, int len)
(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
/*
+ * So M_TRAILINGROOM() is for when you want to know how much space
+ * would be there if it was writable. This can be used to
+ * detect changes in mbufs by knowing the value at one point
+ * and then being able to compare it later to the current M_TRAILINGROOM().
+ * The TRAILINGSPACE() macro is not suitable for this since an mbuf
+ * at one point might not be writable and then later it becomes writable
+ * even though the space at the back of it has not changed.
+ */
+#define M_TRAILINGROOM(m) ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len))
+/*
* Compute the amount of space available after the end of data in an mbuf.
*
* The M_WRITABLE() is a temporary, conservative safety measure: the burden
@@ -1245,9 +1255,7 @@ m_align(struct mbuf *m, int len)
* for mbufs with external storage. We now allow mbuf-embedded data to be
* read-only as well.
*/
-#define M_TRAILINGSPACE(m) \
- (M_WRITABLE(m) ? \
- ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
+#define M_TRAILINGSPACE(m) (M_WRITABLE(m) ? M_TRAILINGROOM(m) : 0)
/*
* Arrange to prepend space of size plen to mbuf m. If a new mbuf must be