aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet
diff options
context:
space:
mode:
authorRandall Stewart <rrs@FreeBSD.org>2019-09-24 18:18:11 +0000
committerRandall Stewart <rrs@FreeBSD.org>2019-09-24 18:18:11 +0000
commit35c7bb340788f0ce9347b7066619d8afb31e2123 (patch)
tree86d8e5b0cf3413e884c83015ec43bfc66f071641 /sys/netinet
parent749597dc1d21dce46fb94bfbe34cdb20ec1d9ab3 (diff)
downloadsrc-35c7bb340788f0ce9347b7066619d8afb31e2123.tar.gz
src-35c7bb340788f0ce9347b7066619d8afb31e2123.zip
This commit adds BBR (Bottleneck Bandwidth and RTT) congestion control. This
is a completely separate TCP stack (tcp_bbr.ko) that will be built only if you add the make options WITH_EXTRA_TCP_STACKS=1 and also include the option TCPHPTS. You can also include the RATELIMIT option if you have a NIC interface that supports hardware pacing, BBR understands how to use such a feature. Note that this commit also adds in a general purpose time-filter which allows you to have a min-filter or max-filter. A filter allows you to have a low (or high) value for some period of time and degrade slowly to another value has time passes. You can find out the details of BBR by looking at the original paper at: https://queue.acm.org/detail.cfm?id=3022184 or consult many other web resources you can find on the web referenced by "BBR congestion control". It should be noted that BBRv1 (which this is) does tend to unfairness in cases of small buffered paths, and it will usually get less bandwidth in the case of large BDP paths(when competing with new-reno or cubic flows). BBR is still an active research area and we do plan on implementing V2 of BBR to see if it is an improvement over V1. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D21582
Notes
Notes: svn path=/head/; revision=352657
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/ip_output.c9
-rw-r--r--sys/netinet/ip_var.h1
-rw-r--r--sys/netinet/tcp.h1
-rw-r--r--sys/netinet/tcp_stacks/bbr.c15189
-rw-r--r--sys/netinet/tcp_stacks/rack.c4154
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c48
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.h2
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.c166
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.h7
-rw-r--r--sys/netinet/tcp_stacks/tcp_bbr.h845
-rw-r--r--sys/netinet/tcp_stacks/tcp_rack.h80
11 files changed, 18994 insertions, 1508 deletions
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index 085040f25e64..cbd2d72188fa 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -212,7 +212,7 @@ ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
static int
ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
- const struct sockaddr_in *gw, struct route *ro)
+ const struct sockaddr_in *gw, struct route *ro, bool stamp_tag)
{
#ifdef KERN_TLS
struct ktls_session *tls = NULL;
@@ -256,7 +256,7 @@ ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
mst = inp->inp_snd_tag;
}
#endif
- if (mst != NULL) {
+ if (stamp_tag && mst != NULL) {
KASSERT(m->m_pkthdr.rcvif == NULL,
("trying to add a send tag to a forwarded packet"));
if (mst->ifp != ifp) {
@@ -791,7 +791,8 @@ sendit:
*/
m_clrprotoflags(m);
IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
- error = ip_output_send(inp, ifp, m, gw, ro);
+ error = ip_output_send(inp, ifp, m, gw, ro,
+ (flags & IP_NO_SND_TAG_RL) ? false : true);
goto done;
}
@@ -827,7 +828,7 @@ sendit:
IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
mtod(m, struct ip *), NULL);
- error = ip_output_send(inp, ifp, m, gw, ro);
+ error = ip_output_send(inp, ifp, m, gw, ro, true);
} else
m_freem(m);
}
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
index 7580a7b45212..b6693eb58200 100644
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -166,6 +166,7 @@ void kmod_ipstat_dec(int statnum);
#define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */
#define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */
#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */
+#define IP_NO_SND_TAG_RL 0x80 /* Don't send down the ratelimit tag */
#ifdef __NO_STRICT_ALIGNMENT
#define IP_HDR_ALIGNED_P(ip) 1
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 508d4b5fbc17..37ba3bb55741 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -239,6 +239,7 @@ struct tcphdr {
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */
#define TCP_BBR_EXTRA_GAIN 1097
+#define TCP_RACK_DO_DETECTION 1097 /* Recycle of extra gain for rack, attack detection */
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
#define TCP_BBR_RETRAN_WTSO 1099
#define TCP_DATA_AFTER_CLOSE 1100
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
new file mode 100644
index 000000000000..dccb2894ea68
--- /dev/null
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -0,0 +1,15189 @@
+/*-
+ * Copyright (c) 2016-2019
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ * This work is based on the ACM Queue paper
+ * BBR - Congestion Based Congestion Control
+ * and also numerous discussions with Neal, Yuchung and Van.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/ktls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/qmath.h>
+#include <sys/tree.h>
+#ifdef NETFLIX_STATS
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
+#endif
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/eventhandler.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/tim_filter.h>
+#include <sys/time.h>
+#include <vm/uma.h>
+#include <sys/kern_prefetch.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_ratelimit.h>
+#include <netinet/tcp_lro.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+
+#include <netipsec/ipsec_support.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+#include "sack_filter.h"
+#include "tcp_bbr.h"
+#include "rack_bbr_common.h"
+uma_zone_t bbr_zone;
+uma_zone_t bbr_pcb_zone;
+
+struct sysctl_ctx_list bbr_sysctl_ctx;
+struct sysctl_oid *bbr_sysctl_root;
+
+#define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \
+ (tv) = (value); \
+ if ((u_long)(tv) < (u_long)(tvmin)) \
+ (tv) = (tvmin); \
+ if ((u_long)(tv) > (u_long)(tvmax)) \
+ (tv) = (tvmax); \
+} while(0)
+
+/*#define BBR_INVARIANT 1*/
+
+/*
+ * initial window
+ */
+static uint32_t bbr_def_init_win = 10;
+static int32_t bbr_persist_min = 250000; /* 250ms */
+static int32_t bbr_persist_max = 1000000; /* 1 Second */
+static int32_t bbr_cwnd_may_shrink = 0;
+static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP;
+static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT;
+static int32_t bbr_hardware_pacing_limit = 8000;
+static int32_t bbr_quanta = 3; /* How much extra quanta do we get? */
+static int32_t bbr_no_retran = 0;
+static int32_t bbr_tcp_map_entries_limit = 1500;
+static int32_t bbr_tcp_map_split_limit = 256;
+
+static int32_t bbr_error_base_paceout = 10000; /* usec to pace */
+static int32_t bbr_max_net_error_cnt = 10;
+/* Should the following be dynamic too -- loss wise */
+static int32_t bbr_rtt_gain_thresh = 0;
+/* Measurement controls */
+static int32_t bbr_use_google_algo = 1;
+static int32_t bbr_ts_limiting = 1;
+static int32_t bbr_ts_can_raise = 0;
+static int32_t bbr_do_red = 600;
+static int32_t bbr_red_scale = 20000;
+static int32_t bbr_red_mul = 1;
+static int32_t bbr_red_div = 2;
+static int32_t bbr_red_growth_restrict = 1;
+static int32_t bbr_target_is_bbunit = 0;
+static int32_t bbr_drop_limit = 0;
+/*
+ * How much gain do we need to see to
+ * stay in startup?
+ */
+static int32_t bbr_marks_rxt_sack_passed = 0;
+static int32_t bbr_start_exit = 25;
+static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */
+static int32_t bbr_startup_loss_thresh = 2000; /* 20.00% loss */
+static int32_t bbr_hptsi_max_mul = 1; /* These two mul/div assure a min pacing */
+static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this
+ * if we go back ever to where the pacer
+ * has priority over timers.
+ */
+static int32_t bbr_policer_call_from_rack_to = 0;
+static int32_t bbr_policer_detection_enabled = 1;
+static int32_t bbr_min_measurements_req = 1; /* We need at least 2
+ * measurments before we are
+ * "good" note that 2 == 1.
+ * This is because we use a >
+ * comparison. This means if
+ * min_measure was 0, it takes
+ * num-measures > min(0) and
+ * you get 1 measurement and
+ * you are good. Set to 1, you
+ * have to have two
+ * measurements (this is done
+ * to prevent it from being ok
+ * to have no measurements). */
+static int32_t bbr_no_pacing_until = 4;
+
+static int32_t bbr_min_usec_delta = 20000; /* 20,000 usecs */
+static int32_t bbr_min_peer_delta = 20; /* 20 units */
+static int32_t bbr_delta_percent = 150; /* 15.0 % */
+
+static int32_t bbr_target_cwnd_mult_limit = 8;
+/*
+ * bbr_cwnd_min_val is the number of
+ * segments we hold to in the RTT probe
+ * state typically 4.
+ */
+static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS;
+
+
+static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS;
+
+static int32_t bbr_gain_to_target = 1;
+static int32_t bbr_gain_gets_extra_too = 1;
+/*
+ * bbr_high_gain is the 2/ln(2) value we need
+ * to double the sending rate in startup. This
+ * is used for both cwnd and hptsi gain's.
+ */
+static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1;
+static int32_t bbr_use_lower_gain_in_startup = 1;
+
+/* thresholds for reduction on drain in sub-states/drain */
+static int32_t bbr_drain_rtt = BBR_SRTT;
+static int32_t bbr_drain_floor = 88;
+static int32_t google_allow_early_out = 1;
+static int32_t google_consider_lost = 1;
+static int32_t bbr_drain_drop_mul = 4;
+static int32_t bbr_drain_drop_div = 5;
+static int32_t bbr_rand_ot = 50;
+static int32_t bbr_can_force_probertt = 0;
+static int32_t bbr_can_adjust_probertt = 1;
+static int32_t bbr_probertt_sets_rtt = 0;
+static int32_t bbr_can_use_ts_for_rtt = 1;
+static int32_t bbr_is_ratio = 0;
+static int32_t bbr_sub_drain_app_limit = 1;
+static int32_t bbr_prtt_slam_cwnd = 1;
+static int32_t bbr_sub_drain_slam_cwnd = 1;
+static int32_t bbr_slam_cwnd_in_main_drain = 1;
+static int32_t bbr_filter_len_sec = 6; /* How long does the rttProp filter
+ * hold */
+static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4);
+/*
+ * bbr_drain_gain is the reverse of the high_gain
+ * designed to drain back out the standing queue
+ * that is formed in startup by causing a larger
+ * hptsi gain and thus drainging the packets
+ * in flight.
+ */
+static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+static int32_t bbr_rttprobe_gain = 192;
+
+/*
+ * The cwnd_gain is the default cwnd gain applied when
+ * calculating a target cwnd. Note that the cwnd is
+ * a secondary factor in the way BBR works (see the
+ * paper and think about it, it will take some time).
+ * Basically the hptsi_gain spreads the packets out
+ * so you never get more than BDP to the peer even
+ * if the cwnd is high. In our implemenation that
+ * means in non-recovery/retransmission scenarios
+ * cwnd will never be reached by the flight-size.
+ */
+static int32_t bbr_cwnd_gain = BBR_UNIT * 2;
+static int32_t bbr_tlp_type_to_use = BBR_SRTT;
+static int32_t bbr_delack_time = 100000; /* 100ms in useconds */
+static int32_t bbr_sack_not_required = 0; /* set to one to allow non-sack to use bbr */
+static int32_t bbr_initial_bw_bps = 62500; /* 500kbps in bytes ps */
+static int32_t bbr_ignore_data_after_close = 1;
+static int16_t bbr_hptsi_gain[] = {
+ (BBR_UNIT *5 / 4),
+ (BBR_UNIT * 3 / 4),
+ BBR_UNIT,
+ BBR_UNIT,
+ BBR_UNIT,
+ BBR_UNIT,
+ BBR_UNIT,
+ BBR_UNIT
+};
+int32_t bbr_use_rack_resend_cheat = 1;
+int32_t bbr_sends_full_iwnd = 1;
+
+#define BBR_HPTSI_GAIN_MAX 8
+/*
+ * The BBR module incorporates a number of
+ * TCP ideas that have been put out into the IETF
+ * over the last few years:
+ * - Yuchung Cheng's RACK TCP (for which its named) that
+ * will stop us using the number of dup acks and instead
+ * use time as the gage of when we retransmit.
+ * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
+ * of Dukkipati et.al.
+ * - Van Jacobson's et.al BBR.
+ *
+ * RACK depends on SACK, so if an endpoint arrives that
+ * cannot do SACK the state machine below will shuttle the
+ * connection back to using the "default" TCP stack that is
+ * in FreeBSD.
+ *
+ * To implement BBR and RACK the original TCP stack was first decomposed
+ * into a functional state machine with individual states
+ * for each of the possible TCP connection states. The do_segement
+ * functions role in life is to mandate the connection supports SACK
+ * initially and then assure that the RACK state matches the conenction
+ * state before calling the states do_segment function. Data processing
+ * of inbound segments also now happens in the hpts_do_segment in general
+ * with only one exception. This is so we can keep the connection on
+ * a single CPU.
+ *
+ * Each state is simplified due to the fact that the original do_segment
+ * has been decomposed and we *know* what state we are in (no
+ * switches on the state) and all tests for SACK are gone. This
+ * greatly simplifies what each state does.
+ *
+ * TCP output is also over-written with a new version since it
+ * must maintain the new rack scoreboard and has had hptsi
+ * integrated as a requirment. Still todo is to eliminate the
+ * use of the callout_() system and use the hpts for all
+ * timers as well.
+ */
+static uint32_t bbr_rtt_probe_time = 200000; /* 200ms in micro seconds */
+static uint32_t bbr_rtt_probe_cwndtarg = 4; /* How many mss's outstanding */
+static const int32_t bbr_min_req_free = 2; /* The min we must have on the
+ * free list */
+static int32_t bbr_tlp_thresh = 1;
+static int32_t bbr_reorder_thresh = 2;
+static int32_t bbr_reorder_fade = 60000000; /* 0 - never fade, def
+ * 60,000,000 - 60 seconds */
+static int32_t bbr_pkt_delay = 1000;
+static int32_t bbr_min_to = 1000; /* Number of usec's minimum timeout */
+static int32_t bbr_incr_timers = 1;
+
+static int32_t bbr_tlp_min = 10000; /* 10ms in usecs */
+static int32_t bbr_delayed_ack_time = 200000; /* 200ms in usecs */
+static int32_t bbr_exit_startup_at_loss = 1;
+
+/*
+ * bbr_lt_bw_ratio is 1/8th
+ * bbr_lt_bw_diff is < 4 Kbit/sec
+ */
+static uint64_t bbr_lt_bw_diff = 4000 / 8; /* In bytes per second */
+static uint64_t bbr_lt_bw_ratio = 8; /* For 1/8th */
+static uint32_t bbr_lt_bw_max_rtts = 48; /* How many rtt's do we use
+ * the lt_bw for */
+static uint32_t bbr_lt_intvl_min_rtts = 4; /* Min num of RTT's to measure
+ * lt_bw */
+static int32_t bbr_lt_intvl_fp = 0; /* False positive epoch diff */
+static int32_t bbr_lt_loss_thresh = 196; /* Lost vs delivered % */
+static int32_t bbr_lt_fd_thresh = 100; /* false detection % */
+
+static int32_t bbr_verbose_logging = 0;
+/*
+ * Currently regular tcp has a rto_min of 30ms
+ * the backoff goes 12 times so that ends up
+ * being a total of 122.850 seconds before a
+ * connection is killed.
+ */
+static int32_t bbr_rto_min_ms = 30; /* 30ms same as main freebsd */
+static int32_t bbr_rto_max_sec = 4; /* 4 seconds */
+
+/****************************************************/
+/* DEFAULT TSO SIZING (cpu performance impacting) */
+/****************************************************/
+/* What amount is our formula using to get TSO size */
+static int32_t bbr_hptsi_per_second = 1000;
+
+/*
+ * For hptsi under bbr_cross_over connections what is delay
+ * target 7ms (in usec) combined with a seg_max of 2
+ * gets us close to identical google behavior in
+ * TSO size selection (possibly more 1MSS sends).
+ */
+static int32_t bbr_hptsi_segments_delay_tar = 7000;
+
+/* Does pacing delay include overhead's in its time calculations? */
+static int32_t bbr_include_enet_oh = 0;
+static int32_t bbr_include_ip_oh = 1;
+static int32_t bbr_include_tcp_oh = 1;
+static int32_t bbr_google_discount = 10;
+
+/* Do we use (nf mode) pkt-epoch to drive us or rttProp? */
+static int32_t bbr_state_is_pkt_epoch = 0;
+static int32_t bbr_state_drain_2_tar = 1;
+/* What is the max the 0 - bbr_cross_over MBPS TSO target
+ * can reach using our delay target. Note that this
+ * value becomes the floor for the cross over
+ * algorithm.
+ */
+static int32_t bbr_hptsi_segments_max = 2;
+static int32_t bbr_hptsi_segments_floor = 1;
+static int32_t bbr_hptsi_utter_max = 0;
+
+/* What is the min the 0 - bbr_cross-over MBPS TSO target can be */
+static int32_t bbr_hptsi_bytes_min = 1460;
+static int32_t bbr_all_get_min = 0;
+
+/* Cross over point from algo-a to algo-b */
+static uint32_t bbr_cross_over = TWENTY_THREE_MBPS;
+
+/* Do we deal with our restart state? */
+static int32_t bbr_uses_idle_restart = 0;
+static int32_t bbr_idle_restart_threshold = 100000; /* 100ms in useconds */
+
+/* Do we allow hardware pacing? */
+static int32_t bbr_allow_hdwr_pacing = 0;
+static int32_t bbr_hdwr_pace_adjust = 2; /* multipler when we calc the tso size */
+static int32_t bbr_hdwr_pace_floor = 1;
+static int32_t bbr_hdwr_pacing_delay_cnt = 10;
+
+/****************************************************/
+static int32_t bbr_resends_use_tso = 0;
+static int32_t bbr_tlp_max_resend = 2;
+static int32_t bbr_sack_block_limit = 128;
+
+#define BBR_MAX_STAT 19
+counter_u64_t bbr_state_time[BBR_MAX_STAT];
+counter_u64_t bbr_state_lost[BBR_MAX_STAT];
+counter_u64_t bbr_state_resend[BBR_MAX_STAT];
+counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
+counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
+counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE];
+counter_u64_t bbr_flows_whdwr_pacing;
+counter_u64_t bbr_flows_nohdwr_pacing;
+
+counter_u64_t bbr_nohdwr_pacing_enobuf;
+counter_u64_t bbr_hdwr_pacing_enobuf;
+
+static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr);
+
+/*
+ * Static defintions we need for forward declarations.
+ */
+static uint32_t
+bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain,
+ uint32_t useconds_time, uint64_t bw);
+static uint32_t
+bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain);
+static void
+ bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win);
+static void
+bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses);
+static void
+bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line,
+ int dolog);
+static uint32_t
+bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain);
+static void
+bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch,
+ int32_t pkt_epoch, uint32_t losses);
+static uint32_t
+bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm);
+static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp);
+static uint32_t
+bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
+ struct bbr_sendmap *rsm, uint32_t srtt,
+ uint32_t cts);
+static void
+bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
+ int32_t line);
+static void
+ bbr_set_state_target(struct tcp_bbr *bbr, int line);
+static void
+ bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line);
+
+static void
+ bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line);
+
+static void
+ tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts);
+
+static void
+ bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts);
+
+static void
+ bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt,
+ uint32_t line, uint8_t is_start, uint16_t set);
+
+static struct bbr_sendmap *
+ bbr_find_lowest_rsm(struct tcp_bbr *bbr);
+static __inline uint32_t
+bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
+static void
+ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which);
+
+static void
+bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
+ uint32_t thresh, uint32_t to);
+static void
+ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
+
+static void
+bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
+ uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay);
+
+static void
+bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr,
+ uint32_t cts, int32_t line);
+static void
+ bbr_stop_all_timers(struct tcpcb *tp);
+static void
+ bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
+static void
+ bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts);
+static void
+ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts);
+
+
+static void
+bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
+ uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod);
+
+static inline uint8_t
+bbr_state_val(struct tcp_bbr *bbr)
+{
+ return(bbr->rc_bbr_substate);
+}
+
+static inline uint32_t
+get_min_cwnd(struct tcp_bbr *bbr)
+{
+ int mss;
+
+ mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
+ if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED)
+ return (bbr_cwnd_min_val_hs * mss);
+ else
+ return (bbr_cwnd_min_val * mss);
+}
+
+static uint32_t
+bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr)
+{
+ uint64_t srtt, var;
+ uint64_t ret_val;
+
+ bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
+ if (tp->t_srtt == 0) {
+ srtt = (uint64_t)BBR_INITIAL_RTO;
+ var = 0;
+ } else {
+ srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
+ var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT);
+ }
+ TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]),
+ bbr_persist_min, bbr_persist_max);
+ return ((uint32_t)ret_val);
+}
+
+static uint32_t
+bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ /*
+ * Start the FR timer, we do this based on getting the first one in
+ * the rc_tmap. Note that if its NULL we must stop the timer. in all
+ * events we need to stop the running timer (if its running) before
+ * starting the new one.
+ */
+ uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
+ int32_t idx;
+ int32_t is_tlp_timer = 0;
+ struct bbr_sendmap *rsm;
+
+ if (bbr->rc_all_timers_stopped) {
+ /* All timers have been stopped none are to run */
+ return (0);
+ }
+ if (bbr->rc_in_persist) {
+ /* We can't start any timer in persists */
+ return (bbr_get_persists_timer_val(tp, bbr));
+ }
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
+ if ((rsm == NULL) ||
+ ((tp->t_flags & TF_SACK_PERMIT) == 0) ||
+ (tp->t_state < TCPS_ESTABLISHED)) {
+ /* Nothing on the send map */
+activate_rxt:
+ if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
+ uint64_t tov;
+
+ time_since_sent = 0;
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
+ if (rsm) {
+ idx = rsm->r_rtr_cnt - 1;
+ if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
+ tstmp_touse = rsm->r_tim_lastsent[idx];
+ else
+ tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
+ if (TSTMP_GT(tstmp_touse, cts))
+ time_since_sent = cts - tstmp_touse;
+ }
+ bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
+ if (tp->t_srtt == 0)
+ tov = BBR_INITIAL_RTO;
+ else
+ tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) +
+ ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT);
+ if (tp->t_rxtshift)
+ tov *= tcp_backoff[tp->t_rxtshift];
+ if (tov > time_since_sent)
+ tov -= time_since_sent;
+ else
+ tov = bbr->r_ctl.rc_min_to;
+ TCPT_RANGESET_NOSLOP(to, tov,
+ (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC),
+ (bbr->rc_max_rto_sec * USECS_IN_SECOND));
+ bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to);
+ return (to);
+ }
+ return (0);
+ }
+ if (rsm->r_flags & BBR_ACKED) {
+ rsm = bbr_find_lowest_rsm(bbr);
+ if (rsm == NULL) {
+ /* No lowest? */
+ goto activate_rxt;
+ }
+ }
+ /* Convert from ms to usecs */
+ if (rsm->r_flags & BBR_SACK_PASSED) {
+ if ((tp->t_flags & TF_SENTFIN) &&
+ ((tp->snd_max - tp->snd_una) == 1) &&
+ (rsm->r_flags & BBR_HAS_FIN)) {
+ /*
+ * We don't start a bbr rack timer if all we have is
+ * a FIN outstanding.
+ */
+ goto activate_rxt;
+ }
+ srtt = bbr_get_rtt(bbr, BBR_RTT_RACK);
+ thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm);
+ idx = rsm->r_rtr_cnt - 1;
+ exp = rsm->r_tim_lastsent[idx] + thresh;
+ if (SEQ_GEQ(exp, cts)) {
+ to = exp - cts;
+ if (to < bbr->r_ctl.rc_min_to) {
+ to = bbr->r_ctl.rc_min_to;
+ }
+ } else {
+ to = bbr->r_ctl.rc_min_to;
+ }
+ } else {
+ /* Ok we need to do a TLP not RACK */
+ if (bbr->rc_tlp_in_progress != 0) {
+ /*
+ * The previous send was a TLP.
+ */
+ goto activate_rxt;
+ }
+ rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
+ if (rsm == NULL) {
+ /* We found no rsm to TLP with. */
+ goto activate_rxt;
+ }
+ if (rsm->r_flags & BBR_HAS_FIN) {
+ /* If its a FIN we don't do TLP */
+ rsm = NULL;
+ goto activate_rxt;
+ }
+ time_since_sent = 0;
+ idx = rsm->r_rtr_cnt - 1;
+ if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
+ tstmp_touse = rsm->r_tim_lastsent[idx];
+ else
+ tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
+ if (TSTMP_GT(tstmp_touse, cts))
+ time_since_sent = cts - tstmp_touse;
+ is_tlp_timer = 1;
+ srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use);
+ thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts);
+ if (thresh > time_since_sent)
+ to = thresh - time_since_sent;
+ else
+ to = bbr->r_ctl.rc_min_to;
+ if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
+ /*
+ * If the TLP time works out to larger than the max
+ * RTO lets not do TLP.. just RTO.
+ */
+ goto activate_rxt;
+ }
+ if ((bbr->rc_tlp_rtx_out == 1) &&
+ (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) {
+ /*
+ * Second retransmit of the same TLP
+ * lets not.
+ */
+ bbr->rc_tlp_rtx_out = 0;
+ goto activate_rxt;
+ }
+ if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) {
+ /*
+ * The tail is no longer the last one I did a probe
+ * on
+ */
+ bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
+ bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
+ }
+ }
+ if (is_tlp_timer == 0) {
+ BBR_STAT_INC(bbr_to_arm_rack);
+ bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
+ } else {
+ bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to);
+ if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
+ /*
+ * We have exceeded how many times we can retran the
+ * current TLP timer, switch to the RTO timer.
+ */
+ goto activate_rxt;
+ } else {
+ BBR_STAT_INC(bbr_to_arm_tlp);
+ bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
+ }
+ }
+ return (to);
+}
+
+static inline int32_t
+bbr_minseg(struct tcp_bbr *bbr)
+{
+ return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options);
+}
+
+static void
+bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
+{
+ struct inpcb *inp;
+ struct hpts_diag diag;
+ uint32_t delayed_ack = 0;
+ uint32_t left = 0;
+ uint32_t hpts_timeout;
+ uint8_t stopped;
+ int32_t delay_calc = 0;
+ uint32_t prev_delay = 0;
+
+ inp = tp->t_inpcb;
+ if (inp->inp_in_hpts) {
+ /* A previous call is already set up */
+ return;
+ }
+ if ((tp->t_state == TCPS_CLOSED) ||
+ (tp->t_state == TCPS_LISTEN)) {
+ return;
+ }
+ stopped = bbr->rc_tmr_stopped;
+ if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
+ left = bbr->r_ctl.rc_timer_exp - cts;
+ }
+ bbr->r_ctl.rc_hpts_flags = 0;
+ bbr->r_ctl.rc_timer_exp = 0;
+ prev_delay = bbr->r_ctl.rc_last_delay_val;
+ if (bbr->r_ctl.rc_last_delay_val &&
+ (slot == 0)) {
+ /*
+ * If a previous pacer delay was in place we
+ * are not coming from the output side (where
+ * we calculate a delay, more likely a timer).
+ */
+ slot = bbr->r_ctl.rc_last_delay_val;
+ if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
+ /* Compensate for time passed */
+ delay_calc = cts - bbr->rc_pacer_started;
+ if (delay_calc <= slot)
+ slot -= delay_calc;
+ }
+ }
+ /* Do we have early to make up for by pushing out the pacing time? */
+ if (bbr->r_agg_early_set) {
+ bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
+ slot += bbr->r_ctl.rc_agg_early;
+ bbr->r_ctl.rc_agg_early = 0;
+ bbr->r_agg_early_set = 0;
+ }
+ /* Are we running a total debt that needs to be compensated for? */
+ if (bbr->r_ctl.rc_hptsi_agg_delay) {
+ if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
+ /* We nuke the delay */
+ slot -= bbr->r_ctl.rc_hptsi_agg_delay;
+ bbr->r_ctl.rc_hptsi_agg_delay = 0;
+ } else {
+ /* We nuke some of the delay, put in a minimal 100usecs */
+ bbr->r_ctl.rc_hptsi_agg_delay -= slot;
+ bbr->r_ctl.rc_last_delay_val = slot = 100;
+ }
+ }
+ bbr->r_ctl.rc_last_delay_val = slot;
+ hpts_timeout = bbr_timer_start(tp, bbr, cts);
+ if (tp->t_flags & TF_DELACK) {
+ if (bbr->rc_in_persist == 0) {
+ delayed_ack = bbr_delack_time;
+ } else {
+ /*
+ * We are in persists and have
+ * gotten a new data element.
+ */
+ if (hpts_timeout > bbr_delack_time) {
+ /*
+ * Lets make the persists timer (which acks)
+ * be the smaller of hpts_timeout and bbr_delack_time.
+ */
+ hpts_timeout = bbr_delack_time;
+ }
+ }
+ }
+ if (delayed_ack &&
+ ((hpts_timeout == 0) ||
+ (delayed_ack < hpts_timeout))) {
+ /* We need a Delayed ack timer */
+ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
+ hpts_timeout = delayed_ack;
+ }
+ if (slot) {
+ /* Mark that we have a pacing timer up */
+ BBR_STAT_INC(bbr_paced_segments);
+ bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
+ }
+ /*
+ * If no timers are going to run and we will fall off thfe hptsi
+ * wheel, we resort to a keep-alive timer if its configured.
+ */
+ if ((hpts_timeout == 0) &&
+ (slot == 0)) {
+ if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)) {
+ /*
+ * Ok we have no timer (persists, rack, tlp, rxt or
+ * del-ack), we don't have segments being paced. So
+ * all that is left is the keepalive timer.
+ */
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
+ } else {
+ hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
+ }
+ bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
+ }
+ }
+ if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
+ (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
+ /*
+ * RACK, TLP, persists and RXT timers all are restartable
+ * based on actions input .. i.e we received a packet (ack
+ * or sack) and that changes things (rw, or snd_una etc).
+ * Thus we can restart them with a new value. For
+ * keep-alive, delayed_ack we keep track of what was left
+ * and restart the timer with a smaller value.
+ */
+ if (left < hpts_timeout)
+ hpts_timeout = left;
+ }
+ if (bbr->r_ctl.rc_incr_tmrs && slot &&
+ (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
+ /*
+ * If configured to do so, and the timer is either
+ * the TLP or RXT timer, we need to increase the timeout
+ * by the pacing time. Consider the bottleneck at my
+ * machine as an example, we are sending something
+ * to start a TLP on. The last packet won't be emitted
+ * fully until the pacing time (the bottleneck will hold
+ * the data in place). Once the packet is emitted that
+ * is when we want to start waiting for the TLP. This
+ * is most evident with hardware pacing (where the nic
+ * is holding the packet(s) before emitting). But it
+ * can also show up in the network so we do it for all
+ * cases. Technically we would take off one packet from
+ * this extra delay but this is easier and being more
+ * conservative is probably better.
+ */
+ hpts_timeout += slot;
+ }
+ if (hpts_timeout) {
+ /*
+ * Hack alert for now we can't time-out over 2147 seconds (a
+ * bit more than 35min)
+ */
+ if (hpts_timeout > 0x7ffffffe)
+ hpts_timeout = 0x7ffffffe;
+ bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
+ } else
+ bbr->r_ctl.rc_timer_exp = 0;
+ if ((slot) &&
+ (bbr->rc_use_google ||
+ bbr->output_error_seen ||
+ (slot <= hpts_timeout)) ) {
+ /*
+ * Tell LRO that it can queue packets while
+ * we pace.
+ */
+ bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
+ if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
+ (bbr->rc_cwnd_limited == 0)) {
+ /*
+ * If we are not cwnd limited and we
+ * are running a rack timer we put on
+ * the do not disturbe even for sack.
+ */
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ } else
+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ bbr->rc_pacer_started = cts;
+
+ (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
+ __LINE__, &diag);
+ bbr->rc_timer_first = 0;
+ bbr->bbr_timer_src = frm;
+ bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
+ bbr_log_hpts_diag(bbr, cts, &diag);
+ } else if (hpts_timeout) {
+ (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
+ __LINE__, &diag);
+ /*
+ * We add the flag here as well if the slot is set,
+ * since hpts will call in to clear the queue first before
+ * calling the output routine (which does our timers).
+ * We don't want to set the flag if its just a timer
+ * else the arrival of data might (that causes us
+ * to send more) might get delayed. Imagine being
+ * on a keep-alive timer and a request comes in for
+ * more data.
+ */
+ if (slot)
+ bbr->rc_pacer_started = cts;
+ if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
+ (bbr->rc_cwnd_limited == 0)) {
+ /*
+ * For a rack timer, don't wake us even
+ * if a sack arrives as long as we are
+ * not cwnd limited.
+ */
+ bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ } else {
+ /* All other timers wake us up */
+ bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ }
+ bbr->bbr_timer_src = frm;
+ bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
+ bbr_log_hpts_diag(bbr, cts, &diag);
+ bbr->rc_timer_first = 1;
+ }
+ bbr->rc_tmr_stopped = 0;
+ bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
+}
+
+static void
+bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb)
+{
+ /*
+ * We received an ack, and then did not call send or were bounced
+ * out due to the hpts was running. Now a timer is up as well, is it
+ * the right timer?
+ */
+ struct inpcb *inp;
+ struct bbr_sendmap *rsm;
+ uint32_t hpts_timeout;
+ int tmr_up;
+
+ tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
+ if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
+ return;
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
+ if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
+ (tmr_up == PACE_TMR_RXT)) {
+ /* Should be an RXT */
+ return;
+ }
+ inp = bbr->rc_inp;
+ if (rsm == NULL) {
+ /* Nothing outstanding? */
+ if (tp->t_flags & TF_DELACK) {
+ if (tmr_up == PACE_TMR_DELACK)
+ /*
+ * We are supposed to have delayed ack up
+ * and we do
+ */
+ return;
+ } else if (sbavail(&inp->inp_socket->so_snd) &&
+ (tmr_up == PACE_TMR_RXT)) {
+ /*
+ * if we hit enobufs then we would expect the
+ * possiblity of nothing outstanding and the RXT up
+ * (and the hptsi timer).
+ */
+ return;
+ } else if (((tcp_always_keepalive ||
+ inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)) &&
+ (tmr_up == PACE_TMR_KEEP) &&
+ (tp->snd_max == tp->snd_una)) {
+ /* We should have keep alive up and we do */
+ return;
+ }
+ }
+ if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) {
+ if ((tp->t_flags & TF_SENTFIN) &&
+ ((tp->snd_max - tp->snd_una) == 1) &&
+ (rsm->r_flags & BBR_HAS_FIN)) {
+ /* needs to be a RXT */
+ if (tmr_up == PACE_TMR_RXT)
+ return;
+ else
+ goto wrong_timer;
+ } else if (tmr_up == PACE_TMR_RACK)
+ return;
+ else
+ goto wrong_timer;
+ } else if (rsm && (tmr_up == PACE_TMR_RACK)) {
+ /* Rack timer has priority if we have data out */
+ return;
+ } else if (SEQ_GT(tp->snd_max, tp->snd_una) &&
+ ((tmr_up == PACE_TMR_TLP) ||
+ (tmr_up == PACE_TMR_RXT))) {
+ /*
+ * Either a TLP or RXT is fine if no sack-passed is in place
+ * and data is outstanding.
+ */
+ return;
+ } else if (tmr_up == PACE_TMR_DELACK) {
+ /*
+ * If the delayed ack was going to go off before the
+ * rtx/tlp/rack timer were going to expire, then that would
+ * be the timer in control. Note we don't check the time
+ * here trusting the code is correct.
+ */
+ return;
+ }
+ if (SEQ_GT(tp->snd_max, tp->snd_una) &&
+ ((tmr_up == PACE_TMR_RXT) ||
+ (tmr_up == PACE_TMR_TLP) ||
+ (tmr_up == PACE_TMR_RACK))) {
+ /*
+ * We have outstanding data and
+ * we *do* have a RACK, TLP or RXT
+ * timer running. We won't restart
+ * anything here since thats probably ok we
+ * will get called with some timer here shortly.
+ */
+ return;
+ }
+ /*
+ * Ok the timer originally started is not what we want now. We will
+ * force the hpts to be stopped if any, and restart with the slot
+ * set to what was in the saved slot.
+ */
+wrong_timer:
+ if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
+ if (inp->inp_in_hpts)
+ tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ bbr_timer_cancel(bbr, __LINE__, cts);
+ bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,
+ 0);
+ } else {
+ /*
+ * Output is hptsi so we just need to switch the type of
+ * timer. We don't bother with keep-alive, since when we
+ * jump through the output, it will start the keep-alive if
+ * nothing is sent.
+ *
+ * We only need a delayed-ack added and or the hpts_timeout.
+ */
+ hpts_timeout = bbr_timer_start(tp, bbr, cts);
+ if (tp->t_flags & TF_DELACK) {
+ if (hpts_timeout == 0) {
+ hpts_timeout = bbr_delack_time;
+ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
+ }
+ else if (hpts_timeout > bbr_delack_time) {
+ hpts_timeout = bbr_delack_time;
+ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
+ }
+ }
+ if (hpts_timeout) {
+ if (hpts_timeout > 0x7ffffffe)
+ hpts_timeout = 0x7ffffffe;
+ bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
+ }
+ }
+}
+
+int32_t bbr_clear_lost = 0;
+
+/*
+ * Considers the two time values now (cts) and earlier.
+ * If cts is smaller than earlier, we could have
+ * had a sequence wrap (our counter wraps every
+ * 70 min or so) or it could be just clock skew
+ * getting us two differnt time values. Clock skew
+ * will show up within 10ms or so. So in such
+ * a case (where cts is behind earlier time by
+ * less than 10ms) we return 0. Otherwise we
+ * return the true difference between them.
+ */
+static inline uint32_t
+bbr_calc_time(uint32_t cts, uint32_t earlier_time) {
+ /*
+ * Given two timestamps, the current time stamp cts, and some other
+ * time-stamp taken in theory earlier return the difference. The
+ * trick is here sometimes locking will get the other timestamp
+ * after the cts. If this occurs we need to return 0.
+ */
+ if (TSTMP_GEQ(cts, earlier_time))
+ return (cts - earlier_time);
+ /*
+ * cts is behind earlier_time if its less than 10ms consider it 0.
+ * If its more than 10ms difference then we had a time wrap. Else
+ * its just the normal locking foo. I wonder if we should not go to
+ * 64bit TS and get rid of this issue.
+ */
+ if (TSTMP_GEQ((cts + 10000), earlier_time))
+ return (0);
+ /*
+ * Ok the time must have wrapped. So we need to answer a large
+ * amount of time, which the normal subtraction should do.
+ */
+ return (cts - earlier_time);
+}
+
+
+
+static int
+sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t stat;
+ int32_t error;
+
+ error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t));
+ if (error || req->newptr == NULL)
+ return error;
+
+ error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
+ if (error)
+ return (error);
+ if (stat == 1) {
+#ifdef BBR_INVARIANTS
+ printf("Clearing BBR lost counters\n");
+#endif
+ COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT);
+ COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT);
+ COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT);
+ } else if (stat == 2) {
+#ifdef BBR_INVARIANTS
+ printf("Clearing BBR option counters\n");
+#endif
+ COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE);
+ } else if (stat == 3) {
+#ifdef BBR_INVARIANTS
+ printf("Clearing BBR stats counters\n");
+#endif
+ COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE);
+ } else if (stat == 4) {
+#ifdef BBR_INVARIANTS
+ printf("Clearing BBR out-size counters\n");
+#endif
+ COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE);
+ }
+ bbr_clear_lost = 0;
+ return (0);
+}
+
+static void
+bbr_init_sysctls()
+{
+ struct sysctl_oid *bbr_probertt;
+ struct sysctl_oid *bbr_hptsi;
+ struct sysctl_oid *bbr_measure;
+ struct sysctl_oid *bbr_cwnd;
+ struct sysctl_oid *bbr_timeout;
+ struct sysctl_oid *bbr_states;
+ struct sysctl_oid *bbr_startup;
+ struct sysctl_oid *bbr_policer;
+
+ /* Probe rtt controls */
+ bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "probertt",
+ CTLFLAG_RW, 0,
+ "");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "gain", CTLFLAG_RW,
+ &bbr_rttprobe_gain, 192,
+ "What is the filter gain drop in probe_rtt (0=disable)?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "cwnd", CTLFLAG_RW,
+ &bbr_rtt_probe_cwndtarg, 4,
+ "How many mss's are outstanding during probe-rtt");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "int", CTLFLAG_RW,
+ &bbr_rtt_probe_limit, 4000000,
+ "If RTT has not shrank in this many micro-seconds enter probe-rtt");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "mintime", CTLFLAG_RW,
+ &bbr_rtt_probe_time, 200000,
+ "How many microseconds in probe-rtt");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "filter_len_sec", CTLFLAG_RW,
+ &bbr_filter_len_sec, 6,
+ "How long in seconds does the rttProp filter run?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "drain_rtt", CTLFLAG_RW,
+ &bbr_drain_rtt, BBR_SRTT,
+ "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "can_force", CTLFLAG_RW,
+ &bbr_can_force_probertt, 0,
+ "If we keep setting new low rtt's but delay going in probe-rtt can we force in??");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "enter_sets_force", CTLFLAG_RW,
+ &bbr_probertt_sets_rtt, 0,
+ "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "can_adjust", CTLFLAG_RW,
+ &bbr_can_adjust_probertt, 1,
+ "Can we dynamically adjust the probe-rtt limits and times?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "is_ratio", CTLFLAG_RW,
+ &bbr_is_ratio, 0,
+ "is the limit to filter a ratio?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "use_cwnd", CTLFLAG_RW,
+ &bbr_prtt_slam_cwnd, 0,
+ "Should we set/recover cwnd?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_probertt),
+ OID_AUTO, "can_use_ts", CTLFLAG_RW,
+ &bbr_can_use_ts_for_rtt, 1,
+ "Can we use the ms timestamp if available for retransmistted rtt calculations?");
+
+ /* Pacing controls */
+ bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "pacing",
+ CTLFLAG_RW, 0,
+ "");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "hw_pacing", CTLFLAG_RW,
+ &bbr_allow_hdwr_pacing, 1,
+ "Do we allow hardware pacing?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "hw_pacing_limit", CTLFLAG_RW,
+ &bbr_hardware_pacing_limit, 4000,
+ "Do we have a limited number of connections for pacing chelsio (0=no limit)?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "hw_pacing_adj", CTLFLAG_RW,
+ &bbr_hdwr_pace_adjust, 2,
+ "Multiplier to calculated tso size?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "hw_pacing_floor", CTLFLAG_RW,
+ &bbr_hdwr_pace_floor, 1,
+ "Do we invoke the hardware pacing floor?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW,
+ &bbr_hdwr_pacing_delay_cnt, 10,
+ "How many packets must be sent after hdwr pacing is enabled");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "bw_cross", CTLFLAG_RW,
+ &bbr_cross_over, 3000000,
+ "What is the point where we cross over to linux like TSO size set");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "seg_deltarg", CTLFLAG_RW,
+ &bbr_hptsi_segments_delay_tar, 7000,
+ "What is the worse case delay target for hptsi < 48Mbp connections");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "enet_oh", CTLFLAG_RW,
+ &bbr_include_enet_oh, 0,
+ "Do we include the ethernet overhead in calculating pacing delay?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "ip_oh", CTLFLAG_RW,
+ &bbr_include_ip_oh, 1,
+ "Do we include the IP overhead in calculating pacing delay?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "tcp_oh", CTLFLAG_RW,
+ &bbr_include_tcp_oh, 0,
+ "Do we include the TCP overhead in calculating pacing delay?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "google_discount", CTLFLAG_RW,
+ &bbr_google_discount, 10,
+ "What is the default google discount percentage wise for pacing (11 = 1.1%%)?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "all_get_min", CTLFLAG_RW,
+ &bbr_all_get_min, 0,
+ "If you are less than a MSS do you just get the min?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "tso_min", CTLFLAG_RW,
+ &bbr_hptsi_bytes_min, 1460,
+ "For 0 -> 24Mbps what is floor number of segments for TSO");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "seg_tso_max", CTLFLAG_RW,
+ &bbr_hptsi_segments_max, 6,
+ "For 0 -> 24Mbps what is top number of segments for TSO");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "seg_floor", CTLFLAG_RW,
+ &bbr_hptsi_segments_floor, 1,
+ "Minimum TSO size we will fall too in segments");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "utter_max", CTLFLAG_RW,
+ &bbr_hptsi_utter_max, 0,
+ "The absolute maximum that any pacing (outside of hardware) can be");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "seg_divisor", CTLFLAG_RW,
+ &bbr_hptsi_per_second, 100,
+ "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps ");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "srtt_mul", CTLFLAG_RW,
+ &bbr_hptsi_max_mul, 1,
+ "The multiplier for pace len max");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_hptsi),
+ OID_AUTO, "srtt_div", CTLFLAG_RW,
+ &bbr_hptsi_max_div, 2,
+ "The divisor for pace len max");
+ /* Measurement controls */
+ bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "measure",
+ CTLFLAG_RW, 0,
+ "Measurement controls");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "min_i_bw", CTLFLAG_RW,
+ &bbr_initial_bw_bps, 62500,
+ "Minimum initial b/w in bytes per second");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "no_sack_needed", CTLFLAG_RW,
+ &bbr_sack_not_required, 0,
+ "Do we allow bbr to run on connections not supporting SACK?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "use_google", CTLFLAG_RW,
+ &bbr_use_google_algo, 0,
+ "Use has close to google V1.0 has possible?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "ts_limiting", CTLFLAG_RW,
+ &bbr_ts_limiting, 1,
+ "Do we attempt to use the peers timestamp to limit b/w caculations?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "ts_can_raise", CTLFLAG_RW,
+ &bbr_ts_can_raise, 0,
+ "Can we raise the b/w via timestamp b/w calculation?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "ts_delta", CTLFLAG_RW,
+ &bbr_min_usec_delta, 20000,
+ "How long in usec between ts of our sends in ts validation code?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "ts_peer_delta", CTLFLAG_RW,
+ &bbr_min_peer_delta, 20,
+ "What min numerical value should be between the peer deltas?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "ts_delta_percent", CTLFLAG_RW,
+ &bbr_delta_percent, 150,
+ "What percentage (150 = 15.0) do we allow variance for?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "min_measure_good_bw", CTLFLAG_RW,
+ &bbr_min_measurements_req, 1,
+ "What is the minimum measurment count we need before we switch to our b/w estimate");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "min_measure_before_pace", CTLFLAG_RW,
+ &bbr_no_pacing_until, 4,
+ "How many pkt-epoch's (0 is off) do we need before pacing is on?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "quanta", CTLFLAG_RW,
+ &bbr_quanta, 2,
+ "Extra quanta to add when calculating the target (ID section 4.2.3.2).");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_measure),
+ OID_AUTO, "noretran", CTLFLAG_RW,
+ &bbr_no_retran, 0,
+ "Should google mode not use retransmission measurements for the b/w estimation?");
+ /* State controls */
+ bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "states",
+ CTLFLAG_RW, 0,
+ "State controls");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "idle_restart", CTLFLAG_RW,
+ &bbr_uses_idle_restart, 0,
+ "Do we use a new special idle_restart state to ramp back up quickly?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "idle_restart_threshold", CTLFLAG_RW,
+ &bbr_idle_restart_threshold, 100000,
+ "How long must we be idle before we restart??");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "use_pkt_epoch", CTLFLAG_RW,
+ &bbr_state_is_pkt_epoch, 0,
+ "Do we use a pkt-epoch for substate if 0 rttProp?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "startup_rtt_gain", CTLFLAG_RW,
+ &bbr_rtt_gain_thresh, 0,
+ "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "drain_floor", CTLFLAG_RW,
+ &bbr_drain_floor, 88,
+ "What is the lowest we can drain (pg) too?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "drain_2_target", CTLFLAG_RW,
+ &bbr_state_drain_2_tar, 1,
+ "Do we drain to target in drain substate?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "gain_2_target", CTLFLAG_RW,
+ &bbr_gain_to_target, 1,
+ "Does probe bw gain to target??");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "gain_extra_time", CTLFLAG_RW,
+ &bbr_gain_gets_extra_too, 1,
+ "Does probe bw gain get the extra time too?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "ld_div", CTLFLAG_RW,
+ &bbr_drain_drop_div, 5,
+ "Long drain drop divider?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "ld_mul", CTLFLAG_RW,
+ &bbr_drain_drop_mul, 4,
+ "Long drain drop multiplier?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "rand_ot_disc", CTLFLAG_RW,
+ &bbr_rand_ot, 50,
+ "Random discount of the ot?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "dr_filter_life", CTLFLAG_RW,
+ &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT,
+ "How many packet-epochs does the b/w delivery rate last?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "subdrain_applimited", CTLFLAG_RW,
+ &bbr_sub_drain_app_limit, 0,
+ "Does our sub-state drain invoke app limited if its long?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW,
+ &bbr_sub_drain_slam_cwnd, 0,
+ "Should we set/recover cwnd for sub-state drain?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW,
+ &bbr_slam_cwnd_in_main_drain, 0,
+ "Should we set/recover cwnd for main-state drain?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "google_gets_earlyout", CTLFLAG_RW,
+ &google_allow_early_out, 1,
+ "Should we allow google probe-bw/drain to exit early at flight target?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_states),
+ OID_AUTO, "google_exit_loss", CTLFLAG_RW,
+ &google_consider_lost, 1,
+ "Should we have losses exit gain of probebw in google mode??");
+ /* Startup controls */
+ bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "startup",
+ CTLFLAG_RW, 0,
+ "Startup controls");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_startup),
+ OID_AUTO, "cheat_iwnd", CTLFLAG_RW,
+ &bbr_sends_full_iwnd, 1,
+ "Do we not pace but burst out initial windows has our TSO size?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_startup),
+ OID_AUTO, "loss_threshold", CTLFLAG_RW,
+ &bbr_startup_loss_thresh, 2000,
+ "In startup what is the loss threshold in a pe that will exit us from startup?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_startup),
+ OID_AUTO, "use_lowerpg", CTLFLAG_RW,
+ &bbr_use_lower_gain_in_startup, 1,
+ "Should we use a lower hptsi gain if we see loss in startup?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_startup),
+ OID_AUTO, "gain", CTLFLAG_RW,
+ &bbr_start_exit, 25,
+ "What gain percent do we need to see to stay in startup??");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_startup),
+ OID_AUTO, "low_gain", CTLFLAG_RW,
+ &bbr_low_start_exit, 15,
+ "What gain percent do we need to see to stay in the lower gain startup??");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_startup),
+ OID_AUTO, "loss_exit", CTLFLAG_RW,
+ &bbr_exit_startup_at_loss, 1,
+ "Should we exit startup at loss in an epoch if we are not gaining?");
+ /* CWND controls */
+ bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "cwnd",
+ CTLFLAG_RW, 0,
+ "Cwnd controls");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "tar_rtt", CTLFLAG_RW,
+ &bbr_cwndtarget_rtt_touse, 0,
+ "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "may_shrink", CTLFLAG_RW,
+ &bbr_cwnd_may_shrink, 0,
+ "Can the cwnd shrink if it would grow to more than the target?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "max_target_limit", CTLFLAG_RW,
+ &bbr_target_cwnd_mult_limit, 8,
+ "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "highspeed_min", CTLFLAG_RW,
+ &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS,
+ "What is the high-speed min cwnd (rttProp under 1ms)");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "lowspeed_min", CTLFLAG_RW,
+ &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS,
+ "What is the min cwnd (rttProp > 1ms)");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "initwin", CTLFLAG_RW,
+ &bbr_def_init_win, 10,
+ "What is the BBR initial window, if 0 use tcp version");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "do_loss_red", CTLFLAG_RW,
+ &bbr_do_red, 600,
+ "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "red_scale", CTLFLAG_RW,
+ &bbr_red_scale, 20000,
+ "What RTT do we scale with?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "red_growslow", CTLFLAG_RW,
+ &bbr_red_growth_restrict, 1,
+ "Do we restrict cwnd growth for whats in flight?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "red_div", CTLFLAG_RW,
+ &bbr_red_div, 2,
+ "If we reduce whats the divisor?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "red_mul", CTLFLAG_RW,
+ &bbr_red_mul, 1,
+ "If we reduce whats the mulitiplier?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "target_is_unit", CTLFLAG_RW,
+ &bbr_target_is_bbunit, 0,
+ "Is the state target the pacing_gain or BBR_UNIT?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_cwnd),
+ OID_AUTO, "drop_limit", CTLFLAG_RW,
+ &bbr_drop_limit, 0,
+ "Number of segments limit for drop (0=use min_cwnd w/flight)?");
+
+ /* Timeout controls */
+ bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "timeout",
+ CTLFLAG_RW, 0,
+ "Time out controls");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "delack", CTLFLAG_RW,
+ &bbr_delack_time, 100000,
+ "BBR's delayed ack time");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "tlp_uses", CTLFLAG_RW,
+ &bbr_tlp_type_to_use, 3,
+ "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "persmin", CTLFLAG_RW,
+ &bbr_persist_min, 250000,
+ "What is the minimum time in microseconds between persists");
+ SYSCTL_ADD_U32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "persmax", CTLFLAG_RW,
+ &bbr_persist_max, 1000000,
+ "What is the largest delay in microseconds between persists");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "tlp_minto", CTLFLAG_RW,
+ &bbr_tlp_min, 10000,
+ "TLP Min timeout in usecs");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "tlp_dack_time", CTLFLAG_RW,
+ &bbr_delayed_ack_time, 200000,
+ "TLP delayed ack compensation value");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "minrto", CTLFLAG_RW,
+ &bbr_rto_min_ms, 30,
+ "Minimum RTO in ms");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "maxrto", CTLFLAG_RW,
+ &bbr_rto_max_sec, 4,
+ "Maxiumum RTO in seconds -- should be at least as large as min_rto");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "tlp_retry", CTLFLAG_RW,
+ &bbr_tlp_max_resend, 2,
+ "How many times does TLP retry a single segment or multiple with no ACK");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "minto", CTLFLAG_RW,
+ &bbr_min_to, 1000,
+ "Minimum rack timeout in useconds");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "pktdelay", CTLFLAG_RW,
+ &bbr_pkt_delay, 1000,
+ "Extra RACK time (in useconds) besides reordering thresh");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "incr_tmrs", CTLFLAG_RW,
+ &bbr_incr_timers, 1,
+ "Increase the RXT/TLP timer by the pacing time used?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_timeout),
+ OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW,
+ &bbr_marks_rxt_sack_passed, 0,
+ "Mark sack passed on all those not ack'd when a RXT hits?");
+ /* Policer controls */
+ bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO,
+ "policer",
+ CTLFLAG_RW, 0,
+ "Policer controls");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "detect_enable", CTLFLAG_RW,
+ &bbr_policer_detection_enabled, 1,
+ "Is policer detection enabled??");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "min_pes", CTLFLAG_RW,
+ &bbr_lt_intvl_min_rtts, 4,
+ "Minimum number of PE's?");
+ SYSCTL_ADD_U64(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "bwdiff", CTLFLAG_RW,
+ &bbr_lt_bw_diff, (4000/8),
+ "Minimal bw diff?");
+ SYSCTL_ADD_U64(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "bwratio", CTLFLAG_RW,
+ &bbr_lt_bw_ratio, 8,
+ "Minimal bw diff?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "from_rack_rxt", CTLFLAG_RW,
+ &bbr_policer_call_from_rack_to, 0,
+ "Do we call the policer detection code from a rack-timeout?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "false_postive", CTLFLAG_RW,
+ &bbr_lt_intvl_fp, 0,
+ "What packet epoch do we do false-postive detection at (0=no)?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "loss_thresh", CTLFLAG_RW,
+ &bbr_lt_loss_thresh, 196,
+ "Loss threshold 196 = 19.6%?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_policer),
+ OID_AUTO, "false_postive_thresh", CTLFLAG_RW,
+ &bbr_lt_fd_thresh, 100,
+ "What percentage is the false detection threshold (150=15.0)?");
+ /* All the rest */
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "cheat_rxt", CTLFLAG_RW,
+ &bbr_use_rack_resend_cheat, 0,
+ "Do we burst 1ms between sends on retransmissions (like rack)?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "error_paceout", CTLFLAG_RW,
+ &bbr_error_base_paceout, 10000,
+ "When we hit an error what is the min to pace out in usec's?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "kill_paceout", CTLFLAG_RW,
+ &bbr_max_net_error_cnt, 10,
+ "When we hit this many errors in a row, kill the session?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "data_after_close", CTLFLAG_RW,
+ &bbr_ignore_data_after_close, 1,
+ "Do we hold off sending a RST until all pending data is ack'd");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "resend_use_tso", CTLFLAG_RW,
+ &bbr_resends_use_tso, 0,
+ "Can resends use TSO?");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "sblklimit", CTLFLAG_RW,
+ &bbr_sack_block_limit, 128,
+ "When do we start ignoring small sack blocks");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "bb_verbose", CTLFLAG_RW,
+ &bbr_verbose_logging, 0,
+ "Should BBR black box logging be verbose");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "reorder_thresh", CTLFLAG_RW,
+ &bbr_reorder_thresh, 2,
+ "What factor for rack will be added when seeing reordering (shift right)");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "reorder_fade", CTLFLAG_RW,
+ &bbr_reorder_fade, 0,
+ "Does reorder detection fade, if so how many ms (0 means never)");
+ SYSCTL_ADD_S32(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
+ &bbr_tlp_thresh, 1,
+ "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
+ /* Stats and counters */
+ /* The pacing counters for hdwr/software can't be in the array */
+ bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
+ bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD,
+ &bbr_hdwr_pacing_enobuf,
+ "Total number of enobufs for hardware paced flows");
+ SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD,
+ &bbr_nohdwr_pacing_enobuf,
+ "Total number of enobufs for non-hardware paced flows");
+
+
+ bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "hdwr_pacing", CTLFLAG_RD,
+ &bbr_flows_whdwr_pacing,
+ "Total number of hardware paced flows");
+ bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "software_pacing", CTLFLAG_RD,
+ &bbr_flows_nohdwr_pacing,
+ "Total number of software paced flows");
+ COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "stats", CTLFLAG_RD,
+ bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats");
+ COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "opts", CTLFLAG_RD,
+ bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats");
+ COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "lost", CTLFLAG_RD,
+ bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur");
+ COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "stateresend", CTLFLAG_RD,
+ bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend");
+ COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "statetime", CTLFLAG_RD,
+ bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states");
+ COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "outsize", CTLFLAG_RD,
+ bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls");
+ SYSCTL_ADD_PROC(&bbr_sysctl_ctx,
+ SYSCTL_CHILDREN(bbr_sysctl_root),
+ OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters");
+}
+
+static inline int32_t
+bbr_progress_timeout_check(struct tcp_bbr *bbr)
+{
+ if (bbr->rc_tp->t_maxunacktime && bbr->rc_tp->t_acktime &&
+ TSTMP_GT(ticks, bbr->rc_tp->t_acktime)) {
+ if ((((uint32_t)ticks - bbr->rc_tp->t_acktime)) >= bbr->rc_tp->t_maxunacktime) {
+ /*
+ * There is an assumption here that the caller will
+ * drop the connection, so we increment the
+ * statistics.
+ */
+ bbr_log_progress_event(bbr, bbr->rc_tp, ticks, PROGRESS_DROP, __LINE__);
+ BBR_STAT_INC(bbr_progress_drops);
+#ifdef NETFLIX_STATS
+ TCPSTAT_INC(tcps_progdrops);
+#endif
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static void
+bbr_counter_destroy()
+{
+ COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE);
+ COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE);
+ COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE);
+ COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT);
+ COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT);
+ COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT);
+ counter_u64_free(bbr_flows_whdwr_pacing);
+ counter_u64_free(bbr_flows_nohdwr_pacing);
+
+}
+
+static __inline void
+bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts)
+{
+ memset(l, 0, sizeof(union tcp_log_stackspecific));
+ l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate;
+ l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate);
+ l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
+ l->bw_inuse = bbr_get_bw(bbr);
+ l->inflight = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ l->applimited = bbr->r_ctl.r_app_limited_until;
+ l->delivered = bbr->r_ctl.rc_delivered;
+ l->timeStamp = cts;
+ l->lost = bbr->r_ctl.rc_lost;
+ l->bbr_state = bbr->rc_bbr_state;
+ l->bbr_substate = bbr_state_val(bbr);
+ l->epoch = bbr->r_ctl.rc_rtt_epoch;
+ l->lt_epoch = bbr->r_ctl.rc_lt_epoch;
+ l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
+ l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
+ l->inhpts = bbr->rc_inp->inp_in_hpts;
+ l->ininput = bbr->rc_inp->inp_in_input;
+ l->use_lt_bw = bbr->rc_lt_use_bw;
+ l->pkts_out = bbr->r_ctl.rc_flight_at_input;
+ l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
+}
+
+static void
+bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = 0;
+ log.u_bbr.flex2 = 0;
+ log.u_bbr.flex5 = 0;
+ log.u_bbr.flex3 = 0;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate;
+ log.u_bbr.flex7 = reason;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt;
+ log.u_bbr.flex8 = 0;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BW_RED_EV, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = seq;
+ log.u_bbr.flex2 = count;
+ log.u_bbr.flex8 = mode;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_LOWGAIN, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+
+
+static void
+bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling,
+ uint8_t reason, uint32_t p_maxseg, int len)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = p_maxseg;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
+ log.u_bbr.flex4 = reason;
+ log.u_bbr.flex5 = bbr->rc_in_persist;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val;
+ log.u_bbr.flex7 = p_maxseg;
+ log.u_bbr.flex8 = bbr->rc_in_persist;
+ log.u_bbr.pkts_out = 0;
+ log.u_bbr.applimited = len;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_JUSTRET, 0,
+ tlen, &log, false, &bbr->rc_tv);
+ }
+}
+
+
+static void
+bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = seq;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_ENTREC, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = tso;
+ log.u_bbr.flex2 = maxseg;
+ log.u_bbr.flex3 = mtu;
+ log.u_bbr.flex4 = csum_flags;
+ TCP_LOG_EVENTP(tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_MSGSIZE, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_flowend(struct tcp_bbr *bbr)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct sockbuf *r, *s;
+ struct timeval tv;
+
+ if (bbr->rc_inp->inp_socket) {
+ r = &bbr->rc_inp->inp_socket->so_rcv;
+ s = &bbr->rc_inp->inp_socket->so_snd;
+ } else {
+ r = s = NULL;
+ }
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv));
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ r, s,
+ TCP_LOG_FLOWEND, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+static void
+bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line,
+ uint32_t lost, uint32_t del)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = lost;
+ log.u_bbr.flex2 = del;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
+ log.u_bbr.flex7 = line;
+ log.u_bbr.flex8 = 0;
+ log.u_bbr.inflight = bbr->r_ctl.r_measurement_count;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_PKT_EPOCH, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->r_ctl.rc_lost;
+ log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat;
+ log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat;
+ log.u_bbr.flex7 = line;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TIME_EPOCH, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex2 = new_tar;
+ log.u_bbr.flex3 = line;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex5 = bbr_quanta;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex7 = bbr->rc_last_options;
+ log.u_bbr.flex8 = meth;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_STATE_TARGET, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+
+}
+
+static void
+bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
+ if (bbr_state_is_pkt_epoch)
+ log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
+ else
+ log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP);
+ log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
+ log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000);
+ log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra;
+ log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_STATE, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,
+ uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
+ log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt;
+ log.u_bbr.flex4 = applied;
+ log.u_bbr.flex5 = rtt;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex7 = cond;
+ log.u_bbr.flex8 = reas;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_RTT_SHRINKS, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_exit_rec(struct tcp_bbr *bbr)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_EXITREC, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg,
+ uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = prev_acked;
+ log.u_bbr.flex3 = bytes_this_ack;
+ log.u_bbr.flex4 = chg;
+ log.u_bbr.flex5 = th_ack;
+ log.u_bbr.flex6 = target;
+ log.u_bbr.flex8 = meth;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_CWND, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin)
+{
+ /*
+ * Log the rtt sample we are applying to the srtt algorithm in
+ * useconds.
+ */
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = rtt;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay;
+ log.u_bbr.flex4 = bbr->rc_tp->ts_offset;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv);
+ log.u_bbr.flex6 = tsin;
+ log.u_bbr.flex7 = 0;
+ log.u_bbr.flex8 = bbr->rc_ack_was_delayed;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ TCP_LOG_RTT, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = time_in;
+ log.u_bbr.flex2 = line;
+ log.u_bbr.flex8 = enter_exit;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_PERSIST, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+static void
+bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_ACKCLEAR, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen,
+ uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = nsegs;
+ log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes;
+ if (m) {
+ struct timespec ts;
+
+ log.u_bbr.flex3 = m->m_flags;
+ if (m->m_flags & M_TSTMP) {
+ mbuf_tstmp2timespec(m, &ts);
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = ts.tv_nsec / 1000;
+ log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv);
+ } else {
+ log.u_bbr.lt_epoch = 0;
+ }
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
+ log.u_bbr.flex5 = tcp_tv_to_usectick(&tv);
+ } else {
+ /* No arrival timestamp */
+ log.u_bbr.flex5 = 0;
+ }
+
+ log.u_bbr.pkts_out = tcp_get_usecs(&tv);
+ } else {
+ log.u_bbr.flex3 = 0;
+ log.u_bbr.flex5 = 0;
+ log.u_bbr.flex6 = 0;
+ log.u_bbr.pkts_out = 0;
+ }
+ log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex7 = bbr->r_wanted_output;
+ log.u_bbr.flex8 = bbr->rc_in_persist;
+ TCP_LOG_EVENTP(bbr->rc_tp, th,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ TCP_LOG_IN, 0,
+ tlen, &log, true, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = did_out;
+ log.u_bbr.flex2 = nxt_pkt;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes;
+ log.u_bbr.flex7 = bbr->r_wanted_output;
+ log.u_bbr.flex8 = bbr->rc_in_persist;
+ log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_DOSEG_DONE, 0,
+ 0, &log, true, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts,
+ int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = o_len;
+ log.u_bbr.flex3 = segcnt;
+ log.u_bbr.flex4 = segsiz;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_ENOBUF_JMP, ENOBUFS,
+ len, &log, true, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = timers;
+ log.u_bbr.flex2 = ret;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex5 = cts;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex8 = hpts_calling;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TO_PROCESS, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ uint64_t ar;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->bbr_timer_src;
+ log.u_bbr.flex2 = 0;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
+ ar = (uint64_t)(bbr->r_ctl.rc_resend);
+ ar >>= 32;
+ ar &= 0x00000000ffffffff;
+ log.u_bbr.flex4 = (uint32_t)ar;
+ ar = (uint64_t)bbr->r_ctl.rc_resend;
+ ar &= 0x00000000ffffffff;
+ log.u_bbr.flex5 = (uint32_t)ar;
+ log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
+ log.u_bbr.flex8 = to_num;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_RTO, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = flex1;
+ log.u_bbr.flex2 = flex2;
+ log.u_bbr.flex3 = flex3;
+ log.u_bbr.flex4 = 0;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
+ log.u_bbr.flex8 = reason;
+ log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_REDUCE, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = diag->p_nxt_slot;
+ log.u_bbr.flex2 = diag->p_cur_slot;
+ log.u_bbr.flex3 = diag->slot_req;
+ log.u_bbr.flex4 = diag->inp_hptsslot;
+ log.u_bbr.flex5 = diag->slot_remaining;
+ log.u_bbr.flex6 = diag->need_new_to;
+ log.u_bbr.flex7 = diag->p_hpts_active;
+ log.u_bbr.flex8 = diag->p_on_min_sleep;
+ /* Hijack other fields as needed */
+ log.u_bbr.epoch = diag->have_slept;
+ log.u_bbr.lt_epoch = diag->yet_to_sleep;
+ log.u_bbr.pkts_out = diag->co_ret;
+ log.u_bbr.applimited = diag->hpts_sleep_time;
+ log.u_bbr.delivered = diag->p_prev_slot;
+ log.u_bbr.inflight = diag->p_runningtick;
+ log.u_bbr.bw_inuse = diag->wheel_tick;
+ log.u_bbr.rttProp = diag->wheel_cts;
+ log.u_bbr.delRate = diag->maxticks;
+ log.u_bbr.cur_del_rate = diag->p_curtick;
+ log.u_bbr.cur_del_rate <<= 32;
+ log.u_bbr.cur_del_rate |= diag->p_lasttick;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
+ uint32_t thresh, uint32_t to)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->rc_tp->t_rttvar;
+ log.u_bbr.flex2 = time_since_sent;
+ log.u_bbr.flex3 = srtt;
+ log.u_bbr.flex4 = thresh;
+ log.u_bbr.flex5 = to;
+ log.u_bbr.flex6 = bbr->rc_tp->t_srtt;
+ log.u_bbr.flex8 = mode;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TIMERPREP, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
+ uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = usecs;
+ log.u_bbr.flex2 = len;
+ log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff);
+ if (override)
+ log.u_bbr.flex5 = (1 << 2);
+ else
+ log.u_bbr.flex5 = 0;
+ log.u_bbr.flex6 = override;
+ log.u_bbr.flex7 = gain;
+ log.u_bbr.flex8 = mod;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_HPTSI_CALC, 0,
+ len, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+
+ log.u_bbr.flex1 = bbr->bbr_timer_src;
+ log.u_bbr.flex2 = to;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex4 = slot;
+ log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot;
+ log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
+ log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2;
+ log.u_bbr.flex8 = which;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TIMERSTAR, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = thresh;
+ log.u_bbr.flex2 = lro;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts;
+ log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
+ log.u_bbr.flex6 = srtt;
+ log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift;
+ log.u_bbr.flex8 = frm;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_THRESH_CALC, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = bbr->bbr_timer_src;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex4 = bbr->rc_in_persist;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
+ log.u_bbr.flex8 = hpts_removed;
+ log.u_bbr.pkts_out = bbr->rc_pacer_started;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TIMERCANC, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+
+static void
+bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio;
+ log.u_bbr.flex2 = (peer_delta >> 32);
+ log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff);
+ log.u_bbr.flex4 = (delta >> 32);
+ log.u_bbr.flex5 = (delta & 0x00000000ffffffff);
+ log.u_bbr.flex7 = bbr->rc_ts_clock_set;
+ log.u_bbr.flex8 = bbr->rc_ts_cant_be_used;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_TSTMP_VAL, 0,
+ 0, &log, false, &bbr->rc_tv);
+
+ }
+}
+
+static void
+bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = tsosz;
+ log.u_bbr.flex2 = tls;
+ log.u_bbr.flex3 = tcp_min_hptsi_time;
+ log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min;
+ log.u_bbr.flex5 = old_val;
+ log.u_bbr.flex6 = maxseg;
+ log.u_bbr.flex7 = bbr->rc_no_pacing;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= bbr->rc_past_init_win;
+ if (hdwr)
+ log.u_bbr.flex8 = 0x80 | bbr->rc_use_google;
+ else
+ log.u_bbr.flex8 = bbr->rc_use_google;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRTSO, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm,
+ uint32_t flags, uint32_t line)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = rsm->r_start;
+ log.u_bbr.flex3 = rsm->r_end;
+ log.u_bbr.flex4 = rsm->r_delivered;
+ log.u_bbr.flex5 = rsm->r_rtr_cnt;
+ log.u_bbr.flex6 = rsm->r_dupack;
+ log.u_bbr.flex7 = rsm->r_tim_lastsent[0];
+ log.u_bbr.flex8 = rsm->r_flags;
+ /* Hijack the pkts_out fids */
+ log.u_bbr.applimited = flags;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_RSM_CLEARED, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts,
+ uint32_t flex3, uint32_t flex2, uint32_t flex5,
+ uint32_t flex6, uint32_t pkts_out, int flex7,
+ uint32_t flex4, uint32_t flex1)
+{
+
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = flex1;
+ log.u_bbr.flex2 = flex2;
+ log.u_bbr.flex3 = flex3;
+ log.u_bbr.flex4 = flex4;
+ log.u_bbr.flex5 = flex5;
+ log.u_bbr.flex6 = flex6;
+ log.u_bbr.flex7 = flex7;
+ /* Hijack the pkts_out fids */
+ log.u_bbr.pkts_out = pkts_out;
+ log.u_bbr.flex8 = flex8;
+ if (bbr->rc_ack_was_delayed)
+ log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay;
+ else
+ log.u_bbr.epoch = 0;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRUPD, 0,
+ flex2, &log, false, &bbr->rc_tv);
+ }
+}
+
+
+static void
+bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
+ uint32_t newbw, uint32_t obw, uint32_t diff,
+ uint32_t tim)
+{
+ if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = reason;
+ log.u_bbr.flex2 = newbw;
+ log.u_bbr.flex3 = obw;
+ log.u_bbr.flex4 = diff;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del;
+ log.u_bbr.flex7 = bbr->rc_lt_is_sampling;
+ log.u_bbr.pkts_out = tim;
+ log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw;
+ if (bbr->rc_lt_use_bw == 0)
+ log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
+ else
+ log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BWSAMP, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static inline void
+bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = tick;
+ log.u_bbr.flex3 = tp->t_maxunacktime;
+ log.u_bbr.flex4 = tp->t_acktime;
+ log.u_bbr.flex8 = event;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_PROGRESS, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
+ uint64_t rate, uint64_t hw_rate, int line, uint32_t cts,
+ int error)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
+ log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
+ log.u_bbr.bw_inuse = rate;
+ log.u_bbr.flex5 = line;
+ log.u_bbr.flex6 = error;
+ log.u_bbr.flex8 = bbr->skip_gain;
+ log.u_bbr.flex8 <<= 1;
+ log.u_bbr.flex8 |= bbr->gain_is_limited;
+ log.u_bbr.flex8 <<= 1;
+ log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing;
+ log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_HDWR_PACE, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = slot;
+ log.u_bbr.flex2 = del_by;
+ log.u_bbr.flex3 = prev_delay;
+ log.u_bbr.flex4 = line;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay;
+ log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags);
+ log.u_bbr.flex8 = bbr->rc_in_persist;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRSND, 0,
+ len, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->r_ctl.rc_delivered;
+ log.u_bbr.flex2 = 0;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt;
+ log.u_bbr.flex4 = end;
+ log.u_bbr.flex5 = seq;
+ log.u_bbr.flex6 = t;
+ log.u_bbr.flex7 = match;
+ log.u_bbr.flex8 = flags;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRRTT, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method)
+{
+ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
+ log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
+ log.u_bbr.flex3 = bbr->r_ctl.gain_epoch;
+ log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight;
+ log.u_bbr.flex7 = 0;
+ log.u_bbr.flex8 = entry_method;
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_EXIT_GAIN, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+static void
+bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired)
+{
+ if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
+ /* R-HU */
+ log.u_bbr.flex1 = 0;
+ log.u_bbr.flex2 = 0;
+ log.u_bbr.flex3 = 0;
+ log.u_bbr.flex4 = 0;
+ log.u_bbr.flex7 = 0;
+ log.u_bbr.flex8 = settings_desired;
+
+ TCP_LOG_EVENTP(bbr->rc_tp, NULL,
+ &bbr->rc_inp->inp_socket->so_rcv,
+ &bbr->rc_inp->inp_socket->so_snd,
+ BBR_LOG_SETTINGS_CHG, 0,
+ 0, &log, false, &bbr->rc_tv);
+ }
+}
+
+/*
+ * Returns the bw from the our filter.
+ */
+static inline uint64_t
+bbr_get_full_bw(struct tcp_bbr *bbr)
+{
+ uint64_t bw;
+
+ bw = get_filter_value(&bbr->r_ctl.rc_delrate);
+
+ return (bw);
+}
+
+static inline void
+bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
+{
+ uint64_t calclr;
+ uint32_t lost, del;
+
+ if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch)
+ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch;
+ else
+ lost = 0;
+ del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del;
+ if (lost == 0) {
+ calclr = 0;
+ } else if (del) {
+ calclr = lost;
+ calclr *= (uint64_t)1000;
+ calclr /= (uint64_t)del;
+ } else {
+ /* Nothing delivered? 100.0% loss */
+ calclr = 1000;
+ }
+ bbr->r_ctl.rc_pkt_epoch_loss_rate = (uint32_t)calclr;
+ if (IN_RECOVERY(bbr->rc_tp->t_flags))
+ bbr->r_ctl.recovery_lr += (uint32_t)calclr;
+ bbr->r_ctl.rc_pkt_epoch++;
+ if (bbr->rc_no_pacing &&
+ (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) {
+ bbr->rc_no_pacing = 0;
+ tcp_bbr_tso_size_check(bbr, cts);
+ }
+ bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time);
+ bbr->r_ctl.rc_pkt_epoch_time = cts;
+ /* What was our loss rate */
+ bbr_log_pkt_epoch(bbr, cts, line, lost, del);
+ bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered;
+ bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost;
+}
+
+static inline void
+bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
+{
+ uint32_t epoch_time;
+
+ /* Tick the RTT clock */
+ bbr->r_ctl.rc_rtt_epoch++;
+ epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start;
+ bbr_log_time_epoch(bbr, cts, line, epoch_time);
+ bbr->r_ctl.rc_rcv_epoch_start = cts;
+}
+
+
+static inline void
+bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked)
+{
+ if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) {
+ bbr->rc_is_pkt_epoch_now = 1;
+ }
+}
+
+/*
+ * Returns the bw from either the b/w filter
+ * or from the lt_bw (if the connection is being
+ * policed).
+ */
+static inline uint64_t
+__bbr_get_bw(struct tcp_bbr *bbr)
+{
+ uint64_t bw, min_bw;
+ uint64_t rtt;
+ int gm_measure_cnt = 1;
+
+ /*
+ * For startup we make, like google, a
+ * minimum b/w. This is generated from the
+ * IW and the rttProp. We do fall back to srtt
+ * if for some reason (initial handshake) we don't
+ * have a rttProp. We, in the worst case, fall back
+ * to the configured min_bw (rc_initial_hptsi_bw).
+ */
+ if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
+ /* Attempt first to use rttProp */
+ rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
+ if (rtt && (rtt < 0xffffffff)) {
+measure:
+ min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
+ ((uint64_t)1000000);
+ min_bw /= rtt;
+ if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) {
+ min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
+ }
+
+ } else if (bbr->rc_tp->t_srtt != 0) {
+ /* No rttProp, use srtt? */
+ rtt = bbr_get_rtt(bbr, BBR_SRTT);
+ goto measure;
+ } else {
+ min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
+ }
+ } else
+ min_bw = 0;
+
+ if ((bbr->rc_past_init_win == 0) &&
+ (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp)))
+ bbr->rc_past_init_win = 1;
+ if ((bbr->rc_use_google) && (bbr->r_ctl.r_measurement_count >= 1))
+ gm_measure_cnt = 0;
+ if (gm_measure_cnt &&
+ ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) ||
+ (bbr->rc_past_init_win == 0))) {
+ /* For google we use our guess rate until we get 1 measurement */
+
+use_initial_window:
+ rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
+ if (rtt && (rtt < 0xffffffff)) {
+ /*
+ * We have an RTT measurment. Use that in
+ * combination with our initial window to calculate
+ * a b/w.
+ */
+ bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
+ ((uint64_t)1000000);
+ bw /= rtt;
+ if (bw < bbr->r_ctl.rc_initial_hptsi_bw) {
+ bw = bbr->r_ctl.rc_initial_hptsi_bw;
+ }
+ } else {
+ /* Drop back to the 40 and punt to a default */
+ bw = bbr->r_ctl.rc_initial_hptsi_bw;
+ }
+ if (bw < 1)
+ /* Probably should panic */
+ bw = 1;
+ if (bw > min_bw)
+ return (bw);
+ else
+ return (min_bw);
+ }
+ if (bbr->rc_lt_use_bw)
+ bw = bbr->r_ctl.rc_lt_bw;
+ else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0))
+ bw = bbr->r_ctl.red_bw;
+ else
+ bw = get_filter_value(&bbr->r_ctl.rc_delrate);
+ if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) {
+ /*
+ * Enforce user set rate limit, keep in mind that
+ * t_peakrate_thr is in B/s already
+ */
+ bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw);
+ }
+ if (bw == 0) {
+ /* We should not be at 0, go to the initial window then */
+ goto use_initial_window;
+ }
+ if (bw < 1)
+ /* Probably should panic */
+ bw = 1;
+ if (bw < min_bw)
+ bw = min_bw;
+ return (bw);
+}
+
+static inline uint64_t
+bbr_get_bw(struct tcp_bbr *bbr)
+{
+ uint64_t bw;
+
+ bw = __bbr_get_bw(bbr);
+ return (bw);
+}
+
+static inline void
+bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts)
+{
+ bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch;
+ bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time;
+ bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+}
+
+static inline void
+bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts)
+{
+ bbr->rc_lt_is_sampling = 0;
+ bbr->rc_lt_use_bw = 0;
+ bbr->r_ctl.rc_lt_bw = 0;
+ bbr_reset_lt_bw_interval(bbr, cts);
+}
+
+static inline void
+bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin)
+{
+ uint64_t diff;
+
+ /* Do we have a previous sample? */
+ if (bbr->r_ctl.rc_lt_bw) {
+ /* Get the diff in bytes per second */
+ if (bbr->r_ctl.rc_lt_bw > bw)
+ diff = bbr->r_ctl.rc_lt_bw - bw;
+ else
+ diff = bw - bbr->r_ctl.rc_lt_bw;
+ if ((diff <= bbr_lt_bw_diff) ||
+ (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) {
+ /* Consider us policed */
+ uint32_t saved_bw;
+
+ saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw;
+ bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2; /* average of two */
+ bbr->rc_lt_use_bw = 1;
+ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
+ /*
+ * Use pkt based epoch for measuring length of
+ * policer up
+ */
+ bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch;
+ /*
+ * reason 4 is we need to start consider being
+ * policed
+ */
+ bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin);
+ return;
+ }
+ }
+ bbr->r_ctl.rc_lt_bw = bw;
+ bbr_reset_lt_bw_interval(bbr, cts);
+ bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin);
+}
+
+/*
+ * RRS: Copied from user space!
+ * Calculate a uniformly distributed random number less than upper_bound
+ * avoiding "modulo bias".
+ *
+ * Uniformity is achieved by generating new random numbers until the one
+ * returned is outside the range [0, 2**32 % upper_bound). This
+ * guarantees the selected random number will be inside
+ * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound)
+ * after reduction modulo upper_bound.
+ */
+static uint32_t
+arc4random_uniform(uint32_t upper_bound)
+{
+ uint32_t r, min;
+
+ if (upper_bound < 2)
+ return 0;
+
+ /* 2**32 % x == (2**32 - x) % x */
+ min = -upper_bound % upper_bound;
+
+ /*
+ * This could theoretically loop forever but each retry has
+ * p > 0.5 (worst case, usually far better) of selecting a
+ * number inside the range we need, so it should rarely need
+ * to re-roll.
+ */
+ for (;;) {
+ r = arc4random();
+ if (r >= min)
+ break;
+ }
+
+ return r % upper_bound;
+}
+
+static void
+bbr_randomize_extra_state_time(struct tcp_bbr *bbr)
+{
+ uint32_t ran, deduct;
+
+ ran = arc4random_uniform(bbr_rand_ot);
+ if (ran) {
+ deduct = bbr->r_ctl.rc_level_state_extra / ran;
+ bbr->r_ctl.rc_level_state_extra -= deduct;
+ }
+}
+/*
+ * Return randomly the starting state
+ * to use in probebw.
+ */
+static uint8_t
+bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts)
+{
+ uint32_t ran;
+ uint8_t ret_val;
+
+ /* Initialize the offset to 0 */
+ bbr->r_ctl.rc_exta_time_gd = 0;
+ bbr->rc_hit_state_1 = 0;
+ bbr->r_ctl.rc_level_state_extra = 0;
+ ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1));
+ /*
+ * The math works funny here :) the return value is used to set the
+ * substate and then the state change is called which increments by
+ * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when
+ * we fully enter the state. Note that the (8 - 1 - ran) assures that
+ * we return 1 - 7, so we dont return 0 and end up starting in
+ * state 1 (DRAIN).
+ */
+ ret_val = BBR_SUBSTATE_COUNT - 1 - ran;
+ /* Set an epoch */
+ if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP))
+ bbr_set_epoch(bbr, cts, __LINE__);
+
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ return (ret_val);
+}
+
+static void
+bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected)
+{
+ uint32_t diff, d_time;
+ uint64_t del_time, bw, lost, delivered;
+
+ if (bbr->r_use_policer == 0)
+ return;
+ if (bbr->rc_lt_use_bw) {
+ /* We are using lt bw do we stop yet? */
+ diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
+ if (diff > bbr_lt_bw_max_rtts) {
+ /* Reset it all */
+reset_all:
+ bbr_reset_lt_bw_sampling(bbr, cts);
+ if (bbr->rc_filled_pipe) {
+ bbr_set_epoch(bbr, cts, __LINE__);
+ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
+ bbr_substate_change(bbr, cts, __LINE__, 0);
+ bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ } else {
+ /*
+ * This should not happen really
+ * unless we remove the startup/drain
+ * restrictions above.
+ */
+ bbr->rc_bbr_state = BBR_STATE_STARTUP;
+ bbr_set_epoch(bbr, cts, __LINE__);
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
+ bbr_set_state_target(bbr, __LINE__);
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ }
+ /* reason 0 is to stop using lt-bw */
+ bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0);
+ return;
+ }
+ if (bbr_lt_intvl_fp == 0) {
+ /* Not doing false-postive detection */
+ return;
+ }
+ /* False positive detection */
+ if (diff == bbr_lt_intvl_fp) {
+ /* At bbr_lt_intvl_fp we record the lost */
+ bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+ } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) {
+ /* Now is our loss rate still high? */
+ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
+ delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
+ if ((delivered == 0) ||
+ (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) {
+ /* No still below our threshold */
+ bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0);
+ } else {
+ /* Yikes its still high, it must be a false positive */
+ bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0);
+ goto reset_all;
+ }
+ }
+ return;
+ }
+ /*
+ * Wait for the first loss before sampling, to let the policer
+ * exhaust its tokens and estimate the steady-state rate allowed by
+ * the policer. Starting samples earlier includes bursts that
+ * over-estimate the bw.
+ */
+ if (bbr->rc_lt_is_sampling == 0) {
+ /* reason 1 is to begin doing the sampling */
+ if (loss_detected == 0)
+ return;
+ bbr_reset_lt_bw_interval(bbr, cts);
+ bbr->rc_lt_is_sampling = 1;
+ bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0);
+ return;
+ }
+ /* Now how long were we delivering long term last> */
+ if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time))
+ d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time;
+ else
+ d_time = 0;
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (bbr->r_ctl.r_app_limited_until) {
+ /* Can not measure in app-limited state */
+ bbr_reset_lt_bw_sampling(bbr, cts);
+ /* reason 2 is to reset sampling due to app limits */
+ bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time);
+ return;
+ }
+ diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
+ if (diff < bbr_lt_intvl_min_rtts) {
+ /*
+ * need more samples (we don't
+ * start on a round like linux so
+ * we need 1 more).
+ */
+ /* 6 is not_enough time or no-loss */
+ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
+ return;
+ }
+ if (diff > (4 * bbr_lt_intvl_min_rtts)) {
+ /*
+ * For now if we wait too long, reset all sampling. We need
+ * to do some research here, its possible that we should
+ * base this on how much loss as occurred.. something like
+ * if its under 10% (or some thresh) reset all otherwise
+ * don't. Thats for phase II I guess.
+ */
+ bbr_reset_lt_bw_sampling(bbr, cts);
+ /* reason 3 is to reset sampling due too long of sampling */
+ bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
+ return;
+ }
+ /*
+ * End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (loss_detected == 0) {
+ /* 6 is not_enough time or no-loss */
+ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
+ return;
+ }
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
+ delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
+ if ((delivered == 0) ||
+ (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) {
+ bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time);
+ return;
+ }
+ if (d_time < 1000) {
+ /* Not enough time. wait */
+ /* 6 is not_enough time or no-loss */
+ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
+ return;
+ }
+ if (d_time >= (0xffffffff / USECS_IN_MSEC)) {
+ /* Too long */
+ bbr_reset_lt_bw_sampling(bbr, cts);
+ /* reason 3 is to reset sampling due too long of sampling */
+ bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
+ return;
+ }
+ del_time = d_time;
+ bw = delivered;
+ bw *= (uint64_t)USECS_IN_SECOND;
+ bw /= del_time;
+ bbr_lt_bw_samp_done(bbr, bw, cts, d_time);
+}
+
+/*
+ * Allocate a sendmap from our zone.
+ */
+static struct bbr_sendmap *
+bbr_alloc(struct tcp_bbr *bbr)
+{
+ struct bbr_sendmap *rsm;
+
+ BBR_STAT_INC(bbr_to_alloc);
+ rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO));
+ if (rsm) {
+ bbr->r_ctl.rc_num_maps_alloced++;
+ return (rsm);
+ }
+ if (bbr->r_ctl.rc_free_cnt) {
+ BBR_STAT_INC(bbr_to_alloc_emerg);
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
+ TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
+ bbr->r_ctl.rc_free_cnt--;
+ return (rsm);
+ }
+ BBR_STAT_INC(bbr_to_alloc_failed);
+ return (NULL);
+}
+
+static struct bbr_sendmap *
+bbr_alloc_full_limit(struct tcp_bbr *bbr)
+{
+ if ((bbr_tcp_map_entries_limit > 0) &&
+ (bbr->r_ctl.rc_num_maps_alloced >= bbr_tcp_map_entries_limit)) {
+ BBR_STAT_INC(bbr_alloc_limited);
+ if (!bbr->alloc_limit_reported) {
+ bbr->alloc_limit_reported = 1;
+ BBR_STAT_INC(bbr_alloc_limited_conns);
+ }
+ return (NULL);
+ }
+ return (bbr_alloc(bbr));
+}
+
+
+/* wrapper to allocate a sendmap entry, subject to a specific limit */
+static struct bbr_sendmap *
+bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type)
+{
+ struct bbr_sendmap *rsm;
+
+ if (limit_type) {
+ /* currently there is only one limit type */
+ if (bbr_tcp_map_split_limit > 0 &&
+ bbr->r_ctl.rc_num_split_allocs >= bbr_tcp_map_split_limit) {
+ BBR_STAT_INC(bbr_split_limited);
+ if (!bbr->alloc_limit_reported) {
+ bbr->alloc_limit_reported = 1;
+ BBR_STAT_INC(bbr_alloc_limited_conns);
+ }
+ return (NULL);
+ }
+ }
+
+ /* allocate and mark in the limit type, if set */
+ rsm = bbr_alloc(bbr);
+ if (rsm != NULL && limit_type) {
+ rsm->r_limit_type = limit_type;
+ bbr->r_ctl.rc_num_split_allocs++;
+ }
+ return (rsm);
+}
+
+static void
+bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
+{
+ if (rsm->r_limit_type) {
+ /* currently there is only one limit type */
+ bbr->r_ctl.rc_num_split_allocs--;
+ }
+ if (rsm->r_is_smallmap)
+ bbr->r_ctl.rc_num_small_maps_alloced--;
+ if (bbr->r_ctl.rc_tlp_send == rsm)
+ bbr->r_ctl.rc_tlp_send = NULL;
+ if (bbr->r_ctl.rc_resend == rsm) {
+ bbr->r_ctl.rc_resend = NULL;
+ }
+ if (bbr->r_ctl.rc_next == rsm)
+ bbr->r_ctl.rc_next = NULL;
+ if (bbr->r_ctl.rc_sacklast == rsm)
+ bbr->r_ctl.rc_sacklast = NULL;
+ if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
+ memset(rsm, 0, sizeof(struct bbr_sendmap));
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
+ rsm->r_limit_type = 0;
+ bbr->r_ctl.rc_free_cnt++;
+ return;
+ }
+ bbr->r_ctl.rc_num_maps_alloced--;
+ uma_zfree(bbr_zone, rsm);
+}
+
+/*
+ * Returns the BDP.
+ */
+static uint64_t
+bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) {
+ /*
+ * Calculate the bytes in flight needed given the bw (in bytes per
+ * second) and the specifyed rtt in useconds. We need to put out the
+ * returned value per RTT to match that rate. Gain will normaly
+ * raise it up from there.
+ *
+ * This should not overflow as long as the bandwidth is below 1
+ * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller
+ * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30).
+ */
+ uint64_t usec_per_sec;
+
+ usec_per_sec = USECS_IN_SECOND;
+ return ((rtt * bw) / usec_per_sec);
+}
+
+/*
+ * Return the initial cwnd.
+ */
+static uint32_t
+bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp)
+{
+ uint32_t i_cwnd;
+
+ if (bbr->rc_init_win) {
+ i_cwnd = bbr->rc_init_win * tp->t_maxseg;
+ } else if (V_tcp_initcwnd_segments)
+ i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
+ max(2 * tp->t_maxseg, 14600));
+ else if (V_tcp_do_rfc3390)
+ i_cwnd = min(4 * tp->t_maxseg,
+ max(2 * tp->t_maxseg, 4380));
+ else {
+ /* Per RFC5681 Section 3.1 */
+ if (tp->t_maxseg > 2190)
+ i_cwnd = 2 * tp->t_maxseg;
+ else if (tp->t_maxseg > 1095)
+ i_cwnd = 3 * tp->t_maxseg;
+ else
+ i_cwnd = 4 * tp->t_maxseg;
+ }
+ return (i_cwnd);
+}
+
+/*
+ * Given a specified gain, return the target
+ * cwnd based on that gain.
+ */
+static uint32_t
+bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw)
+{
+ uint64_t bdp, rtt;
+ uint32_t cwnd;
+
+ if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) ||
+ (bbr_get_full_bw(bbr) == 0)) {
+ /* No measurements yet */
+ return (bbr_initial_cwnd(bbr, bbr->rc_tp));
+ }
+ /*
+ * Get bytes per RTT needed (rttProp is normally in
+ * bbr_cwndtarget_rtt_touse)
+ */
+ rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse);
+ /* Get the bdp from the two values */
+ bdp = bbr_get_bw_delay_prod(rtt, bw);
+ /* Now apply the gain */
+ cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT));
+
+ return (cwnd);
+}
+
+static uint32_t
+bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain)
+{
+ uint32_t cwnd, mss;
+
+ mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
+ /* Get the base cwnd with gain rounded to a mss */
+ cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss);
+ /*
+ * Add in N (2 default since we do not have a
+ * fq layer to trap packets in) quanta's per the I-D
+ * section 4.2.3.2 quanta adjust.
+ */
+ cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs);
+ if (bbr->rc_use_google) {
+ if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
+ (bbr_state_val(bbr) == BBR_SUB_GAIN)) {
+ /*
+ * The linux implementation adds
+ * an extra 2 x mss in gain cycle which
+ * is documented no-where except in the code.
+ * so we add more for Neal undocumented feature
+ */
+ cwnd += 2 * mss;
+ }
+ if ((cwnd / mss) & 0x1) {
+ /* Round up for odd num mss */
+ cwnd += mss;
+ }
+ }
+ /* Are we below the min cwnd? */
+ if (cwnd < get_min_cwnd(bbr))
+ return (get_min_cwnd(bbr));
+ return (cwnd);
+}
+
+static uint16_t
+bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain)
+{
+ if (gain < 1)
+ gain = 1;
+ return (gain);
+}
+
+static uint32_t
+bbr_get_header_oh(struct tcp_bbr *bbr)
+{
+ int seg_oh;
+
+ seg_oh = 0;
+ if (bbr->r_ctl.rc_inc_tcp_oh) {
+ /* Do we include TCP overhead? */
+ seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr));
+ }
+ if (bbr->r_ctl.rc_inc_ip_oh) {
+ /* Do we include IP overhead? */
+#ifdef INET6
+ if (bbr->r_is_v6)
+ seg_oh += sizeof(struct ip6_hdr);
+ else
+#endif
+#ifdef INET
+ seg_oh += sizeof(struct ip);
+#endif
+ }
+ if (bbr->r_ctl.rc_inc_enet_oh) {
+ /* Do we include the ethernet overhead? */
+ seg_oh += sizeof(struct ether_header);
+ }
+ return(seg_oh);
+}
+
+
+static uint32_t
+bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw)
+{
+ uint64_t divor, res, tim;
+
+ if (useconds_time == 0)
+ return (0);
+ gain = bbr_gain_adjust(bbr, gain);
+ divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT;
+ tim = useconds_time;
+ res = (tim * bw * gain) / divor;
+ if (res == 0)
+ res = 1;
+ return ((uint32_t)res);
+}
+
+/*
+ * Given a gain and a length return the delay in useconds that
+ * should be used to evenly space out packets
+ * on the connection (based on the gain factor).
+ */
+static uint32_t
+bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog)
+{
+ uint64_t bw, lentim, res;
+ uint32_t usecs, srtt, over = 0;
+ uint32_t seg_oh, num_segs, maxseg;
+
+ if (len == 0)
+ return (0);
+
+ maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
+ num_segs = (len + maxseg - 1) / maxseg;
+ if (bbr->rc_use_google == 0) {
+ seg_oh = bbr_get_header_oh(bbr);
+ len += (num_segs * seg_oh);
+ }
+ gain = bbr_gain_adjust(bbr, gain);
+ bw = bbr_get_bw(bbr);
+ if (bbr->rc_use_google) {
+ uint64_t cbw;
+
+ /*
+ * Reduce the b/w by the google discount
+ * factor 10 = 1%.
+ */
+ cbw = bw * (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount);
+ cbw /= (uint64_t)1000;
+ /* We don't apply a discount if it results in 0 */
+ if (cbw > 0)
+ bw = cbw;
+ }
+ lentim = ((uint64_t)len *
+ (uint64_t)USECS_IN_SECOND *
+ (uint64_t)BBR_UNIT);
+ res = lentim / ((uint64_t)gain * bw);
+ if (res == 0)
+ res = 1;
+ usecs = (uint32_t)res;
+ srtt = bbr_get_rtt(bbr, BBR_SRTT);
+ if (bbr_hptsi_max_mul && bbr_hptsi_max_div &&
+ (bbr->rc_use_google == 0) &&
+ (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) {
+ /*
+ * We cannot let the delay be more than 1/2 the srtt time.
+ * Otherwise we cannot pace out or send properly.
+ */
+ over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div;
+ BBR_STAT_INC(bbr_hpts_min_time);
+ }
+ if (!nolog)
+ bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1);
+ return (usecs);
+}
+
+static void
+bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack,
+ uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses)
+{
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ uint64_t bw;
+ uint32_t cwnd, target_cwnd, saved_bytes, maxseg;
+ int32_t meth;
+
+#ifdef NETFLIX_STATS
+ if ((tp->t_flags & TF_GPUTINPROG) &&
+ SEQ_GEQ(th->th_ack, tp->gput_ack)) {
+ /*
+ * Strech acks and compressed acks will cause this to
+ * oscillate but we are doing it the same way as the main
+ * stack so it will be compariable (though possibly not
+ * ideal).
+ */
+ int32_t cgput;
+ int64_t gput, time_stamp;
+
+ gput = (int64_t) (th->th_ack - tp->gput_seq) * 8;
+ time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000));
+ cgput = gput / time_stamp;
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
+ cgput);
+ if (tp->t_stats_gput_prev > 0)
+ stats_voi_update_abs_s32(tp->t_stats,
+ VOI_TCP_GPUT_ND,
+ ((gput - tp->t_stats_gput_prev) * 100) /
+ tp->t_stats_gput_prev);
+ tp->t_flags &= ~TF_GPUTINPROG;
+ tp->t_stats_gput_prev = cgput;
+ }
+#endif
+ if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
+ ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
+ /* We don't change anything in probe-rtt */
+ return;
+ }
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ saved_bytes = bytes_this_ack;
+ bytes_this_ack += sack_changed;
+ if (bytes_this_ack > prev_acked) {
+ bytes_this_ack -= prev_acked;
+ /*
+ * A byte ack'd gives us a full mss
+ * to be like linux i.e. they count packets.
+ */
+ if ((bytes_this_ack < maxseg) && bbr->rc_use_google)
+ bytes_this_ack = maxseg;
+ } else {
+ /* Unlikely */
+ bytes_this_ack = 0;
+ }
+ cwnd = tp->snd_cwnd;
+ bw = get_filter_value(&bbr->r_ctl.rc_delrate);
+ if (bw)
+ target_cwnd = bbr_get_target_cwnd(bbr,
+ bw,
+ (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain);
+ else
+ target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp);
+ if (IN_RECOVERY(tp->t_flags) &&
+ (bbr->bbr_prev_in_rec == 0)) {
+ /*
+ * We are entering recovery and
+ * thus packet conservation.
+ */
+ bbr->pkt_conservation = 1;
+ bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime;
+ cwnd = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
+ bytes_this_ack;
+ }
+ if (IN_RECOVERY(tp->t_flags)) {
+ uint32_t flight;
+
+ bbr->bbr_prev_in_rec = 1;
+ if (cwnd > losses) {
+ cwnd -= losses;
+ if (cwnd < maxseg)
+ cwnd = maxseg;
+ } else
+ cwnd = maxseg;
+ flight = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ bbr_log_type_cwndupd(bbr, flight, 0,
+ losses, 10, 0, 0, line);
+ if (bbr->pkt_conservation) {
+ uint32_t time_in;
+
+ if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start))
+ time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start;
+ else
+ time_in = 0;
+
+ if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
+ /* Clear packet conservation after an rttProp */
+ bbr->pkt_conservation = 0;
+ } else {
+ if ((flight + bytes_this_ack) > cwnd)
+ cwnd = flight + bytes_this_ack;
+ if (cwnd < get_min_cwnd(bbr))
+ cwnd = get_min_cwnd(bbr);
+ tp->snd_cwnd = cwnd;
+ bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed,
+ prev_acked, 1, target_cwnd, th->th_ack, line);
+ return;
+ }
+ }
+ } else
+ bbr->bbr_prev_in_rec = 0;
+ if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) {
+ bbr->r_ctl.restrict_growth--;
+ if (bytes_this_ack > maxseg)
+ bytes_this_ack = maxseg;
+ }
+ if (bbr->rc_filled_pipe) {
+ /*
+ * Here we have exited startup and filled the pipe. We will
+ * thus allow the cwnd to shrink to the target. We hit here
+ * mostly.
+ */
+ uint32_t s_cwnd;
+
+ meth = 2;
+ s_cwnd = min((cwnd + bytes_this_ack), target_cwnd);
+ if (s_cwnd > cwnd)
+ cwnd = s_cwnd;
+ else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing)
+ cwnd = s_cwnd;
+ } else {
+ /*
+ * Here we are still in startup, we increase cwnd by what
+ * has been acked.
+ */
+ if ((cwnd < target_cwnd) ||
+ (bbr->rc_past_init_win == 0)) {
+ meth = 3;
+ cwnd += bytes_this_ack;
+ } else {
+ /*
+ * Method 4 means we are at target so no gain in
+ * startup and past the initial window.
+ */
+ meth = 4;
+ }
+ }
+ tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr));
+ bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line);
+}
+
+static void
+tcp_bbr_partialack(struct tcpcb *tp)
+{
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
+ tp->snd_cwnd) {
+ bbr->r_wanted_output = 1;
+ }
+}
+
+static void
+bbr_post_recovery(struct tcpcb *tp)
+{
+ struct tcp_bbr *bbr;
+ uint32_t flight;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ /*
+ * Here we just exit recovery.
+ */
+ EXIT_RECOVERY(tp->t_flags);
+ /* Lock in our b/w reduction for the specified number of pkt-epochs */
+ bbr->r_recovery_bw = 0;
+ tp->snd_recover = tp->snd_una;
+ tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
+ bbr->pkt_conservation = 0;
+ if (bbr->rc_use_google == 0) {
+ /*
+ * For non-google mode lets
+ * go ahead and make sure we clear
+ * the recovery state so if we
+ * bounce back in to recovery we
+ * will do PC.
+ */
+ bbr->bbr_prev_in_rec = 0;
+ }
+ bbr_log_type_exit_rec(bbr);
+ if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__);
+ } else {
+ /* For probe-rtt case lets fix up its saved_cwnd */
+ if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) {
+ bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__);
+ }
+ }
+ flight = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ if ((bbr->rc_use_google == 0) &&
+ bbr_do_red) {
+ uint64_t val, lr2use;
+ uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd;
+ uint32_t *cwnd_p;
+
+ if (bbr_get_rtt(bbr, BBR_SRTT)) {
+ val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000);
+ val /= bbr_get_rtt(bbr, BBR_SRTT);
+ ratio = (uint32_t)val;
+ } else
+ ratio = 1000;
+
+ bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div,
+ bbr->r_ctl.recovery_lr, 21,
+ ratio,
+ bbr->r_ctl.rc_red_cwnd_pe,
+ __LINE__);
+ if ((ratio < bbr_do_red) || (bbr_do_red == 0))
+ goto done;
+ if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
+ bbr_prtt_slam_cwnd) ||
+ (bbr_sub_drain_slam_cwnd &&
+ (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
+ bbr->rc_hit_state_1 &&
+ (bbr_state_val(bbr) == BBR_SUB_DRAIN)) ||
+ ((bbr->rc_bbr_state == BBR_STATE_DRAIN) &&
+ bbr_slam_cwnd_in_main_drain)) {
+ /*
+ * Here we must poke at the saved cwnd
+ * as well as the cwnd.
+ */
+ cwnd = bbr->r_ctl.rc_saved_cwnd;
+ cwnd_p = &bbr->r_ctl.rc_saved_cwnd;
+ } else {
+ cwnd = tp->snd_cwnd;
+ cwnd_p = &tp->snd_cwnd;
+ }
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ /* Add the overall lr with the recovery lr */
+ if (bbr->r_ctl.rc_lost == 0)
+ lr2use = 0;
+ else if (bbr->r_ctl.rc_delivered == 0)
+ lr2use = 1000;
+ else {
+ lr2use = bbr->r_ctl.rc_lost * 1000;
+ lr2use /= bbr->r_ctl.rc_delivered;
+ }
+ lr2use += bbr->r_ctl.recovery_lr;
+ acks_inflight = (flight / (maxseg * 2));
+ if (bbr_red_scale) {
+ lr2use *= bbr_get_rtt(bbr, BBR_SRTT);
+ lr2use /= bbr_red_scale;
+ if ((bbr_red_growth_restrict) &&
+ ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1))
+ bbr->r_ctl.restrict_growth += acks_inflight;
+ }
+ if (lr2use) {
+ val = (uint64_t)cwnd * lr2use;
+ val /= 1000;
+ if (cwnd > val)
+ newcwnd = roundup((cwnd - val), maxseg);
+ else
+ newcwnd = maxseg;
+ } else {
+ val = (uint64_t)cwnd * (uint64_t)bbr_red_mul;
+ val /= (uint64_t)bbr_red_div;
+ newcwnd = roundup((uint32_t)val, maxseg);
+ }
+ /* with standard delayed acks how many acks can I expect? */
+ if (bbr_drop_limit == 0) {
+ /*
+ * Anticpate how much we will
+ * raise the cwnd based on the acks.
+ */
+ if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) {
+ /* We do enforce the min (with the acks) */
+ newcwnd = (get_min_cwnd(bbr) - acks_inflight);
+ }
+ } else {
+ /*
+ * A strict drop limit of N is is inplace
+ */
+ if (newcwnd < (bbr_drop_limit * maxseg)) {
+ newcwnd = bbr_drop_limit * maxseg;
+ }
+ }
+ /* For the next N acks do we restrict the growth */
+ *cwnd_p = newcwnd;
+ if (tp->snd_cwnd > newcwnd)
+ tp->snd_cwnd = newcwnd;
+ bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22,
+ (uint32_t)lr2use,
+ bbr_get_rtt(bbr, BBR_SRTT), __LINE__);
+ bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch;
+ }
+done:
+ bbr->r_ctl.recovery_lr = 0;
+ if (flight <= tp->snd_cwnd) {
+ bbr->r_wanted_output = 1;
+ }
+ tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
+}
+
+static void
+bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts)
+{
+ bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
+ /* Limit the drop in b/w to 1/2 our current filter. */
+ if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate)
+ bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate;
+ if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2))
+ bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2;
+ tcp_bbr_tso_size_check(bbr, cts);
+}
+
+static void
+bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm)
+{
+ struct tcp_bbr *bbr;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_RECOVERY(tp->t_flags)) {
+ tp->snd_recover = tp->snd_max;
+ /* Start a new epoch */
+ bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
+ if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) {
+ /*
+ * Move forward the lt epoch
+ * so it won't count the truncated
+ * epoch.
+ */
+ bbr->r_ctl.rc_lt_epoch++;
+ }
+ if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
+ /*
+ * Just like the policer detection code
+ * if we are in startup we must push
+ * forward the last startup epoch
+ * to hide the truncated PE.
+ */
+ bbr->r_ctl.rc_bbr_last_startup_epoch++;
+ }
+ bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd;
+ ENTER_RECOVERY(tp->t_flags);
+ bbr->rc_tlp_rtx_out = 0;
+ bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate;
+ tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
+ if (bbr->rc_inp->inp_in_hpts &&
+ ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) {
+ /*
+ * When we enter recovery, we need to restart
+ * any timers. This may mean we gain an agg
+ * early, which will be made up for at the last
+ * rxt out.
+ */
+ bbr->rc_timer_first = 1;
+ bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
+ }
+ /*
+ * Calculate a new cwnd based on to the current
+ * delivery rate with no gain. We get the bdp
+ * without gaining it up like we normally would and
+ * we use the last cur_del_rate.
+ */
+ if ((bbr->rc_use_google == 0) &&
+ (bbr->r_ctl.bbr_rttprobe_gain_val ||
+ (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) {
+ tp->snd_cwnd = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
+ (tp->t_maxseg - bbr->rc_last_options);
+ if (tp->snd_cwnd < get_min_cwnd(bbr)) {
+ /* We always gate to min cwnd */
+ tp->snd_cwnd = get_min_cwnd(bbr);
+ }
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__);
+ }
+ bbr_log_type_enter_rec(bbr, rsm->r_start);
+ }
+ break;
+ case CC_RTO_ERR:
+ TCPSTAT_INC(tcps_sndrexmitbad);
+ /* RTO was unnecessary, so reset everything. */
+ bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime);
+ if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ tp->snd_recover = tp->snd_recover_prev;
+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__);
+ }
+ tp->t_badrxtwin = 0;
+ break;
+ }
+}
+
+/*
+ * Indicate whether this ack should be delayed. We can delay the ack if
+ * following conditions are met:
+ * - There is no delayed ack timer in progress.
+ * - Our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window.
+ * - LRO wasn't used for this segment. We make sure by checking that the
+ * segment size is not larger than the MSS.
+ * - Delayed acks are enabled or this is a half-synchronized T/TCP
+ * connection.
+ * - The data being acked is less than a full segment (a stretch ack
+ * of more than a segment we should ack.
+ * - nsegs is 1 (if its more than that we received more than 1 ack).
+ */
+#define DELAY_ACK(tp, bbr, nsegs) \
+ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
+ ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) && \
+ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
+
+/*
+ * Return the lowest RSM in the map of
+ * packets still in flight that is not acked.
+ * This should normally find on the first one
+ * since we remove packets from the send
+ * map after they are marked ACKED.
+ */
+static struct bbr_sendmap *
+bbr_find_lowest_rsm(struct tcp_bbr *bbr)
+{
+ struct bbr_sendmap *rsm;
+
+ /*
+ * Walk the time-order transmitted list looking for an rsm that is
+ * not acked. This will be the one that was sent the longest time
+ * ago that is still outstanding.
+ */
+ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) {
+ if (rsm->r_flags & BBR_ACKED) {
+ continue;
+ }
+ goto finish;
+ }
+finish:
+ return (rsm);
+}
+
+static struct bbr_sendmap *
+bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
+{
+ struct bbr_sendmap *prsm;
+
+ /*
+ * Walk the sequence order list backward until we hit and arrive at
+ * the highest seq not acked. In theory when this is called it
+ * should be the last segment (which it was not).
+ */
+ prsm = rsm;
+ TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
+ if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) {
+ continue;
+ }
+ return (prsm);
+ }
+ return (NULL);
+}
+
+/*
+ * Returns to the caller the number of microseconds that
+ * the packet can be outstanding before we think we
+ * should have had an ack returned.
+ */
+static uint32_t
+bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm)
+{
+ /*
+ * lro is the flag we use to determine if we have seen reordering.
+ * If it gets set we have seen reordering. The reorder logic either
+ * works in one of two ways:
+ *
+ * If reorder-fade is configured, then we track the last time we saw
+ * re-ordering occur. If we reach the point where enough time as
+ * passed we no longer consider reordering has occuring.
+ *
+ * Or if reorder-face is 0, then once we see reordering we consider
+ * the connection to alway be subject to reordering and just set lro
+ * to 1.
+ *
+ * In the end if lro is non-zero we add the extra time for
+ * reordering in.
+ */
+ int32_t lro;
+ uint32_t thresh, t_rxtcur;
+
+ if (srtt == 0)
+ srtt = 1;
+ if (bbr->r_ctl.rc_reorder_ts) {
+ if (bbr->r_ctl.rc_reorder_fade) {
+ if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) {
+ lro = cts - bbr->r_ctl.rc_reorder_ts;
+ if (lro == 0) {
+ /*
+ * No time as passed since the last
+ * reorder, mark it as reordering.
+ */
+ lro = 1;
+ }
+ } else {
+ /* Negative time? */
+ lro = 0;
+ }
+ if (lro > bbr->r_ctl.rc_reorder_fade) {
+ /* Turn off reordering seen too */
+ bbr->r_ctl.rc_reorder_ts = 0;
+ lro = 0;
+ }
+ } else {
+ /* Reodering does not fade */
+ lro = 1;
+ }
+ } else {
+ lro = 0;
+ }
+ thresh = srtt + bbr->r_ctl.rc_pkt_delay;
+ if (lro) {
+ /* It must be set, if not you get 1/4 rtt */
+ if (bbr->r_ctl.rc_reorder_shift)
+ thresh += (srtt >> bbr->r_ctl.rc_reorder_shift);
+ else
+ thresh += (srtt >> 2);
+ } else {
+ thresh += 1000;
+ }
+ /* We don't let the rack timeout be above a RTO */
+ if ((bbr->rc_tp)->t_srtt == 0)
+ t_rxtcur = BBR_INITIAL_RTO;
+ else
+ t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
+ if (thresh > t_rxtcur) {
+ thresh = t_rxtcur;
+ }
+ /* And we don't want it above the RTO max either */
+ if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
+ thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
+ }
+ bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK);
+ return (thresh);
+}
+
+/*
+ * Return to the caller the amount of time in mico-seconds
+ * that should be used for the TLP timer from the last
+ * send time of this packet.
+ */
+static uint32_t
+bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
+ struct bbr_sendmap *rsm, uint32_t srtt,
+ uint32_t cts)
+{
+ uint32_t thresh, len, maxseg, t_rxtcur;
+ struct bbr_sendmap *prsm;
+
+ if (srtt == 0)
+ srtt = 1;
+ if (bbr->rc_tlp_threshold)
+ thresh = srtt + (srtt / bbr->rc_tlp_threshold);
+ else
+ thresh = (srtt * 2);
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ /* Get the previous sent packet, if any */
+ len = rsm->r_end - rsm->r_start;
+
+ /* 2.1 behavior */
+ prsm = TAILQ_PREV(rsm, bbr_head, r_tnext);
+ if (prsm && (len <= maxseg)) {
+ /*
+ * Two packets outstanding, thresh should be (2*srtt) +
+ * possible inter-packet delay (if any).
+ */
+ uint32_t inter_gap = 0;
+ int idx, nidx;
+
+ idx = rsm->r_rtr_cnt - 1;
+ nidx = prsm->r_rtr_cnt - 1;
+ if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
+ /* Yes it was sent later (or at the same time) */
+ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
+ }
+ thresh += inter_gap;
+ } else if (len <= maxseg) {
+ /*
+ * Possibly compensate for delayed-ack.
+ */
+ uint32_t alt_thresh;
+
+ alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time;
+ if (alt_thresh > thresh)
+ thresh = alt_thresh;
+ }
+ /* Not above the current RTO */
+ if (tp->t_srtt == 0)
+ t_rxtcur = BBR_INITIAL_RTO;
+ else
+ t_rxtcur = TICKS_2_USEC(tp->t_rxtcur);
+
+ bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP);
+ /* Not above an RTO */
+ if (thresh > t_rxtcur) {
+ thresh = t_rxtcur;
+ }
+ /* Not above a RTO max */
+ if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
+ thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
+ }
+ /* And now apply the user TLP min */
+ if (thresh < bbr_tlp_min) {
+ thresh = bbr_tlp_min;
+ }
+ return (thresh);
+}
+
+/*
+ * Return one of three RTTs to use (in microseconds).
+ */
+static __inline uint32_t
+bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type)
+{
+ uint32_t f_rtt;
+ uint32_t srtt;
+
+ f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
+ if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) {
+ /* We have no rtt at all */
+ if (bbr->rc_tp->t_srtt == 0)
+ f_rtt = BBR_INITIAL_RTO;
+ else
+ f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
+ /*
+ * Since we don't know how good the rtt is apply a
+ * delayed-ack min
+ */
+ if (f_rtt < bbr_delayed_ack_time) {
+ f_rtt = bbr_delayed_ack_time;
+ }
+ }
+ /* Take the filter version or last measured pkt-rtt */
+ if (rtt_type == BBR_RTT_PROP) {
+ srtt = f_rtt;
+ } else if (rtt_type == BBR_RTT_PKTRTT) {
+ if (bbr->r_ctl.rc_pkt_epoch_rtt) {
+ srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
+ } else {
+ /* No pkt rtt yet */
+ srtt = f_rtt;
+ }
+ } else if (rtt_type == BBR_RTT_RACK) {
+ srtt = bbr->r_ctl.rc_last_rtt;
+ /* We need to add in any internal delay for our timer */
+ if (bbr->rc_ack_was_delayed)
+ srtt += bbr->r_ctl.rc_ack_hdwr_delay;
+ } else if (rtt_type == BBR_SRTT) {
+ srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
+ } else {
+ /* TSNH */
+ srtt = f_rtt;
+#ifdef BBR_INVARIANTS
+ panic("Unknown rtt request type %d", rtt_type);
+#endif
+ }
+ return (srtt);
+}
+
+static int
+bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts)
+{
+ uint32_t thresh;
+
+
+ thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK),
+ cts, rsm);
+ if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) {
+ /* It is lost (past time) */
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Return a sendmap if we need to retransmit something.
+ */
+static struct bbr_sendmap *
+bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ /*
+ * Check to see that we don't need to fall into recovery. We will
+ * need to do so if our oldest transmit is past the time we should
+ * have had an ack.
+ */
+
+ struct bbr_sendmap *rsm;
+ int32_t idx;
+
+ if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) {
+ /* Nothing outstanding that we know of */
+ return (NULL);
+ }
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
+ if (rsm == NULL) {
+ /* Nothing in the transmit map */
+ return (NULL);
+ }
+ if (tp->t_flags & TF_SENTFIN) {
+ /* Fin restricted, don't find anything once a fin is sent */
+ return (NULL);
+ }
+ if (rsm->r_flags & BBR_ACKED) {
+ /*
+ * Ok the first one is acked (this really should not happen
+ * since we remove the from the tmap once they are acked)
+ */
+ rsm = bbr_find_lowest_rsm(bbr);
+ if (rsm == NULL)
+ return (NULL);
+ }
+ idx = rsm->r_rtr_cnt - 1;
+ if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) {
+ /* Send timestamp is the same or less? can't be ready */
+ return (NULL);
+ }
+ /* Get our RTT time */
+ if (bbr_is_lost(bbr, rsm, cts) &&
+ ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ (rsm->r_flags & BBR_SACK_PASSED))) {
+ if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
+ rsm->r_flags |= BBR_MARKED_LOST;
+ bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
+ bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
+ }
+ bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm);
+#ifdef BBR_INVARIANTS
+ if ((rsm->r_end - rsm->r_start) == 0)
+ panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm);
+#endif
+ return (rsm);
+ }
+ return (NULL);
+}
+
+/*
+ * RACK Timer, here we simply do logging and house keeping.
+ * the normal bbr_output_wtime() function will call the
+ * appropriate thing to check if we need to do a RACK retransmit.
+ * We return 1, saying don't proceed with bbr_output_wtime only
+ * when all timers have been stopped (destroyed PCB?).
+ */
+static int
+bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ /*
+ * This timer simply provides an internal trigger to send out data.
+ * The check_recovery_mode call will see if there are needed
+ * retransmissions, if so we will enter fast-recovery. The output
+ * call may or may not do the same thing depending on sysctl
+ * settings.
+ */
+ uint32_t lost;
+
+ if (bbr->rc_all_timers_stopped) {
+ return (1);
+ }
+ if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
+ /* Its not time yet */
+ return (0);
+ }
+ BBR_STAT_INC(bbr_to_tot);
+ lost = bbr->r_ctl.rc_lost;
+ if (bbr->r_state && (bbr->r_state != tp->t_state))
+ bbr_set_state(tp, bbr, 0);
+ bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK);
+ if (bbr->r_ctl.rc_resend == NULL) {
+ /* Lets do the check here */
+ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
+ }
+ if (bbr_policer_call_from_rack_to)
+ bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
+ return (0);
+}
+
+static __inline void
+bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start)
+{
+ int idx;
+
+ nrsm->r_start = start;
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ /* We don't transfer forward the SYN flag */
+ nrsm->r_flags &= ~BBR_HAS_SYN;
+ /* We move forward the FIN flag, not that this should happen */
+ rsm->r_flags &= ~BBR_HAS_FIN;
+ nrsm->r_dupack = rsm->r_dupack;
+ nrsm->r_rtr_bytes = 0;
+ nrsm->r_is_gain = rsm->r_is_gain;
+ nrsm->r_is_drain = rsm->r_is_drain;
+ nrsm->r_delivered = rsm->r_delivered;
+ nrsm->r_ts_valid = rsm->r_ts_valid;
+ nrsm->r_del_ack_ts = rsm->r_del_ack_ts;
+ nrsm->r_del_time = rsm->r_del_time;
+ nrsm->r_app_limited = rsm->r_app_limited;
+ nrsm->r_first_sent_time = rsm->r_first_sent_time;
+ nrsm->r_flight_at_send = rsm->r_flight_at_send;
+ /* We split a piece the lower section looses any just_ret flag. */
+ nrsm->r_bbr_state = rsm->r_bbr_state;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+ rsm->r_end = nrsm->r_start;
+ idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
+ idx /= 8;
+ /* Check if we got too small */
+ if ((rsm->r_is_smallmap == 0) &&
+ ((rsm->r_end - rsm->r_start) <= idx)) {
+ bbr->r_ctl.rc_num_small_maps_alloced++;
+ rsm->r_is_smallmap = 1;
+ }
+ /* Check the new one as well */
+ if ((nrsm->r_end - nrsm->r_start) <= idx) {
+ bbr->r_ctl.rc_num_small_maps_alloced++;
+ nrsm->r_is_smallmap = 1;
+ }
+}
+
+static int
+bbr_sack_mergable(struct bbr_sendmap *at,
+ uint32_t start, uint32_t end)
+{
+ /*
+ * Given a sack block defined by
+ * start and end, and a current postion
+ * at. Return 1 if either side of at
+ * would show that the block is mergable
+ * to that side. A block to be mergable
+ * must have overlap with the start/end
+ * and be in the SACK'd state.
+ */
+ struct bbr_sendmap *l_rsm;
+ struct bbr_sendmap *r_rsm;
+
+ /* first get the either side blocks */
+ l_rsm = TAILQ_PREV(at, bbr_head, r_next);
+ r_rsm = TAILQ_NEXT(at, r_next);
+ if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) {
+ /* Potentially mergeable */
+ if ((l_rsm->r_end == start) ||
+ (SEQ_LT(start, l_rsm->r_end) &&
+ SEQ_GT(end, l_rsm->r_end))) {
+ /*
+ * map blk |------|
+ * sack blk |------|
+ * <or>
+ * map blk |------|
+ * sack blk |------|
+ */
+ return (1);
+ }
+ }
+ if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) {
+ /* Potentially mergeable */
+ if ((r_rsm->r_start == end) ||
+ (SEQ_LT(start, r_rsm->r_start) &&
+ SEQ_GT(end, r_rsm->r_start))) {
+ /*
+ * map blk |---------|
+ * sack blk |----|
+ * <or>
+ * map blk |---------|
+ * sack blk |-------|
+ */
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static struct bbr_sendmap *
+bbr_merge_rsm(struct tcp_bbr *bbr,
+ struct bbr_sendmap *l_rsm,
+ struct bbr_sendmap *r_rsm)
+{
+ /*
+ * We are merging two ack'd RSM's,
+ * the l_rsm is on the left (lower seq
+ * values) and the r_rsm is on the right
+ * (higher seq value). The simplest way
+ * to merge these is to move the right
+ * one into the left. I don't think there
+ * is any reason we need to try to find
+ * the oldest (or last oldest retransmitted).
+ */
+ l_rsm->r_end = r_rsm->r_end;
+ if (l_rsm->r_dupack < r_rsm->r_dupack)
+ l_rsm->r_dupack = r_rsm->r_dupack;
+ if (r_rsm->r_rtr_bytes)
+ l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
+ if (r_rsm->r_in_tmap) {
+ /* This really should not happen */
+ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext);
+ }
+ if (r_rsm->r_app_limited)
+ l_rsm->r_app_limited = r_rsm->r_app_limited;
+ /* Now the flags */
+ if (r_rsm->r_flags & BBR_HAS_FIN)
+ l_rsm->r_flags |= BBR_HAS_FIN;
+ if (r_rsm->r_flags & BBR_TLP)
+ l_rsm->r_flags |= BBR_TLP;
+ if (r_rsm->r_flags & BBR_RWND_COLLAPSED)
+ l_rsm->r_flags |= BBR_RWND_COLLAPSED;
+ if (r_rsm->r_flags & BBR_MARKED_LOST) {
+ /* This really should not happen */
+ bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start;
+ }
+ TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next);
+ if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
+ /* Transfer the split limit to the map we free */
+ r_rsm->r_limit_type = l_rsm->r_limit_type;
+ l_rsm->r_limit_type = 0;
+ }
+ bbr_free(bbr, r_rsm);
+ return(l_rsm);
+}
+
+/*
+ * TLP Timer, here we simply setup what segment we want to
+ * have the TLP expire on, the normal bbr_output_wtime() will then
+ * send it out.
+ *
+ * We return 1, saying don't proceed with bbr_output_wtime only
+ * when all timers have been stopped (destroyed PCB?).
+ */
+static int
+bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ /*
+ * Tail Loss Probe.
+ */
+ struct bbr_sendmap *rsm = NULL;
+ struct socket *so;
+ uint32_t amm;
+ uint32_t out, avail;
+ uint32_t maxseg;
+ int collapsed_win = 0;
+
+ if (bbr->rc_all_timers_stopped) {
+ return (1);
+ }
+ if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
+ /* Its not time yet */
+ return (0);
+ }
+ if (bbr_progress_timeout_check(bbr)) {
+ tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
+ return (1);
+ }
+ /* Did we somehow get into persists? */
+ if (bbr->rc_in_persist) {
+ return (0);
+ }
+ if (bbr->r_state && (bbr->r_state != tp->t_state))
+ bbr_set_state(tp, bbr, 0);
+ BBR_STAT_INC(bbr_tlp_tot);
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+#ifdef KERN_TLS
+ if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
+ /*
+ * For hardware TLS we do *not* want to send
+ * new data.
+ */
+ goto need_retran;
+ }
+#endif
+ /*
+ * A TLP timer has expired. We have been idle for 2 rtts. So we now
+ * need to figure out how to force a full MSS segment out.
+ */
+ so = tp->t_inpcb->inp_socket;
+ avail = sbavail(&so->so_snd);
+ out = ctf_outstanding(tp);
+ if (out > tp->snd_wnd) {
+ /* special case, we need a retransmission */
+ collapsed_win = 1;
+ goto need_retran;
+ }
+ if (avail > out) {
+ /* New data is available */
+ amm = avail - out;
+ if (amm > maxseg) {
+ amm = maxseg;
+ } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
+ /* not enough to fill a MTU and no-delay is off */
+ goto need_retran;
+ }
+ /* Set the send-new override */
+ if ((out + amm) <= tp->snd_wnd) {
+ bbr->rc_tlp_new_data = 1;
+ } else {
+ goto need_retran;
+ }
+ bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
+ bbr->r_ctl.rc_last_tlp_seq = tp->snd_max;
+ bbr->r_ctl.rc_tlp_send = NULL;
+ /* cap any slots */
+ BBR_STAT_INC(bbr_tlp_newdata);
+ goto send;
+ }
+need_retran:
+ /*
+ * Ok we need to arrange the last un-acked segment to be re-sent, or
+ * optionally the first un-acked segment.
+ */
+ if (collapsed_win == 0) {
+ rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
+ if (rsm && (BBR_ACKED | BBR_HAS_FIN)) {
+ rsm = bbr_find_high_nonack(bbr, rsm);
+ }
+ if (rsm == NULL) {
+ goto restore;
+ }
+ } else {
+ /*
+ * We must find the last segment
+ * that was acceptable by the client.
+ */
+ TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
+ if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) {
+ /* Found one */
+ break;
+ }
+ }
+ if (rsm == NULL) {
+ /* None? if so send the first */
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ if (rsm == NULL)
+ goto restore;
+ }
+ }
+ if ((rsm->r_end - rsm->r_start) > maxseg) {
+ /*
+ * We need to split this the last segment in two.
+ */
+ struct bbr_sendmap *nrsm;
+
+ nrsm = bbr_alloc_full_limit(bbr);
+ if (nrsm == NULL) {
+ /*
+ * We can't get memory to split, we can either just
+ * not split it. Or retransmit the whole piece, lets
+ * do the large send (BTLP :-) ).
+ */
+ goto go_for_it;
+ }
+ bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg));
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~BBR_HAS_FIN);
+ rsm = nrsm;
+ }
+go_for_it:
+ bbr->r_ctl.rc_tlp_send = rsm;
+ bbr->rc_tlp_rtx_out = 1;
+ if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) {
+ bbr->r_ctl.rc_tlp_seg_send_cnt++;
+ tp->t_rxtshift++;
+ } else {
+ bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
+ bbr->r_ctl.rc_tlp_seg_send_cnt = 1;
+ }
+send:
+ if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
+ /*
+ * Can't [re]/transmit a segment we have retranmitted the
+ * max times. We need the retransmit timer to take over.
+ */
+restore:
+ bbr->rc_tlp_new_data = 0;
+ bbr->r_ctl.rc_tlp_send = NULL;
+ if (rsm)
+ rsm->r_flags &= ~BBR_TLP;
+ BBR_STAT_INC(bbr_tlp_retran_fail);
+ return (0);
+ } else if (rsm) {
+ rsm->r_flags |= BBR_TLP;
+ }
+ if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) &&
+ (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) {
+ /*
+ * We have retransmitted to many times for TLP. Switch to
+ * the regular RTO timer
+ */
+ goto restore;
+ }
+ bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP);
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
+ return (0);
+}
+
+/*
+ * Delayed ack Timer, here we simply need to setup the
+ * ACK_NOW flag and remove the DELACK flag. From there
+ * the output routine will send the ack out.
+ *
+ * We only return 1, saying don't proceed, if all timers
+ * are stopped (destroyed PCB?).
+ */
+static int
+bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ if (bbr->rc_all_timers_stopped) {
+ return (1);
+ }
+ bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK);
+ tp->t_flags &= ~TF_DELACK;
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_delack);
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
+ return (0);
+}
+
+/*
+ * Persists timer, here we simply need to setup the
+ * FORCE-DATA flag the output routine will send
+ * the one byte send.
+ *
+ * We only return 1, saying don't proceed, if all timers
+ * are stopped (destroyed PCB?).
+ */
+static int
+bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ struct tcptemp *t_template;
+ int32_t retval = 1;
+
+ if (bbr->rc_all_timers_stopped) {
+ return (1);
+ }
+ if (bbr->rc_in_persist == 0)
+ return (0);
+ KASSERT(tp->t_inpcb != NULL,
+ ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
+ /*
+ * Persistence timer into zero window. Force a byte to be output, if
+ * possible.
+ */
+ bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST);
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
+ TCPSTAT_INC(tcps_persisttimeo);
+ /*
+ * Have we exceeded the user specified progress time?
+ */
+ if (bbr_progress_timeout_check(bbr)) {
+ tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
+ goto out;
+ }
+ /*
+ * Hack: if the peer is dead/unreachable, we do not time out if the
+ * window is closed. After a full backoff, drop the connection if
+ * the idle time (no responses to probes) reaches the maximum
+ * backoff that we would use if retransmitting.
+ */
+ if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+ (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
+ ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ TCPSTAT_INC(tcps_persistdrop);
+ tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
+ goto out;
+ }
+ if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) &&
+ tp->snd_una == tp->snd_max) {
+ bbr_exit_persist(tp, bbr, cts, __LINE__);
+ retval = 0;
+ goto out;
+ }
+ /*
+ * If the user has closed the socket then drop a persisting
+ * connection after a much reduced timeout.
+ */
+ if (tp->t_state > TCPS_CLOSE_WAIT &&
+ (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
+ TCPSTAT_INC(tcps_persistdrop);
+ tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
+ goto out;
+ }
+ t_template = tcpip_maketemplate(bbr->rc_inp);
+ if (t_template) {
+ tcp_respond(tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ tp->rcv_nxt, tp->snd_una - 1, 0);
+ /* This sends an ack */
+ if (tp->t_flags & TF_DELACK)
+ tp->t_flags &= ~TF_DELACK;
+ free(t_template, M_TEMP);
+ }
+ if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+ tp->t_rxtshift++;
+ bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0);
+out:
+ return (retval);
+}
+
+/*
+ * If a keepalive goes off, we had no other timers
+ * happening. We always return 1 here since this
+ * routine either drops the connection or sends
+ * out a segment with respond.
+ */
+static int
+bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ struct tcptemp *t_template;
+ struct inpcb *inp;
+
+ if (bbr->rc_all_timers_stopped) {
+ return (1);
+ }
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
+ inp = tp->t_inpcb;
+ bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP);
+ /*
+ * Keep-alive timer went off; send something or drop connection if
+ * idle for too long.
+ */
+ TCPSTAT_INC(tcps_keeptimeo);
+ if (tp->t_state < TCPS_ESTABLISHED)
+ goto dropit;
+ if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ tp->t_state <= TCPS_CLOSING) {
+ if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
+ goto dropit;
+ /*
+ * Send a packet designed to force a response if the peer is
+ * up and reachable: either an ACK if the connection is
+ * still alive, or an RST if the peer has closed the
+ * connection due to timeout or reboot. Using sequence
+ * number tp->snd_una-1 causes the transmitted zero-length
+ * segment to lie outside the receive window; by the
+ * protocol spec, this requires the correspondent TCP to
+ * respond.
+ */
+ TCPSTAT_INC(tcps_keepprobe);
+ t_template = tcpip_maketemplate(inp);
+ if (t_template) {
+ tcp_respond(tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ tp->rcv_nxt, tp->snd_una - 1, 0);
+ free(t_template, M_TEMP);
+ }
+ }
+ bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0);
+ return (1);
+dropit:
+ TCPSTAT_INC(tcps_keepdrops);
+ tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
+ return (1);
+}
+
+/*
+ * Retransmit helper function, clear up all the ack
+ * flags and take care of important book keeping.
+ */
+static void
+bbr_remxt_tmr(struct tcpcb *tp)
+{
+ /*
+ * The retransmit timer went off, all sack'd blocks must be
+ * un-acked.
+ */
+ struct bbr_sendmap *rsm, *trsm = NULL;
+ struct tcp_bbr *bbr;
+ uint32_t cts, lost;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ cts = tcp_get_usecs(&bbr->rc_tv);
+ lost = bbr->r_ctl.rc_lost;
+ if (bbr->r_state && (bbr->r_state != tp->t_state))
+ bbr_set_state(tp, bbr, 0);
+
+ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
+ if (rsm->r_flags & BBR_ACKED) {
+ uint32_t old_flags;
+
+ rsm->r_dupack = 0;
+ if (rsm->r_in_tmap == 0) {
+ /* We must re-add it back to the tlist */
+ if (trsm == NULL) {
+ TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ } else {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext);
+ }
+ rsm->r_in_tmap = 1;
+ }
+ old_flags = rsm->r_flags;
+ rsm->r_flags |= BBR_RXT_CLEARED;
+ rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS);
+ bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
+ } else {
+ if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
+ bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
+ bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
+ }
+ if (bbr_marks_rxt_sack_passed) {
+ /*
+ * With this option, we will rack out
+ * in 1ms increments the rest of the packets.
+ */
+ rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST;
+ rsm->r_flags &= ~BBR_WAS_SACKPASS;
+ } else {
+ /*
+ * With this option we only mark them lost
+ * and remove all sack'd markings. We will run
+ * another RXT or a TLP. This will cause
+ * us to eventually send more based on what
+ * ack's come in.
+ */
+ rsm->r_flags |= BBR_MARKED_LOST;
+ rsm->r_flags &= ~BBR_WAS_SACKPASS;
+ rsm->r_flags &= ~BBR_SACK_PASSED;
+ }
+ }
+ trsm = rsm;
+ }
+ bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ /* Clear the count (we just un-acked them) */
+ bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR);
+ bbr->rc_tlp_new_data = 0;
+ bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
+ /* zap the behindness on a rxt */
+ bbr->r_ctl.rc_hptsi_agg_delay = 0;
+ bbr->r_agg_early_set = 0;
+ bbr->r_ctl.rc_agg_early = 0;
+ bbr->rc_tlp_rtx_out = 0;
+ bbr->r_ctl.rc_sacked = 0;
+ bbr->r_ctl.rc_sacklast = NULL;
+ bbr->r_timer_override = 1;
+ bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
+}
+
+/*
+ * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
+ * we will setup to retransmit the lowest seq number outstanding.
+ */
+static int
+bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ int32_t rexmt;
+ int32_t retval = 0;
+
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
+ if (bbr->rc_all_timers_stopped) {
+ return (1);
+ }
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_una == tp->snd_max)) {
+ /* Nothing outstanding .. nothing to do */
+ return (0);
+ }
+ /*
+ * Retransmission timer went off. Message has not been acked within
+ * retransmit interval. Back off to a longer retransmit interval
+ * and retransmit one segment.
+ */
+ if (bbr_progress_timeout_check(bbr)) {
+ retval = 1;
+ tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
+ goto out;
+ }
+ bbr_remxt_tmr(tp);
+ if ((bbr->r_ctl.rc_resend == NULL) ||
+ ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) {
+ /*
+ * If the rwnd collapsed on
+ * the one we are retransmitting
+ * it does not count against the
+ * rxt count.
+ */
+ tp->t_rxtshift++;
+ }
+ if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ tp->t_rxtshift = TCP_MAXRXTSHIFT;
+ TCPSTAT_INC(tcps_timeoutdrop);
+ retval = 1;
+ tcp_set_inp_to_drop(bbr->rc_inp,
+ (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
+ goto out;
+ }
+ if (tp->t_state == TCPS_SYN_SENT) {
+ /*
+ * If the SYN was retransmitted, indicate CWND to be limited
+ * to 1 segment in cc_conn_init().
+ */
+ tp->snd_cwnd = 1;
+ } else if (tp->t_rxtshift == 1) {
+ /*
+ * first retransmit; record ssthresh and cwnd so they can be
+ * recovered if this turns out to be a "bad" retransmit. A
+ * retransmit is considered "bad" if an ACK for this segment
+ * is received within RTT/2 interval; the assumption here is
+ * that the ACK was already in flight. See "On Estimating
+ * End-to-End Network Path Properties" by Allman and Paxson
+ * for more details.
+ */
+ tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
+ if (!IN_RECOVERY(tp->t_flags)) {
+ tp->snd_cwnd_prev = tp->snd_cwnd;
+ tp->snd_ssthresh_prev = tp->snd_ssthresh;
+ tp->snd_recover_prev = tp->snd_recover;
+ tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ tp->t_flags |= TF_PREVVALID;
+ } else {
+ tp->t_flags &= ~TF_PREVVALID;
+ }
+ tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
+ } else {
+ tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
+ tp->t_flags &= ~TF_PREVVALID;
+ }
+ TCPSTAT_INC(tcps_rexmttimeo);
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift];
+ else
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ TCPT_RANGESET(tp->t_rxtcur, rexmt,
+ MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms),
+ MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
+ /*
+ * We enter the path for PLMTUD if connection is established or, if
+ * connection is FIN_WAIT_1 status, reason for the last is that if
+ * amount of data we send is very small, we could send it in couple
+ * of packets and process straight to FIN. In that case we won't
+ * catch ESTABLISHED state.
+ */
+ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
+ || (tp->t_state == TCPS_FIN_WAIT_1))) {
+#ifdef INET6
+ int32_t isipv6;
+#endif
+
+ /*
+ * Idea here is that at each stage of mtu probe (usually,
+ * 1448 -> 1188 -> 524) should be given 2 chances to recover
+ * before further clamping down. 'tp->t_rxtshift % 2 == 0'
+ * should take care of that.
+ */
+ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
+ (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
+ (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
+ tp->t_rxtshift % 2 == 0)) {
+ /*
+ * Enter Path MTU Black-hole Detection mechanism: -
+ * Disable Path MTU Discovery (IP "DF" bit). -
+ * Reduce MTU to lower value than what we negotiated
+ * with peer.
+ */
+ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
+ /*
+ * Record that we may have found a black
+ * hole.
+ */
+ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
+ /* Keep track of previous MSS. */
+ tp->t_pmtud_saved_maxseg = tp->t_maxseg;
+ }
+ /*
+ * Reduce the MSS to blackhole value or to the
+ * default in an attempt to retransmit.
+ */
+#ifdef INET6
+ isipv6 = bbr->r_is_v6;
+ if (isipv6 &&
+ tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated);
+ } else if (isipv6) {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_v6mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch
+ * to minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated);
+ } else {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch
+ * to minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
+ }
+#endif
+ } else {
+ /*
+ * If further retransmissions are still unsuccessful
+ * with a lowered MTU, maybe this isn't a blackhole
+ * and we restore the previous MSS and blackhole
+ * detection flags. The limit '6' is determined by
+ * giving each probe stage (1448, 1188, 524) 2
+ * chances to recover.
+ */
+ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
+ (tp->t_rxtshift >= 6)) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
+ tp->t_maxseg = tp->t_pmtud_saved_maxseg;
+ TCPSTAT_INC(tcps_pmtud_blackhole_failed);
+ }
+ }
+ }
+ /*
+ * Disable RFC1323 and SACK if we haven't got any response to our
+ * third SYN to work-around some broken terminal servers (most of
+ * which have hopefully been retired) that have bad VJ header
+ * compression code which trashes TCP segments containing
+ * unknown-to-them TCP options.
+ */
+ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
+ (tp->t_rxtshift == 3))
+ tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
+ /*
+ * If we backed off this far, our srtt estimate is probably bogus.
+ * Clobber it so we'll take the next rtt measurement as our srtt;
+ * move the current srtt into rttvar to keep the current retransmit
+ * times until then.
+ */
+ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
+#ifdef INET6
+ if (bbr->r_is_v6)
+ in6_losing(tp->t_inpcb);
+ else
+#endif
+ in_losing(tp->t_inpcb);
+ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
+ tp->t_srtt = 0;
+ }
+ sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
+ tp->snd_recover = tp->snd_max;
+ tp->t_flags |= TF_ACKNOW;
+ tp->t_rtttime = 0;
+out:
+ return (retval);
+}
+
+static int
+bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling)
+{
+ int32_t ret = 0;
+ int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
+
+ if (timers == 0) {
+ return (0);
+ }
+ if (tp->t_state == TCPS_LISTEN) {
+ /* no timers on listen sockets */
+ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
+ return (0);
+ return (1);
+ }
+ if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
+ uint32_t left;
+
+ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ ret = -1;
+ bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
+ return (0);
+ }
+ if (hpts_calling == 0) {
+ ret = -2;
+ bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
+ return (0);
+ }
+ /*
+ * Ok our timer went off early and we are not paced false
+ * alarm, go back to sleep.
+ */
+ left = bbr->r_ctl.rc_timer_exp - cts;
+ ret = -3;
+ bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);
+ tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left));
+ return (1);
+ }
+ bbr->rc_tmr_stopped = 0;
+ bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
+ if (timers & PACE_TMR_DELACK) {
+ ret = bbr_timeout_delack(tp, bbr, cts);
+ } else if (timers & PACE_TMR_PERSIT) {
+ ret = bbr_timeout_persist(tp, bbr, cts);
+ } else if (timers & PACE_TMR_RACK) {
+ bbr->r_ctl.rc_tlp_rxt_last_time = cts;
+ ret = bbr_timeout_rack(tp, bbr, cts);
+ } else if (timers & PACE_TMR_TLP) {
+ bbr->r_ctl.rc_tlp_rxt_last_time = cts;
+ ret = bbr_timeout_tlp(tp, bbr, cts);
+ } else if (timers & PACE_TMR_RXT) {
+ bbr->r_ctl.rc_tlp_rxt_last_time = cts;
+ ret = bbr_timeout_rxt(tp, bbr, cts);
+ } else if (timers & PACE_TMR_KEEP) {
+ ret = bbr_timeout_keepalive(tp, bbr, cts);
+ }
+ bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling);
+ return (ret);
+}
+
+static void
+bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
+{
+ if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ uint8_t hpts_removed = 0;
+
+ if (bbr->rc_inp->inp_in_hpts &&
+ (bbr->rc_timer_first == 1)) {
+ /*
+ * If we are canceling timer's when we have the
+ * timer ahead of the output being paced. We also
+ * must remove ourselves from the hpts.
+ */
+ hpts_removed = 1;
+ tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
+ if (bbr->r_ctl.rc_last_delay_val) {
+ /* Update the last hptsi delay too */
+ uint32_t time_since_send;
+
+ if (TSTMP_GT(cts, bbr->rc_pacer_started))
+ time_since_send = cts - bbr->rc_pacer_started;
+ else
+ time_since_send = 0;
+ if (bbr->r_ctl.rc_last_delay_val > time_since_send) {
+ /* Cut down our slot time */
+ bbr->r_ctl.rc_last_delay_val -= time_since_send;
+ } else {
+ bbr->r_ctl.rc_last_delay_val = 0;
+ }
+ bbr->rc_pacer_started = cts;
+ }
+ }
+ bbr->rc_timer_first = 0;
+ bbr_log_to_cancel(bbr, line, cts, hpts_removed);
+ bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
+ bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
+ }
+}
+
+static void
+bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ bbr->rc_all_timers_stopped = 1;
+ return;
+}
+
+/*
+ * stop all timers always returning 0.
+ */
+static int
+bbr_stopall(struct tcpcb *tp)
+{
+ return (0);
+}
+
+static void
+bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
+{
+ return;
+}
+
+/*
+ * return true if a bbr timer (rack or tlp) is active.
+ */
+static int
+bbr_timer_active(struct tcpcb *tp, uint32_t timer_type)
+{
+ return (0);
+}
+
+static uint32_t
+bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts)
+{
+ struct bbr_sendmap *rsm;
+
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
+ if ((rsm == NULL) || (u_rsm == rsm))
+ return (cts);
+ return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
+}
+
+static void
+bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr,
+ struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time)
+{
+ int32_t idx;
+
+ rsm->r_rtr_cnt++;
+ rsm->r_dupack = 0;
+ if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) {
+ rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS;
+ rsm->r_flags |= BBR_OVERMAX;
+ }
+ if (rsm->r_flags & BBR_RWND_COLLAPSED) {
+ /* Take off the collapsed flag at rxt */
+ rsm->r_flags &= ~BBR_RWND_COLLAPSED;
+ }
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ /* We have retransmitted, its no longer lost */
+ rsm->r_flags &= ~BBR_MARKED_LOST;
+ bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
+ }
+ if (rsm->r_flags & BBR_RXT_CLEARED) {
+ /*
+ * We hit a RXT timer on it and
+ * we cleared the "acked" flag.
+ * We now have it going back into
+ * flight, we can remove the cleared
+ * flag and possibly do accounting on
+ * this piece.
+ */
+ rsm->r_flags &= ~BBR_RXT_CLEARED;
+ }
+ if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) {
+ bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
+ rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
+ }
+ idx = rsm->r_rtr_cnt - 1;
+ rsm->r_tim_lastsent[idx] = cts;
+ rsm->r_pacing_delay = pacing_time;
+ rsm->r_delivered = bbr->r_ctl.rc_delivered;
+ rsm->r_ts_valid = bbr->rc_ts_valid;
+ if (bbr->rc_ts_valid)
+ rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
+ if (bbr->r_ctl.r_app_limited_until)
+ rsm->r_app_limited = 1;
+ else
+ rsm->r_app_limited = 0;
+ if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
+ rsm->r_bbr_state = bbr_state_val(bbr);
+ else
+ rsm->r_bbr_state = 8;
+ if (rsm->r_flags & BBR_ACKED) {
+ /* Problably MTU discovery messing with us */
+ uint32_t old_flags;
+
+ old_flags = rsm->r_flags;
+ rsm->r_flags &= ~BBR_ACKED;
+ bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
+ bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+ if (bbr->r_ctl.rc_sacked == 0)
+ bbr->r_ctl.rc_sacklast = NULL;
+ }
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ }
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ if (rsm->r_flags & BBR_SACK_PASSED) {
+ /* We have retransmitted due to the SACK pass */
+ rsm->r_flags &= ~BBR_SACK_PASSED;
+ rsm->r_flags |= BBR_WAS_SACKPASS;
+ }
+ rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
+ rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
+ if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
+ rsm->r_is_gain = 1;
+ rsm->r_is_drain = 0;
+ } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
+ rsm->r_is_drain = 1;
+ rsm->r_is_gain = 0;
+ } else {
+ rsm->r_is_drain = 0;
+ rsm->r_is_gain = 0;
+ }
+ rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */
+}
+
+/*
+ * Returns 0, or the sequence where we stopped
+ * updating. We also update the lenp to be the amount
+ * of data left.
+ */
+
+static uint32_t
+bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr,
+ struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time)
+{
+ /*
+ * We (re-)transmitted starting at rsm->r_start for some length
+ * (possibly less than r_end.
+ */
+ struct bbr_sendmap *nrsm;
+ uint32_t c_end;
+ int32_t len;
+
+ len = *lenp;
+ c_end = rsm->r_start + len;
+ if (SEQ_GEQ(c_end, rsm->r_end)) {
+ /*
+ * We retransmitted the whole piece or more than the whole
+ * slopping into the next rsm.
+ */
+ bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
+ if (c_end == rsm->r_end) {
+ *lenp = 0;
+ return (0);
+ } else {
+ int32_t act_len;
+
+ /* Hangs over the end return whats left */
+ act_len = rsm->r_end - rsm->r_start;
+ *lenp = (len - act_len);
+ return (rsm->r_end);
+ }
+ /* We don't get out of this block. */
+ }
+ /*
+ * Here we retransmitted less than the whole thing which means we
+ * have to split this into what was transmitted and what was not.
+ */
+ nrsm = bbr_alloc_full_limit(bbr);
+ if (nrsm == NULL) {
+ *lenp = 0;
+ return (0);
+ }
+ /*
+ * So here we are going to take the original rsm and make it what we
+ * retransmitted. nrsm will be the tail portion we did not
+ * retransmit. For example say the chunk was 1, 11 (10 bytes). And
+ * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
+ * 1, 6 and the new piece will be 6, 11.
+ */
+ bbr_clone_rsm(bbr, nrsm, rsm, c_end);
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
+ nrsm->r_dupack = 0;
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~BBR_HAS_FIN);
+ bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
+ *lenp = 0;
+ return (0);
+}
+
+static uint64_t
+bbr_get_hardware_rate(struct tcp_bbr *bbr)
+{
+ uint64_t bw;
+
+ bw = bbr_get_bw(bbr);
+ bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN];
+ bw /= (uint64_t)BBR_UNIT;
+ return(bw);
+}
+
+static void
+bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts,
+ uint64_t act_rate, uint64_t rate_wanted)
+{
+ /*
+ * We could not get a full gains worth
+ * of rate.
+ */
+ if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) {
+ /* we can't even get the real rate */
+ uint64_t red;
+
+ bbr->skip_gain = 1;
+ bbr->gain_is_limited = 0;
+ red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate;
+ if (red)
+ filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts);
+ } else {
+ /* We can use a lower gain */
+ bbr->skip_gain = 0;
+ bbr->gain_is_limited = 1;
+ }
+}
+
+static void
+bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts)
+{
+ const struct tcp_hwrate_limit_table *nrte;
+ int error, rate = -1;
+
+ if (bbr->r_ctl.crte == NULL)
+ return;
+ if ((bbr->rc_inp->inp_route.ro_rt == NULL) ||
+ (bbr->rc_inp->inp_route.ro_rt->rt_ifp == NULL)) {
+ /* Lost our routes? */
+ /* Clear the way for a re-attempt */
+ bbr->bbr_attempt_hdwr_pace = 0;
+lost_rate:
+ bbr->gain_is_limited = 0;
+ bbr->skip_gain = 0;
+ bbr->bbr_hdrw_pacing = 0;
+ counter_u64_add(bbr_flows_whdwr_pacing, -1);
+ counter_u64_add(bbr_flows_nohdwr_pacing, 1);
+ tcp_bbr_tso_size_check(bbr, cts);
+ return;
+ }
+ rate = bbr_get_hardware_rate(bbr);
+ nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte,
+ bbr->rc_tp,
+ bbr->rc_inp->inp_route.ro_rt->rt_ifp,
+ rate,
+ (RS_PACING_GEQ|RS_PACING_SUB_OK),
+ &error);
+ if (nrte == NULL) {
+ goto lost_rate;
+ }
+ if (nrte != bbr->r_ctl.crte) {
+ bbr->r_ctl.crte = nrte;
+ if (error == 0) {
+ BBR_STAT_INC(bbr_hdwr_rl_mod_ok);
+ if (bbr->r_ctl.crte->rate < rate) {
+ /* We have a problem */
+ bbr_setup_less_of_rate(bbr, cts,
+ bbr->r_ctl.crte->rate, rate);
+ } else {
+ /* We are good */
+ bbr->gain_is_limited = 0;
+ bbr->skip_gain = 0;
+ }
+ } else {
+ /* A failure should release the tag */
+ BBR_STAT_INC(bbr_hdwr_rl_mod_fail);
+ bbr->gain_is_limited = 0;
+ bbr->skip_gain = 0;
+ bbr->bbr_hdrw_pacing = 0;
+ }
+ bbr_type_log_hdwr_pacing(bbr,
+ bbr->r_ctl.crte->ptbl->rs_ifp,
+ rate,
+ ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate),
+ __LINE__,
+ cts,
+ error);
+ }
+}
+
+static void
+bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts)
+{
+ /*
+ * If we have hardware pacing support
+ * we need to factor that in for our
+ * TSO size.
+ */
+ const struct tcp_hwrate_limit_table *rlp;
+ uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay;
+
+ if ((bbr->bbr_hdrw_pacing == 0) ||
+ (IN_RECOVERY(bbr->rc_tp->t_flags)) ||
+ (bbr->r_ctl.crte == NULL))
+ return;
+ if (bbr->hw_pacing_set == 0) {
+ /* Not yet by the hdwr pacing count delay */
+ return;
+ }
+ if (bbr_hdwr_pace_adjust == 0) {
+ /* No adjustment */
+ return;
+ }
+ rlp = bbr->r_ctl.crte;
+ if (bbr->rc_tp->t_maxseg > bbr->rc_last_options)
+ maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
+ else
+ maxseg = BBR_MIN_SEG - bbr->rc_last_options;
+ /*
+ * So lets first get the
+ * time we will take between
+ * TSO sized sends currently without
+ * hardware help.
+ */
+ cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT,
+ bbr->r_ctl.rc_pace_max_segs, cts, 1);
+ hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg;
+ hdwr_delay *= rlp->time_between;
+ if (cur_delay > hdwr_delay)
+ delta = cur_delay - hdwr_delay;
+ else
+ delta = 0;
+ bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay,
+ (bbr->r_ctl.rc_pace_max_segs / maxseg),
+ 1);
+ if (delta &&
+ (delta < (max(rlp->time_between,
+ bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) {
+ /*
+ * Now lets divide by the pacing
+ * time between each segment the
+ * hardware sends rounding up and
+ * derive a bytes from that. We multiply
+ * that by bbr_hdwr_pace_adjust to get
+ * more bang for our buck.
+ *
+ * The goal is to have the software pacer
+ * waiting no more than an additional
+ * pacing delay if we can (without the
+ * compensation i.e. x bbr_hdwr_pace_adjust).
+ */
+ seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between),
+ (bbr->r_ctl.rc_pace_max_segs/maxseg));
+ seg_sz *= bbr_hdwr_pace_adjust;
+ if (bbr_hdwr_pace_floor &&
+ (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
+ /* Currently hardware paces
+ * out rs_min_seg segments at a time.
+ * We need to make sure we always send at least
+ * a full burst of bbr_hdwr_pace_floor down.
+ */
+ seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
+ }
+ seg_sz *= maxseg;
+ } else if (delta == 0) {
+ /*
+ * The highest pacing rate is
+ * above our b/w gained. This means
+ * we probably are going quite fast at
+ * the hardware highest rate. Lets just multiply
+ * the calculated TSO size by the
+ * multiplier factor (its probably
+ * 4 segments in the default config for
+ * mlx).
+ */
+ seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust;
+ if (bbr_hdwr_pace_floor &&
+ (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
+ /* Currently hardware paces
+ * out rs_min_seg segments at a time.
+ * We need to make sure we always send at least
+ * a full burst of bbr_hdwr_pace_floor down.
+ */
+ seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
+ }
+ } else {
+ /*
+ * The pacing time difference is so
+ * big that the hardware will
+ * pace out more rapidly then we
+ * really want and then we
+ * will have a long delay. Lets just keep
+ * the same TSO size so its as if
+ * we were not using hdwr pacing (we
+ * just gain a bit of spacing from the
+ * hardware if seg_sz > 1).
+ */
+ seg_sz = bbr->r_ctl.rc_pace_max_segs;
+ }
+ if (seg_sz > bbr->r_ctl.rc_pace_max_segs)
+ new_tso = seg_sz;
+ else
+ new_tso = bbr->r_ctl.rc_pace_max_segs;
+ if (new_tso >= (PACE_MAX_IP_BYTES-maxseg))
+ new_tso = PACE_MAX_IP_BYTES - maxseg;
+
+ if (new_tso != bbr->r_ctl.rc_pace_max_segs) {
+ bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0);
+ bbr->r_ctl.rc_pace_max_segs = new_tso;
+ }
+}
+
+static void
+tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts)
+{
+ uint64_t bw;
+ uint32_t old_tso = 0, new_tso;
+ uint32_t maxseg, bytes;
+ uint32_t tls_seg=0;
+ /*
+ * Google/linux uses the following algorithm to determine
+ * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18):
+ *
+ * bytes = bw_in_bytes_per_second / 1000
+ * bytes = min(bytes, 64k)
+ * tso_segs = bytes / MSS
+ * if (bw < 1.2Mbs)
+ * min_tso_segs = 1
+ * else
+ * min_tso_segs = 2
+ * tso_segs = max(tso_segs, min_tso_segs)
+ *
+ * * Note apply a device specific limit (we apply this in the
+ * tcp_m_copym).
+ * Note that before the initial measurement is made google bursts out
+ * a full iwnd just like new-reno/cubic.
+ *
+ * We do not use this algorithm. Instead we
+ * use a two phased approach:
+ *
+ * if ( bw <= per-tcb-cross-over)
+ * goal_tso = calculate how much with this bw we
+ * can send in goal-time seconds.
+ * if (goal_tso > mss)
+ * seg = goal_tso / mss
+ * tso = seg * mss
+ * else
+ * tso = mss
+ * if (tso > per-tcb-max)
+ * tso = per-tcb-max
+ * else if ( bw > 512Mbps)
+ * tso = max-tso (64k/mss)
+ * else
+ * goal_tso = bw / per-tcb-divsor
+ * seg = (goal_tso + mss-1)/mss
+ * tso = seg * mss
+ *
+ * if (tso < per-tcb-floor)
+ * tso = per-tcb-floor
+ * if (tso > per-tcb-utter_max)
+ * tso = per-tcb-utter_max
+ *
+ * Note the default per-tcb-divisor is 1000 (same as google).
+ * the goal cross over is 30Mbps however. To recreate googles
+ * algorithm you need to set:
+ *
+ * cross-over = 23,168,000 bps
+ * goal-time = 18000
+ * per-tcb-max = 2
+ * per-tcb-divisor = 1000
+ * per-tcb-floor = 1
+ *
+ * This will get you "google bbr" behavior with respect to tso size.
+ *
+ * Note we do set anything TSO size until we are past the initial
+ * window. Before that we gnerally use either a single MSS
+ * or we use the full IW size (so we burst a IW at a time)
+ * Also note that Hardware-TLS is special and does alternate
+ * things to minimize PCI Bus Bandwidth use.
+ */
+
+ if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) {
+ maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
+ } else {
+ maxseg = BBR_MIN_SEG - bbr->rc_last_options;
+ }
+#ifdef KERN_TLS
+ if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
+ tls_seg = ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, bbr->rc_tp->snd_wnd);
+ bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options);
+ }
+#endif
+ old_tso = bbr->r_ctl.rc_pace_max_segs;
+ if (bbr->rc_past_init_win == 0) {
+ /*
+ * Not enough data has been acknowledged to make a
+ * judgement unless we are hardware TLS. Set up
+ * the inital TSO based on if we are sending a
+ * full IW at once or not.
+ */
+ if (bbr->rc_use_google)
+ bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2);
+ else if (bbr->bbr_init_win_cheat)
+ bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp);
+ else
+ bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
+ if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg)
+ bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg;
+#ifdef KERN_TLS
+ if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && tls_seg) {
+ /*
+ * For hardware TLS we set our min to the tls_seg size.
+ */
+ bbr->r_ctl.rc_pace_max_segs = tls_seg;
+ bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options;
+ }
+#endif
+ if (bbr->r_ctl.rc_pace_max_segs == 0) {
+ bbr->r_ctl.rc_pace_max_segs = maxseg;
+ }
+ bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0);
+#ifdef KERN_TLS
+ if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0)
+#endif
+ bbr_adjust_for_hw_pacing(bbr, cts);
+ return;
+ }
+ /**
+ * Now lets set the TSO goal based on our delivery rate in
+ * bytes per second. Note we only do this if
+ * we have acked at least the initial cwnd worth of data.
+ */
+ bw = bbr_get_bw(bbr);
+ if (IN_RECOVERY(bbr->rc_tp->t_flags) &&
+ (bbr->rc_use_google == 0)) {
+ /* We clamp to one MSS in recovery */
+ new_tso = maxseg;
+ } else if (bbr->rc_use_google) {
+ int min_tso_segs;
+
+ /* Google considers the gain too */
+ if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) {
+ bw *= bbr->r_ctl.rc_bbr_hptsi_gain;
+ bw /= BBR_UNIT;
+ }
+ bytes = bw / 1024;
+ if (bytes > (64 * 1024))
+ bytes = 64 * 1024;
+ new_tso = bytes / maxseg;
+ if (bw < ONE_POINT_TWO_MEG)
+ min_tso_segs = 1;
+ else
+ min_tso_segs = 2;
+ if (new_tso < min_tso_segs)
+ new_tso = min_tso_segs;
+ new_tso *= maxseg;
+ } else if (bbr->rc_no_pacing) {
+ new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg;
+ } else if (bw <= bbr->r_ctl.bbr_cross_over) {
+ /*
+ * Calculate the worse case b/w TSO if we are inserting no
+ * more than a delay_target number of TSO's.
+ */
+ uint32_t tso_len, min_tso;
+
+ tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw);
+ if (tso_len > maxseg) {
+ new_tso = tso_len / maxseg;
+ if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max)
+ new_tso = bbr->r_ctl.bbr_hptsi_segments_max;
+ new_tso *= maxseg;
+ } else {
+ /*
+ * less than a full sized frame yikes.. long rtt or
+ * low bw?
+ */
+ min_tso = bbr_minseg(bbr);
+ if ((tso_len > min_tso) && (bbr_all_get_min == 0))
+ new_tso = rounddown(tso_len, min_tso);
+ else
+ new_tso = min_tso;
+ }
+ } else if (bw > FIVETWELVE_MBPS) {
+ /*
+ * This guy is so fast b/w wise that we can TSO as large as
+ * possible of segments that the NIC will allow.
+ */
+ new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
+ } else {
+ /*
+ * This formula is based on attempting to send a segment or
+ * more every bbr_hptsi_per_second. The default is 1000
+ * which means you are targeting what you can send every 1ms
+ * based on the peers bw.
+ *
+ * If the number drops to say 500, then you are looking more
+ * at 2ms and you will raise how much we send in a single
+ * TSO thus saving CPU (less bbr_output_wtime() calls). The
+ * trade off of course is you will send more at once and
+ * thus tend to clump up the sends into larger "bursts"
+ * building a queue.
+ */
+ bw /= bbr->r_ctl.bbr_hptsi_per_second;
+ new_tso = roundup(bw, (uint64_t)maxseg);
+ /*
+ * Gate the floor to match what our lower than 48Mbps
+ * algorithm does. The ceiling (bbr_hptsi_segments_max) thus
+ * becomes the floor for this calculation.
+ */
+ if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg))
+ new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg);
+ }
+ if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor)))
+ new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor;
+ if (new_tso > PACE_MAX_IP_BYTES)
+ new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
+ /* Enforce an utter maximum if we are not HW-TLS */
+#ifdef KERN_TLS
+ if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0)
+#endif
+ if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) {
+ new_tso = bbr->r_ctl.bbr_utter_max * maxseg;
+ }
+#ifdef KERN_TLS
+ if (tls_seg) {
+ /*
+ * Lets move the output size
+ * up to 1 or more TLS record sizes.
+ */
+ uint32_t temp;
+
+ temp = roundup(new_tso, tls_seg);
+ new_tso = temp;
+ /* Back down if needed to under a full frame */
+ while (new_tso > PACE_MAX_IP_BYTES)
+ new_tso -= tls_seg;
+ }
+#endif
+ if (old_tso != new_tso) {
+ /* Only log changes */
+ bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0);
+ bbr->r_ctl.rc_pace_max_segs = new_tso;
+ }
+#ifdef KERN_TLS
+ if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) &&
+ tls_seg) {
+ bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options;
+ } else
+#endif
+ /* We have hardware pacing and not hardware TLS! */
+ bbr_adjust_for_hw_pacing(bbr, cts);
+}
+
+static void
+bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len,
+ uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t cts,
+ struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc,
+ struct sockbuf *sb)
+{
+
+ struct bbr_sendmap *rsm, *nrsm;
+ register uint32_t snd_max, snd_una;
+ uint32_t pacing_time;
+ /*
+ * Add to the RACK log of packets in flight or retransmitted. If
+ * there is a TS option we will use the TS echoed, if not we will
+ * grab a TS.
+ *
+ * Retransmissions will increment the count and move the ts to its
+ * proper place. Note that if options do not include TS's then we
+ * won't be able to effectively use the ACK for an RTT on a retran.
+ *
+ * Notes about r_start and r_end. Lets consider a send starting at
+ * sequence 1 for 10 bytes. In such an example the r_start would be
+ * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
+ * This means that r_end is actually the first sequence for the next
+ * slot (11).
+ *
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (err) {
+ /*
+ * We don't log errors -- we could but snd_max does not
+ * advance in this case either.
+ */
+ return;
+ }
+ if (th_flags & TH_RST) {
+ /*
+ * We don't log resets and we return immediately from
+ * sending
+ */
+ *abandon = 1;
+ return;
+ }
+ snd_una = tp->snd_una;
+ if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) {
+ /*
+ * The call to bbr_log_output is made before bumping
+ * snd_max. This means we can record one extra byte on a SYN
+ * or FIN if seq_out is adding more on and a FIN is present
+ * (and we are not resending).
+ */
+ if (th_flags & TH_SYN)
+ len++;
+ if (th_flags & TH_FIN)
+ len++;
+ }
+ if (SEQ_LEQ((seq_out + len), snd_una)) {
+ /* Are sending an old segment to induce an ack (keep-alive)? */
+ return;
+ }
+ if (SEQ_LT(seq_out, snd_una)) {
+ /* huh? should we panic? */
+ uint32_t end;
+
+ end = seq_out + len;
+ seq_out = snd_una;
+ len = end - seq_out;
+ }
+ snd_max = tp->snd_max;
+ if (len == 0) {
+ /* We don't log zero window probes */
+ return;
+ }
+ pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1);
+ /* First question is it a retransmission? */
+ if (seq_out == snd_max) {
+again:
+ rsm = bbr_alloc(bbr);
+ if (rsm == NULL) {
+ return;
+ }
+ rsm->r_flags = 0;
+ if (th_flags & TH_SYN)
+ rsm->r_flags |= BBR_HAS_SYN;
+ if (th_flags & TH_FIN)
+ rsm->r_flags |= BBR_HAS_FIN;
+ rsm->r_tim_lastsent[0] = cts;
+ rsm->r_rtr_cnt = 1;
+ rsm->r_rtr_bytes = 0;
+ rsm->r_start = seq_out;
+ rsm->r_end = rsm->r_start + len;
+ rsm->r_dupack = 0;
+ rsm->r_delivered = bbr->r_ctl.rc_delivered;
+ rsm->r_pacing_delay = pacing_time;
+ rsm->r_ts_valid = bbr->rc_ts_valid;
+ if (bbr->rc_ts_valid)
+ rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
+ rsm->r_del_time = bbr->r_ctl.rc_del_time;
+ if (bbr->r_ctl.r_app_limited_until)
+ rsm->r_app_limited = 1;
+ else
+ rsm->r_app_limited = 0;
+ rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
+ rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ /*
+ * Here we must also add in this rsm since snd_max
+ * is updated after we return from a new send.
+ */
+ rsm->r_flight_at_send += len;
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
+ rsm->r_bbr_state = bbr_state_val(bbr);
+ else
+ rsm->r_bbr_state = 8;
+ if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
+ rsm->r_is_gain = 1;
+ rsm->r_is_drain = 0;
+ } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
+ rsm->r_is_drain = 1;
+ rsm->r_is_gain = 0;
+ } else {
+ rsm->r_is_drain = 0;
+ rsm->r_is_gain = 0;
+ }
+ return;
+ }
+ /*
+ * If we reach here its a retransmission and we need to find it.
+ */
+more:
+ if (hintrsm && (hintrsm->r_start == seq_out)) {
+ rsm = hintrsm;
+ hintrsm = NULL;
+ } else if (bbr->r_ctl.rc_next) {
+ /* We have a hint from a previous run */
+ rsm = bbr->r_ctl.rc_next;
+ } else {
+ /* No hints sorry */
+ rsm = NULL;
+ }
+ if ((rsm) && (rsm->r_start == seq_out)) {
+ /*
+ * We used rc_next or hintrsm to retransmit, hopefully the
+ * likely case.
+ */
+ seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
+ if (len == 0) {
+ return;
+ } else {
+ goto more;
+ }
+ }
+ /* Ok it was not the last pointer go through it the hard way. */
+ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
+ if (rsm->r_start == seq_out) {
+ seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
+ bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
+ if (len == 0) {
+ return;
+ } else {
+ continue;
+ }
+ }
+ if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
+ /* Transmitted within this piece */
+ /*
+ * Ok we must split off the front and then let the
+ * update do the rest
+ */
+ nrsm = bbr_alloc_full_limit(bbr);
+ if (nrsm == NULL) {
+ bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
+ return;
+ }
+ /*
+ * copy rsm to nrsm and then trim the front of rsm
+ * to not include this part.
+ */
+ bbr_clone_rsm(bbr, nrsm, rsm, seq_out);
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~BBR_HAS_FIN);
+ seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time);
+ if (len == 0) {
+ return;
+ }
+ }
+ }
+ /*
+ * Hmm not found in map did they retransmit both old and on into the
+ * new?
+ */
+ if (seq_out == tp->snd_max) {
+ goto again;
+ } else if (SEQ_LT(seq_out, tp->snd_max)) {
+#ifdef BBR_INVARIANTS
+ printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
+ seq_out, len, tp->snd_una, tp->snd_max);
+ printf("Starting Dump of all rack entries\n");
+ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
+ printf("rsm:%p start:%u end:%u\n",
+ rsm, rsm->r_start, rsm->r_end);
+ }
+ printf("Dump complete\n");
+ panic("seq_out not found rack:%p tp:%p",
+ bbr, tp);
+#endif
+ } else {
+#ifdef BBR_INVARIANTS
+ /*
+ * Hmm beyond sndmax? (only if we are using the new rtt-pack
+ * flag)
+ */
+ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
+ seq_out, len, tp->snd_max, tp);
+#endif
+ }
+}
+
+static void
+bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt)
+{
+ /*
+ * Collapse timeout back the cum-ack moved.
+ */
+ tp->t_rxtshift = 0;
+ tp->t_softerror = 0;
+}
+
+
+static void
+tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin)
+{
+ bbr->rtt_valid = 1;
+ bbr->r_ctl.cur_rtt = rtt_usecs;
+ bbr->r_ctl.ts_in = tsin;
+ if (rsm_send_time)
+ bbr->r_ctl.cur_rtt_send_time = rsm_send_time;
+}
+
+static void
+bbr_make_timestamp_determination(struct tcp_bbr *bbr)
+{
+ /**
+ * We have in our bbr control:
+ * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp).
+ * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts).
+ * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts)
+ * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time)
+ *
+ * Now we can calculate the time between the sends by doing:
+ *
+ * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts
+ *
+ * And the peer's time between receiving them by doing:
+ *
+ * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp
+ *
+ * We want to figure out if the timestamp values are in msec, 10msec or usec.
+ * We also may find that we can't use the timestamps if say we see
+ * that the peer_delta indicates that though we may have taken 10ms to
+ * pace out the data, it only saw 1ms between the two packets. This would
+ * indicate that somewhere on the path is a batching entity that is giving
+ * out time-slices of the actual b/w. This would mean we could not use
+ * reliably the peers timestamps.
+ *
+ * We expect delta > peer_delta initially. Until we figure out the
+ * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio.
+ * If we place 1000 there then its a ms vs our usec. If we place 10000 there
+ * then its 10ms vs our usec. If the peer is running a usec clock we would
+ * put a 1 there. If the value is faster then ours, we will disable the
+ * use of timestamps (though we could revist this later if we find it to be not
+ * just an isolated one or two flows)).
+ *
+ * To detect the batching middle boxes we will come up with our compensation and
+ * if with it in place, we find the peer is drastically off (by some margin) in
+ * the smaller direction, then we will assume the worst case and disable use of timestamps.
+ *
+ */
+ uint64_t delta, peer_delta, delta_up;
+
+ delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts;
+ if (delta < bbr_min_usec_delta) {
+ /*
+ * Have not seen a min amount of time
+ * between our send times so we can
+ * make a determination of the timestamp
+ * yet.
+ */
+ return;
+ }
+ peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp;
+ if (peer_delta < bbr_min_peer_delta) {
+ /*
+ * We may have enough in the form of
+ * our delta but the peers number
+ * has not changed that much. It could
+ * be its clock ratio is such that
+ * we need more data (10ms tick) or
+ * there may be other compression scenarios
+ * going on. In any event we need the
+ * spread to be larger.
+ */
+ return;
+ }
+ /* Ok lets first see which way our delta is going */
+ if (peer_delta > delta) {
+ /* Very unlikely, the peer without
+ * compensation shows that it saw
+ * the two sends arrive further apart
+ * then we saw then in micro-seconds.
+ */
+ if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) {
+ /* well it looks like the peer is a micro-second clock. */
+ bbr->rc_ts_clock_set = 1;
+ bbr->r_ctl.bbr_peer_tsratio = 1;
+ } else {
+ bbr->rc_ts_cant_be_used = 1;
+ bbr->rc_ts_clock_set = 1;
+ }
+ return;
+ }
+ /* Ok we know that the peer_delta is smaller than our send distance */
+ bbr->rc_ts_clock_set = 1;
+ /* First question is it within the percentage that they are using usec time? */
+ delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent;
+ if ((peer_delta + delta_up) >= delta) {
+ /* Its a usec clock */
+ bbr->r_ctl.bbr_peer_tsratio = 1;
+ bbr_log_tstmp_validation(bbr, peer_delta, delta);
+ return;
+ }
+ /* Ok if not usec, what about 10usec (though unlikely)? */
+ delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent;
+ if (((peer_delta * 10) + delta_up) >= delta) {
+ bbr->r_ctl.bbr_peer_tsratio = 10;
+ bbr_log_tstmp_validation(bbr, peer_delta, delta);
+ return;
+ }
+ /* And what about 100usec (though again unlikely)? */
+ delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent;
+ if (((peer_delta * 100) + delta_up) >= delta) {
+ bbr->r_ctl.bbr_peer_tsratio = 100;
+ bbr_log_tstmp_validation(bbr, peer_delta, delta);
+ return;
+ }
+ /* And how about 1 msec (the most likely one)? */
+ delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent;
+ if (((peer_delta * 1000) + delta_up) >= delta) {
+ bbr->r_ctl.bbr_peer_tsratio = 1000;
+ bbr_log_tstmp_validation(bbr, peer_delta, delta);
+ return;
+ }
+ /* Ok if not msec could it be 10 msec? */
+ delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent;
+ if (((peer_delta * 10000) + delta_up) >= delta) {
+ bbr->r_ctl.bbr_peer_tsratio = 10000;
+ return;
+ }
+ /* If we fall down here the clock tick so slowly we can't use it */
+ bbr->rc_ts_cant_be_used = 1;
+ bbr->r_ctl.bbr_peer_tsratio = 0;
+ bbr_log_tstmp_validation(bbr, peer_delta, delta);
+}
+
+/*
+ * Collect new round-trip time estimate
+ * and update averages and current timeout.
+ */
+static void
+tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
+{
+ int32_t delta;
+ uint32_t rtt, tsin;
+ int32_t rtt_ticks;
+
+
+ if (bbr->rtt_valid == 0)
+ /* No valid sample */
+ return;
+
+ rtt = bbr->r_ctl.cur_rtt;
+ tsin = bbr->r_ctl.ts_in;
+ if (bbr->rc_prtt_set_ts) {
+ /*
+ * We are to force feed the rttProp filter due
+ * to an entry into PROBE_RTT. This assures
+ * that the times are sync'd between when we
+ * go into PROBE_RTT and the filter expiration.
+ *
+ * Google does not use a true filter, so they do
+ * this implicitly since they only keep one value
+ * and when they enter probe-rtt they update the
+ * value to the newest rtt.
+ */
+ uint32_t rtt_prop;
+
+ bbr->rc_prtt_set_ts = 0;
+ rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
+ if (rtt > rtt_prop)
+ filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts);
+ else
+ apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
+ }
+ if (bbr->rc_ack_was_delayed)
+ rtt += bbr->r_ctl.rc_ack_hdwr_delay;
+
+ if (rtt < bbr->r_ctl.rc_lowest_rtt)
+ bbr->r_ctl.rc_lowest_rtt = rtt;
+ bbr_log_rtt_sample(bbr, rtt, tsin);
+ if (bbr->r_init_rtt) {
+ /*
+ * The initial rtt is not-trusted, nuke it and lets get
+ * our first valid measurement in.
+ */
+ bbr->r_init_rtt = 0;
+ tp->t_srtt = 0;
+ }
+ if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) {
+ /*
+ * So we have not yet figured out
+ * what the peers TSTMP value is
+ * in (most likely ms). We need a
+ * series of cum-ack's to determine
+ * this reliably.
+ */
+ if (bbr->rc_ack_is_cumack) {
+ if (bbr->rc_ts_data_set) {
+ /* Lets attempt to determine the timestamp granularity. */
+ bbr_make_timestamp_determination(bbr);
+ } else {
+ bbr->rc_ts_data_set = 1;
+ bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts;
+ bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time;
+ }
+ } else {
+ /*
+ * We have to have consecutive acks
+ * reset any "filled" state to none.
+ */
+ bbr->rc_ts_data_set = 0;
+ }
+ }
+ /* Round it up */
+ rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1)));
+ if (rtt_ticks == 0)
+ rtt_ticks = 1;
+ if (tp->t_srtt != 0) {
+ /*
+ * srtt is stored as fixed point with 5 bits after the
+ * binary point (i.e., scaled by 8). The following magic is
+ * equivalent to the smoothing algorithm in rfc793 with an
+ * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
+ * Adjust rtt to origin 0.
+ */
+
+ delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT)
+ - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+
+ tp->t_srtt += delta;
+ if (tp->t_srtt <= 0)
+ tp->t_srtt = 1;
+
+ /*
+ * We accumulate a smoothed rtt variance (actually, a
+ * smoothed mean difference), then set the retransmit timer
+ * to smoothed rtt + 4 times the smoothed variance. rttvar
+ * is stored as fixed point with 4 bits after the binary
+ * point (scaled by 16). The following is equivalent to
+ * rfc793 smoothing with an alpha of .75 (rttvar =
+ * rttvar*3/4 + |delta| / 4). This replaces rfc793's
+ * wired-in beta.
+ */
+ if (delta < 0)
+ delta = -delta;
+ delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
+ tp->t_rttvar += delta;
+ if (tp->t_rttvar <= 0)
+ tp->t_rttvar = 1;
+ if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+ } else {
+ /*
+ * No rtt measurement yet - use the unsmoothed rtt. Set the
+ * variance to half the rtt (so our first retransmit happens
+ * at 3*rtt).
+ */
+ tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT;
+ tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+ }
+ TCPSTAT_INC(tcps_rttupdated);
+ tp->t_rttupdated++;
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks));
+#endif
+ /*
+ * the retransmit should happen at rtt + 4 * rttvar. Because of the
+ * way we do the smoothing, srtt and rttvar will each average +1/2
+ * tick of bias. When we compute the retransmit timer, we want 1/2
+ * tick of rounding and 1 extra tick because of +-1/2 tick
+ * uncertainty in the firing of the timer. The bias will give us
+ * exactly the 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below the minimum
+ * feasible timer (which is 2 ticks).
+ */
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
+ max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2),
+ MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
+
+ /*
+ * We received an ack for a packet that wasn't retransmitted; it is
+ * probably safe to discard any error indications we've received
+ * recently. This isn't quite right, but close enough for now (a
+ * route might have failed after we sent a segment, and the return
+ * path might not be symmetrical).
+ */
+ tp->t_softerror = 0;
+ rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
+ if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt)
+ bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
+}
+
+static void
+bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm,
+ uint32_t t, uint32_t cts, int ack_type)
+{
+ /*
+ * For this RSM, we acknowledged the data from a previous
+ * transmission, not the last one we made. This means we did a false
+ * retransmit.
+ */
+ if (rsm->r_flags & BBR_HAS_FIN) {
+ /*
+ * The sending of the FIN often is multiple sent when we
+ * have everything outstanding ack'd. We ignore this case
+ * since its over now.
+ */
+ return;
+ }
+ if (rsm->r_flags & BBR_TLP) {
+ /*
+ * We expect TLP's to have this occur often
+ */
+ bbr->rc_tlp_rtx_out = 0;
+ return;
+ }
+ if (ack_type != BBR_CUM_ACKED) {
+ /*
+ * If it was not a cum-ack we
+ * don't really know for sure since
+ * the timestamp could be from some
+ * other transmission.
+ */
+ return;
+ }
+
+ if (rsm->r_flags & BBR_WAS_SACKPASS) {
+ /*
+ * We retransmitted based on a sack and the earlier
+ * retransmission ack'd it - re-ordering is occuring.
+ */
+ BBR_STAT_INC(bbr_reorder_seen);
+ bbr->r_ctl.rc_reorder_ts = cts;
+ }
+ /* Back down the loss count */
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
+ bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
+ rsm->r_flags &= ~BBR_MARKED_LOST;
+ if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
+ /* LT sampling also needs adjustment */
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+ }
+ /***** RRS HERE ************************/
+ /* Do we need to do this??? */
+ /* bbr_reset_lt_bw_sampling(bbr, cts); */
+ /***** RRS HERE ************************/
+ BBR_STAT_INC(bbr_badfr);
+ BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start));
+}
+
+
+static void
+bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
+{
+ bbr->r_ctl.rc_rtt_shrinks = cts;
+ if (bbr_can_force_probertt &&
+ (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
+ ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
+ /*
+ * We should enter probe-rtt its been too long
+ * since we have been there.
+ */
+ bbr_enter_probe_rtt(bbr, cts, __LINE__);
+ } else
+ bbr_check_probe_rtt_limits(bbr, cts);
+}
+
+static void
+tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts)
+{
+ uint64_t orig_bw;
+
+ if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) {
+ /* We never apply a zero measurment */
+ bbr_log_type_bbrupd(bbr, 20, cts, 0, 0,
+ 0, 0, 0, 0, 0, 0);
+ return;
+ }
+ if (bbr->r_ctl.r_measurement_count < 0xffffffff)
+ bbr->r_ctl.r_measurement_count++;
+ orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
+ apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch);
+ bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw,
+ (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate),
+ 0, 0, 0, 0, 0, 0);
+ if (orig_bw &&
+ (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) {
+ if (bbr->bbr_hdrw_pacing) {
+ /*
+ * Apply a new rate to the hardware
+ * possibly.
+ */
+ bbr_update_hardware_pacing_rate(bbr, cts);
+ }
+ bbr_set_state_target(bbr, __LINE__);
+ tcp_bbr_tso_size_check(bbr, cts);
+ if (bbr->r_recovery_bw) {
+ bbr_setup_red_bw(bbr, cts);
+ bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW);
+ }
+ } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate))
+ tcp_bbr_tso_size_check(bbr, cts);
+}
+
+static void
+bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
+{
+ if (bbr->rc_in_persist == 0) {
+ /* We log only when not in persist */
+ /* Translate to a Bytes Per Second */
+ uint64_t tim, bw, ts_diff, ts_bw;
+ uint32_t upper, lower, delivered;
+
+ if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
+ tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
+ else
+ tim = 1;
+ /*
+ * Now that we have processed the tim (skipping the sample
+ * or possibly updating the time, go ahead and
+ * calculate the cdr.
+ */
+ delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
+ bw = (uint64_t)delivered;
+ bw *= (uint64_t)USECS_IN_SECOND;
+ bw /= tim;
+ if (bw == 0) {
+ /* We must have a calculatable amount */
+ return;
+ }
+ upper = (bw >> 32) & 0x00000000ffffffff;
+ lower = bw & 0x00000000ffffffff;
+ /*
+ * If we are using this b/w shove it in now so we
+ * can see in the trace viewer if it gets over-ridden.
+ */
+ if (rsm->r_ts_valid &&
+ bbr->rc_ts_valid &&
+ bbr->rc_ts_clock_set &&
+ (bbr->rc_ts_cant_be_used == 0) &&
+ bbr->rc_use_ts_limit) {
+ ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1);
+ ts_diff *= bbr->r_ctl.bbr_peer_tsratio;
+ if ((delivered == 0) ||
+ (rtt < 1000)) {
+ /* Can't use the ts */
+ bbr_log_type_bbrupd(bbr, 61, cts,
+ ts_diff,
+ bbr->r_ctl.last_inbound_ts,
+ rsm->r_del_ack_ts, 0,
+ 0, 0, 0, delivered);
+ } else {
+ ts_bw = (uint64_t)delivered;
+ ts_bw *= (uint64_t)USECS_IN_SECOND;
+ ts_bw /= ts_diff;
+ bbr_log_type_bbrupd(bbr, 62, cts,
+ (ts_bw >> 32),
+ (ts_bw & 0xffffffff), 0, 0,
+ 0, 0, ts_diff, delivered);
+ if ((bbr->ts_can_raise) &&
+ (ts_bw > bw)) {
+ bbr_log_type_bbrupd(bbr, 8, cts,
+ delivered,
+ ts_diff,
+ (bw >> 32),
+ (bw & 0x00000000ffffffff),
+ 0, 0, 0, 0);
+ bw = ts_bw;
+ } else if (ts_bw && (ts_bw < bw)) {
+ bbr_log_type_bbrupd(bbr, 7, cts,
+ delivered,
+ ts_diff,
+ (bw >> 32),
+ (bw & 0x00000000ffffffff),
+ 0, 0, 0, 0);
+ bw = ts_bw;
+ }
+ }
+ }
+ if (rsm->r_first_sent_time &&
+ TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
+ uint64_t sbw, sti;
+ /*
+ * We use what was in flight at the time of our
+ * send and the size of this send to figure
+ * out what we have been sending at (amount).
+ * For the time we take from the time of
+ * the send of the first send outstanding
+ * until this send plus this sends pacing
+ * time. This gives us a good calculation
+ * as to the rate we have been sending at.
+ */
+
+ sbw = (uint64_t)(rsm->r_flight_at_send);
+ sbw *= (uint64_t)USECS_IN_SECOND;
+ sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
+ sti += rsm->r_pacing_delay;
+ sbw /= sti;
+ if (sbw < bw) {
+ bbr_log_type_bbrupd(bbr, 6, cts,
+ delivered,
+ (uint32_t)sti,
+ (bw >> 32),
+ (uint32_t)bw,
+ rsm->r_first_sent_time, 0, (sbw >> 32),
+ (uint32_t)sbw);
+ bw = sbw;
+ }
+ }
+ /* Use the google algorithm for b/w measurements */
+ bbr->r_ctl.rc_bbr_cur_del_rate = bw;
+ if ((rsm->r_app_limited == 0) ||
+ (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) {
+ tcp_bbr_commit_bw(bbr, cts);
+ bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
+ 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time);
+ }
+ }
+}
+
+static void
+bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
+{
+ if (bbr->rc_in_persist == 0) {
+ /* We log only when not in persist */
+ /* Translate to a Bytes Per Second */
+ uint64_t tim, bw;
+ uint32_t upper, lower, delivered;
+ int no_apply = 0;
+
+ if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
+ tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
+ else
+ tim = 1;
+ /*
+ * Now that we have processed the tim (skipping the sample
+ * or possibly updating the time, go ahead and
+ * calculate the cdr.
+ */
+ delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
+ bw = (uint64_t)delivered;
+ bw *= (uint64_t)USECS_IN_SECOND;
+ bw /= tim;
+ if (tim < bbr->r_ctl.rc_lowest_rtt) {
+ bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
+ tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
+
+ no_apply = 1;
+ }
+ upper = (bw >> 32) & 0x00000000ffffffff;
+ lower = bw & 0x00000000ffffffff;
+ /*
+ * If we are using this b/w shove it in now so we
+ * can see in the trace viewer if it gets over-ridden.
+ */
+ bbr->r_ctl.rc_bbr_cur_del_rate = bw;
+ /* Gate by the sending rate */
+ if (rsm->r_first_sent_time &&
+ TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
+ uint64_t sbw, sti;
+ /*
+ * We use what was in flight at the time of our
+ * send and the size of this send to figure
+ * out what we have been sending at (amount).
+ * For the time we take from the time of
+ * the send of the first send outstanding
+ * until this send plus this sends pacing
+ * time. This gives us a good calculation
+ * as to the rate we have been sending at.
+ */
+
+ sbw = (uint64_t)(rsm->r_flight_at_send);
+ sbw *= (uint64_t)USECS_IN_SECOND;
+ sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
+ sti += rsm->r_pacing_delay;
+ sbw /= sti;
+ if (sbw < bw) {
+ bbr_log_type_bbrupd(bbr, 6, cts,
+ delivered,
+ (uint32_t)sti,
+ (bw >> 32),
+ (uint32_t)bw,
+ rsm->r_first_sent_time, 0, (sbw >> 32),
+ (uint32_t)sbw);
+ bw = sbw;
+ }
+ if ((sti > tim) &&
+ (sti < bbr->r_ctl.rc_lowest_rtt)) {
+ bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
+ (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
+ no_apply = 1;
+ } else
+ no_apply = 0;
+ }
+ bbr->r_ctl.rc_bbr_cur_del_rate = bw;
+ if ((no_apply == 0) &&
+ ((rsm->r_app_limited == 0) ||
+ (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) {
+ tcp_bbr_commit_bw(bbr, cts);
+ bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
+ 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time);
+ }
+ }
+}
+
+
+static void
+bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin,
+ uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to)
+{
+ uint64_t old_rttprop;
+
+ /* Update our delivery time and amount */
+ bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start);
+ bbr->r_ctl.rc_del_time = cts;
+ if (rtt == 0) {
+ /*
+ * 0 means its a retransmit, for now we don't use these for
+ * the rest of BBR.
+ */
+ return;
+ }
+ if ((bbr->rc_use_google == 0) &&
+ (match != BBR_RTT_BY_EXACTMATCH) &&
+ (match != BBR_RTT_BY_TIMESTAMP)){
+ /*
+ * We get a lot of rtt updates, lets not pay attention to
+ * any that are not an exact match. That way we don't have
+ * to worry about timestamps and the whole nonsense of
+ * unsure if its a retransmission etc (if we ever had the
+ * timestamp fixed to always have the last thing sent this
+ * would not be a issue).
+ */
+ return;
+ }
+ if ((bbr_no_retran && bbr->rc_use_google) &&
+ (match != BBR_RTT_BY_EXACTMATCH) &&
+ (match != BBR_RTT_BY_TIMESTAMP)){
+ /*
+ * We only do measurements in google mode
+ * with bbr_no_retran on for sure things.
+ */
+ return;
+ }
+ /* Only update srtt if we know by exact match */
+ tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin);
+ if (ack_type == BBR_CUM_ACKED)
+ bbr->rc_ack_is_cumack = 1;
+ else
+ bbr->rc_ack_is_cumack = 0;
+ old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP);
+ /*
+ * Note the following code differs to the original
+ * BBR spec. It calls for <= not <. However after a
+ * long discussion in email with Neal, he acknowledged
+ * that it should be < than so that we will have flows
+ * going into probe-rtt (we were seeing cases where that
+ * did not happen and caused ugly things to occur). We
+ * have added this agreed upon fix to our code base.
+ */
+ if (rtt < old_rttprop) {
+ /* Update when we last saw a rtt drop */
+ bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0);
+ bbr_set_reduced_rtt(bbr, cts, __LINE__);
+ }
+ bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts,
+ match, rsm->r_start, rsm->r_flags);
+ apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
+ if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) {
+ /*
+ * The RTT-prop moved, reset the target (may be a
+ * nop for some states).
+ */
+ bbr_set_state_target(bbr, __LINE__);
+ if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0,
+ __LINE__, BBR_RTTS_NEW_TARGET, 0);
+ else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP))
+ /* It went up */
+ bbr_check_probe_rtt_limits(bbr, cts);
+ }
+ if ((bbr->rc_use_google == 0) &&
+ (match == BBR_RTT_BY_TIMESTAMP)) {
+ /*
+ * We don't do b/w update with
+ * these since they are not really
+ * reliable.
+ */
+ return;
+ }
+ if (bbr->r_ctl.r_app_limited_until &&
+ (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) {
+ /* We are no longer app-limited */
+ bbr->r_ctl.r_app_limited_until = 0;
+ }
+ if (bbr->rc_use_google) {
+ bbr_google_measurement(bbr, rsm, rtt, cts);
+ } else {
+ bbr_nf_measurement(bbr, rsm, rtt, cts);
+ }
+}
+
+/*
+ * Convert a timestamp that the main stack
+ * uses (milliseconds) into one that bbr uses
+ * (microseconds). Return that converted timestamp.
+ */
+static uint32_t
+bbr_ts_convert(uint32_t cts) {
+ uint32_t sec, msec;
+
+ sec = cts / MS_IN_USEC;
+ msec = cts - (MS_IN_USEC * sec);
+ return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC));
+}
+
+/*
+ * Return 0 if we did not update the RTT time, return
+ * 1 if we did.
+ */
+static int
+bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
+ struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack)
+{
+ int32_t i;
+ uint32_t t, uts = 0;
+
+ if ((rsm->r_flags & BBR_ACKED) ||
+ (rsm->r_flags & BBR_WAS_RENEGED) ||
+ (rsm->r_flags & BBR_RXT_CLEARED)) {
+ /* Already done */
+ return (0);
+ }
+ if (rsm->r_rtr_cnt == 1) {
+ /*
+ * Only one transmit. Hopefully the normal case.
+ */
+ if (TSTMP_GT(cts, rsm->r_tim_lastsent[0]))
+ t = cts - rsm->r_tim_lastsent[0];
+ else
+ t = 1;
+ if ((int)t <= 0)
+ t = 1;
+ bbr->r_ctl.rc_last_rtt = t;
+ bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
+ BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to);
+ return (1);
+ }
+ /* Convert to usecs */
+ if ((bbr_can_use_ts_for_rtt == 1) &&
+ (bbr->rc_use_google == 1) &&
+ (ack_type == BBR_CUM_ACKED) &&
+ (to->to_flags & TOF_TS) &&
+ (to->to_tsecr != 0)) {
+
+ t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr;
+ if (t < 1)
+ t = 1;
+ t *= MS_IN_USEC;
+ bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
+ BBR_RTT_BY_TIMESTAMP,
+ rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)],
+ ack_type, to);
+ return (1);
+ }
+ uts = bbr_ts_convert(to->to_tsecr);
+ if ((to->to_flags & TOF_TS) &&
+ (to->to_tsecr != 0) &&
+ (ack_type == BBR_CUM_ACKED) &&
+ ((rsm->r_flags & BBR_OVERMAX) == 0)) {
+ /*
+ * Now which timestamp does it match? In this block the ACK
+ * may be coming from a previous transmission.
+ */
+ uint32_t fudge;
+
+ fudge = BBR_TIMER_FUDGE;
+ for (i = 0; i < rsm->r_rtr_cnt; i++) {
+ if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) &&
+ (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) {
+ if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
+ t = cts - rsm->r_tim_lastsent[i];
+ else
+ t = 1;
+ if ((int)t <= 0)
+ t = 1;
+ bbr->r_ctl.rc_last_rtt = t;
+ bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING,
+ rsm->r_tim_lastsent[i], ack_type, to);
+ if ((i + 1) < rsm->r_rtr_cnt) {
+ /* Likely */
+ bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
+ } else if (rsm->r_flags & BBR_TLP) {
+ bbr->rc_tlp_rtx_out = 0;
+ }
+ return (1);
+ }
+ }
+ /* Fall through if we can't find a matching timestamp */
+ }
+ /*
+ * Ok its a SACK block that we retransmitted. or a windows
+ * machine without timestamps. We can tell nothing from the
+ * time-stamp since its not there or the time the peer last
+ * recieved a segment that moved forward its cum-ack point.
+ *
+ * Lets look at the last retransmit and see what we can tell
+ * (with BBR for space we only keep 2 note we have to keep
+ * at least 2 so the map can not be condensed more).
+ */
+ i = rsm->r_rtr_cnt - 1;
+ if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
+ t = cts - rsm->r_tim_lastsent[i];
+ else
+ goto not_sure;
+ if (t < bbr->r_ctl.rc_lowest_rtt) {
+ /*
+ * We retransmitted and the ack came back in less
+ * than the smallest rtt we have observed in the
+ * windowed rtt. We most likey did an improper
+ * retransmit as outlined in 4.2 Step 3 point 2 in
+ * the rack-draft.
+ *
+ * Use the prior transmission to update all the
+ * information as long as there is only one prior
+ * transmission.
+ */
+ if ((rsm->r_flags & BBR_OVERMAX) == 0) {
+#ifdef BBR_INVARIANTS
+ if (rsm->r_rtr_cnt == 1)
+ panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags);
+#endif
+ i = rsm->r_rtr_cnt - 2;
+ if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
+ t = cts - rsm->r_tim_lastsent[i];
+ else
+ t = 1;
+ bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
+ rsm->r_tim_lastsent[i], ack_type, to);
+ bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
+ } else {
+ /*
+ * Too many prior transmissions, just
+ * updated BBR delivered
+ */
+not_sure:
+ bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
+ BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
+ }
+ } else {
+ /*
+ * We retransmitted it and the retransmit did the
+ * job.
+ */
+ if (rsm->r_flags & BBR_TLP)
+ bbr->rc_tlp_rtx_out = 0;
+ if ((rsm->r_flags & BBR_OVERMAX) == 0)
+ bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts,
+ BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to);
+ else
+ bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
+ BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
+ */
+static void
+bbr_log_sack_passed(struct tcpcb *tp,
+ struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
+{
+ struct bbr_sendmap *nrsm;
+
+ nrsm = rsm;
+ TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap,
+ bbr_head, r_tnext) {
+ if (nrsm == rsm) {
+ /* Skip orginal segment he is acked */
+ continue;
+ }
+ if (nrsm->r_flags & BBR_ACKED) {
+ /* Skip ack'd segments */
+ continue;
+ }
+ if (nrsm->r_flags & BBR_SACK_PASSED) {
+ /*
+ * We found one that is already marked
+ * passed, we have been here before and
+ * so all others below this are marked.
+ */
+ break;
+ }
+ BBR_STAT_INC(bbr_sack_passed);
+ nrsm->r_flags |= BBR_SACK_PASSED;
+ if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) &&
+ bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) {
+ bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start;
+ bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start;
+ nrsm->r_flags |= BBR_MARKED_LOST;
+ }
+ nrsm->r_flags &= ~BBR_WAS_SACKPASS;
+ }
+}
+
+/*
+ * Returns the number of bytes that were
+ * newly ack'd by sack blocks.
+ */
+static uint32_t
+bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack,
+ struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts)
+{
+ int32_t times = 0;
+ uint32_t start, end, maxseg, changed = 0;
+ struct bbr_sendmap *rsm, *nrsm;
+ int32_t used_ref = 1;
+ uint8_t went_back = 0, went_fwd = 0;
+
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ start = sack->start;
+ end = sack->end;
+ rsm = *prsm;
+ if (rsm == NULL)
+ used_ref = 0;
+
+ /* Do we locate the block behind where we last were? */
+ if (rsm && SEQ_LT(start, rsm->r_start)) {
+ went_back = 1;
+ TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
+ if (SEQ_GEQ(start, rsm->r_start) &&
+ SEQ_LT(start, rsm->r_end)) {
+ goto do_rest_ofb;
+ }
+ }
+ }
+start_at_beginning:
+ went_fwd = 1;
+ /*
+ * Ok lets locate the block where this guy is fwd from rsm (if its
+ * set)
+ */
+ TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) {
+ if (SEQ_GEQ(start, rsm->r_start) &&
+ SEQ_LT(start, rsm->r_end)) {
+ break;
+ }
+ }
+do_rest_ofb:
+ if (rsm == NULL) {
+ /*
+ * This happens when we get duplicate sack blocks with the
+ * same end. For example SACK 4: 100 SACK 3: 100 The sort
+ * will not change there location so we would just start at
+ * the end of the first one and get lost.
+ */
+ if (tp->t_flags & TF_SENTFIN) {
+ /*
+ * Check to see if we have not logged the FIN that
+ * went out.
+ */
+ nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
+ if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
+ /*
+ * Ok we did not get the FIN logged.
+ */
+ nrsm->r_end++;
+ rsm = nrsm;
+ goto do_rest_ofb;
+ }
+ }
+ if (times == 1) {
+#ifdef BBR_INVARIANTS
+ panic("tp:%p bbr:%p sack:%p to:%p prsm:%p",
+ tp, bbr, sack, to, prsm);
+#else
+ goto out;
+#endif
+ }
+ times++;
+ BBR_STAT_INC(bbr_sack_proc_restart);
+ rsm = NULL;
+ goto start_at_beginning;
+ }
+ /* Ok we have an ACK for some piece of rsm */
+ if (rsm->r_start != start) {
+ /*
+ * Need to split this in two pieces the before and after.
+ */
+ if (bbr_sack_mergable(rsm, start, end))
+ nrsm = bbr_alloc_full_limit(bbr);
+ else
+ nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
+ if (nrsm == NULL) {
+ /* We could not allocate ignore the sack */
+ struct sackblk blk;
+
+ blk.start = start;
+ blk.end = end;
+ sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
+ goto out;
+ }
+ bbr_clone_rsm(bbr, nrsm, rsm, start);
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~BBR_HAS_FIN);
+ rsm = nrsm;
+ }
+ if (SEQ_GEQ(end, rsm->r_end)) {
+ /*
+ * The end of this block is either beyond this guy or right
+ * at this guy.
+ */
+ if ((rsm->r_flags & BBR_ACKED) == 0) {
+ bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
+ changed += (rsm->r_end - rsm->r_start);
+ bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
+ bbr_log_sack_passed(tp, bbr, rsm);
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
+ }
+ /* Is Reordering occuring? */
+ if (rsm->r_flags & BBR_SACK_PASSED) {
+ BBR_STAT_INC(bbr_reorder_seen);
+ bbr->r_ctl.rc_reorder_ts = cts;
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
+ if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
+ /* LT sampling also needs adjustment */
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+ }
+ }
+ rsm->r_flags |= BBR_ACKED;
+ rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ }
+ bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
+ if (end == rsm->r_end) {
+ /* This block only - done */
+ goto out;
+ }
+ /* There is more not coverend by this rsm move on */
+ start = rsm->r_end;
+ nrsm = TAILQ_NEXT(rsm, r_next);
+ rsm = nrsm;
+ times = 0;
+ goto do_rest_ofb;
+ }
+ if (rsm->r_flags & BBR_ACKED) {
+ /* Been here done that */
+ goto out;
+ }
+ /* Ok we need to split off this one at the tail */
+ if (bbr_sack_mergable(rsm, start, end))
+ nrsm = bbr_alloc_full_limit(bbr);
+ else
+ nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
+ if (nrsm == NULL) {
+ /* failed XXXrrs what can we do but loose the sack info? */
+ struct sackblk blk;
+
+ blk.start = start;
+ blk.end = end;
+ sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
+ goto out;
+ }
+ /* Clone it */
+ bbr_clone_rsm(bbr, nrsm, rsm, end);
+ /* The sack block does not cover this guy fully */
+ rsm->r_flags &= (~BBR_HAS_FIN);
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ nrsm->r_dupack = 0;
+ bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
+ bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
+ changed += (rsm->r_end - rsm->r_start);
+ bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
+ bbr_log_sack_passed(tp, bbr, rsm);
+ /* Is Reordering occuring? */
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
+ }
+ if (rsm->r_flags & BBR_SACK_PASSED) {
+ BBR_STAT_INC(bbr_reorder_seen);
+ bbr->r_ctl.rc_reorder_ts = cts;
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
+ if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
+ /* LT sampling also needs adjustment */
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+ }
+ }
+ rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
+ rsm->r_flags |= BBR_ACKED;
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+out:
+ if (rsm && (rsm->r_flags & BBR_ACKED)) {
+ /*
+ * Now can we merge this newly acked
+ * block with either the previous or
+ * next block?
+ */
+ nrsm = TAILQ_NEXT(rsm, r_next);
+ if (nrsm &&
+ (nrsm->r_flags & BBR_ACKED)) {
+ /* yep this and next can be merged */
+ rsm = bbr_merge_rsm(bbr, rsm, nrsm);
+ }
+ /* Now what about the previous? */
+ nrsm = TAILQ_PREV(rsm, bbr_head, r_next);
+ if (nrsm &&
+ (nrsm->r_flags & BBR_ACKED)) {
+ /* yep the previous and this can be merged */
+ rsm = bbr_merge_rsm(bbr, nrsm, rsm);
+ }
+ }
+ if (used_ref == 0) {
+ BBR_STAT_INC(bbr_sack_proc_all);
+ } else {
+ BBR_STAT_INC(bbr_sack_proc_short);
+ }
+ if (went_fwd && went_back) {
+ BBR_STAT_INC(bbr_sack_search_both);
+ } else if (went_fwd) {
+ BBR_STAT_INC(bbr_sack_search_fwd);
+ } else if (went_back) {
+ BBR_STAT_INC(bbr_sack_search_back);
+ }
+ /* Save off where the next seq is */
+ if (rsm)
+ bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
+ else
+ bbr->r_ctl.rc_sacklast = NULL;
+ *prsm = rsm;
+ return (changed);
+}
+
+
+static void inline
+bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack)
+{
+ struct bbr_sendmap *tmap;
+
+ BBR_STAT_INC(bbr_reneges_seen);
+ tmap = NULL;
+ while (rsm && (rsm->r_flags & BBR_ACKED)) {
+ /* Its no longer sacked, mark it so */
+ uint32_t oflags;
+ bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+#ifdef BBR_INVARIANTS
+ if (rsm->r_in_tmap) {
+ panic("bbr:%p rsm:%p flags:0x%x in tmap?",
+ bbr, rsm, rsm->r_flags);
+ }
+#endif
+ oflags = rsm->r_flags;
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
+ bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
+ if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
+ /* LT sampling also needs adjustment */
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+ }
+ rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST);
+ rsm->r_flags |= BBR_WAS_RENEGED;
+ rsm->r_flags |= BBR_RXT_CLEARED;
+ bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__);
+ /* Rebuild it into our tmap */
+ if (tmap == NULL) {
+ TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ tmap = rsm;
+ } else {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext);
+ tmap = rsm;
+ }
+ tmap->r_in_tmap = 1;
+ /*
+ * XXXrrs Delivered? Should we do anything here?
+ *
+ * Of course we don't on a rxt timeout so maybe its ok that
+ * we don't?
+ *
+ * For now lets not.
+ */
+ rsm = TAILQ_NEXT(rsm, r_next);
+ }
+ /*
+ * Now lets possibly clear the sack filter so we start recognizing
+ * sacks that cover this area.
+ */
+ sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack);
+}
+
+static void
+bbr_log_syn(struct tcpcb *tp, struct tcpopt *to)
+{
+ struct tcp_bbr *bbr;
+ struct bbr_sendmap *rsm;
+ uint32_t cts;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ cts = bbr->r_ctl.rc_rcvtime;
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ if (rsm && (rsm->r_flags & BBR_HAS_SYN)) {
+ if ((rsm->r_end - rsm->r_start) <= 1) {
+ /* Log out the SYN completely */
+ bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
+ rsm->r_rtr_bytes = 0;
+ TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ if (bbr->r_ctl.rc_next == rsm) {
+ /* scoot along the marker */
+ bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ }
+ if (to != NULL)
+ bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0);
+ bbr_free(bbr, rsm);
+ } else {
+ /* There is more (Fast open)? strip out SYN. */
+ rsm->r_flags &= ~BBR_HAS_SYN;
+ rsm->r_start++;
+ }
+ }
+}
+
+/*
+ * Returns the number of bytes that were
+ * acknowledged by SACK blocks.
+ */
+
+static uint32_t
+bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th,
+ uint32_t *prev_acked)
+{
+ uint32_t changed, last_seq, entered_recovery = 0;
+ struct tcp_bbr *bbr;
+ struct bbr_sendmap *rsm;
+ struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
+ register uint32_t th_ack;
+ int32_t i, j, k, new_sb, num_sack_blks = 0;
+ uint32_t cts, acked, ack_point, sack_changed = 0;
+ uint32_t p_maxseg, maxseg, p_acked = 0;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (th->th_flags & TH_RST) {
+ /* We don't log resets */
+ return (0);
+ }
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ cts = bbr->r_ctl.rc_rcvtime;
+
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ changed = 0;
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg);
+ th_ack = th->th_ack;
+ if (SEQ_GT(th_ack, tp->snd_una)) {
+ acked = th_ack - tp->snd_una;
+ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__);
+ bbr->rc_tp->t_acktime = ticks;
+ } else
+ acked = 0;
+ if (SEQ_LEQ(th_ack, tp->snd_una)) {
+ /* Only sent here for sack processing */
+ goto proc_sack;
+ }
+ if (rsm && SEQ_GT(th_ack, rsm->r_start)) {
+ changed = th_ack - rsm->r_start;
+ } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) {
+ /*
+ * For the SYN incoming case we will not have called
+ * tcp_output for the sending of the SYN, so there will be
+ * no map. All other cases should probably be a panic.
+ */
+ if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) {
+ /*
+ * We have a timestamp that can be used to generate
+ * an initial RTT.
+ */
+ uint32_t ts, now, rtt;
+
+ ts = bbr_ts_convert(to->to_tsecr);
+ now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv));
+ rtt = now - ts;
+ if (rtt < 1)
+ rtt = 1;
+ bbr_log_type_bbrrttprop(bbr, rtt,
+ tp->iss, 0, cts,
+ BBR_RTT_BY_TIMESTAMP, tp->iss, 0);
+ apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
+ changed = 1;
+ bbr->r_wanted_output = 1;
+ goto out;
+ }
+ goto proc_sack;
+ } else if (rsm == NULL) {
+ goto out;
+ }
+ if (changed) {
+ /*
+ * The ACK point is advancing to th_ack, we must drop off
+ * the packets in the rack log and calculate any eligble
+ * RTT's.
+ */
+ bbr->r_wanted_output = 1;
+more:
+ if (rsm == NULL) {
+
+ if (tp->t_flags & TF_SENTFIN) {
+ /* if we send a FIN we will not hav a map */
+ goto proc_sack;
+ }
+#ifdef BBR_INVARIANTS
+ panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n",
+ tp,
+ th, tp->t_state, bbr,
+ tp->snd_una, tp->snd_max, changed);
+#endif
+ goto proc_sack;
+ }
+ }
+ if (SEQ_LT(th_ack, rsm->r_start)) {
+ /* Huh map is missing this */
+#ifdef BBR_INVARIANTS
+ printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n",
+ rsm->r_start,
+ th_ack, tp->t_state,
+ bbr->r_state, bbr);
+ panic("th-ack is bad bbr:%p tp:%p", bbr, tp);
+#endif
+ goto proc_sack;
+ } else if (th_ack == rsm->r_start) {
+ /* None here to ack */
+ goto proc_sack;
+ }
+ /*
+ * Clear the dup ack counter, it will
+ * either be freed or if there is some
+ * remaining we need to start it at zero.
+ */
+ rsm->r_dupack = 0;
+ /* Now do we consume the whole thing? */
+ if (SEQ_GEQ(th_ack, rsm->r_end)) {
+ /* Its all consumed. */
+ uint32_t left;
+
+ if (rsm->r_flags & BBR_ACKED) {
+ /*
+ * It was acked on the scoreboard -- remove it from
+ * total
+ */
+ p_acked += (rsm->r_end - rsm->r_start);
+ bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+ if (bbr->r_ctl.rc_sacked == 0)
+ bbr->r_ctl.rc_sacklast = NULL;
+ } else {
+ bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack);
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
+ }
+ if (rsm->r_flags & BBR_SACK_PASSED) {
+ /*
+ * There are acked segments ACKED on the
+ * scoreboard further up. We are seeing
+ * reordering.
+ */
+ BBR_STAT_INC(bbr_reorder_seen);
+ bbr->r_ctl.rc_reorder_ts = cts;
+ if (rsm->r_flags & BBR_MARKED_LOST) {
+ bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
+ if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
+ /* LT sampling also needs adjustment */
+ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
+ }
+ }
+ rsm->r_flags &= ~BBR_MARKED_LOST;
+ }
+ bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
+ rsm->r_rtr_bytes = 0;
+ TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ if (bbr->r_ctl.rc_next == rsm) {
+ /* scoot along the marker */
+ bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ }
+ bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
+ /* Adjust the packet counts */
+ left = th_ack - rsm->r_end;
+ /* Free back to zone */
+ bbr_free(bbr, rsm);
+ if (left) {
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ goto more;
+ }
+ goto proc_sack;
+ }
+ if (rsm->r_flags & BBR_ACKED) {
+ /*
+ * It was acked on the scoreboard -- remove it from total
+ * for the part being cum-acked.
+ */
+ p_acked += (rsm->r_end - rsm->r_start);
+ bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
+ if (bbr->r_ctl.rc_sacked == 0)
+ bbr->r_ctl.rc_sacklast = NULL;
+ } else {
+ /*
+ * It was acked up to th_ack point for the first time
+ */
+ struct bbr_sendmap lrsm;
+
+ memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap));
+ lrsm.r_end = th_ack;
+ bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack);
+ }
+ if ((rsm->r_flags & BBR_MARKED_LOST) &&
+ ((rsm->r_flags & BBR_ACKED) == 0)) {
+ /*
+ * It was marked lost and partly ack'd now
+ * for the first time. We lower the rc_lost_bytes
+ * and still leave it MARKED.
+ */
+ bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start;
+ }
+ bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
+ bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
+ rsm->r_rtr_bytes = 0;
+ /* adjust packet count */
+ rsm->r_start = th_ack;
+proc_sack:
+ /* Check for reneging */
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) {
+ /*
+ * The peer has moved snd_una up to the edge of this send,
+ * i.e. one that it had previously acked. The only way that
+ * can be true if the peer threw away data (space issues)
+ * that it had previously sacked (else it would have given
+ * us snd_una up to (rsm->r_end). We need to undo the acked
+ * markings here.
+ *
+ * Note we have to look to make sure th_ack is our
+ * rsm->r_start in case we get an old ack where th_ack is
+ * behind snd_una.
+ */
+ bbr_peer_reneges(bbr, rsm, th->th_ack);
+ }
+ if ((to->to_flags & TOF_SACK) == 0) {
+ /* We are done nothing left to log */
+ goto out;
+ }
+ rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
+ if (rsm) {
+ last_seq = rsm->r_end;
+ } else {
+ last_seq = tp->snd_max;
+ }
+ /* Sack block processing */
+ if (SEQ_GT(th_ack, tp->snd_una))
+ ack_point = th_ack;
+ else
+ ack_point = tp->snd_una;
+ for (i = 0; i < to->to_nsacks; i++) {
+ bcopy((to->to_sacks + i * TCPOLEN_SACK),
+ &sack, sizeof(sack));
+ sack.start = ntohl(sack.start);
+ sack.end = ntohl(sack.end);
+ if (SEQ_GT(sack.end, sack.start) &&
+ SEQ_GT(sack.start, ack_point) &&
+ SEQ_LT(sack.start, tp->snd_max) &&
+ SEQ_GT(sack.end, ack_point) &&
+ SEQ_LEQ(sack.end, tp->snd_max)) {
+ if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) &&
+ (SEQ_LT(sack.end, last_seq)) &&
+ ((sack.end - sack.start) < (p_maxseg / 8))) {
+ /*
+ * Not the last piece and its smaller than
+ * 1/8th of a p_maxseg. We ignore this.
+ */
+ BBR_STAT_INC(bbr_runt_sacks);
+ continue;
+ }
+ sack_blocks[num_sack_blks] = sack;
+ num_sack_blks++;
+#ifdef NETFLIX_STATS
+ } else if (SEQ_LEQ(sack.start, th_ack) &&
+ SEQ_LEQ(sack.end, th_ack)) {
+ /*
+ * Its a D-SACK block.
+ */
+ tcp_record_dsack(sack.start, sack.end);
+#endif
+ }
+ }
+ if (num_sack_blks == 0)
+ goto out;
+ /*
+ * Sort the SACK blocks so we can update the rack scoreboard with
+ * just one pass.
+ */
+ new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks,
+ num_sack_blks, th->th_ack);
+ ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks);
+ BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks);
+ BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb));
+ num_sack_blks = new_sb;
+ if (num_sack_blks < 2) {
+ goto do_sack_work;
+ }
+ /* Sort the sacks */
+ for (i = 0; i < num_sack_blks; i++) {
+ for (j = i + 1; j < num_sack_blks; j++) {
+ if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
+ sack = sack_blocks[i];
+ sack_blocks[i] = sack_blocks[j];
+ sack_blocks[j] = sack;
+ }
+ }
+ }
+ /*
+ * Now are any of the sack block ends the same (yes some
+ * implememtations send these)?
+ */
+again:
+ if (num_sack_blks > 1) {
+ for (i = 0; i < num_sack_blks; i++) {
+ for (j = i + 1; j < num_sack_blks; j++) {
+ if (sack_blocks[i].end == sack_blocks[j].end) {
+ /*
+ * Ok these two have the same end we
+ * want the smallest end and then
+ * throw away the larger and start
+ * again.
+ */
+ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
+ /*
+ * The second block covers
+ * more area use that
+ */
+ sack_blocks[i].start = sack_blocks[j].start;
+ }
+ /*
+ * Now collapse out the dup-sack and
+ * lower the count
+ */
+ for (k = (j + 1); k < num_sack_blks; k++) {
+ sack_blocks[j].start = sack_blocks[k].start;
+ sack_blocks[j].end = sack_blocks[k].end;
+ j++;
+ }
+ num_sack_blks--;
+ goto again;
+ }
+ }
+ }
+ }
+do_sack_work:
+ rsm = bbr->r_ctl.rc_sacklast;
+ for (i = 0; i < num_sack_blks; i++) {
+ acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts);
+ if (acked) {
+ bbr->r_wanted_output = 1;
+ changed += acked;
+ sack_changed += acked;
+ }
+ }
+out:
+ *prev_acked = p_acked;
+ if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
+ /*
+ * Ok we have a high probability that we need to go in to
+ * recovery since we have data sack'd
+ */
+ struct bbr_sendmap *rsm;
+
+ rsm = bbr_check_recovery_mode(tp, bbr, cts);
+ if (rsm) {
+ /* Enter recovery */
+ entered_recovery = 1;
+ bbr->r_wanted_output = 1;
+ /*
+ * When we enter recovery we need to assure we send
+ * one packet.
+ */
+ if (bbr->r_ctl.rc_resend == NULL) {
+ bbr->r_ctl.rc_resend = rsm;
+ }
+ }
+ }
+ if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
+ /*
+ * See if we need to rack-retransmit anything if so set it
+ * up as the thing to resend assuming something else is not
+ * already in that position.
+ */
+ if (bbr->r_ctl.rc_resend == NULL) {
+ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
+ }
+ }
+ /*
+ * We return the amount that changed via sack, this is used by the
+ * ack-received code to augment what was changed between th_ack <->
+ * snd_una.
+ */
+ return (sack_changed);
+}
+
+static void
+bbr_strike_dupack(struct tcp_bbr *bbr)
+{
+ struct bbr_sendmap *rsm;
+
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
+ if (rsm && (rsm->r_dupack < 0xff)) {
+ rsm->r_dupack++;
+ if (rsm->r_dupack >= DUP_ACK_THRESHOLD)
+ bbr->r_wanted_output = 1;
+ }
+}
+
+/*
+ * Return value of 1, we do not need to call bbr_process_data().
+ * return value of 0, bbr_process_data can be called.
+ * For ret_val if its 0 the TCB is locked and valid, if its non-zero
+ * its unlocked and probably unsafe to touch the TCB.
+ */
+static int
+bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to,
+ uint32_t tiwin, int32_t tlen,
+ int32_t * ofia, int32_t thflags, int32_t * ret_val)
+{
+ int32_t ourfinisacked = 0;
+ int32_t acked_amount;
+ uint16_t nsegs;
+ int32_t acked;
+ uint32_t lost, sack_changed = 0;
+ struct mbuf *mfree;
+ struct tcp_bbr *bbr;
+ uint32_t prev_acked = 0;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ lost = bbr->r_ctl.rc_lost;
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ if (SEQ_GT(th->th_ack, tp->snd_max)) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ bbr->r_wanted_output = 1;
+ return (1);
+ }
+ if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
+ /* Process the ack */
+ if (bbr->rc_in_persist)
+ tp->t_rxtshift = 0;
+ if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
+ bbr_strike_dupack(bbr);
+ sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
+ }
+ bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost));
+ if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
+ /*
+ * Old ack, behind the last one rcv'd or a duplicate ack
+ * with SACK info.
+ */
+ if (th->th_ack == tp->snd_una) {
+ bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0);
+ if (bbr->r_state == TCPS_SYN_SENT) {
+ /*
+ * Special case on where we sent SYN. When
+ * the SYN-ACK is processed in syn_sent
+ * state it bumps the snd_una. This causes
+ * us to hit here even though we did ack 1
+ * byte.
+ *
+ * Go through the nothing left case so we
+ * send data.
+ */
+ goto nothing_left;
+ }
+ }
+ return (0);
+ }
+ /*
+ * If we reach this point, ACK is not a duplicate, i.e., it ACKs
+ * something we sent.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our SYN has
+ * been ACK'd (so connection is now fully synchronized). Go
+ * to non-starred state, increment snd_una for ACK of SYN,
+ * and check if we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ /* Send window already scaled. */
+ }
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ acked = BYTES_THIS_ACK(tp, th);
+ TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+
+ /*
+ * If we just performed our first retransmit, and the ACK arrives
+ * within our recovery window, then it was a mistake to do the
+ * retransmit in the first place. Recover our original cwnd and
+ * ssthresh, and proceed to transmit where we left off.
+ */
+ if (tp->t_flags & TF_PREVVALID) {
+ tp->t_flags &= ~TF_PREVVALID;
+ if (tp->t_rxtshift == 1 &&
+ (int)(ticks - tp->t_badrxtwin) < 0)
+ bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
+ }
+ SOCKBUF_LOCK(&so->so_snd);
+ acked_amount = min(acked, (int)sbavail(&so->so_snd));
+ tp->snd_wnd -= acked_amount;
+ mfree = sbcut_locked(&so->so_snd, acked_amount);
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
+ m_freem(mfree);
+ if (SEQ_GT(th->th_ack, tp->snd_una)) {
+ bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
+ }
+ tp->snd_una = th->th_ack;
+ bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost));
+ if (IN_RECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+ (SEQ_LT(th->th_ack, tp->snd_max))) {
+ tcp_bbr_partialack(tp);
+ } else {
+ bbr_post_recovery(tp);
+ }
+ }
+ if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
+ tp->snd_recover = tp->snd_una;
+ }
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ tp->snd_nxt = tp->snd_max;
+ }
+ if (tp->snd_una == tp->snd_max) {
+ /* Nothing left outstanding */
+nothing_left:
+ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
+ if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
+ bbr->rc_tp->t_acktime = 0;
+ if ((sbused(&so->so_snd) == 0) &&
+ (tp->t_flags & TF_SENTFIN)) {
+ ourfinisacked = 1;
+ }
+ bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
+ if (bbr->rc_in_persist == 0) {
+ bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
+ }
+ sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
+ bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
+ /*
+ * We invalidate the last ack here since we
+ * don't want to transfer forward the time
+ * for our sum's calculations.
+ */
+ if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
+ (sbavail(&so->so_snd) == 0) &&
+ (tp->t_flags2 & TF2_DROP_AF_DATA)) {
+ /*
+ * The socket was gone and the peer sent data, time
+ * to reset him.
+ */
+ *ret_val = 1;
+ tp = tcp_close(tp);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
+ BBR_STAT_INC(bbr_dropped_af_data);
+ return (1);
+ }
+ /* Set need output so persist might get set */
+ bbr->r_wanted_output = 1;
+ }
+ if (ofia)
+ *ofia = ourfinisacked;
+ return (0);
+}
+
+static void
+bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
+{
+ if (bbr->rc_in_persist == 0) {
+ bbr_timer_cancel(bbr, __LINE__, cts);
+ bbr->r_ctl.rc_last_delay_val = 0;
+ tp->t_rxtshift = 0;
+ bbr->rc_in_persist = 1;
+ bbr->r_ctl.rc_went_idle_time = cts;
+ /* We should be capped when rw went to 0 but just in case */
+ bbr_log_type_pesist(bbr, cts, 0, line, 1);
+ /* Time freezes for the state, so do the accounting now */
+ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+ uint32_t time_in;
+
+ time_in = cts - bbr->r_ctl.rc_bbr_state_time;
+ if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
+ int32_t idx;
+
+ idx = bbr_state_val(bbr);
+ counter_u64_add(bbr_state_time[(idx + 5)], time_in);
+ } else {
+ counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
+ }
+ }
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ }
+}
+
+static void
+bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time)
+{
+ /*
+ * Note that if idle time does not exceed our
+ * threshold, we do nothing continuing the state
+ * transitions we were last walking through.
+ */
+ if (idle_time >= bbr_idle_restart_threshold) {
+ if (bbr->rc_use_idle_restart) {
+ bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT;
+ /*
+ * Set our target using BBR_UNIT, so
+ * we increase at a dramatic rate but
+ * we stop when we get the pipe
+ * full again for our current b/w estimate.
+ */
+ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
+ bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
+ bbr_set_state_target(bbr, __LINE__);
+ /* Now setup our gains to ramp up */
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ } else {
+ bbr_substate_change(bbr, cts, __LINE__, 1);
+ }
+ }
+}
+
+static void
+bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
+{
+ uint32_t idle_time;
+
+ if (bbr->rc_in_persist == 0)
+ return;
+ idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time);
+ bbr->rc_in_persist = 0;
+ bbr->rc_hit_state_1 = 0;
+ tp->t_flags &= ~TF_FORCEDATA;
+ bbr->r_ctl.rc_del_time = cts;
+ /*
+ * We invalidate the last ack here since we
+ * don't want to transfer forward the time
+ * for our sum's calculations.
+ */
+ if (bbr->rc_inp->inp_in_hpts) {
+ tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
+ bbr->rc_timer_first = 0;
+ bbr->r_ctl.rc_hpts_flags = 0;
+ bbr->r_ctl.rc_last_delay_val = 0;
+ bbr->r_ctl.rc_hptsi_agg_delay = 0;
+ bbr->r_agg_early_set = 0;
+ bbr->r_ctl.rc_agg_early = 0;
+ }
+ bbr_log_type_pesist(bbr, cts, idle_time, line, 0);
+ if (idle_time >= bbr_rtt_probe_time) {
+ /*
+ * This qualifies as a RTT_PROBE session since we drop the
+ * data outstanding to nothing and waited more than
+ * bbr_rtt_probe_time.
+ */
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0);
+ bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts;
+ }
+ tp->t_rxtshift = 0;
+ /*
+ * If in probeBW and we have persisted more than an RTT lets do
+ * special handling.
+ */
+ /* Force a time based epoch */
+ bbr_set_epoch(bbr, cts, __LINE__);
+ /*
+ * Setup the lost so we don't count anything against the guy
+ * we have been stuck with during persists.
+ */
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ /* Time un-freezes for the state */
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) ||
+ (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) {
+ /*
+ * If we are going back to probe-bw
+ * or probe_rtt, we may need to possibly
+ * do a fast restart.
+ */
+ bbr_restart_after_idle(bbr, cts, idle_time);
+ }
+}
+
+static void
+bbr_collapsed_window(struct tcp_bbr *bbr)
+{
+ /*
+ * Now we must walk the
+ * send map and divide the
+ * ones left stranded. These
+ * guys can't cause us to abort
+ * the connection and are really
+ * "unsent". However if a buggy
+ * client actually did keep some
+ * of the data i.e. collapsed the win
+ * and refused to ack and then opened
+ * the win and acked that data. We would
+ * get into an ack war, the simplier
+ * method then of just pretending we
+ * did not send those segments something
+ * won't work.
+ */
+ struct bbr_sendmap *rsm, *nrsm;
+ tcp_seq max_seq;
+ uint32_t maxseg;
+ int can_split = 0;
+ int fnd = 0;
+
+ maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
+ max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd;
+ bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0);
+ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
+ /* Find the first seq past or at maxseq */
+ if (rsm->r_flags & BBR_RWND_COLLAPSED)
+ rsm->r_flags &= ~BBR_RWND_COLLAPSED;
+ if (SEQ_GEQ(max_seq, rsm->r_start) &&
+ SEQ_GEQ(rsm->r_end, max_seq)) {
+ fnd = 1;
+ break;
+ }
+ }
+ bbr->rc_has_collapsed = 0;
+ if (!fnd) {
+ /* Nothing to do strange */
+ return;
+ }
+ /*
+ * Now can we split?
+ *
+ * We don't want to split if splitting
+ * would generate too many small segments
+ * less we let an attacker fragment our
+ * send_map and leave us out of memory.
+ */
+ if ((max_seq != rsm->r_start) &&
+ (max_seq != rsm->r_end)){
+ /* can we split? */
+ int res1, res2;
+
+ res1 = max_seq - rsm->r_start;
+ res2 = rsm->r_end - max_seq;
+ if ((res1 >= (maxseg/8)) &&
+ (res2 >= (maxseg/8))) {
+ /* No small pieces here */
+ can_split = 1;
+ } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) {
+ /* We are under the limit */
+ can_split = 1;
+ }
+ }
+ /* Ok do we need to split this rsm? */
+ if (max_seq == rsm->r_start) {
+ /* It's this guy no split required */
+ nrsm = rsm;
+ } else if (max_seq == rsm->r_end) {
+ /* It's the next one no split required. */
+ nrsm = TAILQ_NEXT(rsm, r_next);
+ if (nrsm == NULL) {
+ /* Huh? */
+ return;
+ }
+ } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) {
+ /* yep we need to split it */
+ nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
+ if (nrsm == NULL) {
+ /* failed XXXrrs what can we do mark the whole? */
+ nrsm = rsm;
+ goto no_split;
+ }
+ /* Clone it */
+ bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0);
+ bbr_clone_rsm(bbr, nrsm, rsm, max_seq);
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ } else {
+ /*
+ * Split not allowed just start here just
+ * use this guy.
+ */
+ nrsm = rsm;
+ }
+no_split:
+ BBR_STAT_INC(bbr_collapsed_win);
+ /* reuse fnd as a count */
+ fnd = 0;
+ TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) {
+ nrsm->r_flags |= BBR_RWND_COLLAPSED;
+ fnd++;
+ bbr->rc_has_collapsed = 1;
+ }
+ bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd);
+}
+
+static void
+bbr_un_collapse_window(struct tcp_bbr *bbr)
+{
+ struct bbr_sendmap *rsm;
+ int cleared = 0;
+
+ TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
+ if (rsm->r_flags & BBR_RWND_COLLAPSED) {
+ /* Clear the flag */
+ rsm->r_flags &= ~BBR_RWND_COLLAPSED;
+ cleared++;
+ } else
+ break;
+ }
+ bbr_log_type_rwnd_collapse(bbr,
+ (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared);
+ bbr->rc_has_collapsed = 0;
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ /*
+ * Update window information. Don't look at window if no ACK: TAC's
+ * send garbage on first SYN.
+ */
+ uint16_t nsegs;
+ int32_t tfo_syn;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ TCPSTAT_INC(tcps_rcvwinupd);
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ bbr->r_wanted_output = 1;
+ } else if (thflags & TH_ACK) {
+ if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ }
+ }
+ if (tp->snd_wnd < ctf_outstanding(tp))
+ /* The peer collapsed its window on us */
+ bbr_collapsed_window(bbr);
+ else if (bbr->rc_has_collapsed)
+ bbr_un_collapse_window(bbr);
+ /* Was persist timer active and now we have window space? */
+ if ((bbr->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
+ bbr_minseg(bbr)))) {
+ /*
+ * Make the rate persist at end of persist mode if idle long
+ * enough
+ */
+ bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
+
+ /* Make sure we output to start the timer */
+ bbr->r_wanted_output = 1;
+ }
+ /* Do we need to enter persist? */
+ if ((bbr->rc_in_persist == 0) &&
+ (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_max == tp->snd_una) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
+ (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
+ /* No send window.. we must enter persist */
+ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
+ }
+ if (tp->t_flags2 & TF2_DROP_AF_DATA) {
+ m_freem(m);
+ return (0);
+ }
+ /*
+ * Process segments with URG.
+ */
+ if ((thflags & TH_URG) && th->th_urp &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ /*
+ * This is a kludge, but if we receive and accept random
+ * urgent pointers, we'll crash in soreceive. It's hard to
+ * imagine someone actually wanting to send this much urgent
+ * data.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
+ th->th_urp = 0; /* XXX */
+ thflags &= ~TH_URG; /* XXX */
+ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
+ goto dodata; /* XXX */
+ }
+ /*
+ * If this segment advances the known urgent pointer, then
+ * mark the data stream. This should not happen in
+ * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
+ * FIN has been received from the remote side. In these
+ * states we ignore the URG.
+ *
+ * According to RFC961 (Assigned Protocols), the urgent
+ * pointer points to the last octet of urgent data. We
+ * continue, however, to consider it to indicate the first
+ * octet of data past the urgent section as the original
+ * spec states (in one of two places).
+ */
+ if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
+ tp->rcv_up = th->th_seq + th->th_urp;
+ so->so_oobmark = sbavail(&so->so_rcv) +
+ (tp->rcv_up - tp->rcv_nxt) - 1;
+ if (so->so_oobmark == 0)
+ so->so_rcv.sb_state |= SBS_RCVATMARK;
+ sohasoutofband(so);
+ tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /*
+ * Remove out of band data so doesn't get presented to user.
+ * This can happen independent of advancing the URG pointer,
+ * but if two URG's are pending at once, some out-of-band
+ * data may creep in... ick.
+ */
+ if (th->th_urp <= (uint32_t)tlen &&
+ !(so->so_options & SO_OOBINLINE)) {
+ /* hdr drop is delayed */
+ tcp_pulloutofband(so, th, m, drop_hdrlen);
+ }
+ } else {
+ /*
+ * If no out of band data is expected, pull receive urgent
+ * pointer along with the receive window.
+ */
+ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+ tp->rcv_up = tp->rcv_nxt;
+ }
+dodata: /* XXX */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
+ * Process the segment text, merging it into the TCP sequencing
+ * queue, and arranging for acknowledgment of receipt if necessary.
+ * This process logically involves adjusting tp->rcv_wnd as data is
+ * presented to the user (this happens in tcp_usrreq.c, case
+ * PRU_RCVD). If a FIN has already been received on this connection
+ * then we just ignore the text.
+ */
+ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
+ IS_FASTOPEN(tp->t_flags));
+ if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ tcp_seq save_start = th->th_seq;
+ tcp_seq save_rnxt = tp->rcv_nxt;
+ int save_tlen = tlen;
+
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ /*
+ * Insert segment which includes th into TCP reassembly
+ * queue with control block tp. Set thflags to whether
+ * reassembly now includes a segment with FIN. This handles
+ * the common case inline (segment is the next to be
+ * received on an established connection, and the queue is
+ * empty), avoiding linkage into and removal from the queue
+ * and repetition of various conversions. Set DELACK for
+ * segments received in order, but ack immediately when
+ * segments are out of order (so fast retransmit can work).
+ */
+ if (th->th_seq == tp->rcv_nxt &&
+ SEGQ_EMPTY(tp) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state) ||
+ tfo_syn)) {
+#ifdef NETFLIX_SB_LIMITS
+ u_int mcnt, appended;
+
+ if (so->so_rcv.sb_shlim) {
+ mcnt = m_memcnt(m);
+ appended = 0;
+ if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
+ CFO_NOSLEEP, NULL) == false) {
+ counter_u64_add(tcp_sb_shlim_fails, 1);
+ m_freem(m);
+ return (0);
+ }
+ }
+#endif
+ if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) {
+ bbr->bbr_segs_rcvd += max(1, nsegs);
+ tp->t_flags |= TF_DELACK;
+ bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
+ } else {
+ bbr->r_wanted_output = 1;
+ tp->t_flags |= TF_ACKNOW;
+ }
+ tp->rcv_nxt += tlen;
+ thflags = th->th_flags & TH_FIN;
+ TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ m_freem(m);
+ else
+#ifdef NETFLIX_SB_LIMITS
+ appended =
+#endif
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+#ifdef NETFLIX_SB_LIMITS
+ if (so->so_rcv.sb_shlim && appended != mcnt)
+ counter_fo_release(so->so_rcv.sb_shlim,
+ mcnt - appended);
+#endif
+ } else {
+ /*
+ * XXX: Due to the header drop above "th" is
+ * theoretically invalid by now. Fortunately
+ * m_adj() doesn't actually frees any mbufs when
+ * trimming from the head.
+ */
+ tcp_seq temp = save_start;
+ thflags = tcp_reass(tp, th, &temp, &tlen, m);
+ tp->t_flags |= TF_ACKNOW;
+ }
+ if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
+ if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
+ /*
+ * DSACK actually handled in the fastpath
+ * above.
+ */
+ tcp_update_sack_list(tp, save_start,
+ save_start + save_tlen);
+ } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
+ if ((tp->rcv_numsacks >= 1) &&
+ (tp->sackblks[0].end == save_start)) {
+ /*
+ * Partial overlap, recorded at todrop
+ * above.
+ */
+ tcp_update_sack_list(tp,
+ tp->sackblks[0].start,
+ tp->sackblks[0].end);
+ } else {
+ tcp_update_dsack_list(tp, save_start,
+ save_start + save_tlen);
+ }
+ } else if (tlen >= save_tlen) {
+ /* Update of sackblks. */
+ tcp_update_dsack_list(tp, save_start,
+ save_start + save_tlen);
+ } else if (tlen > 0) {
+ tcp_update_dsack_list(tp, save_start,
+ save_start + tlen);
+ }
+ }
+ } else {
+ m_freem(m);
+ thflags &= ~TH_FIN;
+ }
+
+ /*
+ * If FIN is received ACK the FIN and let the user know that the
+ * connection is closing.
+ */
+ if (thflags & TH_FIN) {
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ socantrcvmore(so);
+ /*
+ * If connection is half-synchronized (ie NEEDSYN
+ * flag on) then delay ACK, so it may be piggybacked
+ * when SYN is sent. Otherwise, since we received a
+ * FIN then no more input can be expected, send ACK
+ * now.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ tp->t_flags |= TF_DELACK;
+ bbr_timer_cancel(bbr,
+ __LINE__, bbr->r_ctl.rc_rcvtime);
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ }
+ tp->rcv_nxt++;
+ }
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED and ESTABLISHED STATES enter the
+ * CLOSE_WAIT state.
+ */
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /* FALLTHROUGH */
+ case TCPS_ESTABLISHED:
+ tcp_state_change(tp, TCPS_CLOSE_WAIT);
+ break;
+
+ /*
+ * If still in FIN_WAIT_1 STATE FIN has not been
+ * acked so enter the CLOSING state.
+ */
+ case TCPS_FIN_WAIT_1:
+ tcp_state_change(tp, TCPS_CLOSING);
+ break;
+
+ /*
+ * In FIN_WAIT_2 state enter the TIME_WAIT state,
+ * starting the time-wait timer, turning off the
+ * other standard timers.
+ */
+ case TCPS_FIN_WAIT_2:
+ bbr->rc_timer_first = 1;
+ bbr_timer_cancel(bbr,
+ __LINE__, bbr->r_ctl.rc_rcvtime);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ tcp_twstart(tp);
+ return (1);
+ }
+ }
+ /*
+ * Return any desired output.
+ */
+ if ((tp->t_flags & TF_ACKNOW) ||
+ (sbavail(&so->so_snd) > ctf_outstanding(tp))) {
+ bbr->r_wanted_output = 1;
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ return (0);
+}
+
+/*
+ * Here nothing is really faster, its just that we
+ * have broken out the fast-data path also just like
+ * the fast-ack. Return 1 if we processed the packet
+ * return 0 if you need to take the "slow-path".
+ */
+static int
+bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t nxt_pkt)
+{
+ uint16_t nsegs;
+ int32_t newsize = 0; /* automatic sockbuf scaling */
+ struct tcp_bbr *bbr;
+#ifdef NETFLIX_SB_LIMITS
+ u_int mcnt, appended;
+#endif
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+
+#endif
+ /* On the hpts and we would have called output */
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * the timestamp. NOTE that the test is modified according to the
+ * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if (bbr->r_ctl.rc_resend != NULL) {
+ return (0);
+ }
+ if (tiwin && tiwin != tp->snd_wnd) {
+ return (0);
+ }
+ if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
+ return (0);
+ }
+ if (__predict_false((to->to_flags & TOF_TS) &&
+ (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
+ return (0);
+ }
+ if (__predict_false((th->th_ack != tp->snd_una))) {
+ return (0);
+ }
+ if (__predict_false(tlen > sbspace(&so->so_rcv))) {
+ return (0);
+ }
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * This is a pure, in-sequence data packet with nothing on the
+ * reassembly queue and we have enough buffer space to take it.
+ */
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+#ifdef NETFLIX_SB_LIMITS
+ if (so->so_rcv.sb_shlim) {
+ mcnt = m_memcnt(m);
+ appended = 0;
+ if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
+ CFO_NOSLEEP, NULL) == false) {
+ counter_u64_add(tcp_sb_shlim_fails, 1);
+ m_freem(m);
+ return (1);
+ }
+ }
+#endif
+ /* Clean receiver SACK report if present */
+ if (tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+ TCPSTAT_INC(tcps_preddat);
+ tp->rcv_nxt += tlen;
+ /*
+ * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
+ */
+ tp->snd_wl1 = th->th_seq;
+ /*
+ * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
+ */
+ tp->rcv_up = tp->rcv_nxt;
+ TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
+ TCPSTAT_ADD(tcps_rcvbyte, tlen);
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+ newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
+
+ /* Add data to socket buffer. */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ m_freem(m);
+ } else {
+ /*
+ * Set new socket buffer size. Give up when limit is
+ * reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, NULL))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+#ifdef NETFLIX_SB_LIMITS
+ appended =
+#endif
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ ctf_calc_rwin(so, tp);
+ }
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
+#ifdef NETFLIX_SB_LIMITS
+ if (so->so_rcv.sb_shlim && mcnt != appended)
+ counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
+#endif
+ if (DELAY_ACK(tp, bbr, nsegs)) {
+ bbr->bbr_segs_rcvd += max(1, nsegs);
+ tp->t_flags |= TF_DELACK;
+ bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
+ } else {
+ bbr->r_wanted_output = 1;
+ tp->t_flags |= TF_ACKNOW;
+ }
+ return (1);
+}
+
+/*
+ * This subfunction is used to try to highly optimize the
+ * fast path. We again allow window updates that are
+ * in sequence to remain in the fast-path. We also add
+ * in the __predict's to attempt to help the compiler.
+ * Note that if we return a 0, then we can *not* process
+ * it and the caller should push the packet into the
+ * slow-path. If we return 1, then all is well and
+ * the packet is fully processed.
+ */
+static int
+bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t nxt_pkt)
+{
+ int32_t acked;
+ uint16_t nsegs;
+ uint32_t sack_changed;
+#ifdef TCPDEBUG
+ /*
+ * The size of tcp_saveipgen must be the size of the max ip header,
+ * now IPv6.
+ */
+ u_char tcp_saveipgen[IP6_HDR_LEN];
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+
+#endif
+ uint32_t prev_acked = 0;
+ struct tcp_bbr *bbr;
+
+ if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
+ /* Old ack, behind (or duplicate to) the last one rcv'd */
+ return (0);
+ }
+ if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
+ /* Above what we have sent? */
+ return (0);
+ }
+ if (__predict_false(tiwin == 0)) {
+ /* zero window */
+ return (0);
+ }
+ if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
+ /* We need a SYN or a FIN, unlikely.. */
+ return (0);
+ }
+ if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
+ /* Timestamp is behind .. old ack with seq wrap? */
+ return (0);
+ }
+ if (__predict_false(IN_RECOVERY(tp->t_flags))) {
+ /* Still recovering */
+ return (0);
+ }
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ if (__predict_false(bbr->r_ctl.rc_resend != NULL)) {
+ /* We are retransmitting */
+ return (0);
+ }
+ if (__predict_false(bbr->rc_in_persist != 0)) {
+ /* In persist mode */
+ return (0);
+ }
+ if (bbr->r_ctl.rc_sacked) {
+ /* We have sack holes on our scoreboard */
+ return (0);
+ }
+ /* Ok if we reach here, we can process a fast-ack */
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
+ /*
+ * We never detect loss in fast ack [we can't
+ * have a sack and can't be in recovery so
+ * we always pass 0 (nothing detected)].
+ */
+ bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0);
+ /* Did the window get updated? */
+ if (tiwin != tp->snd_wnd) {
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ }
+ /* Do we need to exit persists? */
+ if ((bbr->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
+ bbr_minseg(bbr)))) {
+ bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
+ bbr->r_wanted_output = 1;
+ }
+ /* Do we need to enter persists? */
+ if ((bbr->rc_in_persist == 0) &&
+ (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_max == tp->snd_una) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
+ (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
+ /* No send window.. we must enter persist */
+ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * the timestamp. NOTE that the test is modified according to the
+ * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = bbr->r_ctl.rc_rcvtime;
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * This is a pure ack for outstanding data.
+ */
+ TCPSTAT_INC(tcps_predack);
+
+ /*
+ * "bad retransmit" recovery.
+ */
+ if (tp->t_flags & TF_PREVVALID) {
+ tp->t_flags &= ~TF_PREVVALID;
+ if (tp->t_rxtshift == 1 &&
+ (int)(ticks - tp->t_badrxtwin) < 0)
+ bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
+ }
+ /*
+ * Recalculate the transmit timer / rtt.
+ *
+ * Some boxes send broken timestamp replies during the SYN+ACK
+ * phase, ignore timestamps of 0 or we could calculate a huge RTT
+ * and blow up the retransmit timer.
+ */
+ acked = BYTES_THIS_ACK(tp, th);
+
+#ifdef TCP_HHOOK
+ /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+ hhook_run_tcp_est_in(tp, th, to);
+#endif
+
+ TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
+ TCPSTAT_ADD(tcps_rcvackbyte, acked);
+ sbdrop(&so->so_snd, acked);
+
+ if (SEQ_GT(th->th_ack, tp->snd_una))
+ bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
+ tp->snd_una = th->th_ack;
+ if (tp->snd_wnd < ctf_outstanding(tp))
+ /* The peer collapsed its window on us */
+ bbr_collapsed_window(bbr);
+ else if (bbr->rc_has_collapsed)
+ bbr_un_collapse_window(bbr);
+
+ if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
+ tp->snd_recover = tp->snd_una;
+ }
+ bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0);
+ /*
+ * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
+ */
+ tp->snd_wl2 = th->th_ack;
+ m_freem(m);
+ /*
+ * If all outstanding data are acked, stop retransmit timer,
+ * otherwise restart timer using current (possibly backed-off)
+ * value. If process is waiting for space, wakeup/selwakeup/signal.
+ * If data are ready to send, let tcp_output decide between more
+ * output or persist.
+ */
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp,
+ (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ /* Wake up the socket if we have room to write more */
+ sowwakeup(so);
+ if (tp->snd_una == tp->snd_max) {
+ /* Nothing left outstanding */
+ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
+ if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
+ bbr->rc_tp->t_acktime = 0;
+ bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
+ if (bbr->rc_in_persist == 0) {
+ bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
+ }
+ sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
+ bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
+ /*
+ * We invalidate the last ack here since we
+ * don't want to transfer forward the time
+ * for our sum's calculations.
+ */
+ bbr->r_wanted_output = 1;
+ }
+ if (sbavail(&so->so_snd)) {
+ bbr->r_wanted_output = 1;
+ }
+ return (1);
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t todrop;
+ int32_t ourfinisacked = 0;
+ struct tcp_bbr *bbr;
+ int32_t ret_val = 0;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ /*
+ * If the state is SYN_SENT: if seg contains an ACK, but not for our
+ * SYN, drop the input. if seg contains a RST, then drop the
+ * connection. if seg does not contain SYN, then drop it. Otherwise
+ * this is an acceptable SYN segment initialize tp->rcv_nxt and
+ * tp->irs if seg contains ack then advance tp->snd_una. BRR does
+ * not support ECN so we will not say we are capable. if SYN has
+ * been acked change to ESTABLISHED else SYN_RCVD state arrange for
+ * segment to be acked (eventually) continue processing rest of
+ * data/controls, beginning with URG
+ */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
+ TCP_PROBE5(connect__refused, NULL, tp,
+ mtod(m, const char *), tp, th);
+ tp = tcp_drop(tp, ECONNREFUSED);
+ ctf_do_drop(m, tp);
+ return (1);
+ }
+ if (thflags & TH_RST) {
+ ctf_do_drop(m, tp);
+ return (1);
+ }
+ if (!(thflags & TH_SYN)) {
+ ctf_do_drop(m, tp);
+ return (1);
+ }
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ int tfo_partial = 0;
+
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+#ifdef MAC
+ mac_socketpeer_set_from_mbuf(m, so);
+#endif
+ /* Do window scaling on this connection? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ tp->rcv_adv += min(tp->rcv_wnd,
+ TCP_MAXWIN << tp->rcv_scale);
+ /*
+ * If not all the data that was sent in the TFO SYN
+ * has been acked, resend the remainder right away.
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ (tp->snd_una != tp->snd_max)) {
+ tp->snd_nxt = th->th_ack;
+ tfo_partial = 1;
+ }
+ /*
+ * If there's data, delay ACK; if there's also a FIN ACKNOW
+ * will be turned on later.
+ */
+ if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && (tfo_partial == 0)) {
+ bbr->bbr_segs_rcvd += 1;
+ tp->t_flags |= TF_DELACK;
+ bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
+ } else {
+ bbr->r_wanted_output = 1;
+ tp->t_flags |= TF_ACKNOW;
+ }
+ if (SEQ_GT(th->th_ack, tp->iss)) {
+ /*
+ * The SYN is acked
+ * handle it specially.
+ */
+ bbr_log_syn(tp, to);
+ }
+ if (SEQ_GT(th->th_ack, tp->snd_una)) {
+ /*
+ * We advance snd_una for the
+ * fast open case. If th_ack is
+ * acknowledging data beyond
+ * snd_una we can't just call
+ * ack-processing since the
+ * data stream in our send-map
+ * will start at snd_una + 1 (one
+ * beyond the SYN). If its just
+ * equal we don't need to do that
+ * and there is no send_map.
+ */
+ tp->snd_una++;
+ }
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
+ * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+ } else {
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(connect__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+ cc_conn_init(tp);
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state => simultaneous
+ * open. If segment contains CC option and there is a
+ * cached CC, apply TAO test. If it succeeds, connection is *
+ * half-synchronized. Otherwise, do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
+ * there was no CC option, clear cached CC value.
+ */
+ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+ tcp_state_change(tp, TCPS_SYN_RECEIVED);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * Advance th->th_seq to correspond to first data byte. If data,
+ * trim to stay within window, dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data. If the
+ * remote host used T/TCP to validate the SYN, our data will be
+ * ACK'd; if so, enter normal data segment processing in the middle
+ * of step 5, ack processing. Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK) {
+ if ((to->to_flags & TOF_TS) != 0) {
+ uint32_t t, rtt;
+
+ t = tcp_tv_to_mssectick(&bbr->rc_tv);
+ if (TSTMP_GEQ(t, to->to_tsecr)) {
+ rtt = t - to->to_tsecr;
+ if (rtt == 0) {
+ rtt = 1;
+ }
+ rtt *= MS_IN_USEC;
+ tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
+ apply_filter_min_small(&bbr->r_ctl.rc_rttprop,
+ rtt, bbr->r_ctl.rc_rcvtime);
+ }
+ }
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
+ return (ret_val);
+ /* We may have changed to FIN_WAIT_1 above */
+ if (tp->t_state == TCPS_FIN_WAIT_1) {
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing
+ * for the ESTABLISHED state if our FIN is now
+ * acknowledged then enter FIN_WAIT_2.
+ */
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more data, then
+ * closing user can proceed. Starting the
+ * timer is contrary to the specification,
+ * but if we don't get a FIN we'll hang
+ * forever.
+ *
+ * XXXjl: we should release the tp also, and
+ * use a compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ }
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ourfinisacked = 0;
+ int32_t ret_val;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ if (IS_FASTOPEN(tp->t_flags)) {
+ /*
+ * When a TFO connection is in SYN_RECEIVED, the only valid
+ * packets are the initial SYN, a retransmit/copy of the
+ * initial SYN (possibly with a subset of the original
+ * data), a valid ACK, a FIN, or a RST.
+ */
+ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ } else if (thflags & TH_SYN) {
+ /* non-initial SYN is ignored */
+ if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
+ (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
+ (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ /*
+ * In the SYN-RECEIVED state, validate that the packet belongs to
+ * this connection before trimming the data to fit the receive
+ * window. Check the sequence number versus IRS since we know the
+ * sequence numbers haven't wrapped. This is a partial fix for the
+ * "LAND" DoS attack.
+ */
+ if (SEQ_LT(th->th_seq, tp->irs)) {
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ tp->snd_wnd = tiwin;
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (IS_FASTOPEN(tp->t_flags)) {
+ cc_conn_init(tp);
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ }
+ TCPSTAT_INC(tcps_connects);
+ soisconnected(so);
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ /*
+ * ok for the first time in lets see if we can use the ts to figure
+ * out what the initial RTT was.
+ */
+ if ((to->to_flags & TOF_TS) != 0) {
+ uint32_t t, rtt;
+
+ t = tcp_tv_to_mssectick(&bbr->rc_tv);
+ if (TSTMP_GEQ(t, to->to_tsecr)) {
+ rtt = t - to->to_tsecr;
+ if (rtt == 0) {
+ rtt = 1;
+ }
+ rtt *= MS_IN_USEC;
+ tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
+ apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime);
+ }
+ }
+ /* Drop off any SYN in the send map (probably not there) */
+ if (thflags & TH_ACK)
+ bbr_log_syn(tp, to);
+ if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
+
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+ /*
+ * Account for the ACK of our SYN prior to regular
+ * ACK processing below.
+ */
+ tp->snd_una++;
+ }
+ /*
+ * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
+ * FIN-WAIT-1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(accept__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+ /*
+ * TFO connections call cc_conn_init() during SYN
+ * processing. Calling it again here for such connections
+ * is not harmless as it would undo the snd_cwnd reduction
+ * that occurs when a TFO SYN|ACK is retransmitted.
+ */
+ if (!IS_FASTOPEN(tp->t_flags))
+ cc_conn_init(tp);
+ }
+ /*
+ * If segment contains data or ACK, will call tcp_reass() later; if
+ * not, do so now to pass queued data to user.
+ */
+ if (tlen == 0 && (thflags & TH_FIN) == 0)
+ (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
+ (struct mbuf *)0);
+ tp->snd_wl1 = th->th_seq - 1;
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (tp->t_state == TCPS_FIN_WAIT_1) {
+ /* We could have went to FIN_WAIT_1 (or EST) above */
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing for the
+ * ESTABLISHED state if our FIN is now acknowledged then
+ * enter FIN_WAIT_2.
+ */
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more data, then closing
+ * user can proceed. Starting the timer is contrary
+ * to the specification, but if we don't get a FIN
+ * we'll hang forever.
+ *
+ * XXXjl: we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ struct tcp_bbr *bbr;
+ int32_t ret_val;
+
+ /*
+ * Header prediction: check for the two common cases of a
+ * uni-directional data xfer. If the packet has no control flags,
+ * is in-sequence, the window didn't change and we're not
+ * retransmitting, it's a candidate. If the length is zero and the
+ * ack moved forward, we're the sender side of the xfer. Just free
+ * the data acked & wake any higher level process that was blocked
+ * waiting for space. If the length is non-zero and the ack didn't
+ * move, we're the receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data toc The socket
+ * buffer and note that we need a delayed ack. Make sure that the
+ * hidden state-flags are also off. Since we check for
+ * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
+ */
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) {
+ /*
+ * If we have delived under 4 segments increase the initial
+ * window if raised by the peer. We use this to determine
+ * dynamic and static rwnd's at the end of a connection.
+ */
+ bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd);
+ }
+ if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
+ __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
+ __predict_true(SEGQ_EMPTY(tp)) &&
+ __predict_true(th->th_seq == tp->rcv_nxt)) {
+ if (tlen == 0) {
+ if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
+ tiwin, nxt_pkt)) {
+ return (0);
+ }
+ } else {
+ if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
+ tiwin, nxt_pkt)) {
+ return (0);
+ }
+ }
+ }
+ ctf_calc_rwin(so, tp);
+
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ ctf_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ bbr->r_wanted_output = 1;
+ return (ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (bbr_progress_timeout_check(bbr)) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ /* State changes only happen in bbr_process_data() */
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ struct tcp_bbr *bbr;
+ int32_t ret_val;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ ctf_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ bbr->r_wanted_output = 1;
+ return (ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (bbr_progress_timeout_check(bbr)) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+static int
+bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr,
+ struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so)
+{
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ if (bbr->rc_allow_data_af_clo == 0) {
+close_now:
+ tp = tcp_close(tp);
+ TCPSTAT_INC(tcps_rcvafterclose);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
+ return (1);
+ }
+ if (sbavail(&so->so_snd) == 0)
+ goto close_now;
+ /* Ok we allow data that is ignored and a followup reset */
+ tp->rcv_nxt = th->th_seq + *tlen;
+ tp->t_flags2 |= TF2_DROP_AF_DATA;
+ bbr->r_wanted_output = 1;
+ *tlen = 0;
+ return (0);
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ourfinisacked = 0;
+ int32_t ret_val;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ ctf_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) && tlen) {
+ /*
+ * We call a new function now so we might continue and setup
+ * to reset at all data being ack'd.
+ */
+ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ bbr->r_wanted_output = 1;
+ return (ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more data, then closing user can
+ * proceed. Starting the timer is contrary to the
+ * specification, but if we don't get a FIN we'll hang
+ * forever.
+ *
+ * XXXjl: we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (bbr_progress_timeout_check(bbr)) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ourfinisacked = 0;
+ int32_t ret_val;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ ctf_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) && tlen) {
+ /*
+ * We call a new function now so we might continue and setup
+ * to reset at all data being ack'd.
+ */
+ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ bbr->r_wanted_output = 1;
+ return (ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (ourfinisacked) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ tcp_twstart(tp);
+ m_freem(m);
+ return (1);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (bbr_progress_timeout_check(bbr)) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ourfinisacked = 0;
+ int32_t ret_val;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ ctf_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) && tlen) {
+ /*
+ * We call a new function now so we might continue and setup
+ * to reset at all data being ack'd.
+ */
+ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
+ return (1);
+ }
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ bbr->r_wanted_output = 1;
+ return (ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ /*
+ * case TCPS_LAST_ACK: Ack processing.
+ */
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (ourfinisacked) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ tp = tcp_close(tp);
+ ctf_do_drop(m, tp);
+ return (1);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (bbr_progress_timeout_check(bbr)) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+
+/*
+ * Return value of 1, the TCB is unlocked and most
+ * likely gone, return value of 0, the TCB is still
+ * locked.
+ */
+static int
+bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
+ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+{
+ int32_t ourfinisacked = 0;
+ int32_t ret_val;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ ctf_calc_rwin(so, tp);
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
+
+ /*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ ctf_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment and
+ * it's less than ts_recent, drop it.
+ */
+ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ return (ret_val);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ return (ret_val);
+ }
+ /*
+ * If new data are received on a connection after the user processes
+ * are gone, then we may RST the other end depending on the outcome
+ * of bbr_check_data_after_close.
+ */
+ if ((so->so_state & SS_NOFDREF) &&
+ tlen) {
+ /*
+ * We call a new function now so we might continue and setup
+ * to reset at all data being ack'd.
+ */
+ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
+ return (1);
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * If last ACK falls within this segment's sequence numbers, record
+ * its timestamp. NOTE: 1) That the test incorporates suggestions
+ * from the latest proposal of the tcplw@cray.com list (Braden
+ * 1993/04/26). 2) That updating only on newer timestamps interferes
+ * with our earlier PAWS tests, so this check should be solely
+ * predicated on the sequence space of this segment. 3) That we
+ * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
+ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
+ * SEG.Len, This modified check allows us to overcome RFC1323's
+ * limitations as described in Stevens TCP/IP Illustrated Vol. 2
+ * p.869. In such cases, we can still calculate the RTT correctly
+ * when RCV.NXT == Last.ACK.Sent.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if ((to->to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+ ((thflags & (TH_SYN | TH_FIN)) != 0))) {
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent = to->to_tsval;
+ }
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
+ * is on (half-synchronized state), then queue data for later
+ * processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_flags & TF_NEEDSYN) {
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+ } else if (tp->t_flags & TF_ACKNOW) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ bbr->r_wanted_output = 1;
+ return (ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ return (0);
+ }
+ }
+ /*
+ * Ack processing.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ return (ret_val);
+ }
+ if (sbavail(&so->so_snd)) {
+ if (bbr_progress_timeout_check(bbr)) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ }
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
+ tiwin, thflags, nxt_pkt));
+}
+
+static void
+bbr_stop_all_timers(struct tcpcb *tp)
+{
+ struct tcp_bbr *bbr;
+
+ /*
+ * Assure no timers are running.
+ */
+ if (tcp_timer_active(tp, TT_PERSIST)) {
+ /* We enter in persists, set the flag appropriately */
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ bbr->rc_in_persist = 1;
+ }
+ tcp_timer_suspend(tp, TT_PERSIST);
+ tcp_timer_suspend(tp, TT_REXMT);
+ tcp_timer_suspend(tp, TT_KEEP);
+ tcp_timer_suspend(tp, TT_DELACK);
+}
+
+static void
+bbr_google_mode_on(struct tcp_bbr *bbr)
+{
+ bbr->rc_use_google = 1;
+ bbr->rc_no_pacing = 0;
+ bbr->r_ctl.bbr_google_discount = bbr_google_discount;
+ bbr->r_use_policer = bbr_policer_detection_enabled;
+ bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
+ bbr->bbr_use_rack_cheat = 0;
+ bbr->r_ctl.rc_incr_tmrs = 0;
+ bbr->r_ctl.rc_inc_tcp_oh = 0;
+ bbr->r_ctl.rc_inc_ip_oh = 0;
+ bbr->r_ctl.rc_inc_enet_oh = 0;
+ reset_time(&bbr->r_ctl.rc_delrate,
+ BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
+ reset_time_small(&bbr->r_ctl.rc_rttprop,
+ (11 * USECS_IN_SECOND));
+ tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
+}
+
+static void
+bbr_google_mode_off(struct tcp_bbr *bbr)
+{
+ bbr->rc_use_google = 0;
+ bbr->r_ctl.bbr_google_discount = 0;
+ bbr->no_pacing_until = bbr_no_pacing_until;
+ bbr->r_use_policer = 0;
+ if (bbr->no_pacing_until)
+ bbr->rc_no_pacing = 1;
+ else
+ bbr->rc_no_pacing = 0;
+ if (bbr_use_rack_resend_cheat)
+ bbr->bbr_use_rack_cheat = 1;
+ else
+ bbr->bbr_use_rack_cheat = 0;
+ if (bbr_incr_timers)
+ bbr->r_ctl.rc_incr_tmrs = 1;
+ else
+ bbr->r_ctl.rc_incr_tmrs = 0;
+ if (bbr_include_tcp_oh)
+ bbr->r_ctl.rc_inc_tcp_oh = 1;
+ else
+ bbr->r_ctl.rc_inc_tcp_oh = 0;
+ if (bbr_include_ip_oh)
+ bbr->r_ctl.rc_inc_ip_oh = 1;
+ else
+ bbr->r_ctl.rc_inc_ip_oh = 0;
+ if (bbr_include_enet_oh)
+ bbr->r_ctl.rc_inc_enet_oh = 1;
+ else
+ bbr->r_ctl.rc_inc_enet_oh = 0;
+ bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
+ reset_time(&bbr->r_ctl.rc_delrate,
+ bbr_num_pktepo_for_del_limit);
+ reset_time_small(&bbr->r_ctl.rc_rttprop,
+ (bbr_filter_len_sec * USECS_IN_SECOND));
+ tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
+}
+/*
+ * Return 0 on success, non-zero on failure
+ * which indicates the error (usually no memory).
+ */
+static int
+bbr_init(struct tcpcb *tp)
+{
+ struct tcp_bbr *bbr = NULL;
+ struct inpcb *inp;
+ uint32_t cts;
+
+ tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
+ if (tp->t_fb_ptr == NULL) {
+ /*
+ * We need to allocate memory but cant. The INP and INP_INFO
+ * locks and they are recusive (happens during setup. So a
+ * scheme to drop the locks fails :(
+ *
+ */
+ return (ENOMEM);
+ }
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ bbr->rtt_valid = 0;
+ inp = tp->t_inpcb;
+ inp->inp_flags2 |= INP_CANNOT_DO_ECN;
+ inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ TAILQ_INIT(&bbr->r_ctl.rc_map);
+ TAILQ_INIT(&bbr->r_ctl.rc_free);
+ TAILQ_INIT(&bbr->r_ctl.rc_tmap);
+ bbr->rc_tp = tp;
+ if (tp->t_inpcb) {
+ bbr->rc_inp = tp->t_inpcb;
+ }
+ cts = tcp_get_usecs(&bbr->rc_tv);
+ tp->t_acktime = 0;
+ bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close;
+ bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade;
+ bbr->rc_tlp_threshold = bbr_tlp_thresh;
+ bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh;
+ bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay;
+ bbr->r_ctl.rc_min_to = bbr_min_to;
+ bbr->rc_bbr_state = BBR_STATE_STARTUP;
+ bbr->r_ctl.bbr_lost_at_state = 0;
+ bbr->r_ctl.rc_lost_at_startup = 0;
+ bbr->rc_all_timers_stopped = 0;
+ bbr->r_ctl.rc_bbr_lastbtlbw = 0;
+ bbr->r_ctl.rc_pkt_epoch_del = 0;
+ bbr->r_ctl.rc_pkt_epoch = 0;
+ bbr->r_ctl.rc_lowest_rtt = 0xffffffff;
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain;
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
+ bbr->r_ctl.rc_went_idle_time = cts;
+ bbr->rc_pacer_started = cts;
+ bbr->r_ctl.rc_pkt_epoch_time = cts;
+ bbr->r_ctl.rc_rcvtime = cts;
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ bbr->r_ctl.rc_del_time = cts;
+ bbr->r_ctl.rc_tlp_rxt_last_time = cts;
+ bbr->r_ctl.last_in_probertt = cts;
+ bbr->skip_gain = 0;
+ bbr->gain_is_limited = 0;
+ bbr->no_pacing_until = bbr_no_pacing_until;
+ if (bbr->no_pacing_until)
+ bbr->rc_no_pacing = 1;
+ if (bbr_use_google_algo) {
+ bbr->rc_no_pacing = 0;
+ bbr->rc_use_google = 1;
+ bbr->r_ctl.bbr_google_discount = bbr_google_discount;
+ bbr->r_use_policer = bbr_policer_detection_enabled;
+ } else {
+ bbr->rc_use_google = 0;
+ bbr->r_ctl.bbr_google_discount = 0;
+ bbr->r_use_policer = 0;
+ }
+ if (bbr_ts_limiting)
+ bbr->rc_use_ts_limit = 1;
+ else
+ bbr->rc_use_ts_limit = 0;
+ if (bbr_ts_can_raise)
+ bbr->ts_can_raise = 1;
+ else
+ bbr->ts_can_raise = 0;
+ if (V_tcp_delack_enabled == 1)
+ tp->t_delayed_ack = 2;
+ else if (V_tcp_delack_enabled == 0)
+ tp->t_delayed_ack = 0;
+ else if (V_tcp_delack_enabled < 100)
+ tp->t_delayed_ack = V_tcp_delack_enabled;
+ else
+ tp->t_delayed_ack = 2;
+ if (bbr->rc_use_google == 0)
+ bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
+ else
+ bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
+ bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms;
+ bbr->rc_max_rto_sec = bbr_rto_max_sec;
+ bbr->rc_init_win = bbr_def_init_win;
+ if (tp->t_flags & TF_REQ_TSTMP)
+ bbr->rc_last_options = TCP_TS_OVERHEAD;
+ bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options;
+ bbr->r_ctl.rc_high_rwnd = tp->snd_wnd;
+ bbr->r_init_rtt = 1;
+
+ counter_u64_add(bbr_flows_nohdwr_pacing, 1);
+ if (bbr_allow_hdwr_pacing)
+ bbr->bbr_hdw_pace_ena = 1;
+ else
+ bbr->bbr_hdw_pace_ena = 0;
+ if (bbr_sends_full_iwnd)
+ bbr->bbr_init_win_cheat = 1;
+ else
+ bbr->bbr_init_win_cheat = 0;
+ bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max;
+ bbr->r_ctl.rc_drain_pg = bbr_drain_gain;
+ bbr->r_ctl.rc_startup_pg = bbr_high_gain;
+ bbr->rc_loss_exit = bbr_exit_startup_at_loss;
+ bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain;
+ bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second;
+ bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar;
+ bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max;
+ bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor;
+ bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min;
+ bbr->r_ctl.bbr_cross_over = bbr_cross_over;
+ bbr->r_ctl.rc_rtt_shrinks = cts;
+ if (bbr->rc_use_google) {
+ setup_time_filter(&bbr->r_ctl.rc_delrate,
+ FILTER_TYPE_MAX,
+ BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
+ setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
+ FILTER_TYPE_MIN, (11 * USECS_IN_SECOND));
+ } else {
+ setup_time_filter(&bbr->r_ctl.rc_delrate,
+ FILTER_TYPE_MAX,
+ bbr_num_pktepo_for_del_limit);
+ setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
+ FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND));
+ }
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0);
+ if (bbr_uses_idle_restart)
+ bbr->rc_use_idle_restart = 1;
+ else
+ bbr->rc_use_idle_restart = 0;
+ bbr->r_ctl.rc_bbr_cur_del_rate = 0;
+ bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps;
+ if (bbr_resends_use_tso)
+ bbr->rc_resends_use_tso = 1;
+#ifdef NETFLIX_PEAKRATE
+ tp->t_peakrate_thr = tp->t_maxpeakrate;
+#endif
+ if (tp->snd_una != tp->snd_max) {
+ /* Create a send map for the current outstanding data */
+ struct bbr_sendmap *rsm;
+
+ rsm = bbr_alloc(bbr);
+ if (rsm == NULL) {
+ uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
+ tp->t_fb_ptr = NULL;
+ return (ENOMEM);
+ }
+ rsm->r_flags = BBR_OVERMAX;
+ rsm->r_tim_lastsent[0] = cts;
+ rsm->r_rtr_cnt = 1;
+ rsm->r_rtr_bytes = 0;
+ rsm->r_start = tp->snd_una;
+ rsm->r_end = tp->snd_max;
+ rsm->r_dupack = 0;
+ rsm->r_delivered = bbr->r_ctl.rc_delivered;
+ rsm->r_ts_valid = 0;
+ rsm->r_del_ack_ts = tp->ts_recent;
+ rsm->r_del_time = cts;
+ if (bbr->r_ctl.r_app_limited_until)
+ rsm->r_app_limited = 1;
+ else
+ rsm->r_app_limited = 0;
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
+ rsm->r_bbr_state = bbr_state_val(bbr);
+ else
+ rsm->r_bbr_state = 8;
+ }
+ if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0))
+ bbr->bbr_use_rack_cheat = 1;
+ if (bbr_incr_timers && (bbr->rc_use_google == 0))
+ bbr->r_ctl.rc_incr_tmrs = 1;
+ if (bbr_include_tcp_oh && (bbr->rc_use_google == 0))
+ bbr->r_ctl.rc_inc_tcp_oh = 1;
+ if (bbr_include_ip_oh && (bbr->rc_use_google == 0))
+ bbr->r_ctl.rc_inc_ip_oh = 1;
+ if (bbr_include_enet_oh && (bbr->rc_use_google == 0))
+ bbr->r_ctl.rc_inc_enet_oh = 1;
+
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->t_srtt)) {
+ uint32_t rtt;
+
+ rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
+ apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
+ }
+ /* announce the settings and state */
+ bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT);
+ tcp_bbr_tso_size_check(bbr, cts);
+ /*
+ * Now call the generic function to start a timer. This will place
+ * the TCB on the hptsi wheel if a timer is needed with appropriate
+ * flags.
+ */
+ bbr_stop_all_timers(tp);
+ bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
+ return (0);
+}
+
+/*
+ * Return 0 if we can accept the connection. Return
+ * non-zero if we can't handle the connection. A EAGAIN
+ * means you need to wait until the connection is up.
+ * a EADDRNOTAVAIL means we can never handle the connection
+ * (no SACK).
+ */
+static int
+bbr_handoff_ok(struct tcpcb *tp)
+{
+ if ((tp->t_state == TCPS_CLOSED) ||
+ (tp->t_state == TCPS_LISTEN)) {
+ /* Sure no problem though it may not stick */
+ return (0);
+ }
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED)) {
+ /*
+ * We really don't know you have to get to ESTAB or beyond
+ * to tell.
+ */
+ return (EAGAIN);
+ }
+ if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) {
+ return (0);
+ }
+ /*
+ * If we reach here we don't do SACK on this connection so we can
+ * never do rack.
+ */
+ return (EINVAL);
+}
+
+static void
+bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
+{
+ if (tp->t_fb_ptr) {
+ uint32_t calc;
+ struct tcp_bbr *bbr;
+ struct bbr_sendmap *rsm;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ if (bbr->r_ctl.crte)
+ tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
+ bbr_log_flowend(bbr);
+ bbr->rc_tp = NULL;
+ if (tp->t_inpcb) {
+ /* Backout any flags2 we applied */
+ tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN;
+ tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
+ tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ }
+ if (bbr->bbr_hdrw_pacing)
+ counter_u64_add(bbr_flows_whdwr_pacing, -1);
+ else
+ counter_u64_add(bbr_flows_nohdwr_pacing, -1);
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ while (rsm) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
+ uma_zfree(bbr_zone, rsm);
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ }
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
+ while (rsm) {
+ TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
+ uma_zfree(bbr_zone, rsm);
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
+ }
+ calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd;
+ if (calc > (bbr->r_ctl.rc_init_rwnd / 10))
+ BBR_STAT_INC(bbr_dynamic_rwnd);
+ else
+ BBR_STAT_INC(bbr_static_rwnd);
+ bbr->r_ctl.rc_free_cnt = 0;
+ uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
+ tp->t_fb_ptr = NULL;
+ }
+ /* Make sure snd_nxt is correctly set */
+ tp->snd_nxt = tp->snd_max;
+}
+
+static void
+bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win)
+{
+ switch (tp->t_state) {
+ case TCPS_SYN_SENT:
+ bbr->r_state = TCPS_SYN_SENT;
+ bbr->r_substate = bbr_do_syn_sent;
+ break;
+ case TCPS_SYN_RECEIVED:
+ bbr->r_state = TCPS_SYN_RECEIVED;
+ bbr->r_substate = bbr_do_syn_recv;
+ break;
+ case TCPS_ESTABLISHED:
+ bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd);
+ bbr->r_state = TCPS_ESTABLISHED;
+ bbr->r_substate = bbr_do_established;
+ break;
+ case TCPS_CLOSE_WAIT:
+ bbr->r_state = TCPS_CLOSE_WAIT;
+ bbr->r_substate = bbr_do_close_wait;
+ break;
+ case TCPS_FIN_WAIT_1:
+ bbr->r_state = TCPS_FIN_WAIT_1;
+ bbr->r_substate = bbr_do_fin_wait_1;
+ break;
+ case TCPS_CLOSING:
+ bbr->r_state = TCPS_CLOSING;
+ bbr->r_substate = bbr_do_closing;
+ break;
+ case TCPS_LAST_ACK:
+ bbr->r_state = TCPS_LAST_ACK;
+ bbr->r_substate = bbr_do_lastack;
+ break;
+ case TCPS_FIN_WAIT_2:
+ bbr->r_state = TCPS_FIN_WAIT_2;
+ bbr->r_substate = bbr_do_fin_wait_2;
+ break;
+ case TCPS_LISTEN:
+ case TCPS_CLOSED:
+ case TCPS_TIME_WAIT:
+ default:
+ break;
+ };
+}
+
+static void
+bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog)
+{
+ /*
+ * Now what state are we going into now? Is there adjustments
+ * needed?
+ */
+ int32_t old_state, old_gain;
+
+
+ old_state = bbr_state_val(bbr);
+ old_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
+ if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) {
+ /* Save the lowest srtt we saw in our end of the sub-state */
+ bbr->rc_hit_state_1 = 0;
+ if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff)
+ bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state;
+ }
+ bbr->rc_bbr_substate++;
+ if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) {
+ /* Cycle back to first state-> gain */
+ bbr->rc_bbr_substate = 0;
+ }
+ if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
+ /*
+ * We enter the gain(5/4) cycle (possibly less if
+ * shallow buffer detection is enabled)
+ */
+ if (bbr->skip_gain) {
+ /*
+ * Hardware pacing has set our rate to
+ * the max and limited our b/w just
+ * do level i.e. no gain.
+ */
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1];
+ } else if (bbr->gain_is_limited &&
+ bbr->bbr_hdrw_pacing &&
+ bbr->r_ctl.crte) {
+ /*
+ * We can't gain above the hardware pacing
+ * rate which is less than our rate + the gain
+ * calculate the gain needed to reach the hardware
+ * pacing rate..
+ */
+ uint64_t bw, rate, gain_calc;
+
+ bw = bbr_get_bw(bbr);
+ rate = bbr->r_ctl.crte->rate;
+ if ((rate > bw) &&
+ (((bw * (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) {
+ gain_calc = (rate * BBR_UNIT) / bw;
+ if (gain_calc < BBR_UNIT)
+ gain_calc = BBR_UNIT;
+ bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc;
+ } else {
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
+ }
+ } else
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
+ if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) {
+ bbr->r_ctl.rc_bbr_state_atflight = cts;
+ } else
+ bbr->r_ctl.rc_bbr_state_atflight = 0;
+ } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
+ bbr->rc_hit_state_1 = 1;
+ bbr->r_ctl.rc_exta_time_gd = 0;
+ bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ if (bbr_state_drain_2_tar) {
+ bbr->r_ctl.rc_bbr_state_atflight = 0;
+ } else
+ bbr->r_ctl.rc_bbr_state_atflight = cts;
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN];
+ } else {
+ /* All other cycles hit here 2-7 */
+ if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) {
+ if (bbr_sub_drain_slam_cwnd &&
+ (bbr->rc_use_google == 0) &&
+ (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP))
+ bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) -
+ bbr_get_rtt(bbr, BBR_RTT_PROP));
+ else
+ bbr->r_ctl.rc_exta_time_gd = 0;
+ if (bbr->r_ctl.rc_exta_time_gd) {
+ bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd;
+ /* Now chop up the time for each state (div by 7) */
+ bbr->r_ctl.rc_level_state_extra /= 7;
+ if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) {
+ /* Add a randomization */
+ bbr_randomize_extra_state_time(bbr);
+ }
+ }
+ }
+ bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)];
+ }
+ if (bbr->rc_use_google) {
+ bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
+ }
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
+ if (dolog)
+ bbr_log_type_statechange(bbr, cts, line);
+
+ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+ uint32_t time_in;
+
+ time_in = cts - bbr->r_ctl.rc_bbr_state_time;
+ if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
+ counter_u64_add(bbr_state_time[(old_state + 5)], time_in);
+ } else {
+ counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
+ }
+ }
+ bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
+ bbr_set_state_target(bbr, __LINE__);
+ if (bbr_sub_drain_slam_cwnd &&
+ (bbr->rc_use_google == 0) &&
+ (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
+ /* Slam down the cwnd */
+ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
+ if (bbr_sub_drain_app_limit) {
+ /* Go app limited if we are on a long drain */
+ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered +
+ ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)));
+ }
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ if (bbr->rc_lt_use_bw) {
+ /* In policed mode we clamp pacing_gain to BBR_UNIT */
+ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
+ }
+ /* Google changes TSO size every cycle */
+ if (bbr->rc_use_google)
+ tcp_bbr_tso_size_check(bbr, cts);
+ bbr->r_ctl.gain_epoch = cts;
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch;
+}
+
+static void
+bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
+{
+ if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) &&
+ (google_allow_early_out == 1) &&
+ (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) {
+ /* We have reached out target flight size possibly early */
+ goto change_state;
+ }
+ if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+ return;
+ }
+ if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) {
+ /*
+ * Must be a rttProp movement forward before
+ * we can change states.
+ */
+ return;
+ }
+ if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
+ /*
+ * The needed time has passed but for
+ * the gain cycle extra rules apply:
+ * 1) If we have seen loss, we exit
+ * 2) If we have not reached the target
+ * we stay in GAIN (gain-to-target).
+ */
+ if (google_consider_lost && losses)
+ goto change_state;
+ if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) {
+ return;
+ }
+ }
+change_state:
+ /* For gain we must reach our target, all others last 1 rttProp */
+ bbr_substate_change(bbr, cts, __LINE__, 1);
+}
+
+static void
+bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
+{
+ uint32_t flight, bbr_cur_cycle_time;
+
+ if (bbr->rc_use_google) {
+ bbr_set_probebw_google_gains(bbr, cts, losses);
+ return;
+ }
+ if (cts == 0) {
+ /*
+ * Never alow cts to be 0 we
+ * do this so we can judge if
+ * we have set a timestamp.
+ */
+ cts = 1;
+ }
+ if (bbr_state_is_pkt_epoch)
+ bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
+ else
+ bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP);
+
+ if (bbr->r_ctl.rc_bbr_state_atflight == 0) {
+ if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
+ flight = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) {
+ /* Keep it slam down */
+ if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) {
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ if (bbr_sub_drain_app_limit) {
+ /* Go app limited if we are on a long drain */
+ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight);
+ }
+ }
+ if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) &&
+ (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) ||
+ (flight >= bbr->r_ctl.flightsize_at_drain))) {
+ /*
+ * Still here after the same time as
+ * the gain. We need to drain harder
+ * for the next srtt. Reduce by a set amount
+ * the gain drop is capped at DRAIN states
+ * value (88).
+ */
+ bbr->r_ctl.flightsize_at_drain = flight;
+ if (bbr_drain_drop_mul &&
+ bbr_drain_drop_div &&
+ (bbr_drain_drop_mul < bbr_drain_drop_div)) {
+ /* Use your specific drop value (def 4/5 = 20%) */
+ bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul;
+ bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div;
+ } else {
+ /* You get drop of 20% */
+ bbr->r_ctl.rc_bbr_hptsi_gain *= 4;
+ bbr->r_ctl.rc_bbr_hptsi_gain /= 5;
+ }
+ if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) {
+ /* Reduce our gain again to the bottom */
+ bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
+ }
+ bbr_log_exit_gain(bbr, cts, 4);
+ /*
+ * Extend out so we wait another
+ * epoch before dropping again.
+ */
+ bbr->r_ctl.gain_epoch = cts;
+ }
+ if (flight <= bbr->r_ctl.rc_target_at_state) {
+ if (bbr_sub_drain_slam_cwnd &&
+ (bbr->rc_use_google == 0) &&
+ (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
+ bbr_log_exit_gain(bbr, cts, 3);
+ }
+ } else {
+ /* Its a gain */
+ if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) {
+ bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
+ goto change_state;
+ }
+ if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) ||
+ ((ctf_outstanding(bbr->rc_tp) + bbr->rc_tp->t_maxseg - 1) >=
+ bbr->rc_tp->snd_wnd)) {
+ bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
+ bbr_log_exit_gain(bbr, cts, 2);
+ }
+ }
+ /**
+ * We fall through and return always one of two things has
+ * occured.
+ * 1) We are still not at target
+ * <or>
+ * 2) We reached the target and set rc_bbr_state_atflight
+ * which means we no longer hit this block
+ * next time we are called.
+ */
+ return;
+ }
+change_state:
+ if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time))
+ return;
+ if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) {
+ /* Less than a full time-period has passed */
+ return;
+ }
+ if (bbr->r_ctl.rc_level_state_extra &&
+ (bbr_state_val(bbr) > BBR_SUB_DRAIN) &&
+ ((cts - bbr->r_ctl.rc_bbr_state_time) <
+ (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
+ /* Less than a full time-period + extra has passed */
+ return;
+ }
+ if (bbr_gain_gets_extra_too &&
+ bbr->r_ctl.rc_level_state_extra &&
+ (bbr_state_val(bbr) == BBR_SUB_GAIN) &&
+ ((cts - bbr->r_ctl.rc_bbr_state_time) <
+ (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
+ /* Less than a full time-period + extra has passed */
+ return;
+ }
+ bbr_substate_change(bbr, cts, __LINE__, 1);
+}
+
+static uint32_t
+bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain)
+{
+ uint32_t mss, tar;
+
+ if (bbr->rc_use_google) {
+ /* Google just uses the cwnd target */
+ tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain);
+ } else {
+ mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options),
+ bbr->r_ctl.rc_pace_max_segs);
+ /* Get the base cwnd with gain rounded to a mss */
+ tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr),
+ gain), mss);
+ /* Make sure it is within our min */
+ if (tar < get_min_cwnd(bbr))
+ return (get_min_cwnd(bbr));
+ }
+ return (tar);
+}
+
+static void
+bbr_set_state_target(struct tcp_bbr *bbr, int line)
+{
+ uint32_t tar, meth;
+
+ if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
+ ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
+ /* Special case using old probe-rtt method */
+ tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
+ meth = 1;
+ } else {
+ /* Non-probe-rtt case and reduced probe-rtt */
+ if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
+ (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) {
+ /* For gain cycle we use the hptsi gain */
+ tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
+ meth = 2;
+ } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) {
+ /*
+ * If configured, or for google all other states
+ * get BBR_UNIT.
+ */
+ tar = bbr_get_a_state_target(bbr, BBR_UNIT);
+ meth = 3;
+ } else {
+ /*
+ * Or we set a target based on the pacing gain
+ * for non-google mode and default (non-configured).
+ * Note we don't set a target goal below drain (192).
+ */
+ if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN]) {
+ tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]);
+ meth = 4;
+ } else {
+ tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
+ meth = 5;
+ }
+ }
+ }
+ bbr_log_set_of_state_target(bbr, tar, line, meth);
+ bbr->r_ctl.rc_target_at_state = tar;
+}
+
+static void
+bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
+{
+ /* Change to probe_rtt */
+ uint32_t time_in;
+
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain
+ + bbr->r_ctl.rc_delivered);
+ /* Setup so we force feed the filter */
+ if (bbr->rc_use_google || bbr_probertt_sets_rtt)
+ bbr->rc_prtt_set_ts = 1;
+ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+ time_in = cts - bbr->r_ctl.rc_bbr_state_time;
+ counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
+ }
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0);
+ bbr->r_ctl.rc_rtt_shrinks = cts;
+ bbr->r_ctl.last_in_probertt = cts;
+ bbr->r_ctl.rc_probertt_srttchktim = cts;
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ bbr->rc_bbr_state = BBR_STATE_PROBE_RTT;
+ /* We need to force the filter to update */
+
+ if ((bbr_sub_drain_slam_cwnd) &&
+ bbr->rc_hit_state_1 &&
+ (bbr->rc_use_google == 0) &&
+ (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
+ if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd)
+ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
+ } else
+ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
+ /* Update the lost */
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){
+ /* Set to the non-configurable default of 4 (PROBE_RTT_MIN) */
+ bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
+ bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
+ bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6);
+ bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd;
+ } else {
+ /*
+ * We bring it down slowly by using a hptsi gain that is
+ * probably 75%. This will slowly float down our outstanding
+ * without tampering with the cwnd.
+ */
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
+ bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
+ bbr_set_state_target(bbr, __LINE__);
+ if (bbr_prtt_slam_cwnd &&
+ (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ }
+ if (ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
+ bbr->r_ctl.rc_target_at_state) {
+ /* We are at target */
+ bbr->r_ctl.rc_bbr_enters_probertt = cts;
+ } else {
+ /* We need to come down to reach target before our time begins */
+ bbr->r_ctl.rc_bbr_enters_probertt = 0;
+ }
+ bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch;
+ BBR_STAT_INC(bbr_enter_probertt);
+ bbr_log_exit_gain(bbr, cts, 0);
+ bbr_log_type_statechange(bbr, cts, line);
+}
+
+static void
+bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts)
+{
+ /*
+ * Sanity check on probe-rtt intervals.
+ * In crazy situations where we are competing
+ * against new-reno flows with huge buffers
+ * our rtt-prop interval could come to dominate
+ * things if we can't get through a full set
+ * of cycles, we need to adjust it.
+ */
+ if (bbr_can_adjust_probertt &&
+ (bbr->rc_use_google == 0)) {
+ uint16_t val = 0;
+ uint32_t cur_rttp, fval, newval, baseval;
+
+ /* Are we to small and go into probe-rtt to often? */
+ baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1));
+ cur_rttp = roundup(baseval, USECS_IN_SECOND);
+ fval = bbr_filter_len_sec * USECS_IN_SECOND;
+ if (bbr_is_ratio == 0) {
+ if (fval > bbr_rtt_probe_limit)
+ newval = cur_rttp + (fval - bbr_rtt_probe_limit);
+ else
+ newval = cur_rttp;
+ } else {
+ int mul;
+
+ mul = fval / bbr_rtt_probe_limit;
+ newval = cur_rttp * mul;
+ }
+ if (cur_rttp > bbr->r_ctl.rc_probertt_int) {
+ bbr->r_ctl.rc_probertt_int = cur_rttp;
+ reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
+ val = 1;
+ } else {
+ /*
+ * No adjustments were made
+ * do we need to shrink it?
+ */
+ if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) {
+ if (cur_rttp <= bbr_rtt_probe_limit) {
+ /*
+ * Things have calmed down lets
+ * shrink all the way to default
+ */
+ bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
+ reset_time_small(&bbr->r_ctl.rc_rttprop,
+ (bbr_filter_len_sec * USECS_IN_SECOND));
+ cur_rttp = bbr_rtt_probe_limit;
+ newval = (bbr_filter_len_sec * USECS_IN_SECOND);
+ val = 2;
+ } else {
+ /*
+ * Well does some adjustment make sense?
+ */
+ if (cur_rttp < bbr->r_ctl.rc_probertt_int) {
+ /* We can reduce interval time some */
+ bbr->r_ctl.rc_probertt_int = cur_rttp;
+ reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
+ val = 3;
+ }
+ }
+ }
+ }
+ if (val)
+ bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val);
+ }
+}
+
+static void
+bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
+{
+ /* Exit probe-rtt */
+
+ if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) {
+ tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ bbr_log_exit_gain(bbr, cts, 1);
+ bbr->rc_hit_state_1 = 0;
+ bbr->r_ctl.rc_rtt_shrinks = cts;
+ bbr->r_ctl.last_in_probertt = cts;
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0);
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
+ bbr->r_ctl.rc_delivered);
+ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+ uint32_t time_in;
+
+ time_in = cts - bbr->r_ctl.rc_bbr_state_time;
+ counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
+ }
+ if (bbr->rc_filled_pipe) {
+ /* Switch to probe_bw */
+ bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
+ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
+ bbr_substate_change(bbr, cts, __LINE__, 0);
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ } else {
+ /* Back to startup */
+ bbr->rc_bbr_state = BBR_STATE_STARTUP;
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ /*
+ * We don't want to give a complete free 3
+ * measurements until we exit, so we use
+ * the number of pe's we were in probe-rtt
+ * to add to the startup_epoch. That way
+ * we will still retain the old state.
+ */
+ bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt);
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ /* Make sure to use the lower pg when shifting back in */
+ if (bbr->r_ctl.rc_lost &&
+ bbr_use_lower_gain_in_startup &&
+ (bbr->rc_use_google == 0))
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
+ else
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
+ /* Probably not needed but set it anyway */
+ bbr_set_state_target(bbr, __LINE__);
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0);
+ }
+ bbr_check_probe_rtt_limits(bbr, cts);
+}
+
+static int32_t inline
+bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts)
+{
+ if ((bbr->rc_past_init_win == 1) &&
+ (bbr->rc_in_persist == 0) &&
+ (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) {
+ return (1);
+ }
+ if (bbr_can_force_probertt &&
+ (bbr->rc_in_persist == 0) &&
+ (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
+ ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
+ return (1);
+ }
+ return (0);
+}
+
+
+static int32_t
+bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t pkt_epoch)
+{
+ uint64_t btlbw, gain;
+ if (pkt_epoch == 0) {
+ /*
+ * Need to be on a pkt-epoch to continue.
+ */
+ return (0);
+ }
+ btlbw = bbr_get_full_bw(bbr);
+ gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
+ (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
+ if (btlbw >= gain) {
+ bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
+ bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
+ }
+ if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)
+ return (1);
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
+ return(0);
+}
+
+static int32_t inline
+bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch)
+{
+ /* Have we gained 25% in the last 3 packet based epoch's? */
+ uint64_t btlbw, gain;
+ int do_exit;
+ int delta, rtt_gain;
+
+ if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
+ (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
+ /*
+ * This qualifies as a RTT_PROBE session since we drop the
+ * data outstanding to nothing and waited more than
+ * bbr_rtt_probe_time.
+ */
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
+ bbr_set_reduced_rtt(bbr, cts, __LINE__);
+ }
+ if (bbr_should_enter_probe_rtt(bbr, cts)) {
+ bbr_enter_probe_rtt(bbr, cts, __LINE__);
+ return (0);
+ }
+ if (bbr->rc_use_google)
+ return (bbr_google_startup(bbr, cts, pkt_epoch));
+
+ if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
+ (bbr_use_lower_gain_in_startup)) {
+ /* Drop to a lower gain 1.5 x since we saw loss */
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
+ }
+ if (pkt_epoch == 0) {
+ /*
+ * Need to be on a pkt-epoch to continue.
+ */
+ return (0);
+ }
+ if (bbr_rtt_gain_thresh) {
+ /*
+ * Do we allow a flow to stay
+ * in startup with no loss and no
+ * gain in rtt over a set threshold?
+ */
+ if (bbr->r_ctl.rc_pkt_epoch_rtt &&
+ bbr->r_ctl.startup_last_srtt &&
+ (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) {
+ delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt;
+ rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt;
+ } else
+ rtt_gain = 0;
+ if ((bbr->r_ctl.startup_last_srtt == 0) ||
+ (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt))
+ /* First time or new lower value */
+ bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
+
+ if ((bbr->r_ctl.rc_lost == 0) &&
+ (rtt_gain < bbr_rtt_gain_thresh)) {
+ /*
+ * No loss, and we are under
+ * our gain threhold for
+ * increasing RTT.
+ */
+ if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
+ bbr->r_ctl.rc_bbr_last_startup_epoch++;
+ bbr_log_startup_event(bbr, cts, rtt_gain,
+ delta, bbr->r_ctl.startup_last_srtt, 10);
+ return (0);
+ }
+ }
+ if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) &&
+ (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) &&
+ (!IN_RECOVERY(bbr->rc_tp->t_flags))) {
+ /*
+ * We only assess if we have a new measurment when
+ * we have no loss and are not in recovery.
+ * Drag up by one our last_startup epoch so we will hold
+ * the number of non-gain we have already accumulated.
+ */
+ if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
+ bbr->r_ctl.rc_bbr_last_startup_epoch++;
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9);
+ return (0);
+ }
+ /* Case where we reduced the lost (bad retransmit) */
+ if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost)
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count;
+ btlbw = bbr_get_full_bw(bbr);
+ if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower)
+ gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
+ (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
+ else
+ gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
+ (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
+ do_exit = 0;
+ if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw)
+ bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
+ if (btlbw >= gain) {
+ bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
+ /* Update the lost so we won't exit in next set of tests */
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
+ }
+ if ((bbr->rc_loss_exit &&
+ (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
+ (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) &&
+ ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) {
+ /*
+ * If we had no gain, we had loss and that loss was above
+ * our threshould, the rwnd is not constrained, and we have
+ * had at least 3 packet epochs exit. Note that this is
+ * switched off by sysctl. Google does not do this by the
+ * way.
+ */
+ if ((ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
+ (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) {
+ do_exit = 1;
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4);
+ } else {
+ /* Just record an updated loss value */
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5);
+ }
+ } else
+ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
+ if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) ||
+ do_exit) {
+ /* Return 1 to exit the startup state. */
+ return (1);
+ }
+ /* Stay in startup */
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
+ return (0);
+}
+
+static void
+bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses)
+{
+ /*
+ * A tick occured in the rtt epoch do we need to do anything?
+ */
+#ifdef BBR_INVARIANTS
+ if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
+ (bbr->rc_bbr_state != BBR_STATE_DRAIN) &&
+ (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) &&
+ (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
+ (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) {
+ /* Debug code? */
+ panic("Unknown BBR state %d?\n", bbr->rc_bbr_state);
+ }
+#endif
+ if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
+ /* Do we exit the startup state? */
+ if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) {
+ uint32_t time_in;
+
+ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
+ bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6);
+ bbr->rc_filled_pipe = 1;
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+
+ time_in = cts - bbr->r_ctl.rc_bbr_state_time;
+ counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
+ } else
+ time_in = 0;
+ if (bbr->rc_no_pacing)
+ bbr->rc_no_pacing = 0;
+ bbr->r_ctl.rc_bbr_state_time = cts;
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg;
+ bbr->rc_bbr_state = BBR_STATE_DRAIN;
+ bbr_set_state_target(bbr, __LINE__);
+ if ((bbr->rc_use_google == 0) &&
+ bbr_slam_cwnd_in_main_drain) {
+ /* Here we don't have to worry about probe-rtt */
+ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ if (ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
+ bbr->r_ctl.rc_target_at_state) {
+ /*
+ * Switch to probe_bw if we are already
+ * there
+ */
+ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
+ bbr_substate_change(bbr, cts, __LINE__, 0);
+ bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ }
+ }
+ } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) {
+ uint32_t inflight;
+ struct tcpcb *tp;
+
+ tp = bbr->rc_tp;
+ inflight = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ if (inflight >= bbr->r_ctl.rc_target_at_state) {
+ /* We have reached a flight of the cwnd target */
+ bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
+ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
+ bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
+ bbr_set_state_target(bbr, __LINE__);
+ /*
+ * Rig it so we don't do anything crazy and
+ * start fresh with a new randomization.
+ */
+ bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
+ bbr->rc_bbr_substate = BBR_SUB_LEVEL6;
+ bbr_substate_change(bbr, cts, __LINE__, 1);
+ }
+ } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) {
+ /* Has in-flight reached the bdp (or less)? */
+ uint32_t inflight;
+ struct tcpcb *tp;
+
+ tp = bbr->rc_tp;
+ inflight = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ if ((bbr->rc_use_google == 0) &&
+ bbr_slam_cwnd_in_main_drain &&
+ (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
+ /*
+ * Here we don't have to worry about probe-rtt
+ * re-slam it, but keep it slammed down.
+ */
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ if (inflight <= bbr->r_ctl.rc_target_at_state) {
+ /* We have drained */
+ bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
+ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
+ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
+ uint32_t time_in;
+
+ time_in = cts - bbr->r_ctl.rc_bbr_state_time;
+ counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
+ }
+ if ((bbr->rc_use_google == 0) &&
+ bbr_slam_cwnd_in_main_drain &&
+ (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
+ /* Restore the cwnd */
+ tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ /* Setup probe-rtt has being done now RRS-HERE */
+ bbr->r_ctl.rc_rtt_shrinks = cts;
+ bbr->r_ctl.last_in_probertt = cts;
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0);
+ /* Randomly pick a sub-state */
+ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
+ bbr_substate_change(bbr, cts, __LINE__, 0);
+ bbr_log_type_statechange(bbr, cts, __LINE__);
+ }
+ } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) {
+ uint32_t flight;
+
+ flight = ctf_flight_size(bbr->rc_tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered);
+ if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) &&
+ (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
+ /*
+ * We must keep cwnd at the desired MSS.
+ */
+ bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ } else if ((bbr_prtt_slam_cwnd) &&
+ (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
+ /* Re-slam it */
+ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
+ }
+ if (bbr->r_ctl.rc_bbr_enters_probertt == 0) {
+ /* Has outstanding reached our target? */
+ if (flight <= bbr->r_ctl.rc_target_at_state) {
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0);
+ bbr->r_ctl.rc_bbr_enters_probertt = cts;
+ /* If time is exactly 0, be 1usec off */
+ if (bbr->r_ctl.rc_bbr_enters_probertt == 0)
+ bbr->r_ctl.rc_bbr_enters_probertt = 1;
+ if (bbr->rc_use_google == 0) {
+ /*
+ * Restore any lowering that as occured to
+ * reach here
+ */
+ if (bbr->r_ctl.bbr_rttprobe_gain_val)
+ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
+ else
+ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
+ }
+ }
+ if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) &&
+ (bbr->rc_use_google == 0) &&
+ bbr->r_ctl.bbr_rttprobe_gain_val &&
+ (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) ||
+ (flight >= bbr->r_ctl.flightsize_at_drain))) {
+ /*
+ * We have doddled with our current hptsi
+ * gain an srtt and have still not made it
+ * to target, or we have increased our flight.
+ * Lets reduce the gain by xx%
+ * flooring the reduce at DRAIN (based on
+ * mul/div)
+ */
+ int red;
+
+ bbr->r_ctl.flightsize_at_drain = flight;
+ bbr->r_ctl.rc_probertt_srttchktim = cts;
+ red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1);
+ if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) {
+ /* Reduce our gain again */
+ bbr->r_ctl.rc_bbr_hptsi_gain -= red;
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0);
+ } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) {
+ /* one more chance before we give up */
+ bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0);
+ } else {
+ /* At the very bottom */
+ bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1);
+ }
+ }
+ }
+ if (bbr->r_ctl.rc_bbr_enters_probertt &&
+ (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) &&
+ ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) {
+ /* Time to exit probe RTT normally */
+ bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts);
+ }
+ } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
+ if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
+ (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
+ /*
+ * This qualifies as a RTT_PROBE session since we
+ * drop the data outstanding to nothing and waited
+ * more than bbr_rtt_probe_time.
+ */
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
+ bbr_set_reduced_rtt(bbr, cts, __LINE__);
+ }
+ if (bbr_should_enter_probe_rtt(bbr, cts)) {
+ bbr_enter_probe_rtt(bbr, cts, __LINE__);
+ } else {
+ bbr_set_probebw_gains(bbr, cts, losses);
+ }
+ }
+}
+
+static void
+bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses)
+{
+ int32_t epoch = 0;
+
+ if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
+ bbr_set_epoch(bbr, cts, line);
+ /* At each epoch doe lt bw sampling */
+ epoch = 1;
+ }
+ bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses);
+}
+
+static int
+bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
+ int32_t nxt_pkt, struct timeval *tv)
+{
+ int32_t thflags, retval;
+ uint32_t cts, lcts;
+ uint32_t tiwin;
+ struct tcpopt to;
+ struct tcp_bbr *bbr;
+ struct bbr_sendmap *rsm;
+ struct timeval ltv;
+ int32_t did_out = 0;
+ int32_t in_recovery;
+ uint16_t nsegs;
+ int32_t prev_state;
+ uint32_t lost;
+
+ nsegs = max(1, m->m_pkthdr.lro_nsegs);
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ /* add in our stats */
+ kern_prefetch(bbr, &prev_state);
+ prev_state = 0;
+ thflags = th->th_flags;
+ /*
+ * If this is either a state-changing packet or current state isn't
+ * established, we require a write lock on tcbinfo. Otherwise, we
+ * allow the tcbinfo to be in either alocked or unlocked, as the
+ * caller may have unnecessarily acquired a write lock due to a
+ * race.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+ __func__));
+ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+ __func__));
+
+ tp->t_rcvtime = ticks;
+ /*
+ * Unscale the window into a 32-bit value. For the SYN_SENT state
+ * the scale is zero.
+ */
+ tiwin = th->th_win << tp->snd_scale;
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+#endif
+ /*
+ * Parse options on any incoming segment.
+ */
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
+
+ if (m->m_flags & M_TSTMP) {
+ /* Prefer the hardware timestamp if present */
+ struct timespec ts;
+
+ mbuf_tstmp2timespec(m, &ts);
+ bbr->rc_tv.tv_sec = ts.tv_sec;
+ bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
+ bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
+ } else if (m->m_flags & M_TSTMP_LRO) {
+ /* Next the arrival timestamp */
+ struct timespec ts;
+
+ mbuf_tstmp2timespec(m, &ts);
+ bbr->rc_tv.tv_sec = ts.tv_sec;
+ bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
+ bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
+ } else {
+ /*
+ * Ok just get the current time.
+ */
+ bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv);
+ }
+ /*
+ * If echoed timestamp is later than the current time, fall back to
+ * non RFC1323 RTT calculation. Normalize timestamp if syncookies
+ * were used when this connection was established.
+ */
+ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+ to.to_tsecr -= tp->ts_offset;
+ if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv)))
+ to.to_tsecr = 0;
+ }
+ /*
+ * If its the first time in we need to take care of options and
+ * verify we can do SACK for rack!
+ */
+ if (bbr->r_state == 0) {
+ /*
+ * Process options only when we get SYN/ACK back. The SYN
+ * case for incoming connections is handled in tcp_syncache.
+ * According to RFC1323 the window field in a SYN (i.e., a
+ * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
+ * this is traditional behavior, may need to be cleaned up.
+ */
+ if (bbr->rc_inp == NULL) {
+ bbr->rc_inp = tp->t_inpcb;
+ }
+ /*
+ * We need to init rc_inp here since its not init'd when
+ * bbr_init is called
+ */
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ if ((to.to_flags & TOF_SCALE) &&
+ (tp->t_flags & TF_REQ_SCALE)) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->snd_scale = to.to_wscale;
+ }
+ /*
+ * Initial send window. It will be updated with the
+ * next incoming segment to the scaled value.
+ */
+ tp->snd_wnd = th->th_win;
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ }
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ (to.to_flags & TOF_SACKPERM) == 0)
+ tp->t_flags &= ~TF_SACK_PERMIT;
+ if (IS_FASTOPEN(tp->t_flags)) {
+ if (to.to_flags & TOF_FASTOPEN) {
+ uint16_t mss;
+
+ if (to.to_flags & TOF_MSS)
+ mss = to.to_mss;
+ else
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
+ mss = TCP6_MSS;
+ else
+ mss = TCP_MSS;
+ tcp_fastopen_update_cache(tp, mss,
+ to.to_tfo_len, to.to_tfo_cookie);
+ } else
+ tcp_fastopen_disable_path(tp);
+ }
+ }
+ /*
+ * At this point we are at the initial call. Here we decide
+ * if we are doing RACK or not. We do this by seeing if
+ * TF_SACK_PERMIT is set, if not rack is *not* possible and
+ * we switch to the default code.
+ */
+ if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
+ /* Bail */
+ tcp_switch_back_to_default(tp);
+ (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
+ tlen, iptos);
+ return (1);
+ }
+ /* Set the flag */
+ bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
+ tcp_set_hpts(tp->t_inpcb);
+ sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack);
+ }
+ if (thflags & TH_ACK) {
+ /* Track ack types */
+ if (to.to_flags & TOF_SACK)
+ BBR_STAT_INC(bbr_acks_with_sacks);
+ else
+ BBR_STAT_INC(bbr_plain_acks);
+ }
+ /*
+ * This is the one exception case where we set the rack state
+ * always. All other times (timers etc) we must have a rack-state
+ * set (so we assure we have done the checks above for SACK).
+ */
+ if (bbr->r_state != tp->t_state)
+ bbr_set_state(tp, bbr, tiwin);
+
+ if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL)
+ kern_prefetch(rsm, &prev_state);
+ prev_state = bbr->r_state;
+ bbr->rc_ack_was_delayed = 0;
+ lost = bbr->r_ctl.rc_lost;
+ bbr->rc_is_pkt_epoch_now = 0;
+ if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) {
+ /* Get the real time into lcts and figure the real delay */
+ lcts = tcp_get_usecs(&ltv);
+ if (TSTMP_GT(lcts, cts)) {
+ bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts;
+ bbr->rc_ack_was_delayed = 1;
+ if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay,
+ bbr->r_ctl.highest_hdwr_delay))
+ bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay;
+ } else {
+ bbr->r_ctl.rc_ack_hdwr_delay = 0;
+ bbr->rc_ack_was_delayed = 0;
+ }
+ } else {
+ bbr->r_ctl.rc_ack_hdwr_delay = 0;
+ bbr->rc_ack_was_delayed = 0;
+ }
+ bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m);
+ if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
+ retval = 0;
+ m_freem(m);
+ goto done_with_input;
+ }
+ /*
+ * If a segment with the ACK-bit set arrives in the SYN-SENT state
+ * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
+ */
+ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return (1);
+ }
+ in_recovery = IN_RECOVERY(tp->t_flags);
+ if (tiwin > bbr->r_ctl.rc_high_rwnd)
+ bbr->r_ctl.rc_high_rwnd = tiwin;
+#ifdef BBR_INVARIANTS
+ if ((tp->t_inpcb->inp_flags & INP_DROPPED) ||
+ (tp->t_inpcb->inp_flags2 & INP_FREED)) {
+ panic("tp:%p bbr:%p given a dropped inp:%p",
+ tp, bbr, tp->t_inpcb);
+ }
+#endif
+ bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp,
+ (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ bbr->rtt_valid = 0;
+ if (to.to_flags & TOF_TS) {
+ bbr->rc_ts_valid = 1;
+ bbr->r_ctl.last_inbound_ts = to.to_tsval;
+ } else {
+ bbr->rc_ts_valid = 0;
+ bbr->r_ctl.last_inbound_ts = 0;
+ }
+ retval = (*bbr->r_substate) (m, th, so,
+ tp, &to, drop_hdrlen,
+ tlen, tiwin, thflags, nxt_pkt);
+#ifdef BBR_INVARIANTS
+ if ((retval == 0) &&
+ (tp->t_inpcb == NULL)) {
+ panic("retval:%d tp:%p t_inpcb:NULL state:%d",
+ retval, tp, prev_state);
+ }
+#endif
+ if (nxt_pkt == 0)
+ BBR_STAT_INC(bbr_rlock_left_ret0);
+ else
+ BBR_STAT_INC(bbr_rlock_left_ret1);
+ if (retval == 0) {
+ /*
+ * If retval is 1 the tcb is unlocked and most likely the tp
+ * is gone.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ tcp_bbr_xmit_timer_commit(bbr, tp, cts);
+ if (bbr->rc_is_pkt_epoch_now)
+ bbr_set_pktepoch(bbr, cts, __LINE__);
+ bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
+ if (nxt_pkt == 0) {
+ if (bbr->r_wanted_output != 0) {
+ bbr->rc_output_starts_timer = 0;
+ did_out = 1;
+ (void)tp->t_fb->tfb_tcp_output(tp);
+ } else
+ bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0);
+ }
+ if ((nxt_pkt == 0) &&
+ ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
+ (SEQ_GT(tp->snd_max, tp->snd_una) ||
+ (tp->t_flags & TF_DELACK) ||
+ ((tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)))) {
+ /*
+ * We could not send (probably in the hpts but
+ * stopped the timer)?
+ */
+ if ((tp->snd_max == tp->snd_una) &&
+ ((tp->t_flags & TF_DELACK) == 0) &&
+ (bbr->rc_inp->inp_in_hpts) &&
+ (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
+ /*
+ * keep alive not needed if we are hptsi
+ * output yet
+ */
+ ;
+ } else {
+ if (bbr->rc_inp->inp_in_hpts) {
+ tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
+ if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
+ (TSTMP_GT(lcts, bbr->rc_pacer_started))) {
+ uint32_t del;
+
+ del = lcts - bbr->rc_pacer_started;
+ if (del > bbr->r_ctl.rc_last_delay_val) {
+ BBR_STAT_INC(bbr_force_timer_start);
+ bbr->r_ctl.rc_last_delay_val -= del;
+ bbr->rc_pacer_started = lcts;
+ } else {
+ /* We are late */
+ BBR_STAT_INC(bbr_force_output);
+ (void)tp->t_fb->tfb_tcp_output(tp);
+ }
+ }
+ }
+ bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val,
+ 0);
+ }
+ } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) {
+ /* Do we have the correct timer running? */
+ bbr_timer_audit(tp, bbr, lcts, &so->so_snd);
+ }
+ /* Do we have a new state */
+ if (bbr->r_state != tp->t_state)
+ bbr_set_state(tp, bbr, tiwin);
+done_with_input:
+ bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out);
+ if (did_out)
+ bbr->r_wanted_output = 0;
+#ifdef BBR_INVARIANTS
+ if (tp->t_inpcb == NULL) {
+ panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
+ did_out,
+ retval, tp, prev_state);
+ }
+#endif
+ }
+ return (retval);
+}
+
+static void
+bbr_log_type_hrdwtso(struct tcpcb *tp, struct tcp_bbr *bbr, int len, int mod, int what_we_can_send)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+ uint32_t cts;
+
+ cts = tcp_get_usecs(&tv);
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ log.u_bbr.flex1 = bbr->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex2 = what_we_can_send;
+ log.u_bbr.flex3 = bbr->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex4 = len;
+ log.u_bbr.flex5 = 0;
+ log.u_bbr.flex7 = mod;
+ log.u_bbr.flex8 = 1;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ TCP_HDWR_TLS, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+static void
+bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
+{
+ struct timeval tv;
+ int retval;
+
+ /* First lets see if we have old packets */
+ if (tp->t_in_pkt) {
+ if (ctf_do_queued_segments(so, tp, 1)) {
+ m_freem(m);
+ return;
+ }
+ }
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+ } else {
+ /* Should not be should we kassert instead? */
+ tcp_get_usecs(&tv);
+ }
+ retval = bbr_do_segment_nounlock(m, th, so, tp,
+ drop_hdrlen, tlen, iptos, 0, &tv);
+ if (retval == 0)
+ INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * Return how much data can be sent without violating the
+ * cwnd or rwnd.
+ */
+
+static inline uint32_t
+bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin,
+ uint32_t avail, int32_t sb_offset, uint32_t cts)
+{
+ uint32_t len;
+
+ if (ctf_outstanding(tp) >= tp->snd_wnd) {
+ /* We never want to go over our peers rcv-window */
+ len = 0;
+ } else {
+ uint32_t flight;
+
+ flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
+ if (flight >= sendwin) {
+ /*
+ * We have in flight what we are allowed by cwnd (if
+ * it was rwnd blocking it would have hit above out
+ * >= tp->snd_wnd).
+ */
+ return (0);
+ }
+ len = sendwin - flight;
+ if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
+ /* We would send too much (beyond the rwnd) */
+ len = tp->snd_wnd - ctf_outstanding(tp);
+ }
+ if ((len + sb_offset) > avail) {
+ /*
+ * We don't have that much in the SB, how much is
+ * there?
+ */
+ len = avail - sb_offset;
+ }
+ }
+ return (len);
+}
+
+static inline void
+bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
+{
+#ifdef NETFLIX_STATS
+ TCPSTAT_INC(tcps_sndpack_error);
+ TCPSTAT_ADD(tcps_sndbyte_error, len);
+#endif
+}
+
+static inline void
+bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
+{
+ if (error) {
+ bbr_do_error_accounting(tp, bbr, rsm, len, error);
+ return;
+ }
+ if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
+ /* Window probe */
+ TCPSTAT_INC(tcps_sndprobe);
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u32(tp->t_stats,
+ VOI_TCP_RETXPB, len);
+#endif
+ } else if (rsm) {
+ if (rsm->r_flags & BBR_TLP) {
+ /*
+ * TLP should not count in retran count, but in its
+ * own bin
+ */
+#ifdef NETFLIX_STATS
+ tp->t_sndtlppack++;
+ tp->t_sndtlpbyte += len;
+ TCPSTAT_INC(tcps_tlpresends);
+ TCPSTAT_ADD(tcps_tlpresend_bytes, len);
+#endif
+ } else {
+ /* Retransmit */
+ tp->t_sndrexmitpack++;
+ TCPSTAT_INC(tcps_sndrexmitpack);
+ TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
+ len);
+#endif
+ }
+ /*
+ * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is
+ * sub-state
+ */
+ counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len);
+ if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) {
+ /* Non probe_bw log in 1, 2, or 4. */
+ counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len);
+ } else {
+ /*
+ * Log our probe state 3, and log also 5-13 to show
+ * us the recovery sub-state for the send. This
+ * means that 3 == (5+6+7+8+9+10+11+12+13)
+ */
+ counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len);
+ counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len);
+ }
+ /* Place in both 16's the totals of retransmitted */
+ counter_u64_add(bbr_state_lost[16], len);
+ counter_u64_add(bbr_state_resend[16], len);
+ /* Place in 17's the total sent */
+ counter_u64_add(bbr_state_resend[17], len);
+ counter_u64_add(bbr_state_lost[17], len);
+
+ } else {
+ /* New sends */
+ TCPSTAT_INC(tcps_sndpack);
+ TCPSTAT_ADD(tcps_sndbyte, len);
+ /* Place in 17's the total sent */
+ counter_u64_add(bbr_state_resend[17], len);
+ counter_u64_add(bbr_state_lost[17], len);
+#ifdef NETFLIX_STATS
+ stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
+ len);
+#endif
+ }
+}
+
+static void
+bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level)
+{
+ if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) {
+ /*
+ * Limit the cwnd to not be above N x the target plus whats
+ * is outstanding. The target is based on the current b/w
+ * estimate.
+ */
+ uint32_t target;
+
+ target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT);
+ target += ctf_outstanding(tp);
+ target *= bbr_target_cwnd_mult_limit;
+ if (tp->snd_cwnd > target)
+ tp->snd_cwnd = target;
+ bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__);
+ }
+}
+
+static int
+bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg)
+{
+ /*
+ * "adv" is the amount we could increase the window, taking into
+ * account that we are limited by TCP_MAXWIN << tp->rcv_scale.
+ */
+ uint32_t adv;
+ int32_t oldwin;
+
+ adv = min(recwin, TCP_MAXWIN << tp->rcv_scale);
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
+ oldwin = (tp->rcv_adv - tp->rcv_nxt);
+ adv -= oldwin;
+ } else
+ oldwin = 0;
+
+ /*
+ * If the new window size ends up being the same as the old size
+ * when it is scaled, then don't force a window update.
+ */
+ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
+ return (0);
+
+ if (adv >= (2 * maxseg) &&
+ (adv >= (so->so_rcv.sb_hiwat / 4) ||
+ recwin <= (so->so_rcv.sb_hiwat / 8) ||
+ so->so_rcv.sb_hiwat <= 8 * maxseg)) {
+ return (1);
+ }
+ if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
+ return (1);
+ return (0);
+}
+
+/*
+ * Return 0 on success and a errno on failure to send.
+ * Note that a 0 return may not mean we sent anything
+ * if the TCB was on the hpts. A non-zero return
+ * does indicate the error we got from ip[6]_output.
+ */
+static int
+bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
+{
+ struct socket *so;
+ int32_t len;
+ uint32_t cts;
+ uint32_t recwin, sendwin;
+ int32_t sb_offset;
+ int32_t flags, abandon, error = 0;
+ struct tcp_log_buffer *lgb = NULL;
+ struct mbuf *m;
+ struct mbuf *mb;
+ uint32_t if_hw_tsomaxsegcount = 0;
+ uint32_t if_hw_tsomaxsegsize = 0;
+ uint32_t if_hw_tsomax = 0;
+ struct ip *ip = NULL;
+#ifdef TCPDEBUG
+ struct ipovly *ipov = NULL;
+#endif
+ struct tcp_bbr *bbr;
+ struct tcphdr *th;
+#ifdef NETFLIX_TCPOUDP
+ struct udphdr *udp = NULL;
+#endif
+ u_char opt[TCP_MAXOLEN];
+ unsigned ipoptlen, optlen, hdrlen;
+#ifdef NETFLIX_TCPOUDP
+ unsigned ulen;
+#endif
+ uint32_t bbr_seq;
+ uint32_t delay_calc=0;
+ uint8_t doing_tlp = 0;
+ uint8_t local_options;
+#ifdef BBR_INVARIANTS
+ uint8_t doing_retran_from = 0;
+ uint8_t picked_up_retran = 0;
+#endif
+ uint8_t wanted_cookie = 0;
+ uint8_t more_to_rxt=0;
+ int32_t prefetch_so_done = 0;
+ int32_t prefetch_rsm = 0;
+ uint32_t what_we_can = 0;
+ uint32_t tot_len = 0;
+ uint32_t rtr_cnt = 0;
+ uint32_t maxseg, pace_max_segs, p_maxseg;
+ int32_t csum_flags;
+ int32_t hw_tls;
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ unsigned ipsec_optlen = 0;
+
+#endif
+ volatile int32_t sack_rxmit;
+ struct bbr_sendmap *rsm = NULL;
+ int32_t tso, mtu;
+ int force_tso = 0;
+ struct tcpopt to;
+ int32_t slot = 0;
+ struct inpcb *inp;
+ struct sockbuf *sb;
+ uint32_t hpts_calling;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int32_t isipv6;
+#endif
+ uint8_t app_limited = BBR_JR_SENT_DATA;
+ uint8_t filled_all = 0;
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ /* We take a cache hit here */
+ memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
+ cts = tcp_tv_to_usectick(&bbr->rc_tv);
+ inp = bbr->rc_inp;
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+#ifdef KERN_TLS
+ if (sb->sb_flags & SB_TLS_IFNET)
+ hw_tls = 1;
+ else
+#endif
+ hw_tls = 0;
+ kern_prefetch(sb, &maxseg);
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ if (bbr_minseg(bbr) < maxseg) {
+ tcp_bbr_tso_size_check(bbr, cts);
+ }
+ /* Remove any flags that indicate we are pacing on the inp */
+ pace_max_segs = bbr->r_ctl.rc_pace_max_segs;
+ p_maxseg = min(maxseg, pace_max_segs);
+ INP_WLOCK_ASSERT(inp);
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE)
+ return (tcp_offload_output(tp));
+#endif
+
+#ifdef INET6
+ if (bbr->r_state) {
+ /* Use the cache line loaded if possible */
+ isipv6 = bbr->r_is_v6;
+ } else {
+ isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+ }
+#endif
+ if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
+ inp->inp_in_hpts) {
+ /*
+ * We are on the hpts for some timer but not hptsi output.
+ * Possibly remove from the hpts so we can send/recv etc.
+ */
+ if ((tp->t_flags & TF_ACKNOW) == 0) {
+ /*
+ * No immediate demand right now to send an ack, but
+ * the user may have read, making room for new data
+ * (a window update). If so we may want to cancel
+ * whatever timer is running (KEEP/DEL-ACK?) and
+ * continue to send out a window update. Or we may
+ * have gotten more data into the socket buffer to
+ * send.
+ */
+ recwin = min(max(sbspace(&so->so_rcv), 0),
+ TCP_MAXWIN << tp->rcv_scale);
+ if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) &&
+ ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <=
+ (tp->snd_max - tp->snd_una))) {
+ /*
+ * Nothing new to send and no window update
+ * is needed to send. Lets just return and
+ * let the timer-run off.
+ */
+ return (0);
+ }
+ }
+ tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ bbr_timer_cancel(bbr, __LINE__, cts);
+ }
+ if (bbr->r_ctl.rc_last_delay_val) {
+ /* Calculate a rough delay for early escape to sending */
+ if (SEQ_GT(cts, bbr->rc_pacer_started))
+ delay_calc = cts - bbr->rc_pacer_started;
+ if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
+ delay_calc -= bbr->r_ctl.rc_last_delay_val;
+ else
+ delay_calc = 0;
+ }
+ /* Mark that we have called bbr_output(). */
+ if ((bbr->r_timer_override) ||
+ (tp->t_flags & TF_FORCEDATA) ||
+ (tp->t_state < TCPS_ESTABLISHED)) {
+ /* Timeouts or early states are exempt */
+ if (inp->inp_in_hpts)
+ tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ } else if (inp->inp_in_hpts) {
+ if ((bbr->r_ctl.rc_last_delay_val) &&
+ (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
+ delay_calc) {
+ /*
+ * We were being paced for output and the delay has
+ * already exceeded when we were supposed to be
+ * called, lets go ahead and pull out of the hpts
+ * and call output.
+ */
+ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1);
+ bbr->r_ctl.rc_last_delay_val = 0;
+ tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ } else if (tp->t_state == TCPS_CLOSED) {
+ bbr->r_ctl.rc_last_delay_val = 0;
+ tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ } else {
+ /*
+ * On the hpts, you shall not pass! even if ACKNOW
+ * is on, we will when the hpts fires, unless of
+ * course we are overdue.
+ */
+ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1);
+ return (0);
+ }
+ }
+ bbr->rc_cwnd_limited = 0;
+ if (bbr->r_ctl.rc_last_delay_val) {
+ /* recalculate the real delay and deal with over/under */
+ if (SEQ_GT(cts, bbr->rc_pacer_started))
+ delay_calc = cts - bbr->rc_pacer_started;
+ else
+ delay_calc = 0;
+ if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
+ /* Setup the delay which will be added in */
+ delay_calc -= bbr->r_ctl.rc_last_delay_val;
+ else {
+ /*
+ * We are early setup to adjust
+ * our slot time.
+ */
+ bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc);
+ bbr->r_ctl.rc_last_delay_val = 0;
+ bbr->r_agg_early_set = 1;
+ if (bbr->r_ctl.rc_hptsi_agg_delay) {
+ if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) {
+ /* Nope our previous late cancels out the early */
+ bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early;
+ bbr->r_agg_early_set = 0;
+ bbr->r_ctl.rc_agg_early = 0;
+ } else {
+ bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay;
+ bbr->r_ctl.rc_hptsi_agg_delay = 0;
+ }
+ }
+ bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls,
+ bbr->r_ctl.rc_agg_early, cts, 3, 0,
+ bbr->r_agg_early_set, 3);
+ BBR_STAT_INC(bbr_early);
+ delay_calc = 0;
+ }
+ } else {
+ /* We were not delayed due to hptsi */
+ if (bbr->r_agg_early_set)
+ bbr->r_ctl.rc_agg_early = 0;
+ bbr->r_agg_early_set = 0;
+ delay_calc = 0;
+ }
+ if (delay_calc) {
+ /*
+ * We had a hptsi delay which means we are falling behind on
+ * sending at the expected rate. Calculate an extra amount
+ * of data we can send, if any, to put us back on track.
+ */
+ if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay)
+ bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff;
+ else
+ bbr->r_ctl.rc_hptsi_agg_delay += delay_calc;
+ }
+ sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+ if ((tp->snd_una == tp->snd_max) &&
+ (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
+ (sbavail(sb))) {
+ /*
+ * Ok we have been idle with nothing outstanding
+ * we possibly need to start fresh with either a new
+ * suite of states or a fast-ramp up.
+ */
+ bbr_restart_after_idle(bbr,
+ cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time));
+ }
+ /*
+ * Now was there a hptsi delay where we are behind? We only count
+ * being behind if: a) We are not in recovery. b) There was a delay.
+ * <and> c) We had room to send something.
+ *
+ */
+ hpts_calling = inp->inp_hpts_calls;
+ inp->inp_hpts_calls = 0;
+ if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ if (bbr_process_timers(tp, bbr, cts, hpts_calling)) {
+ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1);
+ return (0);
+ }
+ }
+ bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ if (hpts_calling &&
+ (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
+ bbr->r_ctl.rc_last_delay_val = 0;
+ }
+ bbr->r_timer_override = 0;
+ bbr->r_wanted_output = 0;
+ /*
+ * For TFO connections in SYN_RECEIVED, only allow the initial
+ * SYN|ACK and those sent by the retransmit timer.
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ ((tp->t_state == TCPS_SYN_RECEIVED) ||
+ (tp->t_state == TCPS_SYN_SENT)) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN or SYN|ACK sent */
+ (tp->t_rxtshift == 0)) { /* not a retransmit */
+ return (0);
+ }
+ /*
+ * Before sending anything check for a state update. For hpts
+ * calling without input this is important. If its input calling
+ * then this was already done.
+ */
+ if (bbr->rc_use_google == 0)
+ bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
+again:
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_max. BBR in general does not pay much attention to snd_nxt
+ * for historic reasons the persist timer still uses it. This means
+ * we have to look at it. All retransmissions that are not persits
+ * use the rsm that needs to be sent so snd_nxt is ignored. At the
+ * end of this routine we pull snd_nxt always up to snd_max.
+ */
+ doing_tlp = 0;
+#ifdef BBR_INVARIANTS
+ doing_retran_from = picked_up_retran = 0;
+#endif
+ error = 0;
+ tso = 0;
+ slot = 0;
+ mtu = 0;
+ sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+ sb_offset = tp->snd_max - tp->snd_una;
+ flags = tcp_outflags[tp->t_state];
+ sack_rxmit = 0;
+ len = 0;
+ rsm = NULL;
+ if (flags & TH_RST) {
+ SOCKBUF_LOCK(sb);
+ goto send;
+ }
+recheck_resend:
+ while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
+ /* We need to always have one in reserve */
+ rsm = bbr_alloc(bbr);
+ if (rsm == NULL) {
+ error = ENOMEM;
+ /* Lie to get on the hpts */
+ tot_len = tp->t_maxseg;
+ if (hpts_calling)
+ /* Retry in a ms */
+ slot = 1001;
+ goto just_return_nolock;
+ }
+ TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
+ bbr->r_ctl.rc_free_cnt++;
+ rsm = NULL;
+ }
+ /* What do we send, a resend? */
+ if (bbr->r_ctl.rc_resend == NULL) {
+ /* Check for rack timeout */
+ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
+ if (bbr->r_ctl.rc_resend) {
+#ifdef BBR_INVARIANTS
+ picked_up_retran = 1;
+#endif
+ bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend);
+ }
+ }
+ if (bbr->r_ctl.rc_resend) {
+ rsm = bbr->r_ctl.rc_resend;
+#ifdef BBR_INVARIANTS
+ doing_retran_from = 1;
+#endif
+ /* Remove any TLP flags its a RACK or T-O */
+ rsm->r_flags &= ~BBR_TLP;
+ bbr->r_ctl.rc_resend = NULL;
+ if (SEQ_LT(rsm->r_start, tp->snd_una)) {
+#ifdef BBR_INVARIANTS
+ panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n",
+ tp, bbr, rsm, rsm->r_start, tp->snd_una);
+ goto recheck_resend;
+#else
+ /* TSNH */
+ rsm = NULL;
+ goto recheck_resend;
+#endif
+ }
+ rtr_cnt++;
+ if (rsm->r_flags & BBR_HAS_SYN) {
+ /* Only retransmit a SYN by itself */
+ len = 0;
+ if ((flags & TH_SYN) == 0) {
+ /* Huh something is wrong */
+ rsm->r_start++;
+ if (rsm->r_start == rsm->r_end) {
+ /* Clean it up, somehow we missed the ack? */
+ bbr_log_syn(tp, NULL);
+ } else {
+ /* TFO with data? */
+ rsm->r_flags &= ~BBR_HAS_SYN;
+ len = rsm->r_end - rsm->r_start;
+ }
+ } else {
+ /* Retransmitting SYN */
+ rsm = NULL;
+ SOCKBUF_LOCK(sb);
+ goto send;
+ }
+ } else
+ len = rsm->r_end - rsm->r_start;
+ if ((bbr->rc_resends_use_tso == 0) &&
+#ifdef KERN_TLS
+ ((sb->sb_flags & SB_TLS_IFNET) == 0) &&
+#endif
+ (len > maxseg)) {
+ len = maxseg;
+ more_to_rxt = 1;
+ }
+ sb_offset = rsm->r_start - tp->snd_una;
+ if (len > 0) {
+ sack_rxmit = 1;
+ TCPSTAT_INC(tcps_sack_rexmits);
+ TCPSTAT_ADD(tcps_sack_rexmit_bytes,
+ min(len, maxseg));
+ } else {
+ /* I dont think this can happen */
+ rsm = NULL;
+ goto recheck_resend;
+ }
+ BBR_STAT_INC(bbr_resends_set);
+ } else if (bbr->r_ctl.rc_tlp_send) {
+ /*
+ * Tail loss probe
+ */
+ doing_tlp = 1;
+ rsm = bbr->r_ctl.rc_tlp_send;
+ bbr->r_ctl.rc_tlp_send = NULL;
+ sack_rxmit = 1;
+ len = rsm->r_end - rsm->r_start;
+ rtr_cnt++;
+ if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
+ len = maxseg;
+
+ if (SEQ_GT(tp->snd_una, rsm->r_start)) {
+#ifdef BBR_INVARIANTS
+ panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u",
+ tp, bbr, tp->snd_una, rsm, rsm->r_start);
+#else
+ /* TSNH */
+ rsm = NULL;
+ goto recheck_resend;
+#endif
+ }
+ sb_offset = rsm->r_start - tp->snd_una;
+ BBR_STAT_INC(bbr_tlp_set);
+ }
+ /*
+ * Enforce a connection sendmap count limit if set
+ * as long as we are not retransmiting.
+ */
+ if ((rsm == NULL) &&
+ (bbr_tcp_map_entries_limit > 0) &&
+ (bbr->r_ctl.rc_num_maps_alloced >= bbr_tcp_map_entries_limit)) {
+ BBR_STAT_INC(bbr_alloc_limited);
+ if (!bbr->alloc_limit_reported) {
+ bbr->alloc_limit_reported = 1;
+ BBR_STAT_INC(bbr_alloc_limited_conns);
+ }
+ goto just_return_nolock;
+ }
+#ifdef BBR_INVARIANTS
+ if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) {
+ panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u",
+ tp, bbr, rsm, sb_offset, len);
+ }
+#endif
+ /*
+ * Get standard flags, and add SYN or FIN if requested by 'hidden'
+ * state flags.
+ */
+ if (tp->t_flags & TF_NEEDFIN && (rsm == NULL))
+ flags |= TH_FIN;
+ if (tp->t_flags & TF_NEEDSYN)
+ flags |= TH_SYN;
+
+ if (rsm && (rsm->r_flags & BBR_HAS_FIN)) {
+ /* we are retransmitting the fin */
+ len--;
+ if (len) {
+ /*
+ * When retransmitting data do *not* include the
+ * FIN. This could happen from a TLP probe if we
+ * allowed data with a FIN.
+ */
+ flags &= ~TH_FIN;
+ }
+ } else if (rsm) {
+ if (flags & TH_FIN)
+ flags &= ~TH_FIN;
+ }
+ if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
+ void *end_rsm;
+
+ end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
+ if (end_rsm)
+ kern_prefetch(end_rsm, &prefetch_rsm);
+ prefetch_rsm = 1;
+ }
+ SOCKBUF_LOCK(sb);
+ /*
+ * If in persist timeout with window of 0, send 1 byte. Otherwise,
+ * if window is small but nonzero and time TF_SENTFIN expired, we
+ * will send what we can and go to transmit state.
+ */
+ if (tp->t_flags & TF_FORCEDATA) {
+ if ((sendwin == 0) || (sendwin <= (tp->snd_max - tp->snd_una))) {
+ /*
+ * If we still have some data to send, then clear
+ * the FIN bit. Usually this would happen below
+ * when it realizes that we aren't sending all the
+ * data. However, if we have exactly 1 byte of
+ * unsent data, then it won't clear the FIN bit
+ * below, and if we are in persist state, we wind up
+ * sending the packet without recording that we sent
+ * the FIN bit.
+ *
+ * We can't just blindly clear the FIN bit, because
+ * if we don't have any more data to send then the
+ * probe will be the FIN itself.
+ */
+ if (sb_offset < sbused(sb))
+ flags &= ~TH_FIN;
+ sendwin = 1;
+ } else {
+ if ((bbr->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
+ bbr_minseg(bbr)))) {
+ /* Exit persists if there is space */
+ bbr_exit_persist(tp, bbr, cts, __LINE__);
+ }
+ if (rsm == NULL) {
+ /*
+ * If we are dropping persist mode then we
+ * need to correct sb_offset if not a
+ * retransmit.
+ */
+ sb_offset = tp->snd_max - tp->snd_una;
+ }
+ }
+ }
+ /*
+ * If snd_nxt == snd_max and we have transmitted a FIN, the
+ * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
+ * negative length. This can also occur when TCP opens up its
+ * congestion window while receiving additional duplicate acks after
+ * fast-retransmit because TCP will reset snd_nxt to snd_max after
+ * the fast-retransmit.
+ *
+ * In the normal retransmit-FIN-only case, however, snd_nxt will be
+ * set to snd_una, the sb_offset will be 0, and the length may wind
+ * up 0.
+ *
+ * If sack_rxmit is true we are retransmitting from the scoreboard
+ * in which case len is already set.
+ */
+ if (sack_rxmit == 0) {
+ uint32_t avail;
+
+ avail = sbavail(sb);
+ if (SEQ_GT(tp->snd_max, tp->snd_una))
+ sb_offset = tp->snd_max - tp->snd_una;
+ else
+ sb_offset = 0;
+ if (bbr->rc_tlp_new_data) {
+ /* TLP is forcing out new data */
+ uint32_t tlplen;
+
+ doing_tlp = 1;
+ tlplen = maxseg;
+
+ if (tlplen > (uint32_t)(avail - sb_offset)) {
+ tlplen = (uint32_t)(avail - sb_offset);
+ }
+ if (tlplen > tp->snd_wnd) {
+ len = tp->snd_wnd;
+ } else {
+ len = tlplen;
+ }
+ bbr->rc_tlp_new_data = 0;
+ } else {
+ what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts);
+ if ((len < p_maxseg) &&
+ (bbr->rc_in_persist == 0) &&
+ (ctf_outstanding(tp) >= (2 * p_maxseg)) &&
+ ((avail - sb_offset) >= p_maxseg)) {
+ /*
+ * We are not completing whats in the socket
+ * buffer (i.e. there is at least a segment
+ * waiting to send) and we have 2 or more
+ * segments outstanding. There is no sense
+ * of sending a little piece. Lets defer and
+ * and wait until we can send a whole
+ * segment.
+ */
+ len = 0;
+ }
+ if ((tp->t_flags & TF_FORCEDATA) && (bbr->rc_in_persist)) {
+ /*
+ * We are in persists, figure out if
+ * a retransmit is available (maybe the previous
+ * persists we sent) or if we have to send new
+ * data.
+ */
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ if (rsm) {
+ len = rsm->r_end - rsm->r_start;
+ if (rsm->r_flags & BBR_HAS_FIN)
+ len--;
+ if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
+ len = maxseg;
+ if (len > 1)
+ BBR_STAT_INC(bbr_persist_reneg);
+ /*
+ * XXXrrs we could force the len to
+ * 1 byte here to cause the chunk to
+ * split apart.. but that would then
+ * mean we always retransmit it as
+ * one byte even after the window
+ * opens.
+ */
+ sack_rxmit = 1;
+ sb_offset = rsm->r_start - tp->snd_una;
+ } else {
+ /*
+ * First time through in persists or peer
+ * acked our one byte. Though we do have
+ * to have something in the sb.
+ */
+ len = 1;
+ sb_offset = 0;
+ if (avail == 0)
+ len = 0;
+ }
+ }
+ }
+ }
+ if (prefetch_so_done == 0) {
+ kern_prefetch(so, &prefetch_so_done);
+ prefetch_so_done = 1;
+ }
+ /*
+ * Lop off SYN bit if it has already been sent. However, if this is
+ * SYN-SENT state and if segment contains data and if we don't know
+ * that foreign host supports TAO, suppress sending segment.
+ */
+ if ((flags & TH_SYN) && (rsm == NULL) &&
+ SEQ_GT(tp->snd_max, tp->snd_una)) {
+ if (tp->t_state != TCPS_SYN_RECEIVED)
+ flags &= ~TH_SYN;
+ /*
+ * When sending additional segments following a TFO SYN|ACK,
+ * do not include the SYN bit.
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ flags &= ~TH_SYN;
+ sb_offset--, len++;
+ if (sbavail(sb) == 0)
+ len = 0;
+ } else if ((flags & TH_SYN) && rsm) {
+ /*
+ * Subtract one from the len for the SYN being
+ * retransmitted.
+ */
+ len--;
+ }
+ /*
+ * Be careful not to send data and/or FIN on SYN segments. This
+ * measure is needed to prevent interoperability problems with not
+ * fully conformant TCP implementations.
+ */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
+ len = 0;
+ flags &= ~TH_FIN;
+ }
+ /*
+ * On TFO sockets, ensure no data is sent in the following cases:
+ *
+ * - When retransmitting SYN|ACK on a passively-created socket
+ * - When retransmitting SYN on an actively created socket
+ * - When sending a zero-length cookie (cookie request) on an
+ * actively created socket
+ * - When the socket is in the CLOSED state (RST is being sent)
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
+ ((tp->t_state == TCPS_SYN_SENT) &&
+ (tp->t_tfo_client_cookie_len == 0)) ||
+ (flags & TH_RST))) {
+ len = 0;
+ sack_rxmit = 0;
+ rsm = NULL;
+ }
+ /* Without fast-open there should never be data sent on a SYN */
+ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
+ len = 0;
+ if (len <= 0) {
+ /*
+ * If FIN has been sent but not acked, but we haven't been
+ * called to retransmit, len will be < 0. Otherwise, window
+ * shrank after we sent into it. If window shrank to 0,
+ * cancel pending retransmit, pull snd_nxt back to (closed)
+ * window, and set the persist timer if it isn't already
+ * going. If the window didn't close completely, just wait
+ * for an ACK.
+ *
+ * We also do a general check here to ensure that we will
+ * set the persist timer when we have data to send, but a
+ * 0-byte window. This makes sure the persist timer is set
+ * even if the packet hits one of the "goto send" lines
+ * below.
+ */
+ len = 0;
+ if ((tp->snd_wnd == 0) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ (tp->snd_una == tp->snd_max) &&
+ (sb_offset < (int)sbavail(sb))) {
+ /*
+ * Not enough room in the rwnd to send
+ * a paced segment out.
+ */
+ bbr_enter_persist(tp, bbr, cts, __LINE__);
+ }
+ } else if ((rsm == NULL) &&
+ (doing_tlp == 0) &&
+ (len < bbr->r_ctl.rc_pace_max_segs)) {
+ /*
+ * We are not sending a full segment for
+ * some reason. Should we not send anything (think
+ * sws or persists)?
+ */
+ if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ (len < (int)(sbavail(sb) - sb_offset))) {
+ /*
+ * Here the rwnd is less than
+ * the pacing size, this is not a retransmit,
+ * we are established and
+ * the send is not the last in the socket buffer
+ * lets not send, and possibly enter persists.
+ */
+ len = 0;
+ if (tp->snd_max == tp->snd_una)
+ bbr_enter_persist(tp, bbr, cts, __LINE__);
+ } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) &&
+ (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
+ (len < (int)(sbavail(sb) - sb_offset)) &&
+ (len < bbr_minseg(bbr))) {
+ /*
+ * Here we are not retransmitting, and
+ * the cwnd is not so small that we could
+ * not send at least a min size (rxt timer
+ * not having gone off), We have 2 segments or
+ * more already in flight, its not the tail end
+ * of the socket buffer and the cwnd is blocking
+ * us from sending out minimum pacing segment size.
+ * Lets not send anything.
+ */
+ bbr->rc_cwnd_limited = 1;
+ len = 0;
+ } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
+ min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
+ (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
+ (len < (int)(sbavail(sb) - sb_offset)) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state))) {
+ /*
+ * Here we have a send window but we have
+ * filled it up and we can't send another pacing segment.
+ * We also have in flight more than 2 segments
+ * and we are not completing the sb i.e. we allow
+ * the last bytes of the sb to go out even if
+ * its not a full pacing segment.
+ */
+ len = 0;
+ }
+ }
+ /* len will be >= 0 after this point. */
+ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
+ tcp_sndbuf_autoscale(tp, so, sendwin);
+ /*
+ *
+ */
+ if (bbr->rc_in_persist &&
+ len &&
+ (rsm == NULL) &&
+ (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) {
+ /*
+ * We are in persist, not doing a retransmit and don't have enough space
+ * yet to send a full TSO. So is it at the end of the sb
+ * if so we need to send else nuke to 0 and don't send.
+ */
+ int sbleft;
+ if (sbavail(sb) > sb_offset)
+ sbleft = sbavail(sb) - sb_offset;
+ else
+ sbleft = 0;
+ if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) {
+ /* not at end of sb lets not send */
+ len = 0;
+ }
+ }
+ /*
+ * Decide if we can use TCP Segmentation Offloading (if supported by
+ * hardware).
+ *
+ * TSO may only be used if we are in a pure bulk sending state. The
+ * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
+ * options prevent using TSO. With TSO the TCP header is the same
+ * (except for the sequence number) for all generated packets. This
+ * makes it impossible to transmit any options which vary per
+ * generated segment or packet.
+ *
+ * IPv4 handling has a clear separation of ip options and ip header
+ * flags while IPv6 combines both in in6p_outputopts. ip6_optlen()
+ * does the right thing below to provide length of just ip options
+ * and thus checking for ipoptlen is enough to decide if ip options
+ * are present.
+ */
+#ifdef INET6
+ if (isipv6)
+ ipoptlen = ip6_optlen(inp);
+ else
+#endif
+ if (inp->inp_options)
+ ipoptlen = inp->inp_options->m_len -
+ offsetof(struct ipoption, ipopt_list);
+ else
+ ipoptlen = 0;
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ /*
+ * Pre-calculate here as we save another lookup into the darknesses
+ * of IPsec that way and can actually decide if TSO is ok.
+ */
+#ifdef INET6
+ if (isipv6 && IPSEC_ENABLED(ipv6))
+ ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
+#ifdef INET
+ else
+#endif
+#endif /* INET6 */
+#ifdef INET
+ if (IPSEC_ENABLED(ipv4))
+ ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
+#endif /* INET */
+#endif /* IPSEC */
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ ipoptlen += ipsec_optlen;
+#endif
+ if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
+ (len > maxseg) &&
+ (tp->t_port == 0) &&
+ ((tp->t_flags & TF_SIGNATURE) == 0) &&
+ tp->rcv_numsacks == 0 &&
+ ipoptlen == 0)
+ tso = 1;
+
+ recwin = min(max(sbspace(&so->so_rcv), 0),
+ TCP_MAXWIN << tp->rcv_scale);
+ /*
+ * Sender silly window avoidance. We transmit under the following
+ * conditions when len is non-zero:
+ *
+ * - We have a full segment (or more with TSO) - This is the last
+ * buffer in a write()/send() and we are either idle or running
+ * NODELAY - we've timed out (e.g. persist timer) - we have more
+ * then 1/2 the maximum send window's worth of data (receiver may be
+ * limited the window size) - we need to retransmit
+ */
+ if (rsm)
+ goto send;
+ if (len) {
+ if (sack_rxmit)
+ goto send;
+ if (len >= p_maxseg)
+ goto send;
+ /*
+ * NOTE! on localhost connections an 'ack' from the remote
+ * end may occur synchronously with the output and cause us
+ * to flush a buffer queued with moretocome. XXX
+ *
+ */
+ if (((tp->t_flags & TF_MORETOCOME) == 0) && /* normal case */
+ ((tp->t_flags & TF_NODELAY) ||
+ ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) &&
+ (tp->t_flags & TF_NOPUSH) == 0) {
+ goto send;
+ }
+ if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */
+ goto send;
+ }
+ if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */
+ goto send;
+ }
+ if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
+ goto send;
+ }
+ }
+ /*
+ * Sending of standalone window updates.
+ *
+ * Window updates are important when we close our window due to a
+ * full socket buffer and are opening it again after the application
+ * reads data from it. Once the window has opened again and the
+ * remote end starts to send again the ACK clock takes over and
+ * provides the most current window information.
+ *
+ * We must avoid the silly window syndrome whereas every read from
+ * the receive buffer, no matter how small, causes a window update
+ * to be sent. We also should avoid sending a flurry of window
+ * updates when the socket buffer had queued a lot of data and the
+ * application is doing small reads.
+ *
+ * Prevent a flurry of pointless window updates by only sending an
+ * update when we can increase the advertized window by more than
+ * 1/4th of the socket buffer capacity. When the buffer is getting
+ * full or is very small be more aggressive and send an update
+ * whenever we can increase by two mss sized segments. In all other
+ * situations the ACK's to new incoming data will carry further
+ * window increases.
+ *
+ * Don't send an independent window update if a delayed ACK is
+ * pending (it will get piggy-backed on it) or the remote side
+ * already has done a half-close and won't send more data. Skip
+ * this if the connection is in T/TCP half-open state.
+ */
+ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
+ !(tp->t_flags & TF_DELACK) &&
+ !TCPS_HAVERCVDFIN(tp->t_state)) {
+ /* Check to see if we should do a window update */
+ if (bbr_window_update_needed(tp, so, recwin, maxseg))
+ goto send;
+ }
+ /*
+ * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
+ * is also a catch-all for the retransmit timer timeout case.
+ */
+ if (tp->t_flags & TF_ACKNOW) {
+ goto send;
+ }
+ if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
+ goto send;
+ }
+ if (SEQ_GT(tp->snd_up, tp->snd_una)) {
+ goto send;
+ }
+ /*
+ * If our state indicates that FIN should be sent and we have not
+ * yet done so, then we need to send.
+ */
+ if (flags & TH_FIN &&
+ ((tp->t_flags & TF_SENTFIN) == 0)) {
+ goto send;
+ }
+ /*
+ * No reason to send a segment, just return.
+ */
+just_return:
+ SOCKBUF_UNLOCK(sb);
+just_return_nolock:
+ if (tot_len)
+ slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
+ if (bbr->rc_no_pacing)
+ slot = 0;
+ if (tot_len == 0) {
+ if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=
+ tp->snd_wnd) {
+ BBR_STAT_INC(bbr_rwnd_limited);
+ app_limited = BBR_JR_RWND_LIMITED;
+ bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
+ if ((bbr->rc_in_persist == 0) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_max == tp->snd_una) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
+ /* No send window.. we must enter persist */
+ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
+ }
+ } else if (ctf_outstanding(tp) >= sbavail(sb)) {
+ BBR_STAT_INC(bbr_app_limited);
+ app_limited = BBR_JR_APP_LIMITED;
+ bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
+ } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) {
+ BBR_STAT_INC(bbr_cwnd_limited);
+ app_limited = BBR_JR_CWND_LIMITED;
+ bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)));
+ bbr->rc_cwnd_limited = 1;
+ } else {
+ BBR_STAT_INC(bbr_app_limited);
+ app_limited = BBR_JR_APP_LIMITED;
+ bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
+ }
+ bbr->r_ctl.rc_hptsi_agg_delay = 0;
+ bbr->r_agg_early_set = 0;
+ bbr->r_ctl.rc_agg_early = 0;
+ bbr->r_ctl.rc_last_delay_val = 0;
+ } else if (bbr->rc_use_google == 0)
+ bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
+ /* Are we app limited? */
+ if ((app_limited == BBR_JR_APP_LIMITED) ||
+ (app_limited == BBR_JR_RWND_LIMITED)) {
+ /**
+ * We are application limited.
+ */
+ bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered);
+ }
+ if (tot_len == 0)
+ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1);
+ tp->t_flags &= ~TF_FORCEDATA;
+ /* Dont update the time if we did not send */
+ bbr->r_ctl.rc_last_delay_val = 0;
+ bbr->rc_output_starts_timer = 1;
+ bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len);
+ bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ /* Make sure snd_nxt is drug up */
+ tp->snd_nxt = tp->snd_max;
+ }
+ return (error);
+
+send:
+ if (doing_tlp == 0) {
+ /*
+ * Data not a TLP, and its not the rxt firing. If it is the
+ * rxt firing, we want to leave the tlp_in_progress flag on
+ * so we don't send another TLP. It has to be a rack timer
+ * or normal send (response to acked data) to clear the tlp
+ * in progress flag.
+ */
+ bbr->rc_tlp_in_progress = 0;
+ bbr->rc_tlp_rtx_out = 0;
+ } else {
+ /*
+ * Its a TLP.
+ */
+ bbr->rc_tlp_in_progress = 1;
+ }
+ bbr_timer_cancel(bbr, __LINE__, cts);
+ if (rsm == NULL) {
+ if (sbused(sb) > 0) {
+ /*
+ * This is sub-optimal. We only send a stand alone
+ * FIN on its own segment.
+ */
+ if (flags & TH_FIN) {
+ flags &= ~TH_FIN;
+ if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {
+ /* Lets not send this */
+ slot = 0;
+ goto just_return;
+ }
+ }
+ }
+ } else {
+ /*
+ * We do *not* send a FIN on a retransmit if it has data.
+ * The if clause here where len > 1 should never come true.
+ */
+ if ((len > 0) &&
+ (((rsm->r_flags & BBR_HAS_FIN) == 0) &&
+ (flags & TH_FIN))) {
+ flags &= ~TH_FIN;
+ len--;
+ }
+ }
+ SOCKBUF_LOCK_ASSERT(sb);
+ if (len > 0) {
+ if ((tp->snd_una == tp->snd_max) &&
+ (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
+ /*
+ * This qualifies as a RTT_PROBE session since we
+ * drop the data outstanding to nothing and waited
+ * more than bbr_rtt_probe_time.
+ */
+ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
+ bbr_set_reduced_rtt(bbr, cts, __LINE__);
+ }
+ if (len >= maxseg)
+ tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
+ }
+ /*
+ * Before ESTABLISHED, force sending of initial options unless TCP
+ * set not to do any options. NOTE: we assume that the IP/TCP header
+ * plus TCP options always fit in a single mbuf, leaving room for a
+ * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
+ * + optlen <= MCLBYTES
+ */
+ optlen = 0;
+#ifdef INET6
+ if (isipv6)
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ else
+#endif
+ hdrlen = sizeof(struct tcpiphdr);
+
+ /*
+ * Compute options for segment. We only have to care about SYN and
+ * established connection segments. Options for SYN-ACK segments
+ * are handled in TCP syncache.
+ */
+ to.to_flags = 0;
+ local_options = 0;
+ if ((tp->t_flags & TF_NOOPT) == 0) {
+ /* Maximum segment size. */
+ if (flags & TH_SYN) {
+ to.to_mss = tcp_mssopt(&inp->inp_inc);
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port)
+ to.to_mss -= V_tcp_udp_tunneling_overhead;
+#endif
+ to.to_flags |= TOF_MSS;
+ /*
+ * On SYN or SYN|ACK transmits on TFO connections,
+ * only include the TFO option if it is not a
+ * retransmit, as the presence of the TFO option may
+ * have caused the original SYN or SYN|ACK to have
+ * been dropped by a middlebox.
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ (tp->t_rxtshift == 0)) {
+ if (tp->t_state == TCPS_SYN_RECEIVED) {
+ to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
+ to.to_tfo_cookie =
+ (u_int8_t *)&tp->t_tfo_cookie.server;
+ to.to_flags |= TOF_FASTOPEN;
+ wanted_cookie = 1;
+ } else if (tp->t_state == TCPS_SYN_SENT) {
+ to.to_tfo_len =
+ tp->t_tfo_client_cookie_len;
+ to.to_tfo_cookie =
+ tp->t_tfo_cookie.client;
+ to.to_flags |= TOF_FASTOPEN;
+ wanted_cookie = 1;
+ }
+ }
+ }
+ /* Window scaling. */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
+ to.to_wscale = tp->request_r_scale;
+ to.to_flags |= TOF_SCALE;
+ }
+ /* Timestamps. */
+ if ((tp->t_flags & TF_RCVD_TSTMP) ||
+ ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
+ to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset;
+ to.to_tsecr = tp->ts_recent;
+ to.to_flags |= TOF_TS;
+ local_options += TCPOLEN_TIMESTAMP + 2;
+ }
+ /* Set receive buffer autosizing timestamp. */
+ if (tp->rfbuf_ts == 0 &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE))
+ tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv);
+ /* Selective ACK's. */
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_SACKPERM;
+ else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ tp->rcv_numsacks > 0) {
+ to.to_flags |= TOF_SACK;
+ to.to_nsacks = tp->rcv_numsacks;
+ to.to_sacks = (u_char *)tp->sackblks;
+ }
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ /* TCP-MD5 (RFC2385). */
+ if (tp->t_flags & TF_SIGNATURE)
+ to.to_flags |= TOF_SIGNATURE;
+#endif /* TCP_SIGNATURE */
+
+ /* Processing the options. */
+ hdrlen += (optlen = tcp_addoptions(&to, opt));
+ /*
+ * If we wanted a TFO option to be added, but it was unable
+ * to fit, ensure no data is sent.
+ */
+ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
+ !(to.to_flags & TOF_FASTOPEN))
+ len = 0;
+ }
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ if (V_tcp_udp_tunneling_port == 0) {
+ /* The port was removed?? */
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (EHOSTUNREACH);
+ }
+
+ hdrlen += sizeof(struct udphdr);
+ }
+#endif
+#ifdef INET6
+ if (isipv6)
+ ipoptlen = ip6_optlen(tp->t_inpcb);
+ else
+#endif
+ if (tp->t_inpcb->inp_options)
+ ipoptlen = tp->t_inpcb->inp_options->m_len -
+ offsetof(struct ipoption, ipopt_list);
+ else
+ ipoptlen = 0;
+ ipoptlen = 0;
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ ipoptlen += ipsec_optlen;
+#endif
+ if (bbr->rc_last_options != local_options) {
+ /*
+ * Cache the options length this generally does not change
+ * on a connection. We use this to calculate TSO.
+ */
+ bbr->rc_last_options = local_options;
+ }
+ maxseg = tp->t_maxseg - (ipoptlen + optlen);
+ p_maxseg = min(maxseg, pace_max_segs);
+ /*
+ * Adjust data length if insertion of options will bump the packet
+ * length beyond the t_maxseg length. Clear the FIN bit because we
+ * cut off the tail of the segment.
+ */
+#ifdef KERN_TLS
+ /* force TSO for so TLS offload can get mss */
+ if (sb->sb_flags & SB_TLS_IFNET) {
+ force_tso = 1;
+ }
+#endif
+
+ if (len > maxseg) {
+ if (len != 0 && (flags & TH_FIN)) {
+ flags &= ~TH_FIN;
+ }
+ if (tso) {
+ uint32_t moff;
+ int32_t max_len;
+
+ /* extract TSO information */
+ if_hw_tsomax = tp->t_tsomax;
+ if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
+ if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
+ KASSERT(ipoptlen == 0,
+ ("%s: TSO can't do IP options", __func__));
+
+ /*
+ * Check if we should limit by maximum payload
+ * length:
+ */
+ if (if_hw_tsomax != 0) {
+ /* compute maximum TSO length */
+ max_len = (if_hw_tsomax - hdrlen -
+ max_linkhdr);
+ if (max_len <= 0) {
+ len = 0;
+ } else if (len > max_len) {
+ len = max_len;
+ }
+ }
+ /*
+ * Prevent the last segment from being fractional
+ * unless the send sockbuf can be emptied:
+ */
+ if (((sb_offset + len) < sbavail(sb)) &&
+ (hw_tls == 0)) {
+ moff = len % (uint32_t)maxseg;
+ if (moff != 0) {
+ len -= moff;
+ }
+ }
+ /*
+ * In case there are too many small fragments don't
+ * use TSO:
+ */
+ if (len <= maxseg) {
+ len = maxseg;
+ tso = 0;
+ }
+ } else {
+ /* Not doing TSO */
+ if (optlen + ipoptlen > tp->t_maxseg) {
+ /*
+ * Since we don't have enough space to put
+ * the IP header chain and the TCP header in
+ * one packet as required by RFC 7112, don't
+ * send it.
+ */
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EMSGSIZE;
+ sack_rxmit = 0;
+ goto out;
+ }
+ len = maxseg;
+ }
+ } else {
+ /* Not doing TSO */
+ if_hw_tsomaxsegcount = 0;
+ tso = 0;
+ }
+ KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
+ ("%s: len > IP_MAXPACKET", __func__));
+#ifdef DIAGNOSTIC
+#ifdef INET6
+ if (max_linkhdr + hdrlen > MCLBYTES)
+#else
+ if (max_linkhdr + hdrlen > MHLEN)
+#endif
+ panic("tcphdr too big");
+#endif
+ /*
+ * This KASSERT is here to catch edge cases at a well defined place.
+ * Before, those had triggered (random) panic conditions further
+ * down.
+ */
+#ifdef BBR_INVARIANTS
+ if (sack_rxmit) {
+ if (SEQ_LT(rsm->r_start, tp->snd_una)) {
+ panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u",
+ rsm, tp, bbr, rsm->r_start, tp->snd_una);
+ }
+ }
+#endif
+ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
+ if ((len == 0) &&
+ (flags & TH_FIN) &&
+ (sbused(sb))) {
+ /*
+ * We have outstanding data, don't send a fin by itself!.
+ */
+ slot = 0;
+ goto just_return;
+ }
+ /*
+ * Grab a header mbuf, attaching a copy of data to be transmitted,
+ * and initialize the header from the template for sends on this
+ * connection.
+ */
+ if (len) {
+ uint32_t moff;
+ uint32_t orig_len;
+
+ /*
+ * We place a limit on sending with hptsi.
+ */
+ if ((rsm == NULL) && len > pace_max_segs)
+ len = pace_max_segs;
+ if (len <= maxseg)
+ tso = 0;
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr)
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ else
+#endif
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+
+ if (m == NULL) {
+ BBR_STAT_INC(bbr_failed_mbuf_aloc);
+ bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
+ SOCKBUF_UNLOCK(sb);
+ error = ENOBUFS;
+ sack_rxmit = 0;
+ goto out;
+ }
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ /*
+ * Start the m_copy functions from the closest mbuf to the
+ * sb_offset in the socket buffer chain.
+ */
+ if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) {
+#ifdef BBR_INVARIANTS
+ if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0)))
+ panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u",
+ tp, bbr, len, sb_offset, sbavail(sb), rsm,
+ doing_retran_from,
+ picked_up_retran,
+ doing_tlp);
+
+#endif
+ /*
+ * In this messed up situation we have two choices,
+ * a) pretend the send worked, and just start timers
+ * and what not (not good since that may lead us
+ * back here a lot). <or> b) Send the lowest segment
+ * in the map. <or> c) Drop the connection. Lets do
+ * <b> which if it continues to happen will lead to
+ * <c> via timeouts.
+ */
+ BBR_STAT_INC(bbr_offset_recovery);
+ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
+ sb_offset = 0;
+ if (rsm == NULL) {
+ sack_rxmit = 0;
+ len = sbavail(sb);
+ } else {
+ sack_rxmit = 1;
+ if (rsm->r_start != tp->snd_una) {
+ /*
+ * Things are really messed up, <c>
+ * is the only thing to do.
+ */
+ BBR_STAT_INC(bbr_offset_drop);
+ tcp_set_inp_to_drop(inp, EFAULT);
+ return (0);
+ }
+ len = rsm->r_end - rsm->r_start;
+ }
+ if (len > sbavail(sb))
+ len = sbavail(sb);
+ if (len > maxseg)
+ len = maxseg;
+ }
+ mb = sbsndptr_noadv(sb, sb_offset, &moff);
+ if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
+ m_copydata(mb, moff, (int)len,
+ mtod(m, caddr_t)+hdrlen);
+ if (rsm == NULL)
+ sbsndptr_adv(sb, mb, len);
+ m->m_len += len;
+ } else {
+ struct sockbuf *msb;
+
+ if (rsm)
+ msb = NULL;
+ else
+ msb = sb;
+#ifdef BBR_INVARIANTS
+ if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) {
+ if (rsm) {
+ panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ",
+ tp, bbr, len, moff,
+ sbavail(sb), rsm,
+ tp->snd_una, rsm->r_flags, rsm->r_start,
+ doing_retran_from,
+ picked_up_retran,
+ doing_tlp, sack_rxmit);
+ } else {
+ panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u",
+ tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una);
+ }
+ }
+#endif
+ orig_len = len;
+ m->m_next = tcp_m_copym(
+#ifdef NETFLIX_COPY_ARGS
+ tp,
+#endif
+ mb, moff, &len,
+ if_hw_tsomaxsegcount,
+ if_hw_tsomaxsegsize, msb,
+ ((rsm == NULL) ? hw_tls : 0)
+#ifdef NETFLIX_COPY_ARGS
+ , &filled_all
+#endif
+ );
+ if (len <= maxseg && !force_tso) {
+ /*
+ * Must have ran out of mbufs for the copy
+ * shorten it to no longer need tso. Lets
+ * not put on sendalot since we are low on
+ * mbufs.
+ */
+ tso = 0;
+ }
+ if (m->m_next == NULL) {
+ SOCKBUF_UNLOCK(sb);
+ (void)m_free(m);
+ error = ENOBUFS;
+ sack_rxmit = 0;
+ goto out;
+ }
+ }
+#ifdef BBR_INVARIANTS
+ if (tso && len < maxseg) {
+ panic("tp:%p tso on, but len:%d < maxseg:%d",
+ tp, len, maxseg);
+ }
+ if (tso && if_hw_tsomaxsegcount) {
+ int32_t seg_cnt = 0;
+ struct mbuf *foo;
+
+ foo = m;
+ while (foo) {
+ seg_cnt++;
+ foo = foo->m_next;
+ }
+ if (seg_cnt > if_hw_tsomaxsegcount) {
+ panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount);
+ }
+ }
+#endif
+ /*
+ * If we're sending everything we've got, set PUSH. (This
+ * will keep happy those implementations which only give
+ * data to the user when a buffer fills or a PUSH comes in.)
+ */
+ if (sb_offset + len == sbused(sb) &&
+ sbused(sb) &&
+ !(flags & TH_SYN)) {
+ flags |= TH_PUSH;
+ }
+ SOCKBUF_UNLOCK(sb);
+ } else {
+ SOCKBUF_UNLOCK(sb);
+ if (tp->t_flags & TF_ACKNOW)
+ TCPSTAT_INC(tcps_sndacks);
+ else if (flags & (TH_SYN | TH_FIN | TH_RST))
+ TCPSTAT_INC(tcps_sndctrl);
+ else if (SEQ_GT(tp->snd_up, tp->snd_una))
+ TCPSTAT_INC(tcps_sndurg);
+ else
+ TCPSTAT_INC(tcps_sndwinup);
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ BBR_STAT_INC(bbr_failed_mbuf_aloc);
+ bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
+ error = ENOBUFS;
+ /* Fudge the send time since we could not send */
+ sack_rxmit = 0;
+ goto out;
+ }
+#ifdef INET6
+ if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
+ MHLEN >= hdrlen) {
+ M_ALIGN(m, hdrlen);
+ } else
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ }
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef MAC
+ mac_inpcb_create_mbuf(inp, m);
+#endif
+#ifdef INET6
+ if (isipv6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else {
+#endif
+ th = (struct tcphdr *)(ip6 + 1);
+
+#ifdef NETFLIX_TCPOUDP
+ }
+#endif
+ tcpip_fillheaders(inp,
+#ifdef NETFLIX_TCPOUDP
+ tp->t_port,
+#endif
+ ip6, th);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+#ifdef TCPDEBUG
+ ipov = (struct ipovly *)ip;
+#endif
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else
+#endif
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(inp,
+#ifdef NETFLIX_TCPOUDP
+ tp->t_port,
+#endif
+ ip, th);
+ }
+ /*
+ * If we are doing retransmissions, then snd_nxt will not reflect
+ * the first unsent octet. For ACK only packets, we do not want the
+ * sequence number of the retransmitted packet, we want the sequence
+ * number of the next unsent octet. So, if there is no data (and no
+ * SYN or FIN), use snd_max instead of snd_nxt when filling in
+ * ti_seq. But if we are in persist state, snd_max might reflect
+ * one byte beyond the right edge of the window, so use snd_nxt in
+ * that case, since we know we aren't doing a retransmission.
+ * (retransmit and persist are mutually exclusive...)
+ */
+ if (sack_rxmit == 0) {
+ if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) {
+ /* New data (including new persists) */
+ th->th_seq = htonl(tp->snd_max);
+ bbr_seq = tp->snd_max;
+ } else if (flags & TH_SYN) {
+ /* Syn's always send from iss */
+ th->th_seq = htonl(tp->iss);
+ bbr_seq = tp->iss;
+ } else if (flags & TH_FIN) {
+ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) {
+ /*
+ * If we sent the fin already its 1 minus
+ * snd_max
+ */
+ th->th_seq = (htonl(tp->snd_max - 1));
+ bbr_seq = (tp->snd_max - 1);
+ } else {
+ /* First time FIN use snd_max */
+ th->th_seq = htonl(tp->snd_max);
+ bbr_seq = tp->snd_max;
+ }
+ } else if (flags & TH_RST) {
+ /*
+ * For a Reset send the last cum ack in sequence
+ * (this like any other choice may still generate a
+ * challenge ack, if a ack-update packet is in
+ * flight).
+ */
+ th->th_seq = htonl(tp->snd_una);
+ bbr_seq = tp->snd_una;
+ } else {
+ /*
+ * len == 0 and not persist we use snd_max, sending
+ * an ack unless we have sent the fin then its 1
+ * minus.
+ */
+ /*
+ * XXXRRS Question if we are in persists and we have
+ * nothing outstanding to send and we have not sent
+ * a FIN, we will send an ACK. In such a case it
+ * might be better to send (tp->snd_una - 1) which
+ * would force the peer to ack.
+ */
+ if (tp->t_flags & TF_SENTFIN) {
+ th->th_seq = htonl(tp->snd_max - 1);
+ bbr_seq = (tp->snd_max - 1);
+ } else {
+ th->th_seq = htonl(tp->snd_max);
+ bbr_seq = tp->snd_max;
+ }
+ }
+ } else {
+ /* All retransmits use the rsm to guide the send */
+ th->th_seq = htonl(rsm->r_start);
+ bbr_seq = rsm->r_start;
+ }
+ th->th_ack = htonl(tp->rcv_nxt);
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ }
+ th->th_flags = flags;
+ /*
+ * Calculate receive window. Don't shrink window, but avoid silly
+ * window syndrome.
+ */
+ if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) &&
+ recwin < maxseg)))
+ recwin = 0;
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
+ recwin < (tp->rcv_adv - tp->rcv_nxt))
+ recwin = (tp->rcv_adv - tp->rcv_nxt);
+ if (recwin > TCP_MAXWIN << tp->rcv_scale)
+ recwin = TCP_MAXWIN << tp->rcv_scale;
+
+ /*
+ * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
+ * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is
+ * handled in syncache.
+ */
+ if (flags & TH_SYN)
+ th->th_win = htons((u_short)
+ (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
+ else
+ th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
+ /*
+ * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
+ * window. This may cause the remote transmitter to stall. This
+ * flag tells soreceive() to disable delayed acknowledgements when
+ * draining the buffer. This can occur if the receiver is
+ * attempting to read more data than can be buffered prior to
+ * transmitting on the connection.
+ */
+ if (th->th_win == 0) {
+ tp->t_sndzerowin++;
+ tp->t_flags |= TF_RXWIN0SENT;
+ } else
+ tp->t_flags &= ~TF_RXWIN0SENT;
+ if (SEQ_GT(tp->snd_up, tp->snd_max)) {
+ th->th_urp = htons((u_short)(tp->snd_up - tp->snd_max));
+ th->th_flags |= TH_URG;
+ } else
+ /*
+ * If no urgent pointer to send, then we pull the urgent
+ * pointer to the left edge of the send window so that it
+ * doesn't drift into the send window on sequence number
+ * wraparound.
+ */
+ tp->snd_up = tp->snd_una; /* drag it along */
+
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (to.to_flags & TOF_SIGNATURE) {
+ /*
+ * Calculate MD5 signature and put it into the place
+ * determined before. NOTE: since TCP options buffer doesn't
+ * point into mbuf's data, calculate offset and use it.
+ */
+ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
+ (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
+ /*
+ * Do not send segment if the calculation of MD5
+ * digest has failed.
+ */
+ goto out;
+ }
+ }
+#endif
+
+ /*
+ * Put TCP length in extended header, and then checksum extended
+ * header and data.
+ */
+ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+#ifdef INET6
+ if (isipv6) {
+ /*
+ * ip6_plen is not need to be filled now, and will be filled
+ * in ip6_output.
+ */
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+#endif
+ csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
+ optlen + len, IPPROTO_TCP, 0);
+#ifdef NETFLIX_TCPOUDP
+ }
+#endif
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ {
+#ifdef NETFLIX_TCPOUDP
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+#endif
+ csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+ IPPROTO_TCP + len + optlen));
+#ifdef NETFLIX_TCPOUDP
+ }
+#endif
+ /* IP version must be set here for ipv4/ipv6 checking later */
+ KASSERT(ip->ip_v == IPVERSION,
+ ("%s: IP version incorrect: %d", __func__, ip->ip_v));
+ }
+#endif
+
+ /*
+ * Enable TSO and specify the size of the segments. The TCP pseudo
+ * header checksum is always provided. XXX: Fixme: This is currently
+ * not the case for IPv6.
+ */
+ if (tso || force_tso) {
+ KASSERT(force_tso || len > maxseg,
+ ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg));
+ m->m_pkthdr.csum_flags |= CSUM_TSO;
+ csum_flags |= CSUM_TSO;
+ m->m_pkthdr.tso_segsz = maxseg;
+ }
+ KASSERT(len + hdrlen == m_length(m, NULL),
+ ("%s: mbuf chain different than expected: %d + %u != %u",
+ __func__, len, hdrlen, m_length(m, NULL)));
+
+#ifdef TCP_HHOOK
+ /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */
+ hhook_run_tcp_est_out(tp, th, &to, len, tso);
+#endif
+#ifdef TCPDEBUG
+ /*
+ * Trace.
+ */
+ if (so->so_options & SO_DEBUG) {
+ u_short save = 0;
+
+#ifdef INET6
+ if (!isipv6)
+#endif
+ {
+ save = ipov->ih_len;
+ ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen +
+ * (th->th_off << 2) */ );
+ }
+ tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
+#ifdef INET6
+ if (!isipv6)
+#endif
+ ipov->ih_len = save;
+ }
+#endif /* TCPDEBUG */
+
+ /* Log to the black box */
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
+ /* Record info on type of transmission */
+ log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay;
+ log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
+ log.u_bbr.flex3 = maxseg;
+ log.u_bbr.flex4 = delay_calc;
+ /* Encode filled_all into the upper flex5 bit */
+ log.u_bbr.flex5 = bbr->rc_past_init_win;
+ log.u_bbr.flex5 <<= 1;
+ log.u_bbr.flex5 |= bbr->rc_no_pacing;
+ log.u_bbr.flex5 <<= 29;
+ if (filled_all)
+ log.u_bbr.flex5 |= 0x80000000;
+ log.u_bbr.flex5 |= tp->t_maxseg;
+ log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
+ /* lets poke in the low and the high here for debugging */
+ log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
+ if (rsm || sack_rxmit) {
+ if (doing_tlp)
+ log.u_bbr.flex8 = 2;
+ else
+ log.u_bbr.flex8 = 1;
+ } else {
+ log.u_bbr.flex8 = 0;
+ }
+ lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
+ len, &log, false, NULL, NULL, 0, tv);
+ } else {
+ lgb = NULL;
+ }
+ /*
+ * Fill in IP length and desired time to live and send to IP level.
+ * There should be a better way to handle ttl and tos; we could keep
+ * them in the template, but need a way to checksum without them.
+ */
+ /*
+ * m->m_pkthdr.len should have been set before cksum calcuration,
+ * because in6_cksum() need it.
+ */
+#ifdef INET6
+ if (isipv6) {
+ /*
+ * we separately set hoplimit for every segment, since the
+ * user might want to change the value via setsockopt. Also,
+ * desired default hop limit might be changed via Neighbor
+ * Discovery.
+ */
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+
+ /*
+ * Set the packet size here for the benefit of DTrace
+ * probes. ip6_output() will set it properly; it's supposed
+ * to include the option header lengths as well.
+ */
+ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
+
+ if (V_path_mtu_discovery && maxseg > V_tcp_minmss)
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+
+ if (tp->t_state == TCPS_SYN_SENT)
+ TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
+
+ TCP_PROBE5(send, NULL, tp, ip6, tp, th);
+ /* TODO: IPv6 IP6TOS_ECT bit on */
+ error = ip6_output(m, inp->in6p_outputopts,
+ &inp->inp_route6,
+ ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
+ NULL, NULL, inp);
+
+ if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
+ mtu = inp->inp_route6.ro_rt->rt_mtu;
+ }
+#endif /* INET6 */
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ ip->ip_len = htons(m->m_pkthdr.len);
+#ifdef INET6
+ if (isipv6)
+ ip->ip_ttl = in6_selecthlim(inp, NULL);
+#endif /* INET6 */
+ /*
+ * If we do path MTU discovery, then we set DF on every
+ * packet. This might not be the best thing to do according
+ * to RFC3390 Section 2. However the tcp hostcache migitates
+ * the problem so it affects only the first tcp connection
+ * with a host.
+ *
+ * NB: Don't set DF on small MTU/MSS to have a safe
+ * fallback.
+ */
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ if (tp->t_port == 0 || len < V_tcp_minmss) {
+ ip->ip_off |= htons(IP_DF);
+ }
+ } else {
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+
+ if (tp->t_state == TCPS_SYN_SENT)
+ TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
+
+ TCP_PROBE5(send, NULL, tp, ip, tp, th);
+
+ error = ip_output(m, inp->inp_options, &inp->inp_route,
+ ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
+ inp);
+ if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
+ mtu = inp->inp_route.ro_rt->rt_mtu;
+ }
+#endif /* INET */
+out:
+
+ if (lgb) {
+ lgb->tlb_errno = error;
+ lgb = NULL;
+ }
+ /*
+ * In transmit state, time the transmission and arrange for the
+ * retransmit. In persist state, just set snd_max.
+ */
+ if (error == 0) {
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->t_flags & TF_SACK_PERMIT) &&
+ tp->rcv_numsacks > 0)
+ tcp_clean_dsack_blocks(tp);
+ /* We sent an ack clear the bbr_segs_rcvd count */
+ bbr->output_error_seen = 0;
+ bbr->oerror_cnt = 0;
+ bbr->bbr_segs_rcvd = 0;
+ if (len == 0)
+ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1);
+ else if (hw_tls) {
+ if (filled_all ||
+ (len >= bbr->r_ctl.rc_pace_max_segs))
+ BBR_STAT_INC(bbr_meets_tso_thresh);
+ else {
+ if (doing_tlp) {
+ BBR_STAT_INC(bbr_miss_tlp);
+ bbr_log_type_hrdwtso(tp, bbr, len, 1, what_we_can);
+
+
+ } else if (rsm) {
+ BBR_STAT_INC(bbr_miss_retran);
+ bbr_log_type_hrdwtso(tp, bbr, len, 2, what_we_can);
+ } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > sbavail(sb)) {
+ BBR_STAT_INC(bbr_miss_tso_app);
+ bbr_log_type_hrdwtso(tp, bbr, len, 3, what_we_can);
+ } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_cwnd) {
+ BBR_STAT_INC(bbr_miss_tso_cwnd);
+ bbr_log_type_hrdwtso(tp, bbr, len, 4, what_we_can);
+ } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_wnd) {
+ BBR_STAT_INC(bbr_miss_tso_rwnd);
+ bbr_log_type_hrdwtso(tp, bbr, len, 5, what_we_can);
+ } else {
+ BBR_STAT_INC(bbr_miss_unknown);
+ bbr_log_type_hrdwtso(tp, bbr, len, 6, what_we_can);
+ }
+ }
+ }
+ /* Do accounting for new sends */
+ if ((len > 0) && (rsm == NULL)) {
+ int idx;
+ if (tp->snd_una == tp->snd_max) {
+ /*
+ * Special case to match google, when
+ * nothing is in flight the delivered
+ * time does get updated to the current
+ * time (see tcp_rate_bsd.c).
+ */
+ bbr->r_ctl.rc_del_time = cts;
+ }
+ if (len >= maxseg) {
+ idx = (len / maxseg) + 3;
+ if (idx >= TCP_MSS_ACCT_ATIMER)
+ counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1);
+ else
+ counter_u64_add(bbr_out_size[idx], 1);
+ } else {
+ /* smaller than a MSS */
+ idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options);
+ if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV)
+ idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1);
+ counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1);
+ }
+ }
+ }
+ abandon = 0;
+ /*
+ * We must do the send accounting before we log the output,
+ * otherwise the state of the rsm could change and we account to the
+ * wrong bucket.
+ */
+ if (len > 0) {
+ bbr_do_send_accounting(tp, bbr, rsm, len, error);
+ if (error == 0) {
+ if (tp->snd_una == tp->snd_max)
+ bbr->r_ctl.rc_tlp_rxt_last_time = cts;
+ }
+ }
+ bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error,
+ cts, mb, &abandon, rsm, 0, sb);
+ if (abandon) {
+ /*
+ * If bbr_log_output destroys the TCB or sees a TH_RST being
+ * sent we should hit this condition.
+ */
+ return (0);
+ }
+ if (((tp->t_flags & TF_FORCEDATA) == 0) ||
+ (bbr->rc_in_persist == 0)) {
+ /*
+ * Advance snd_nxt over sequence space of this segment.
+ */
+ if (error)
+ /* We don't log or do anything with errors */
+ goto skip_upd;
+
+ if (tp->snd_una == tp->snd_max &&
+ (len || (flags & (TH_SYN | TH_FIN)))) {
+ /*
+ * Update the time we just added data since none was
+ * outstanding.
+ */
+ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
+ bbr->rc_tp->t_acktime = ticks;
+ }
+ if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) {
+ if (flags & TH_SYN) {
+ tp->snd_max++;
+ }
+ if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
+ tp->snd_max++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ }
+ if (sack_rxmit == 0)
+ tp->snd_max += len;
+skip_upd:
+ if ((error == 0) && len)
+ tot_len += len;
+ } else {
+ /* Persists case */
+ int32_t xlen = len;
+
+ if (error)
+ goto nomore;
+
+ if (flags & TH_SYN)
+ ++xlen;
+ if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
+ ++xlen;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ if (xlen && (tp->snd_una == tp->snd_max)) {
+ /*
+ * Update the time we just added data since none was
+ * outstanding.
+ */
+ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
+ bbr->rc_tp->t_acktime = ticks;
+ }
+ if (sack_rxmit == 0)
+ tp->snd_max += xlen;
+ tot_len += (len + optlen + ipoptlen);
+ }
+nomore:
+ if (error) {
+ /*
+ * Failures do not advance the seq counter above. For the
+ * case of ENOBUFS we will fall out and become ack-clocked.
+ * capping the cwnd at the current flight.
+ * Everything else will just have to retransmit with the timer
+ * (no pacer).
+ */
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ BBR_STAT_INC(bbr_saw_oerr);
+ /* Clear all delay/early tracks */
+ bbr->r_ctl.rc_hptsi_agg_delay = 0;
+ bbr->r_ctl.rc_agg_early = 0;
+ bbr->r_agg_early_set = 0;
+ bbr->output_error_seen = 1;
+ if (bbr->oerror_cnt < 0xf)
+ bbr->oerror_cnt++;
+ if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) {
+ /* drop the session */
+ tcp_set_inp_to_drop(inp, ENETDOWN);
+ }
+ switch (error) {
+ case ENOBUFS:
+ /*
+ * Make this guy have to get ack's to send
+ * more but lets make sure we don't
+ * slam him below a T-O (1MSS).
+ */
+ if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
+ tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)) - maxseg;
+ if (tp->snd_cwnd < maxseg)
+ tp->snd_cwnd = maxseg;
+ }
+ slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
+ BBR_STAT_INC(bbr_saw_enobuf);
+ if (bbr->bbr_hdrw_pacing)
+ counter_u64_add(bbr_hdwr_pacing_enobuf, 1);
+ else
+ counter_u64_add(bbr_nohdwr_pacing_enobuf, 1);
+ /*
+ * Here even in the enobuf's case we want to do our
+ * state update. The reason being we may have been
+ * called by the input function. If so we have had
+ * things change.
+ */
+ error = 0;
+ goto enobufs;
+ case EMSGSIZE:
+ /*
+ * For some reason the interface we used initially
+ * to send segments changed to another or lowered
+ * its MTU. If TSO was active we either got an
+ * interface without TSO capabilits or TSO was
+ * turned off. If we obtained mtu from ip_output()
+ * then update it and try again.
+ */
+ /* Turn on tracing (or try to) */
+ {
+ int old_maxseg;
+
+ old_maxseg = tp->t_maxseg;
+ BBR_STAT_INC(bbr_saw_emsgsiz);
+ bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts);
+ if (mtu != 0)
+ tcp_mss_update(tp, -1, mtu, NULL, NULL);
+ if (old_maxseg <= tp->t_maxseg) {
+ /* Huh it did not shrink? */
+ tp->t_maxseg = old_maxseg - 40;
+ bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts);
+ }
+ tp->t_flags &= ~TF_FORCEDATA;
+ /*
+ * Nuke all other things that can interfere
+ * with slot
+ */
+ if ((tot_len + len) && (len >= tp->t_maxseg)) {
+ slot = bbr_get_pacing_delay(bbr,
+ bbr->r_ctl.rc_bbr_hptsi_gain,
+ (tot_len + len), cts, 0);
+ if (slot < bbr_error_base_paceout)
+ slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
+ } else
+ slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
+ bbr->rc_output_starts_timer = 1;
+ bbr_start_hpts_timer(bbr, tp, cts, 10, slot,
+ tot_len);
+ return (error);
+ }
+ case EPERM:
+ tp->t_softerror = error;
+ /* Fall through */
+ case EHOSTDOWN:
+ case EHOSTUNREACH:
+ case ENETDOWN:
+ case ENETUNREACH:
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_softerror = error;
+ }
+ /* FALLTHROUGH */
+ default:
+ tp->t_flags &= ~TF_FORCEDATA;
+ slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
+ bbr->rc_output_starts_timer = 1;
+ bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
+ return (error);
+ }
+#ifdef NETFLIX_STATS
+ } else if (((tp->t_flags & TF_GPUTINPROG) == 0) &&
+ len &&
+ (rsm == NULL) &&
+ (bbr->rc_in_persist == 0)) {
+ tp->gput_seq = bbr_seq;
+ tp->gput_ack = bbr_seq +
+ min(sbavail(&so->so_snd) - sb_offset, sendwin);
+ tp->gput_ts = cts;
+ tp->t_flags |= TF_GPUTINPROG;
+#endif
+ }
+ TCPSTAT_INC(tcps_sndtotal);
+ if ((bbr->bbr_hdw_pace_ena) &&
+ (bbr->bbr_attempt_hdwr_pace == 0) &&
+ (bbr->rc_past_init_win) &&
+ (bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
+ (get_filter_value(&bbr->r_ctl.rc_delrate)) &&
+ (inp->inp_route.ro_rt &&
+ inp->inp_route.ro_rt->rt_ifp)) {
+ /*
+ * We are past the initial window and
+ * have at least one measurement so we
+ * could use hardware pacing if its available.
+ * We have an interface and we have not attempted
+ * to setup hardware pacing, lets try to now.
+ */
+ uint64_t rate_wanted;
+ int err = 0;
+
+ rate_wanted = bbr_get_hardware_rate(bbr);
+ bbr->bbr_attempt_hdwr_pace = 1;
+ bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp,
+ inp->inp_route.ro_rt->rt_ifp,
+ rate_wanted,
+ (RS_PACING_GEQ|RS_PACING_SUB_OK),
+ &err);
+ if (bbr->r_ctl.crte) {
+ bbr_type_log_hdwr_pacing(bbr,
+ bbr->r_ctl.crte->ptbl->rs_ifp,
+ rate_wanted,
+ bbr->r_ctl.crte->rate,
+ __LINE__, cts, err);
+ BBR_STAT_INC(bbr_hdwr_rl_add_ok);
+ counter_u64_add(bbr_flows_nohdwr_pacing, -1);
+ counter_u64_add(bbr_flows_whdwr_pacing, 1);
+ bbr->bbr_hdrw_pacing = 1;
+ /* Now what is our gain status? */
+ if (bbr->r_ctl.crte->rate < rate_wanted) {
+ /* We have a problem */
+ bbr_setup_less_of_rate(bbr, cts,
+ bbr->r_ctl.crte->rate, rate_wanted);
+ } else {
+ /* We are good */
+ bbr->gain_is_limited = 0;
+ bbr->skip_gain = 0;
+ }
+ tcp_bbr_tso_size_check(bbr, cts);
+ } else {
+ bbr_type_log_hdwr_pacing(bbr,
+ inp->inp_route.ro_rt->rt_ifp,
+ rate_wanted,
+ 0,
+ __LINE__, cts, err);
+ BBR_STAT_INC(bbr_hdwr_rl_add_fail);
+ }
+ }
+ if (bbr->bbr_hdrw_pacing) {
+ /*
+ * Worry about cases where the route
+ * changes or something happened that we
+ * lost our hardware pacing possibly during
+ * the last ip_output call.
+ */
+ if (inp->inp_snd_tag == NULL) {
+ /* A change during ip output disabled hw pacing? */
+ bbr->bbr_hdrw_pacing = 0;
+ } else if ((inp->inp_route.ro_rt == NULL) ||
+ (inp->inp_route.ro_rt->rt_ifp != inp->inp_snd_tag->ifp)) {
+ /*
+ * We had an interface or route change,
+ * detach from the current hdwr pacing
+ * and setup to re-attempt next go
+ * round.
+ */
+ bbr->bbr_hdrw_pacing = 0;
+ bbr->bbr_attempt_hdwr_pace = 0;
+ tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
+ tcp_bbr_tso_size_check(bbr, cts);
+ }
+ }
+ /*
+ * Data sent (as far as we can tell). If this advertises a larger
+ * window than any other segment, then remember the size of the
+ * advertised window. Any pending ACK has now been sent.
+ */
+ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
+ tp->rcv_adv = tp->rcv_nxt + recwin;
+
+ tp->last_ack_sent = tp->rcv_nxt;
+ if ((error == 0) &&
+ (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) &&
+ (doing_tlp == 0) &&
+ (tso == 0) &&
+ (hw_tls == 0) &&
+ (len > 0) &&
+ ((flags & TH_RST) == 0) &&
+ (IN_RECOVERY(tp->t_flags) == 0) &&
+ (bbr->rc_in_persist == 0) &&
+ ((tp->t_flags & TF_FORCEDATA) == 0) &&
+ (tot_len < bbr->r_ctl.rc_pace_max_segs)) {
+ /*
+ * For non-tso we need to goto again until we have sent out
+ * enough data to match what we are hptsi out every hptsi
+ * interval.
+ */
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ /* Make sure snd_nxt is drug up */
+ tp->snd_nxt = tp->snd_max;
+ }
+ if (rsm != NULL) {
+ rsm = NULL;
+ goto skip_again;
+ }
+ rsm = NULL;
+ sack_rxmit = 0;
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA);
+ goto again;
+ }
+skip_again:
+ if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
+ /*
+ * Calculate/Re-Calculate the hptsi slot in usecs based on
+ * what we have sent so far
+ */
+ slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
+ if (bbr->rc_no_pacing)
+ slot = 0;
+ }
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA);
+enobufs:
+ if (bbr->rc_use_google == 0)
+ bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
+ bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
+ bbr->r_ctl.rc_lost_bytes)));
+ bbr->rc_output_starts_timer = 1;
+ if (bbr->bbr_use_rack_cheat &&
+ (more_to_rxt ||
+ ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {
+ /* Rack cheats and shotguns out all rxt's 1ms apart */
+ if (slot > 1000)
+ slot = 1000;
+ }
+ if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {
+ /*
+ * We don't change the tso size until some number of sends
+ * to give the hardware commands time to get down
+ * to the interface.
+ */
+ bbr->r_ctl.bbr_hdwr_cnt_noset_snt++;
+ if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) {
+ bbr->hw_pacing_set = 1;
+ tcp_bbr_tso_size_check(bbr, cts);
+ }
+ }
+ bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len);
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ /* Make sure snd_nxt is drug up */
+ tp->snd_nxt = tp->snd_max;
+ }
+ return (error);
+
+}
+
+/*
+ * See bbr_output_wtime() for return values.
+ */
+static int
+bbr_output(struct tcpcb *tp)
+{
+ int32_t ret;
+ struct timeval tv;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ (void)tcp_get_usecs(&tv);
+ ret = bbr_output_wtime(tp, &tv);
+ return (ret);
+}
+
+static void
+bbr_mtu_chg(struct tcpcb *tp)
+{
+ struct tcp_bbr *bbr;
+ struct bbr_sendmap *rsm, *frsm = NULL;
+ uint32_t maxseg;
+
+ /*
+ * The MTU has changed. a) Clear the sack filter. b) Mark everything
+ * over the current size as SACK_PASS so a retransmit will occur.
+ */
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ maxseg = tp->t_maxseg - bbr->rc_last_options;
+ sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
+ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
+ /* Don't mess with ones acked (by sack?) */
+ if (rsm->r_flags & BBR_ACKED)
+ continue;
+ if ((rsm->r_end - rsm->r_start) > maxseg) {
+ /*
+ * We mark sack-passed on all the previous large
+ * sends we did. This will force them to retransmit.
+ */
+ rsm->r_flags |= BBR_SACK_PASSED;
+ if (((rsm->r_flags & BBR_MARKED_LOST) == 0) &&
+ bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) {
+ bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
+ bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
+ rsm->r_flags |= BBR_MARKED_LOST;
+ }
+ if (frsm == NULL)
+ frsm = rsm;
+ }
+ }
+ if (frsm) {
+ bbr->r_ctl.rc_resend = frsm;
+ }
+}
+
+/*
+ * bbr_ctloutput() must drop the inpcb lock before performing copyin on
+ * socket option arguments. When it re-acquires the lock after the copy, it
+ * has to revalidate that the connection is still valid for the socket
+ * option.
+ */
+static int
+bbr_set_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr)
+{
+ int32_t error = 0, optval;
+
+ switch (sopt->sopt_name) {
+ case TCP_RACK_PACE_MAX_SEG:
+ case TCP_RACK_MIN_TO:
+ case TCP_RACK_REORD_THRESH:
+ case TCP_RACK_REORD_FADE:
+ case TCP_RACK_TLP_THRESH:
+ case TCP_RACK_PKT_DELAY:
+ case TCP_BBR_ALGORITHM:
+ case TCP_BBR_TSLIMITS:
+ case TCP_BBR_IWINTSO:
+ case TCP_BBR_RECFORCE:
+ case TCP_BBR_STARTUP_PG:
+ case TCP_BBR_DRAIN_PG:
+ case TCP_BBR_RWND_IS_APP:
+ case TCP_BBR_PROBE_RTT_INT:
+ case TCP_BBR_PROBE_RTT_GAIN:
+ case TCP_BBR_PROBE_RTT_LEN:
+ case TCP_BBR_STARTUP_LOSS_EXIT:
+ case TCP_BBR_USEDEL_RATE:
+ case TCP_BBR_MIN_RTO:
+ case TCP_BBR_MAX_RTO:
+ case TCP_BBR_PACE_PER_SEC:
+ case TCP_DELACK:
+ case TCP_BBR_PACE_DEL_TAR:
+ case TCP_BBR_SEND_IWND_IN_TSO:
+ case TCP_BBR_EXTRA_STATE:
+ case TCP_BBR_UTTER_MAX_TSO:
+ case TCP_BBR_MIN_TOPACEOUT:
+ case TCP_BBR_FLOOR_MIN_TSO:
+ case TCP_BBR_TSTMP_RAISES:
+ case TCP_BBR_POLICER_DETECT:
+ case TCP_BBR_USE_RACK_CHEAT:
+ case TCP_DATA_AFTER_CLOSE:
+ case TCP_BBR_HDWR_PACE:
+ case TCP_BBR_PACE_SEG_MAX:
+ case TCP_BBR_PACE_SEG_MIN:
+ case TCP_BBR_PACE_CROSS:
+ case TCP_BBR_PACE_OH:
+#ifdef NETFLIX_PEAKRATE
+ case TCP_MAXPEAKRATE:
+#endif
+ case TCP_BBR_TMR_PACE_OH:
+ case TCP_BBR_RACK_RTT_USE:
+ case TCP_BBR_RETRAN_WTSO:
+ break;
+ default:
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+ if (error)
+ return (error);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
+ }
+ tp = intotcpcb(inp);
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ switch (sopt->sopt_name) {
+ case TCP_BBR_PACE_PER_SEC:
+ BBR_OPTS_INC(tcp_bbr_pace_per_sec);
+ bbr->r_ctl.bbr_hptsi_per_second = optval;
+ break;
+ case TCP_BBR_PACE_DEL_TAR:
+ BBR_OPTS_INC(tcp_bbr_pace_del_tar);
+ bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval;
+ break;
+ case TCP_BBR_PACE_SEG_MAX:
+ BBR_OPTS_INC(tcp_bbr_pace_seg_max);
+ bbr->r_ctl.bbr_hptsi_segments_max = optval;
+ break;
+ case TCP_BBR_PACE_SEG_MIN:
+ BBR_OPTS_INC(tcp_bbr_pace_seg_min);
+ bbr->r_ctl.bbr_hptsi_bytes_min = optval;
+ break;
+ case TCP_BBR_PACE_CROSS:
+ BBR_OPTS_INC(tcp_bbr_pace_cross);
+ bbr->r_ctl.bbr_cross_over = optval;
+ break;
+ case TCP_BBR_ALGORITHM:
+ BBR_OPTS_INC(tcp_bbr_algorithm);
+ if (optval && (bbr->rc_use_google == 0)) {
+ /* Turn on the google mode */
+ bbr_google_mode_on(bbr);
+ if ((optval > 3) && (optval < 500)) {
+ /*
+ * Must be at least greater than .3%
+ * and must be less than 50.0%.
+ */
+ bbr->r_ctl.bbr_google_discount = optval;
+ }
+ } else if ((optval == 0) && (bbr->rc_use_google == 1)) {
+ /* Turn off the google mode */
+ bbr_google_mode_off(bbr);
+ }
+ break;
+ case TCP_BBR_TSLIMITS:
+ BBR_OPTS_INC(tcp_bbr_tslimits);
+ if (optval == 1)
+ bbr->rc_use_ts_limit = 1;
+ else if (optval == 0)
+ bbr->rc_use_ts_limit = 0;
+ else
+ error = EINVAL;
+ break;
+
+ case TCP_BBR_IWINTSO:
+ BBR_OPTS_INC(tcp_bbr_iwintso);
+ if ((optval >= 0) && (optval < 128)) {
+ uint32_t twin;
+
+ bbr->rc_init_win = optval;
+ twin = bbr_initial_cwnd(bbr, tp);
+ if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd))
+ tp->snd_cwnd = twin;
+ else
+ error = EBUSY;
+ } else
+ error = EINVAL;
+ break;
+ case TCP_BBR_STARTUP_PG:
+ BBR_OPTS_INC(tcp_bbr_startup_pg);
+ if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) {
+ bbr->r_ctl.rc_startup_pg = optval;
+ if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
+ bbr->r_ctl.rc_bbr_hptsi_gain = optval;
+ }
+ } else
+ error = EINVAL;
+ break;
+ case TCP_BBR_DRAIN_PG:
+ BBR_OPTS_INC(tcp_bbr_drain_pg);
+ if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE))
+ bbr->r_ctl.rc_drain_pg = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_BBR_PROBE_RTT_LEN:
+ BBR_OPTS_INC(tcp_bbr_probertt_len);
+ if (optval <= 1)
+ reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND));
+ else
+ error = EINVAL;
+ break;
+ case TCP_BBR_PROBE_RTT_GAIN:
+ BBR_OPTS_INC(tcp_bbr_probertt_gain);
+ if (optval <= BBR_UNIT)
+ bbr->r_ctl.bbr_rttprobe_gain_val = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_BBR_PROBE_RTT_INT:
+ BBR_OPTS_INC(tcp_bbr_probe_rtt_int);
+ if (optval > 1000)
+ bbr->r_ctl.rc_probertt_int = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_BBR_MIN_TOPACEOUT:
+ BBR_OPTS_INC(tcp_bbr_topaceout);
+ if (optval == 0) {
+ bbr->no_pacing_until = 0;
+ bbr->rc_no_pacing = 0;
+ } else if (optval <= 0x00ff) {
+ bbr->no_pacing_until = optval;
+ if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) &&
+ (bbr->rc_bbr_state == BBR_STATE_STARTUP)){
+ /* Turn on no pacing */
+ bbr->rc_no_pacing = 1;
+ }
+ } else
+ error = EINVAL;
+ break;
+ case TCP_BBR_STARTUP_LOSS_EXIT:
+ BBR_OPTS_INC(tcp_bbr_startup_loss_exit);
+ bbr->rc_loss_exit = optval;
+ break;
+ case TCP_BBR_USEDEL_RATE:
+ error = EINVAL;
+ break;
+ case TCP_BBR_MIN_RTO:
+ BBR_OPTS_INC(tcp_bbr_min_rto);
+ bbr->r_ctl.rc_min_rto_ms = optval;
+ break;
+ case TCP_BBR_MAX_RTO:
+ BBR_OPTS_INC(tcp_bbr_max_rto);
+ bbr->rc_max_rto_sec = optval;
+ break;
+ case TCP_RACK_MIN_TO:
+ /* Minimum time between rack t-o's in ms */
+ BBR_OPTS_INC(tcp_rack_min_to);
+ bbr->r_ctl.rc_min_to = optval;
+ break;
+ case TCP_RACK_REORD_THRESH:
+ /* RACK reorder threshold (shift amount) */
+ BBR_OPTS_INC(tcp_rack_reord_thresh);
+ if ((optval > 0) && (optval < 31))
+ bbr->r_ctl.rc_reorder_shift = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_RACK_REORD_FADE:
+ /* Does reordering fade after ms time */
+ BBR_OPTS_INC(tcp_rack_reord_fade);
+ bbr->r_ctl.rc_reorder_fade = optval;
+ break;
+ case TCP_RACK_TLP_THRESH:
+ /* RACK TLP theshold i.e. srtt+(srtt/N) */
+ BBR_OPTS_INC(tcp_rack_tlp_thresh);
+ if (optval)
+ bbr->rc_tlp_threshold = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_BBR_USE_RACK_CHEAT:
+ BBR_OPTS_INC(tcp_use_rackcheat);
+ if (bbr->rc_use_google) {
+ error = EINVAL;
+ break;
+ }
+ BBR_OPTS_INC(tcp_rack_cheat);
+ if (optval)
+ bbr->bbr_use_rack_cheat = 1;
+ else
+ bbr->bbr_use_rack_cheat = 0;
+ break;
+ case TCP_BBR_FLOOR_MIN_TSO:
+ BBR_OPTS_INC(tcp_utter_max_tso);
+ if ((optval >= 0) && (optval < 40))
+ bbr->r_ctl.bbr_hptsi_segments_floor = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_BBR_UTTER_MAX_TSO:
+ BBR_OPTS_INC(tcp_utter_max_tso);
+ if ((optval >= 0) && (optval < 0xffff))
+ bbr->r_ctl.bbr_utter_max = optval;
+ else
+ error = EINVAL;
+ break;
+
+ case TCP_BBR_EXTRA_STATE:
+ BBR_OPTS_INC(tcp_extra_state);
+ if (optval)
+ bbr->rc_use_idle_restart = 1;
+ else
+ bbr->rc_use_idle_restart = 0;
+ break;
+ case TCP_BBR_SEND_IWND_IN_TSO:
+ BBR_OPTS_INC(tcp_iwnd_tso);
+ if (optval) {
+ bbr->bbr_init_win_cheat = 1;
+ if (bbr->rc_past_init_win == 0) {
+ uint32_t cts;
+ cts = tcp_get_usecs(&bbr->rc_tv);
+ tcp_bbr_tso_size_check(bbr, cts);
+ }
+ } else
+ bbr->bbr_init_win_cheat = 0;
+ break;
+ case TCP_BBR_HDWR_PACE:
+ BBR_OPTS_INC(tcp_hdwr_pacing);
+ if (optval){
+ bbr->bbr_hdw_pace_ena = 1;
+ bbr->bbr_attempt_hdwr_pace = 0;
+ } else {
+ bbr->bbr_hdw_pace_ena = 0;
+ if (bbr->bbr_hdrw_pacing) {
+ bbr->bbr_hdrw_pacing = 0;
+ in_pcbdetach_txrtlmt(bbr->rc_inp);
+ }
+ }
+ break;
+
+ case TCP_DELACK:
+ BBR_OPTS_INC(tcp_delack);
+ if (optval < 100) {
+ if (optval == 0) /* off */
+ tp->t_delayed_ack = 0;
+ else if (optval == 1) /* on which is 2 */
+ tp->t_delayed_ack = 2;
+ else /* higher than 2 and less than 100 */
+ tp->t_delayed_ack = optval;
+ if (tp->t_flags & TF_DELACK) {
+ tp->t_flags &= ~TF_DELACK;
+ tp->t_flags |= TF_ACKNOW;
+ bbr_output(tp);
+ }
+ } else
+ error = EINVAL;
+ break;
+ case TCP_RACK_PKT_DELAY:
+ /* RACK added ms i.e. rack-rtt + reord + N */
+ BBR_OPTS_INC(tcp_rack_pkt_delay);
+ bbr->r_ctl.rc_pkt_delay = optval;
+ break;
+#ifdef NETFLIX_PEAKRATE
+ case TCP_MAXPEAKRATE:
+ BBR_OPTS_INC(tcp_maxpeak);
+ error = tcp_set_maxpeakrate(tp, optval);
+ if (!error)
+ tp->t_peakrate_thr = tp->t_maxpeakrate;
+ break;
+#endif
+ case TCP_BBR_RETRAN_WTSO:
+ BBR_OPTS_INC(tcp_retran_wtso);
+ if (optval)
+ bbr->rc_resends_use_tso = 1;
+ else
+ bbr->rc_resends_use_tso = 0;
+ break;
+ case TCP_DATA_AFTER_CLOSE:
+ BBR_OPTS_INC(tcp_data_ac);
+ if (optval)
+ bbr->rc_allow_data_af_clo = 1;
+ else
+ bbr->rc_allow_data_af_clo = 0;
+ break;
+ case TCP_BBR_POLICER_DETECT:
+ BBR_OPTS_INC(tcp_policer_det);
+ if (bbr->rc_use_google == 0)
+ error = EINVAL;
+ else if (optval)
+ bbr->r_use_policer = 1;
+ else
+ bbr->r_use_policer = 0;
+ break;
+
+ case TCP_BBR_TSTMP_RAISES:
+ BBR_OPTS_INC(tcp_ts_raises);
+ if (optval)
+ bbr->ts_can_raise = 1;
+ else
+ bbr->ts_can_raise = 0;
+ break;
+ case TCP_BBR_TMR_PACE_OH:
+ BBR_OPTS_INC(tcp_pacing_oh_tmr);
+ if (bbr->rc_use_google) {
+ error = EINVAL;
+ } else {
+ if (optval)
+ bbr->r_ctl.rc_incr_tmrs = 1;
+ else
+ bbr->r_ctl.rc_incr_tmrs = 0;
+ }
+ break;
+ case TCP_BBR_PACE_OH:
+ BBR_OPTS_INC(tcp_pacing_oh);
+ if (bbr->rc_use_google) {
+ error = EINVAL;
+ } else {
+ if (optval > (BBR_INCL_TCP_OH|
+ BBR_INCL_IP_OH|
+ BBR_INCL_ENET_OH)) {
+ error = EINVAL;
+ break;
+ }
+ if (optval & BBR_INCL_TCP_OH)
+ bbr->r_ctl.rc_inc_tcp_oh = 1;
+ else
+ bbr->r_ctl.rc_inc_tcp_oh = 0;
+ if (optval & BBR_INCL_IP_OH)
+ bbr->r_ctl.rc_inc_ip_oh = 1;
+ else
+ bbr->r_ctl.rc_inc_ip_oh = 0;
+ if (optval & BBR_INCL_ENET_OH)
+ bbr->r_ctl.rc_inc_enet_oh = 1;
+ else
+ bbr->r_ctl.rc_inc_enet_oh = 0;
+ }
+ break;
+ default:
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+#ifdef NETFLIX_STATS
+ tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
+#endif
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+/*
+ * return 0 on success, error-num on failure
+ */
+static int
+bbr_get_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr)
+{
+ int32_t error, optval;
+
+ /*
+ * Because all our options are either boolean or an int, we can just
+ * pull everything into optval and then unlock and copy. If we ever
+ * add a option that is not a int, then this will have quite an
+ * impact to this routine.
+ */
+ switch (sopt->sopt_name) {
+ case TCP_BBR_PACE_PER_SEC:
+ optval = bbr->r_ctl.bbr_hptsi_per_second;
+ break;
+ case TCP_BBR_PACE_DEL_TAR:
+ optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar;
+ break;
+ case TCP_BBR_PACE_SEG_MAX:
+ optval = bbr->r_ctl.bbr_hptsi_segments_max;
+ break;
+ case TCP_BBR_MIN_TOPACEOUT:
+ optval = bbr->no_pacing_until;
+ break;
+ case TCP_BBR_PACE_SEG_MIN:
+ optval = bbr->r_ctl.bbr_hptsi_bytes_min;
+ break;
+ case TCP_BBR_PACE_CROSS:
+ optval = bbr->r_ctl.bbr_cross_over;
+ break;
+ case TCP_BBR_ALGORITHM:
+ optval = bbr->rc_use_google;
+ break;
+ case TCP_BBR_TSLIMITS:
+ optval = bbr->rc_use_ts_limit;
+ break;
+ case TCP_BBR_IWINTSO:
+ optval = bbr->rc_init_win;
+ break;
+ case TCP_BBR_STARTUP_PG:
+ optval = bbr->r_ctl.rc_startup_pg;
+ break;
+ case TCP_BBR_DRAIN_PG:
+ optval = bbr->r_ctl.rc_drain_pg;
+ break;
+ case TCP_BBR_PROBE_RTT_INT:
+ optval = bbr->r_ctl.rc_probertt_int;
+ break;
+ case TCP_BBR_PROBE_RTT_LEN:
+ optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND);
+ break;
+ case TCP_BBR_PROBE_RTT_GAIN:
+ optval = bbr->r_ctl.bbr_rttprobe_gain_val;
+ break;
+ case TCP_BBR_STARTUP_LOSS_EXIT:
+ optval = bbr->rc_loss_exit;
+ break;
+ case TCP_BBR_USEDEL_RATE:
+ error = EINVAL;
+ break;
+ case TCP_BBR_MIN_RTO:
+ optval = bbr->r_ctl.rc_min_rto_ms;
+ break;
+ case TCP_BBR_MAX_RTO:
+ optval = bbr->rc_max_rto_sec;
+ break;
+ case TCP_RACK_PACE_MAX_SEG:
+ /* Max segments in a pace */
+ optval = bbr->r_ctl.rc_pace_max_segs;
+ break;
+ case TCP_RACK_MIN_TO:
+ /* Minimum time between rack t-o's in ms */
+ optval = bbr->r_ctl.rc_min_to;
+ break;
+ case TCP_RACK_REORD_THRESH:
+ /* RACK reorder threshold (shift amount) */
+ optval = bbr->r_ctl.rc_reorder_shift;
+ break;
+ case TCP_RACK_REORD_FADE:
+ /* Does reordering fade after ms time */
+ optval = bbr->r_ctl.rc_reorder_fade;
+ break;
+ case TCP_BBR_USE_RACK_CHEAT:
+ /* Do we use the rack cheat for rxt */
+ optval = bbr->bbr_use_rack_cheat;
+ break;
+ case TCP_BBR_FLOOR_MIN_TSO:
+ optval = bbr->r_ctl.bbr_hptsi_segments_floor;
+ break;
+ case TCP_BBR_UTTER_MAX_TSO:
+ optval = bbr->r_ctl.bbr_utter_max;
+ break;
+ case TCP_BBR_SEND_IWND_IN_TSO:
+ /* Do we send TSO size segments initially */
+ optval = bbr->bbr_init_win_cheat;
+ break;
+ case TCP_BBR_EXTRA_STATE:
+ optval = bbr->rc_use_idle_restart;
+ break;
+ case TCP_RACK_TLP_THRESH:
+ /* RACK TLP theshold i.e. srtt+(srtt/N) */
+ optval = bbr->rc_tlp_threshold;
+ break;
+ case TCP_RACK_PKT_DELAY:
+ /* RACK added ms i.e. rack-rtt + reord + N */
+ optval = bbr->r_ctl.rc_pkt_delay;
+ break;
+ case TCP_BBR_RETRAN_WTSO:
+ optval = bbr->rc_resends_use_tso;
+ break;
+ case TCP_DATA_AFTER_CLOSE:
+ optval = bbr->rc_allow_data_af_clo;
+ break;
+ case TCP_DELACK:
+ optval = tp->t_delayed_ack;
+ break;
+ case TCP_BBR_HDWR_PACE:
+ optval = bbr->bbr_hdw_pace_ena;
+ break;
+ case TCP_BBR_POLICER_DETECT:
+ optval = bbr->r_use_policer;
+ break;
+ case TCP_BBR_TSTMP_RAISES:
+ optval = bbr->ts_can_raise;
+ break;
+ case TCP_BBR_TMR_PACE_OH:
+ optval = bbr->r_ctl.rc_incr_tmrs;
+ break;
+ case TCP_BBR_PACE_OH:
+ optval = 0;
+ if (bbr->r_ctl.rc_inc_tcp_oh)
+ optval |= BBR_INCL_TCP_OH;
+ if (bbr->r_ctl.rc_inc_ip_oh)
+ optval |= BBR_INCL_IP_OH;
+ if (bbr->r_ctl.rc_inc_enet_oh)
+ optval |= BBR_INCL_ENET_OH;
+ break;
+ default:
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ return (error);
+}
+
+/*
+ * return 0 on success, error-num on failure
+ */
+static int
+bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
+{
+ int32_t error = EINVAL;
+ struct tcp_bbr *bbr;
+
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ if (bbr == NULL) {
+ /* Huh? */
+ goto out;
+ }
+ if (sopt->sopt_dir == SOPT_SET) {
+ return (bbr_set_sockopt(so, sopt, inp, tp, bbr));
+ } else if (sopt->sopt_dir == SOPT_GET) {
+ return (bbr_get_sockopt(so, sopt, inp, tp, bbr));
+ }
+out:
+ INP_WUNLOCK(inp);
+ return (error);
+}
+
+
+struct tcp_function_block __tcp_bbr = {
+ .tfb_tcp_block_name = __XSTRING(STACKNAME),
+ .tfb_tcp_output = bbr_output,
+ .tfb_do_queued_segments = ctf_do_queued_segments,
+ .tfb_do_segment_nounlock = bbr_do_segment_nounlock,
+ .tfb_tcp_do_segment = bbr_do_segment,
+ .tfb_tcp_ctloutput = bbr_ctloutput,
+ .tfb_tcp_fb_init = bbr_init,
+ .tfb_tcp_fb_fini = bbr_fini,
+ .tfb_tcp_timer_stop_all = bbr_stopall,
+ .tfb_tcp_timer_activate = bbr_timer_activate,
+ .tfb_tcp_timer_active = bbr_timer_active,
+ .tfb_tcp_timer_stop = bbr_timer_stop,
+ .tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
+ .tfb_tcp_handoff_ok = bbr_handoff_ok,
+ .tfb_tcp_mtu_chg = bbr_mtu_chg
+};
+
+static const char *bbr_stack_names[] = {
+ __XSTRING(STACKNAME),
+#ifdef STACKALIAS
+ __XSTRING(STACKALIAS),
+#endif
+};
+
+static bool bbr_mod_inited = false;
+
+static int
+tcp_addbbr(module_t mod, int32_t type, void *data)
+{
+ int32_t err = 0;
+ int num_stacks;
+
+ switch (type) {
+ case MOD_LOAD:
+ printf("Attempting to load " __XSTRING(MODNAME) "\n");
+ bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
+ sizeof(struct bbr_sendmap),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
+ sizeof(struct tcp_bbr),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
+ sysctl_ctx_init(&bbr_sysctl_ctx);
+ bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
+ OID_AUTO,
+#ifdef STACKALIAS
+ __XSTRING(STACKALIAS),
+#else
+ __XSTRING(STACKNAME),
+#endif
+ CTLFLAG_RW, 0,
+ "");
+ if (bbr_sysctl_root == NULL) {
+ printf("Failed to add sysctl node\n");
+ err = EFAULT;
+ goto free_uma;
+ }
+ bbr_init_sysctls();
+ num_stacks = nitems(bbr_stack_names);
+ err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK,
+ bbr_stack_names, &num_stacks);
+ if (err) {
+ printf("Failed to register %s stack name for "
+ "%s module\n", bbr_stack_names[num_stacks],
+ __XSTRING(MODNAME));
+ sysctl_ctx_free(&bbr_sysctl_ctx);
+ free_uma:
+ uma_zdestroy(bbr_zone);
+ uma_zdestroy(bbr_pcb_zone);
+ bbr_counter_destroy();
+ printf("Failed to register " __XSTRING(MODNAME)
+ " module err:%d\n", err);
+ return (err);
+ }
+ tcp_lro_reg_mbufq();
+ bbr_mod_inited = true;
+ printf(__XSTRING(MODNAME) " is now available\n");
+ break;
+ case MOD_QUIESCE:
+ err = deregister_tcp_functions(&__tcp_bbr, true, false);
+ break;
+ case MOD_UNLOAD:
+ err = deregister_tcp_functions(&__tcp_bbr, false, true);
+ if (err == EBUSY)
+ break;
+ if (bbr_mod_inited) {
+ uma_zdestroy(bbr_zone);
+ uma_zdestroy(bbr_pcb_zone);
+ sysctl_ctx_free(&bbr_sysctl_ctx);
+ bbr_counter_destroy();
+ printf(__XSTRING(MODNAME)
+ " is now no longer available\n");
+ bbr_mod_inited = false;
+ }
+ tcp_lro_dereg_mbufq();
+ err = 0;
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (err);
+}
+
+static moduledata_t tcp_bbr = {
+ .name = __XSTRING(MODNAME),
+ .evhand = tcp_addbbr,
+ .priv = 0
+};
+
+MODULE_VERSION(MODNAME, 1);
+DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 7ef1f3cc7832..f4a17e4dfc4b 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -1,6 +1,5 @@
/*-
- * Copyright (c) 2016-2019
- * Netflix Inc. All rights reserved.
+ * Copyright (c) 2016-2019 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -32,7 +31,8 @@ __FBSDID("$FreeBSD$");
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
-
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
#include <sys/param.h>
#include <sys/module.h>
#include <sys/kernel.h>
@@ -45,18 +45,20 @@ __FBSDID("$FreeBSD$");
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
-#ifdef NETFLIX_STATS
-#include <sys/qmath.h>
-#endif
#include <sys/socket.h>
#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/ktls.h>
+#endif
#include <sys/sysctl.h>
#include <sys/systm.h>
-#include <sys/tree.h>
#ifdef NETFLIX_STATS
+#include <sys/qmath.h>
+#include <sys/tree.h>
#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#endif
#include <sys/refcount.h>
+#include <sys/tree.h>
#include <sys/queue.h>
#include <sys/smp.h>
#include <sys/kthread.h>
@@ -79,8 +81,8 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
-#define TCPOUTFLAGS
#include <netinet/tcp.h>
+#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
@@ -90,6 +92,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_fastopen.h>
+#include <netinet/tcp_lro.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
@@ -163,25 +166,41 @@ struct sysctl_oid *rack_sysctl_root;
* must maintain the new rack scoreboard.
*
*/
-static int32_t rack_precache = 1;
static int32_t rack_tlp_thresh = 1;
static int32_t rack_reorder_thresh = 2;
static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000
* - 60 seconds */
+/* Attack threshold detections */
+static uint32_t rack_highest_sack_thresh_seen = 0;
+static uint32_t rack_highest_move_thresh_seen = 0;
+
static int32_t rack_pkt_delay = 1;
-static int32_t rack_inc_var = 0;/* For TLP */
-static int32_t rack_reduce_largest_on_idle = 0;
static int32_t rack_min_pace_time = 0;
-static int32_t rack_min_pace_time_seg_req=6;
static int32_t rack_early_recovery = 1;
-static int32_t rack_early_recovery_max_seg = 6;
static int32_t rack_send_a_lot_in_prr = 1;
static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
-static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */
static int32_t rack_verbose_logging = 0;
static int32_t rack_ignore_data_after_close = 1;
-static int32_t rack_map_entries_limit = 1024;
-static int32_t rack_map_split_limit = 256;
+static int32_t use_rack_cheat = 1;
+static int32_t rack_persist_min = 250; /* 250ms */
+static int32_t rack_persist_max = 1000; /* 1 Second */
+static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */
+static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */
+
+/* Sack attack detection thresholds and such */
+static int32_t tcp_force_detection = 0;
+
+#ifdef NETFLIX_EXP_DETECTION
+static int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */
+static int32_t tcp_sack_to_move_thresh = 600; /* 60 % */
+static int32_t tcp_restoral_thresh = 650; /* 65 % (sack:2:ack -5%) */
+static int32_t tcp_attack_on_turns_on_logging = 0;
+static int32_t tcp_map_minimum = 500;
+#endif
+static int32_t tcp_sad_decay_val = 800;
+static int32_t tcp_sad_pacing_interval = 2000;
+static int32_t tcp_sad_low_pps = 100;
+
/*
* Currently regular tcp has a rto_min of 30ms
@@ -191,11 +210,11 @@ static int32_t rack_map_split_limit = 256;
*/
static int32_t rack_tlp_min = 10;
static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */
-static int32_t rack_rto_max = 30000; /* 30 seconds */
+static int32_t rack_rto_max = 4000; /* 4 seconds */
static const int32_t rack_free_cache = 2;
static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
-static int32_t rack_pace_every_seg = 1;
+static int32_t rack_pace_every_seg = 0;
static int32_t rack_delayed_ack_time = 200; /* 200ms */
static int32_t rack_slot_reduction = 4;
static int32_t rack_lower_cwnd_at_tlp = 0;
@@ -204,9 +223,12 @@ static int32_t rack_proportional_rate = 10;
static int32_t rack_tlp_max_resend = 2;
static int32_t rack_limited_retran = 0;
static int32_t rack_always_send_oldest = 0;
-static int32_t rack_sack_block_limit = 128;
static int32_t rack_use_sack_filter = 1;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
+static int32_t rack_per_of_gp = 50;
+static int32_t rack_tcp_map_entries_limit = 1500;
+static int32_t rack_tcp_map_split_limit = 256;
+
/* Rack specific counters */
counter_u64_t rack_badfr;
@@ -217,8 +239,11 @@ counter_u64_t rack_timestamp_mismatch;
counter_u64_t rack_reorder_seen;
counter_u64_t rack_paced_segments;
counter_u64_t rack_unpaced_segments;
+counter_u64_t rack_calc_zero;
+counter_u64_t rack_calc_nonzero;
counter_u64_t rack_saw_enobuf;
counter_u64_t rack_saw_enetunreach;
+counter_u64_t rack_per_timer_hole;
/* Tail loss probe counters */
counter_u64_t rack_tlp_tot;
@@ -239,13 +264,34 @@ counter_u64_t rack_split_limited;
counter_u64_t rack_sack_proc_all;
counter_u64_t rack_sack_proc_short;
counter_u64_t rack_sack_proc_restart;
-counter_u64_t rack_runt_sacks;
+counter_u64_t rack_sack_attacks_detected;
+counter_u64_t rack_sack_attacks_reversed;
+counter_u64_t rack_sack_used_next_merge;
+counter_u64_t rack_sack_splits;
+counter_u64_t rack_sack_used_prev_merge;
+counter_u64_t rack_sack_skipped_acked;
+counter_u64_t rack_ack_total;
+counter_u64_t rack_express_sack;
+counter_u64_t rack_sack_total;
+counter_u64_t rack_move_none;
+counter_u64_t rack_move_some;
+
counter_u64_t rack_used_tlpmethod;
counter_u64_t rack_used_tlpmethod2;
counter_u64_t rack_enter_tlp_calc;
counter_u64_t rack_input_idle_reduces;
+counter_u64_t rack_collapsed_win;
counter_u64_t rack_tlp_does_nada;
+/* Counters for HW TLS */
+counter_u64_t rack_tls_rwnd;
+counter_u64_t rack_tls_cwnd;
+counter_u64_t rack_tls_app;
+counter_u64_t rack_tls_other;
+counter_u64_t rack_tls_filled;
+counter_u64_t rack_tls_rxt;
+counter_u64_t rack_tls_tlp;
+
/* Temp CPU counters */
counter_u64_t rack_find_high;
@@ -253,21 +299,12 @@ counter_u64_t rack_progress_drops;
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
-/*
- * This was originally defined in tcp_timer.c, but is now reproduced here given
- * the unification of the SYN and non-SYN retransmit timer exponents combined
- * with wanting to retain previous behaviour for previously deployed stack
- * versions.
- */
-int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
- { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
-
static void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
static int
rack_process_ack(struct mbuf *m, struct tcphdr *th,
- struct socket *so, struct tcpcb *tp, struct tcpopt *to,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to,
uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
static int
rack_process_data(struct mbuf *m, struct tcphdr *th,
@@ -320,17 +357,13 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm);
-static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
+static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num);
static int32_t rack_output(struct tcpcb *tp);
-static void
-rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
- struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
- uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
- uint32_t cts);
+ uint32_t cts, int *moved_two);
static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
static void rack_remxt_tmr(struct tcpcb *tp);
static int
@@ -354,9 +387,6 @@ static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
-static void
-rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
- struct tcpcb *tp, int32_t * ret_val);
static int
rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
@@ -365,13 +395,6 @@ static int
rack_do_closing(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
-static void rack_do_drop(struct mbuf *m, struct tcpcb *tp);
-static void
-rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
- struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
-static void
-rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
- struct tcphdr *th, int32_t rstreason, int32_t tlen);
static int
rack_do_established(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
@@ -400,13 +423,6 @@ static int
rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
-static int
-rack_drop_checks(struct tcpopt *to, struct mbuf *m,
- struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
- int32_t * drop_hdrlen, int32_t * ret_val);
-static int
-rack_process_rst(struct mbuf *m, struct tcphdr *th,
- struct socket *so, struct tcpcb *tp);
struct rack_sendmap *
tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
uint32_t tsused);
@@ -414,10 +430,6 @@ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
static void
tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
-static int
-rack_ts_check(struct mbuf *m, struct tcphdr *th,
- struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
-
int32_t rack_clear_counter=0;
@@ -453,9 +465,12 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_to_arm_rack);
counter_u64_zero(rack_to_arm_tlp);
counter_u64_zero(rack_paced_segments);
+ counter_u64_zero(rack_calc_zero);
+ counter_u64_zero(rack_calc_nonzero);
counter_u64_zero(rack_unpaced_segments);
counter_u64_zero(rack_saw_enobuf);
counter_u64_zero(rack_saw_enetunreach);
+ counter_u64_zero(rack_per_timer_hole);
counter_u64_zero(rack_to_alloc_hard);
counter_u64_zero(rack_to_alloc_emerg);
counter_u64_zero(rack_sack_proc_all);
@@ -466,12 +481,31 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_alloc_limited_conns);
counter_u64_zero(rack_split_limited);
counter_u64_zero(rack_find_high);
- counter_u64_zero(rack_runt_sacks);
+ counter_u64_zero(rack_tls_rwnd);
+ counter_u64_zero(rack_tls_cwnd);
+ counter_u64_zero(rack_tls_app);
+ counter_u64_zero(rack_tls_other);
+ counter_u64_zero(rack_tls_filled);
+ counter_u64_zero(rack_tls_rxt);
+ counter_u64_zero(rack_tls_tlp);
+ counter_u64_zero(rack_sack_attacks_detected);
+ counter_u64_zero(rack_sack_attacks_reversed);
+ counter_u64_zero(rack_sack_used_next_merge);
+ counter_u64_zero(rack_sack_used_prev_merge);
+ counter_u64_zero(rack_sack_splits);
+ counter_u64_zero(rack_sack_skipped_acked);
+ counter_u64_zero(rack_ack_total);
+ counter_u64_zero(rack_express_sack);
+ counter_u64_zero(rack_sack_total);
+ counter_u64_zero(rack_move_none);
+ counter_u64_zero(rack_move_some);
counter_u64_zero(rack_used_tlpmethod);
counter_u64_zero(rack_used_tlpmethod2);
counter_u64_zero(rack_enter_tlp_calc);
counter_u64_zero(rack_progress_drops);
counter_u64_zero(rack_tlp_does_nada);
+ counter_u64_zero(rack_collapsed_win);
+
}
rack_clear_counter = 0;
return (0);
@@ -482,18 +516,9 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
static void
rack_init_sysctls()
{
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "map_limit", CTLFLAG_RW,
- &rack_map_entries_limit , 1024,
- "Is there a limit on how big the sendmap can grow? ");
-
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "map_splitlimit", CTLFLAG_RW,
- &rack_map_split_limit , 256,
- "Is there a limit on how much splitting a peer can do?");
-
+ struct sysctl_oid *rack_counters;
+ struct sysctl_oid *rack_attack;
+
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "rate_sample_method", CTLFLAG_RW,
@@ -501,31 +526,52 @@ rack_init_sysctls()
"What method should we use for rate sampling 0=high, 1=low ");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "hw_tlsmax", CTLFLAG_RW,
+ &rack_hw_tls_max_seg , 0,
+ "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? ");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "data_after_close", CTLFLAG_RW,
&rack_ignore_data_after_close, 0,
"Do we hold off sending a RST until all pending data is ack'd");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "cheat_rxt", CTLFLAG_RW,
+ &use_rack_cheat, 1,
+ "Do we use the rxt cheat for rack?");
+
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "persmin", CTLFLAG_RW,
+ &rack_persist_min, 250,
+ "What is the minimum time in milliseconds between persists");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "persmax", CTLFLAG_RW,
+ &rack_persist_max, 1000,
+ "What is the largest delay in milliseconds between persists");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "no_sack_needed", CTLFLAG_RW,
+ &rack_sack_not_required, 0,
+ "Do we allow rack to run on connections not supporting SACK?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "tlpmethod", CTLFLAG_RW,
&rack_tlp_threshold_use, TLP_USE_TWO_ONE,
"What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "gp_percentage", CTLFLAG_RW,
+ &rack_per_of_gp, 50,
+ "Do we pace to percentage of goodput (0=old method)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "min_pace_time", CTLFLAG_RW,
&rack_min_pace_time, 0,
"Should we enforce a minimum pace time of 1ms");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "min_pace_segs", CTLFLAG_RW,
- &rack_min_pace_time_seg_req, 6,
- "How many segments have to be in the len to enforce min-pace-time");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
- &rack_reduce_largest_on_idle, 0,
- "Should we reduce the largest cwnd seen to IW on idle reduction");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "bb_verbose", CTLFLAG_RW,
&rack_verbose_logging, 0,
"Should RACK black box logging be verbose");
@@ -546,26 +592,11 @@ rack_init_sysctls()
"TLP minimum timeout per the specification (10ms)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "precache", CTLFLAG_RW,
- &rack_precache, 0,
- "Where should we precache the mcopy (0 is not at all)");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "sblklimit", CTLFLAG_RW,
- &rack_sack_block_limit, 128,
- "When do we start paying attention to small sack blocks");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "send_oldest", CTLFLAG_RW,
&rack_always_send_oldest, 1,
"Should we always send the oldest TLP and RACK-TLP");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
- &rack_tlp_in_recovery, 1,
- "Can we do a TLP during recovery?");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "rack_tlimit", CTLFLAG_RW,
&rack_limited_retran, 0,
"How many times can a rack timeout drive out sends");
@@ -607,12 +638,12 @@ rack_init_sysctls()
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
- &rack_pace_every_seg, 1,
- "Should we pace out every segment hptsi");
+ &rack_pace_every_seg, 0,
+ "Should we use the original pacing mechanism that did not pace much?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
- &rack_hptsi_segments, 6,
+ &rack_hptsi_segments, 40,
"Should we pace out only a limited size of segments");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -626,11 +657,6 @@ rack_init_sysctls()
"Minimum rack timeout in milliseconds");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
- &rack_early_recovery_max_seg, 6,
- "Max segments in early recovery");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "earlyrecovery", CTLFLAG_RW,
&rack_early_recovery, 1,
"Do we do early recovery with rack");
@@ -654,221 +680,376 @@ rack_init_sysctls()
OID_AUTO, "pktdelay", CTLFLAG_RW,
&rack_pkt_delay, 1,
"Extra RACK time (in ms) besides reordering thresh");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
+
+ rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "inc_var", CTLFLAG_RW,
- &rack_inc_var, 0,
- "Should rack add to the TLP timer the variance in rtt calculation");
+ OID_AUTO,
+ "stats",
+ CTLFLAG_RW, 0,
+ "Rack Counters");
rack_badfr = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "badfr", CTLFLAG_RD,
&rack_badfr, "Total number of bad FRs");
rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "badfr_bytes", CTLFLAG_RD,
&rack_badfr_bytes, "Total number of bad FRs");
rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "prrsndret", CTLFLAG_RD,
&rack_rtm_prr_retran,
"Total number of prr based retransmits");
rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "prrsndnew", CTLFLAG_RD,
&rack_rtm_prr_newdata,
"Total number of prr based new transmits");
rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tsnf", CTLFLAG_RD,
&rack_timestamp_mismatch,
"Total number of timestamps that we could not find the reported ts");
rack_find_high = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "findhigh", CTLFLAG_RD,
&rack_find_high,
"Total number of FIN causing find-high");
rack_reorder_seen = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "reordering", CTLFLAG_RD,
&rack_reorder_seen,
"Total number of times we added delay due to reordering");
rack_tlp_tot = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_to_total", CTLFLAG_RD,
&rack_tlp_tot,
"Total number of tail loss probe expirations");
rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_new", CTLFLAG_RD,
&rack_tlp_newdata,
"Total number of tail loss probe sending new data");
rack_tlp_retran = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_retran", CTLFLAG_RD,
&rack_tlp_retran,
"Total number of tail loss probe sending retransmitted data");
rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
&rack_tlp_retran_bytes,
"Total bytes of tail loss probe sending retransmitted data");
rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
&rack_tlp_retran_fail,
"Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
rack_to_tot = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "rack_to_tot", CTLFLAG_RD,
&rack_to_tot,
"Total number of times the rack to expired?");
rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "arm_rack", CTLFLAG_RD,
&rack_to_arm_rack,
"Total number of times the rack timer armed?");
rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "arm_tlp", CTLFLAG_RD,
&rack_to_arm_tlp,
"Total number of times the tlp timer armed?");
+
+ rack_calc_zero = counter_u64_alloc(M_WAITOK);
+ rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "calc_zero", CTLFLAG_RD,
+ &rack_calc_zero,
+ "Total number of times pacing time worked out to zero?");
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "calc_nonzero", CTLFLAG_RD,
+ &rack_calc_nonzero,
+ "Total number of times pacing time worked out to non-zero?");
rack_paced_segments = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "paced", CTLFLAG_RD,
&rack_paced_segments,
"Total number of times a segment send caused hptsi");
rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "unpaced", CTLFLAG_RD,
&rack_unpaced_segments,
"Total number of times a segment did not cause hptsi");
rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "saw_enobufs", CTLFLAG_RD,
&rack_saw_enobuf,
"Total number of times a segment did not cause hptsi");
rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
&rack_saw_enetunreach,
"Total number of times a segment did not cause hptsi");
rack_to_alloc = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "allocs", CTLFLAG_RD,
&rack_to_alloc,
"Total allocations of tracking structures");
rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "allochard", CTLFLAG_RD,
&rack_to_alloc_hard,
"Total allocations done with sleeping the hard way");
rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "allocemerg", CTLFLAG_RD,
&rack_to_alloc_emerg,
"Total allocations done from emergency cache");
rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "alloc_limited", CTLFLAG_RD,
&rack_to_alloc_limited,
"Total allocations dropped due to limit");
rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
&rack_alloc_limited_conns,
"Connections with allocations dropped due to limit");
rack_split_limited = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "split_limited", CTLFLAG_RD,
&rack_split_limited,
"Split allocations dropped due to limit");
rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "sack_long", CTLFLAG_RD,
&rack_sack_proc_all,
"Total times we had to walk whole list for sack processing");
rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "sack_restart", CTLFLAG_RD,
&rack_sack_proc_restart,
"Total times we had to walk whole list due to a restart");
rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "sack_short", CTLFLAG_RD,
&rack_sack_proc_short,
"Total times we took shortcut for sack processing");
rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
&rack_enter_tlp_calc,
"Total times we called calc-tlp");
rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
&rack_used_tlpmethod,
"Total number of runt sacks");
rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
&rack_used_tlpmethod2,
- "Total number of runt sacks 2");
- rack_runt_sacks = counter_u64_alloc(M_WAITOK);
- SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ "Total number of times we hit TLP method 2");
+ /* Sack Attacker detection stuff */
+ rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "runtsacks", CTLFLAG_RD,
- &rack_runt_sacks,
- "Total number of runt sacks");
+ OID_AUTO,
+ "sack_attack",
+ CTLFLAG_RW, 0,
+ "Rack Sack Attack Counters and Controls");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
+ &rack_highest_sack_thresh_seen, 0,
+ "Highest sack to ack ratio seen");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
+ &rack_highest_move_thresh_seen, 0,
+ "Highest move to non-move ratio seen");
+ rack_ack_total = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "acktotal", CTLFLAG_RD,
+ &rack_ack_total,
+ "Total number of Ack's");
+
+ rack_express_sack = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
+ &rack_express_sack,
+ "Total expresss number of Sack's");
+ rack_sack_total = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "sacktotal", CTLFLAG_RD,
+ &rack_sack_total,
+ "Total number of SACK's");
+ rack_move_none = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "move_none", CTLFLAG_RD,
+ &rack_move_none,
+ "Total number of SACK index reuse of postions under threshold");
+ rack_move_some = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "move_some", CTLFLAG_RD,
+ &rack_move_some,
+ "Total number of SACK index reuse of postions over threshold");
+ rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "attacks", CTLFLAG_RD,
+ &rack_sack_attacks_detected,
+ "Total number of SACK attackers that had sack disabled");
+ rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "reversed", CTLFLAG_RD,
+ &rack_sack_attacks_reversed,
+ "Total number of SACK attackers that were later determined false positive");
+ rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "nextmerge", CTLFLAG_RD,
+ &rack_sack_used_next_merge,
+ "Total number of times we used the next merge");
+ rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "prevmerge", CTLFLAG_RD,
+ &rack_sack_used_prev_merge,
+ "Total number of times we used the prev merge");
+ rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "skipacked", CTLFLAG_RD,
+ &rack_sack_skipped_acked,
+ "Total number of times we skipped previously sacked");
+ rack_sack_splits = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "ofsplit", CTLFLAG_RD,
+ &rack_sack_splits,
+ "Total number of times we did the old fashion tree split");
rack_progress_drops = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "prog_drops", CTLFLAG_RD,
&rack_progress_drops,
"Total number of progress drops");
rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
&rack_input_idle_reduces,
"Total number of idle reductions on input");
+ rack_collapsed_win = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "collapsed_win", CTLFLAG_RD,
+ &rack_collapsed_win,
+ "Total number of collapsed windows");
rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
+ SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "tlp_nada", CTLFLAG_RD,
&rack_tlp_does_nada,
"Total number of nada tlp calls");
+
+ rack_tls_rwnd = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_rwnd", CTLFLAG_RD,
+ &rack_tls_rwnd,
+ "Total hdwr tls rwnd limited");
+
+ rack_tls_cwnd = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_cwnd", CTLFLAG_RD,
+ &rack_tls_cwnd,
+ "Total hdwr tls cwnd limited");
+
+ rack_tls_app = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_app", CTLFLAG_RD,
+ &rack_tls_app,
+ "Total hdwr tls app limited");
+
+ rack_tls_other = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_other", CTLFLAG_RD,
+ &rack_tls_other,
+ "Total hdwr tls other limited");
+
+ rack_tls_filled = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_filled", CTLFLAG_RD,
+ &rack_tls_filled,
+ "Total hdwr tls filled");
+
+ rack_tls_rxt = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_rxt", CTLFLAG_RD,
+ &rack_tls_rxt,
+ "Total hdwr rxt");
+
+ rack_tls_tlp = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "tls_tlp", CTLFLAG_RD,
+ &rack_tls_tlp,
+ "Total hdwr tls tlp");
+ rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "timer_hole", CTLFLAG_RD,
+ &rack_per_timer_hole,
+ "Total persists start in timer hole");
+
COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "outsize", CTLFLAG_RD,
@@ -883,10 +1064,52 @@ rack_init_sysctls()
&rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
}
+static __inline int
+rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
+{
+ if (SEQ_GEQ(b->r_start, a->r_start) &&
+ SEQ_LT(b->r_start, a->r_end)) {
+ /*
+ * The entry b is within the
+ * block a. i.e.:
+ * a -- |-------------|
+ * b -- |----|
+ * <or>
+ * b -- |------|
+ * <or>
+ * b -- |-----------|
+ */
+ return (0);
+ } else if (SEQ_GEQ(b->r_start, a->r_end)) {
+ /*
+ * b falls as either the next
+ * sequence block after a so a
+ * is said to be smaller than b.
+ * i.e:
+ * a -- |------|
+ * b -- |--------|
+ * or
+ * b -- |-----|
+ */
+ return (1);
+ }
+ /*
+ * Whats left is where a is
+ * larger than b. i.e:
+ * a -- |-------|
+ * b -- |---|
+ * or even possibly
+ * b -- |--------------|
+ */
+ return (-1);
+}
+
+RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
+RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
+
static inline int32_t
rack_progress_timeout_check(struct tcpcb *tp)
{
-#ifdef NETFLIX_PROGRESS
if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
/*
@@ -897,21 +1120,52 @@ rack_progress_timeout_check(struct tcpcb *tp)
struct tcp_rack *rack;
rack = (struct tcp_rack *)tp->t_fb_ptr;
counter_u64_add(rack_progress_drops, 1);
+#ifdef NETFLIX_STATS
TCPSTAT_INC(tcps_progdrops);
+#endif
rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
return (1);
}
}
-#endif
return (0);
}
+
+static void
+rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = tsused;
+ log.u_bbr.flex2 = thresh;
+ log.u_bbr.flex3 = rsm->r_flags;
+ log.u_bbr.flex4 = rsm->r_dupack;
+ log.u_bbr.flex5 = rsm->r_start;
+ log.u_bbr.flex6 = rsm->r_end;
+ log.u_bbr.flex8 = mod;
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_SETTINGS_CHG, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+
+
static void
rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
@@ -920,22 +1174,27 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
log.u_bbr.flex4 = slot;
log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
+ log.u_bbr.flex7 = rack->rc_in_persist;
log.u_bbr.flex8 = which;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_TIMERSTAR, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
static void
-rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
+rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no)
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
@@ -943,11 +1202,15 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
log.u_bbr.flex8 = to_num;
log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
log.u_bbr.flex2 = rack->rc_rack_rtt;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.flex3 = no;
+ log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_RTO, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
@@ -957,6 +1220,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
{
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
@@ -969,11 +1233,14 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
- TCP_LOG_EVENT(tp, NULL,
+ log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRRTT, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
@@ -989,10 +1256,16 @@ rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log, 0, sizeof(log));
/* Convert our ms to a microsecond */
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = rtt * 1000;
+ log.u_bbr.flex2 = rack->r_ctl.ack_count;
+ log.u_bbr.flex3 = rack->r_ctl.sack_count;
+ log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
+ log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
+ log.u_bbr.flex8 = rack->sack_attack_disable;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1007,6 +1280,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
{
if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
@@ -1016,11 +1290,13 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
log.u_bbr.flex3 = tp->t_maxunacktime;
log.u_bbr.flex4 = tp->t_acktime;
log.u_bbr.flex8 = event;
- TCP_LOG_EVENT(tp, NULL,
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_PROGRESS, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
@@ -1029,18 +1305,22 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
log.u_bbr.flex1 = slot;
+ log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
log.u_bbr.flex8 = rack->rc_in_persist;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRSND, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
@@ -1049,41 +1329,76 @@ rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = did_out;
log.u_bbr.flex2 = nxt_pkt;
log.u_bbr.flex3 = way_out;
log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex7 = rack->r_wanted_output;
log.u_bbr.flex8 = rack->rc_in_persist;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_DOSEG_DONE, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
+static void
+rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+ uint32_t cts;
+ memset(&log, 0, sizeof(log));
+ cts = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex4 = len;
+ log.u_bbr.flex5 = orig_len;
+ log.u_bbr.flex6 = rack->r_ctl.rc_sacked;
+ log.u_bbr.flex7 = mod;
+ log.u_bbr.flex8 = frm;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ TCP_HDWR_TLS, 0,
+ 0, &log, false, &tv);
+ }
+}
+
static void
rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
log.u_bbr.flex1 = slot;
log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex7 = hpts_calling;
log.u_bbr.flex8 = rack->rc_in_persist;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_JUSTRET, 0,
- tlen, &log, false);
+ tlen, &log, false, &tv);
}
}
@@ -1092,6 +1407,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
@@ -1100,13 +1416,16 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
log.u_bbr.flex2 = 0;
log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = 0;
+ log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
log.u_bbr.flex8 = hpts_removed;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_TIMERCANC, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
@@ -1115,6 +1434,7 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex1 = timers;
@@ -1122,15 +1442,76 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t
log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
log.u_bbr.flex5 = cts;
- TCP_LOG_EVENT(rack->rc_tp, NULL,
+ log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_TO_PROCESS, 0,
- 0, &log, false);
+ 0, &log, false, &tv);
}
}
static void
+rack_log_to_prr(struct tcp_rack *rack, int frm)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
+ log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
+ log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
+ log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
+ log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
+ log.u_bbr.flex8 = frm;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_BBRUPD, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+#ifdef NETFLIX_EXP_DETECTION
+static void
+rack_log_sad(struct tcp_rack *rack, int event)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = rack->r_ctl.sack_count;
+ log.u_bbr.flex2 = rack->r_ctl.ack_count;
+ log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
+ log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
+ log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
+ log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
+ log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
+ log.u_bbr.lt_epoch = (tcp_force_detection << 8);
+ log.u_bbr.lt_epoch |= rack->do_detection;
+ log.u_bbr.applimited = tcp_map_minimum;
+ log.u_bbr.flex7 = rack->sack_attack_disable;
+ log.u_bbr.flex8 = event;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.delivered = tcp_sad_decay_val;
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_SAD_DETECTION, 0,
+ 0, &log, false, &tv);
+ }
+}
+#endif
+
+static void
rack_counter_destroy()
{
counter_u64_free(rack_badfr);
@@ -1158,14 +1539,15 @@ rack_counter_destroy()
counter_u64_free(rack_sack_proc_restart);
counter_u64_free(rack_to_alloc);
counter_u64_free(rack_to_alloc_limited);
+ counter_u64_free(rack_alloc_limited_conns);
counter_u64_free(rack_split_limited);
counter_u64_free(rack_find_high);
- counter_u64_free(rack_runt_sacks);
counter_u64_free(rack_enter_tlp_calc);
counter_u64_free(rack_used_tlpmethod);
counter_u64_free(rack_used_tlpmethod2);
counter_u64_free(rack_progress_drops);
counter_u64_free(rack_input_idle_reduces);
+ counter_u64_free(rack_collapsed_win);
counter_u64_free(rack_tlp_does_nada);
COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
@@ -1185,7 +1567,7 @@ rack_alloc(struct tcp_rack *rack)
if (rack->rc_free_cnt) {
counter_u64_add(rack_to_alloc_emerg, 1);
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
- TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
rack->rc_free_cnt--;
return (rsm);
}
@@ -1195,8 +1577,9 @@ rack_alloc(struct tcp_rack *rack)
static struct rack_sendmap *
rack_alloc_full_limit(struct tcp_rack *rack)
{
- if ((rack_map_entries_limit > 0) &&
- (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
+ if ((rack_tcp_map_entries_limit > 0) &&
+ (rack->do_detection == 0) &&
+ (rack->r_ctl.rc_num_maps_alloced >= rack_tcp_map_entries_limit)) {
counter_u64_add(rack_to_alloc_limited, 1);
if (!rack->alloc_limit_reported) {
rack->alloc_limit_reported = 1;
@@ -1215,8 +1598,9 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
if (limit_type) {
/* currently there is only one limit type */
- if (rack_map_split_limit > 0 &&
- rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) {
+ if (rack_tcp_map_split_limit > 0 &&
+ (rack->do_detection == 0) &&
+ rack->r_ctl.rc_num_split_allocs >= rack_tcp_map_split_limit) {
counter_u64_add(rack_split_limited, 1);
if (!rack->alloc_limit_reported) {
rack->alloc_limit_reported = 1;
@@ -1244,13 +1628,11 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
}
if (rack->r_ctl.rc_tlpsend == rsm)
rack->r_ctl.rc_tlpsend = NULL;
- if (rack->r_ctl.rc_next == rsm)
- rack->r_ctl.rc_next = NULL;
if (rack->r_ctl.rc_sacklast == rsm)
rack->r_ctl.rc_sacklast = NULL;
if (rack->rc_free_cnt < rack_free_cache) {
memset(rsm, 0, sizeof(struct rack_sendmap));
- TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
rsm->r_limit_type = 0;
rack->rc_free_cnt++;
return;
@@ -1271,13 +1653,12 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui
#endif
INP_WLOCK_ASSERT(tp->t_inpcb);
-
tp->ccv->nsegs = nsegs;
tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
uint32_t max;
- max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
+ max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
if (tp->ccv->bytes_this_ack > max) {
tp->ccv->bytes_this_ack = max;
}
@@ -1295,6 +1676,12 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui
SEQ_GEQ(th->th_ack, tp->gput_ack)) {
gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
max(1, tcp_ts_getticks() - tp->gput_ts);
+ /* We store it in bytes per ms (or kbytes per sec) */
+ rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8;
+ rack->r_ctl.rc_gp_hist_idx++;
+ if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST)
+ rack->r_ctl.rc_gp_hist_filled = 1;
+ rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST;
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
gput);
/*
@@ -1309,6 +1696,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui
tp->t_stats_gput_prev);
tp->t_flags &= ~TF_GPUTINPROG;
tp->t_stats_gput_prev = gput;
+
if (tp->t_maxpeakrate) {
/*
* We update t_peakrate_thr. This gives us roughly
@@ -1320,7 +1708,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui
#endif
if (tp->snd_cwnd > tp->snd_ssthresh) {
tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
- nsegs * V_tcp_abc_l_var * tp->t_maxseg);
+ nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
if (tp->t_bytes_acked >= tp->snd_cwnd) {
tp->t_bytes_acked -= tp->snd_cwnd;
tp->ccv->flags |= CCF_ABC_SENTAWND;
@@ -1392,9 +1780,12 @@ rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
/* Suck the next prr cnt back into cwnd */
tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
rack->r_ctl.rc_prr_sndcnt = 0;
+ rack_log_to_prr(rack, 1);
}
tp->snd_recover = tp->snd_una;
EXIT_RECOVERY(tp->t_flags);
+
+
}
static void
@@ -1407,13 +1798,15 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
rack = (struct tcp_rack *)tp->t_fb_ptr;
switch (type) {
case CC_NDUPACK:
-/* rack->r_ctl.rc_ssthresh_set = 1;*/
+ tp->t_flags &= ~TF_WASFRECOVERY;
+ tp->t_flags &= ~TF_WASCRECOVERY;
if (!IN_FASTRECOVERY(tp->t_flags)) {
rack->r_ctl.rc_tlp_rtx_out = 0;
rack->r_ctl.rc_prr_delivered = 0;
rack->r_ctl.rc_prr_out = 0;
rack->r_ctl.rc_loss_count = 0;
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 2);
rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
tp->snd_recover = tp->snd_max;
if (tp->t_flags & TF_ECN_PERMIT)
@@ -1433,8 +1826,8 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
tp->t_bytes_acked = 0;
EXIT_RECOVERY(tp->t_flags);
tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
- tp->t_maxseg) * tp->t_maxseg;
- tp->snd_cwnd = tp->t_maxseg;
+ ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
+ tp->snd_cwnd = ctf_fixed_maxseg(tp);
break;
case CC_RTO_ERR:
TCPSTAT_INC(tcps_sndrexmitbad);
@@ -1442,10 +1835,14 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
tp->snd_cwnd = tp->snd_cwnd_prev;
tp->snd_ssthresh = tp->snd_ssthresh_prev;
tp->snd_recover = tp->snd_recover_prev;
- if (tp->t_flags & TF_WASFRECOVERY)
+ if (tp->t_flags & TF_WASFRECOVERY) {
ENTER_FASTRECOVERY(tp->t_flags);
- if (tp->t_flags & TF_WASCRECOVERY)
+ tp->t_flags &= ~TF_WASFRECOVERY;
+ }
+ if (tp->t_flags & TF_WASCRECOVERY) {
ENTER_CONGRECOVERY(tp->t_flags);
+ tp->t_flags &= ~TF_WASCRECOVERY;
+ }
tp->snd_nxt = tp->snd_max;
tp->t_badrxtwin = 0;
break;
@@ -1461,7 +1858,7 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
static inline void
-rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
+rack_cc_after_idle(struct tcpcb *tp)
{
uint32_t i_cwnd;
@@ -1475,29 +1872,11 @@ rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
if (CC_ALGO(tp)->after_idle != NULL)
CC_ALGO(tp)->after_idle(tp->ccv);
- if (V_tcp_initcwnd_segments)
- i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
- max(2 * tp->t_maxseg, 14600));
- else if (V_tcp_do_rfc3390)
- i_cwnd = min(4 * tp->t_maxseg,
- max(2 * tp->t_maxseg, 4380));
- else {
- /* Per RFC5681 Section 3.1 */
- if (tp->t_maxseg > 2190)
- i_cwnd = 2 * tp->t_maxseg;
- else if (tp->t_maxseg > 1095)
- i_cwnd = 3 * tp->t_maxseg;
- else
- i_cwnd = 4 * tp->t_maxseg;
- }
- if (reduce_largest) {
- /*
- * Do we reduce the largest cwnd to make
- * rack play nice on restart hptsi wise?
- */
- if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd)
- ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
- }
+ if (tp->snd_cwnd == 1)
+ i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
+ else
+ i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
+
/*
* Being idle is no differnt than the initial window. If the cc
* clamps it down below the initial window raise it to the initial
@@ -1526,320 +1905,6 @@ rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
(tlen <= tp->t_maxseg) && \
(tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
-static inline void
-rack_calc_rwin(struct socket *so, struct tcpcb *tp)
-{
- int32_t win;
-
- /*
- * Calculate amount of space in receive window, and then do TCP
- * input processing. Receive window is amount of space in rcv queue,
- * but not less than advertised window.
- */
- win = sbspace(&so->so_rcv);
- if (win < 0)
- win = 0;
- tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
-}
-
-static void
-rack_do_drop(struct mbuf *m, struct tcpcb *tp)
-{
- /*
- * Drop space held by incoming segment and return.
- */
- if (tp != NULL)
- INP_WUNLOCK(tp->t_inpcb);
- if (m)
- m_freem(m);
-}
-
-static void
-rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen)
-{
- if (tp != NULL) {
- tcp_dropwithreset(m, th, tp, tlen, rstreason);
- INP_WUNLOCK(tp->t_inpcb);
- } else
- tcp_dropwithreset(m, th, NULL, tlen, rstreason);
-}
-
-/*
- * The value in ret_val informs the caller
- * if we dropped the tcb (and lock) or not.
- * 1 = we dropped it, 0 = the TCB is still locked
- * and valid.
- */
-static void
-rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
-{
- /*
- * Generate an ACK dropping incoming segment if it occupies sequence
- * space, where the ACK reflects our state.
- *
- * We can now skip the test for the RST flag since all paths to this
- * code happen after packets containing RST have been dropped.
- *
- * In the SYN-RECEIVED state, don't send an ACK unless the segment
- * we received passes the SYN-RECEIVED ACK test. If it fails send a
- * RST. This breaks the loop in the "LAND" DoS attack, and also
- * prevents an ACK storm between two listening ports that have been
- * sent forged SYN segments, each with the source address of the
- * other.
- */
- struct tcp_rack *rack;
-
- if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
- (SEQ_GT(tp->snd_una, th->th_ack) ||
- SEQ_GT(th->th_ack, tp->snd_max))) {
- *ret_val = 1;
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
- return;
- } else
- *ret_val = 0;
- rack = (struct tcp_rack *)tp->t_fb_ptr;
- rack->r_wanted_output++;
- tp->t_flags |= TF_ACKNOW;
- if (m)
- m_freem(m);
-}
-
-
-static int
-rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
-{
- /*
- * RFC5961 Section 3.2
- *
- * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
- * window, we send challenge ACK.
- *
- * Note: to take into account delayed ACKs, we should test against
- * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
- * of closed window, not covered by the RFC.
- */
- int dropped = 0;
-
- if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
- SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
- (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
-
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- KASSERT(tp->t_state != TCPS_SYN_SENT,
- ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
- __func__, th, tp));
-
- if (V_tcp_insecure_rst ||
- (tp->last_ack_sent == th->th_seq) ||
- (tp->rcv_nxt == th->th_seq) ||
- ((tp->last_ack_sent - 1) == th->th_seq)) {
- TCPSTAT_INC(tcps_drops);
- /* Drop the connection. */
- switch (tp->t_state) {
- case TCPS_SYN_RECEIVED:
- so->so_error = ECONNREFUSED;
- goto close;
- case TCPS_ESTABLISHED:
- case TCPS_FIN_WAIT_1:
- case TCPS_FIN_WAIT_2:
- case TCPS_CLOSE_WAIT:
- case TCPS_CLOSING:
- case TCPS_LAST_ACK:
- so->so_error = ECONNRESET;
- close:
- tcp_state_change(tp, TCPS_CLOSED);
- /* FALLTHROUGH */
- default:
- tp = tcp_close(tp);
- }
- dropped = 1;
- rack_do_drop(m, tp);
- } else {
- TCPSTAT_INC(tcps_badrst);
- /* Send challenge ACK. */
- tcp_respond(tp, mtod(m, void *), th, m,
- tp->rcv_nxt, tp->snd_nxt, TH_ACK);
- tp->last_ack_sent = tp->rcv_nxt;
- }
- } else {
- m_freem(m);
- }
- return (dropped);
-}
-
-/*
- * The value in ret_val informs the caller
- * if we dropped the tcb (and lock) or not.
- * 1 = we dropped it, 0 = the TCB is still locked
- * and valid.
- */
-static void
-rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
-{
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-
- TCPSTAT_INC(tcps_badsyn);
- if (V_tcp_insecure_syn &&
- SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
- SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
- tp = tcp_drop(tp, ECONNRESET);
- *ret_val = 1;
- rack_do_drop(m, tp);
- } else {
- /* Send challenge ACK. */
- tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
- tp->snd_nxt, TH_ACK);
- tp->last_ack_sent = tp->rcv_nxt;
- m = NULL;
- *ret_val = 0;
- rack_do_drop(m, NULL);
- }
-}
-
-/*
- * rack_ts_check returns 1 for you should not proceed. It places
- * in ret_val what should be returned 1/0 by the caller. The 1 indicates
- * that the TCB is unlocked and probably dropped. The 0 indicates the
- * TCB is still valid and locked.
- */
-static int
-rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
-{
-
- /* Check to see if ts_recent is over 24 days old. */
- if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
- /*
- * Invalidate ts_recent. If this segment updates ts_recent,
- * the age will be reset later and ts_recent will get a
- * valid value. If it does not, setting ts_recent to zero
- * will at least satisfy the requirement that zero be placed
- * in the timestamp echo reply when ts_recent isn't valid.
- * The age isn't reset until we get a valid ts_recent
- * because we don't want out-of-order segments to be dropped
- * when ts_recent is old.
- */
- tp->ts_recent = 0;
- } else {
- TCPSTAT_INC(tcps_rcvduppack);
- TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
- TCPSTAT_INC(tcps_pawsdrop);
- *ret_val = 0;
- if (tlen) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
- } else {
- rack_do_drop(m, NULL);
- }
- return (1);
- }
- return (0);
-}
-
-/*
- * rack_drop_checks returns 1 for you should not proceed. It places
- * in ret_val what should be returned 1/0 by the caller. The 1 indicates
- * that the TCB is unlocked and probably dropped. The 0 indicates the
- * TCB is still valid and locked.
- */
-static int
-rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
-{
- int32_t todrop;
- int32_t thflags;
- int32_t tlen;
-
- thflags = *thf;
- tlen = *tlenp;
- todrop = tp->rcv_nxt - th->th_seq;
- if (todrop > 0) {
- if (thflags & TH_SYN) {
- thflags &= ~TH_SYN;
- th->th_seq++;
- if (th->th_urp > 1)
- th->th_urp--;
- else
- thflags &= ~TH_URG;
- todrop--;
- }
- /*
- * Following if statement from Stevens, vol. 2, p. 960.
- */
- if (todrop > tlen
- || (todrop == tlen && (thflags & TH_FIN) == 0)) {
- /*
- * Any valid FIN must be to the left of the window.
- * At this point the FIN must be a duplicate or out
- * of sequence; drop it.
- */
- thflags &= ~TH_FIN;
- /*
- * Send an ACK to resynchronize and drop any data.
- * But keep on processing for RST or ACK.
- */
- tp->t_flags |= TF_ACKNOW;
- todrop = tlen;
- TCPSTAT_INC(tcps_rcvduppack);
- TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
- } else {
- TCPSTAT_INC(tcps_rcvpartduppack);
- TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
- }
- if (tp->t_flags & TF_SACK_PERMIT) {
- /*
- * record the left, to-be-dropped edge of data
- * here, for use as dsack block further down
- */
- tcp_update_sack_list(tp, th->th_seq,
- th->th_seq + todrop);
- /*
- * ACK now, as the next in-sequence segment
- * will clear the DSACK block again
- */
- tp->t_flags |= TF_ACKNOW;
- }
- *drop_hdrlen += todrop; /* drop from the top afterwards */
- th->th_seq += todrop;
- tlen -= todrop;
- if (th->th_urp > todrop)
- th->th_urp -= todrop;
- else {
- thflags &= ~TH_URG;
- th->th_urp = 0;
- }
- }
- /*
- * If segment ends after window, drop trailing data (and PUSH and
- * FIN); if nothing left, just ACK.
- */
- todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
- if (todrop > 0) {
- TCPSTAT_INC(tcps_rcvpackafterwin);
- if (todrop >= tlen) {
- TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
- /*
- * If window is closed can only take segments at
- * window edge, and have to drop data and PUSH from
- * incoming segments. Continue processing, but
- * remember to ack. Otherwise, drop segment and
- * ack.
- */
- if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
- tp->t_flags |= TF_ACKNOW;
- TCPSTAT_INC(tcps_rcvwinprobe);
- } else {
- rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
- return (1);
- }
- } else
- TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
- m_adj(m, -todrop);
- tlen -= todrop;
- thflags &= ~(TH_PUSH | TH_FIN);
- }
- *thf = thflags;
- *tlenp = tlen;
- return (0);
-}
-
static struct rack_sendmap *
rack_find_lowest_rsm(struct tcp_rack *rack)
{
@@ -1872,7 +1937,7 @@ rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
*/
counter_u64_add(rack_find_high, 1);
prsm = rsm;
- TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
+ RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
continue;
}
@@ -1944,7 +2009,6 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
thresh += 1;
}
/* We don't let the rack timeout be above a RTO */
-
if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
}
@@ -1971,7 +2035,7 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
thresh = (srtt * 2);
/* Get the previous sent packet, if any */
- maxseg = tcp_maxseg(tp);
+ maxseg = ctf_fixed_maxseg(tp);
counter_u64_add(rack_enter_tlp_calc, 1);
len = rsm->r_end - rsm->r_start;
if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
@@ -2044,6 +2108,24 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
return (thresh);
}
+static uint32_t
+rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ /*
+ * We want the rack_rtt which is the
+ * last rtt we measured. However if that
+ * does not exist we fallback to the srtt (which
+ * we probably will never do) and then as a last
+ * resort we use RACK_INITIAL_RTO if no srtt is
+ * yet set.
+ */
+ if (rack->rc_rack_rtt)
+ return(rack->rc_rack_rtt);
+ else if (tp->t_srtt == 0)
+ return(RACK_INITIAL_RTO);
+ return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT));
+}
+
static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
{
@@ -2055,17 +2137,12 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
struct tcp_rack *rack;
struct rack_sendmap *rsm;
int32_t idx;
- uint32_t srtt_cur, srtt, thresh;
+ uint32_t srtt, thresh;
rack = (struct tcp_rack *)tp->t_fb_ptr;
- if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
+ if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
return (NULL);
}
- srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
- srtt = TICKS_2_MSEC(srtt_cur);
- if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
- srtt = rack->rc_rack_rtt;
-
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
if (rsm == NULL)
return (NULL);
@@ -2076,6 +2153,7 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
return (NULL);
}
idx = rsm->r_rtr_cnt - 1;
+ srtt = rack_grab_rtt(tp, rack);
thresh = rack_calc_thresh_rack(rack, srtt, tsused);
if (tsused < rsm->r_tim_lastsent[idx]) {
return (NULL);
@@ -2100,7 +2178,7 @@ rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
- tcp_persmin, tcp_persmax);
+ rack_persist_min, rack_persist_max);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
@@ -2109,7 +2187,7 @@ rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
}
static uint32_t
-rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
{
/*
* Start the FR timer, we do this based on getting the first one in
@@ -2117,7 +2195,7 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
* events we need to stop the running timer (if its running) before
* starting the new one.
*/
- uint32_t thresh, exp, to, srtt, time_since_sent;
+ uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
uint32_t srtt_cur;
int32_t idx;
int32_t is_tlp_timer = 0;
@@ -2131,13 +2209,31 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
/* We can't start any timer in persists */
return (rack_get_persists_timer_val(tp, rack));
}
+ if ((tp->t_state < TCPS_ESTABLISHED) ||
+ ((tp->t_flags & TF_SACK_PERMIT) == 0))
+ goto activate_rxt;
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
- if (rsm == NULL) {
+ if ((rsm == NULL) || sup_rack) {
/* Nothing on the send map */
activate_rxt:
+ time_since_sent = 0;
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm) {
+ idx = rsm->r_rtr_cnt - 1;
+ if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
+ tstmp_touse = rsm->r_tim_lastsent[idx];
+ else
+ tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
+ if (TSTMP_GT(tstmp_touse, cts))
+ time_since_sent = cts - tstmp_touse;
+ }
if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
to = TICKS_2_MSEC(tp->t_rxtcur);
+ if (to > time_since_sent)
+ to -= time_since_sent;
+ else
+ to = rack->r_ctl.rc_min_to;
if (to == 0)
to = 1;
return (to);
@@ -2151,6 +2247,16 @@ activate_rxt:
goto activate_rxt;
}
}
+ if (rack->sack_attack_disable) {
+ /*
+ * We don't want to do
+ * any TLP's if you are an attacker.
+ * Though if you are doing what
+ * is expected you may still have
+ * SACK-PASSED marks.
+ */
+ goto activate_rxt;
+ }
/* Convert from ms to usecs */
if (rsm->r_flags & RACK_SACK_PASSED) {
if ((tp->t_flags & TF_SENTFIN) &&
@@ -2162,12 +2268,20 @@ activate_rxt:
*/
goto activate_rxt;
}
- if (tp->t_srtt) {
- srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
- srtt = TICKS_2_MSEC(srtt_cur);
- } else
- srtt = RACK_INITIAL_RTO;
-
+ if ((rack->use_rack_cheat == 0) &&
+ (IN_RECOVERY(tp->t_flags)) &&
+ (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
+ /*
+ * We are not cheating, in recovery and
+ * not enough ack's to yet get our next
+ * retransmission out.
+ *
+ * Note that classified attackers do not
+ * get to use the rack-cheat.
+ */
+ goto activate_tlp;
+ }
+ srtt = rack_grab_rtt(tp, rack);
thresh = rack_calc_thresh_rack(rack, srtt, cts);
idx = rsm->r_rtr_cnt - 1;
exp = rsm->r_tim_lastsent[idx] + thresh;
@@ -2181,6 +2295,7 @@ activate_rxt:
}
} else {
/* Ok we need to do a TLP not RACK */
+activate_tlp:
if ((rack->rc_tlp_in_progress != 0) ||
(rack->r_ctl.rc_tlp_rtx_out != 0)) {
/*
@@ -2189,12 +2304,6 @@ activate_rxt:
*/
goto activate_rxt;
}
- if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) {
- /*
- * Peer collapsed rwnd, don't do TLP.
- */
- goto activate_rxt;
- }
rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
if (rsm == NULL) {
/* We found no rsm to TLP with. */
@@ -2206,10 +2315,13 @@ activate_rxt:
goto activate_rxt;
}
idx = rsm->r_rtr_cnt - 1;
- if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx]))
- time_since_sent = cts - rsm->r_tim_lastsent[idx];
- else
- time_since_sent = 0;
+ time_since_sent = 0;
+ if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
+ tstmp_touse = rsm->r_tim_lastsent[idx];
+ else
+ tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
+ if (TSTMP_GT(tstmp_touse, cts))
+ time_since_sent = cts - tstmp_touse;
is_tlp_timer = 1;
if (tp->t_srtt) {
srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
@@ -2260,10 +2372,6 @@ static void
rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
if (rack->rc_in_persist == 0) {
- if (((tp->t_flags & TF_SENTFIN) == 0) &&
- (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
- /* Must need to send more data to enter persist */
- return;
rack->r_ctl.rc_went_idle_time = cts;
rack_timer_cancel(tp, rack, cts, __LINE__);
tp->t_rxtshift = 0;
@@ -2285,8 +2393,8 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
}
static void
-rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
- int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
+rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
+ int32_t slot, uint32_t tot_len_this_send, int sup_rack)
{
struct inpcb *inp;
uint32_t delayed_ack = 0;
@@ -2299,7 +2407,6 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int
/* A previous call is already set up */
return;
}
-
if ((tp->t_state == TCPS_CLOSED) ||
(tp->t_state == TCPS_LISTEN)) {
return;
@@ -2308,6 +2415,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int
if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
left = rack->r_ctl.rc_timer_exp - cts;
}
+ rack->tlp_timer_up = 0;
rack->r_ctl.rc_timer_exp = 0;
if (rack->rc_inp->inp_in_hpts == 0) {
rack->r_ctl.rc_hpts_flags = 0;
@@ -2325,23 +2433,21 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int
else
slot = 1;
}
- if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
- /* No send window.. we must enter persist */
- rack_enter_persist(tp, rack, cts);
- } else if ((frm_out_sbavail &&
- (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
- (tp->snd_wnd < tp->t_maxseg)) &&
- TCPS_HAVEESTABLISHED(tp->t_state)) {
+ hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
+ if (rack->sack_attack_disable &&
+ (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) {
/*
- * If we have no window or we can't send a segment (and have
- * data to send.. we cheat here and frm_out_sbavail is
- * passed in with the sbavail(sb) only from bbr_output) and
- * we are established, then we must enter persits (if not
- * already in persits).
+ * We have a potential attacker on
+ * the line. We have possibly some
+ * (or now) pacing time set. We want to
+ * slow down the processing of sacks by some
+ * amount (if it is an attacker). Set the default
+ * slot for attackers in place (unless the orginal
+ * interval is longer). Its stored in
+ * micro-seconds, so lets convert to msecs.
*/
- rack_enter_persist(tp, rack, cts);
+ slot = USEC_TO_MSEC(tcp_sad_pacing_interval);
}
- hpts_timeout = rack_timer_start(tp, rack, cts);
if (tp->t_flags & TF_DELACK) {
delayed_ack = TICKS_2_MSEC(tcp_delacktime);
rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
@@ -2398,6 +2504,11 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int
rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
}
if (slot) {
+ rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ else
+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
rack->r_ctl.rc_last_output_to = cts + slot;
if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
if (rack->rc_inp->inp_in_hpts == 0)
@@ -2413,6 +2524,15 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int
rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
}
} else if (hpts_timeout) {
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) {
+ /* For a rack timer, don't wake us */
+ rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ } else {
+ /* All other timers wake us up */
+ rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ }
if (rack->rc_inp->inp_in_hpts == 0)
tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
@@ -2448,7 +2568,7 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
* settings.
*/
struct rack_sendmap *rsm;
- int32_t recovery;
+ int32_t recovery, ll;
if (tp->t_timers->tt_flags & TT_STOPPED) {
return (1);
@@ -2457,12 +2577,16 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
/* Its not time yet */
return (0);
}
- rack_log_to_event(rack, RACK_TO_FRM_RACK);
recovery = IN_RECOVERY(tp->t_flags);
counter_u64_add(rack_to_tot, 1);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
rsm = rack_check_recovery_mode(tp, cts);
+ if (rsm)
+ ll = rsm->r_end - rsm->r_start;
+ else
+ ll = 0;
+ rack_log_to_event(rack, RACK_TO_FRM_RACK, ll);
if (rsm) {
uint32_t rtt;
@@ -2470,23 +2594,23 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
if (rtt == 0)
rtt = 1;
if ((recovery == 0) &&
- (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
+ (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
/*
* The rack-timeout that enter's us into recovery
* will force out one MSS and set us up so that we
* can do one more send in 2*rtt (transitioning the
* rack timeout into a rack-tlp).
*/
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
- } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
- ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 3);
+ } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) &&
+ rack->use_rack_cheat) {
/*
- * When a rack timer goes, we have to send at
- * least one segment. They will be paced a min of 1ms
- * apart via the next rack timer (or further
- * if the rack timer dictates it).
+ * When a rack timer goes, if the rack cheat is
+ * on, arrange it so we can send a full segment.
*/
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 4);
}
} else {
/* This is a case that should happen rarely if ever */
@@ -2500,6 +2624,24 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
return (0);
}
+static __inline void
+rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
+ struct rack_sendmap *rsm, uint32_t start)
+{
+ int idx;
+
+ nrsm->r_start = start;
+ nrsm->r_end = rsm->r_end;
+ nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_flags = rsm->r_flags;
+ nrsm->r_dupack = rsm->r_dupack;
+ nrsm->r_rtr_bytes = 0;
+ rsm->r_end = nrsm->r_start;
+ for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
+ nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ }
+}
+
static struct rack_sendmap *
rack_merge_rsm(struct tcp_rack *rack,
struct rack_sendmap *l_rsm,
@@ -2515,19 +2657,32 @@ rack_merge_rsm(struct tcp_rack *rack,
* is any reason we need to try to find
* the oldest (or last oldest retransmitted).
*/
+ struct rack_sendmap *rm;
+
l_rsm->r_end = r_rsm->r_end;
+ if (l_rsm->r_dupack < r_rsm->r_dupack)
+ l_rsm->r_dupack = r_rsm->r_dupack;
if (r_rsm->r_rtr_bytes)
l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
if (r_rsm->r_in_tmap) {
/* This really should not happen */
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
+ r_rsm->r_in_tmap = 0;
}
/* Now the flags */
if (r_rsm->r_flags & RACK_HAS_FIN)
l_rsm->r_flags |= RACK_HAS_FIN;
if (r_rsm->r_flags & RACK_TLP)
l_rsm->r_flags |= RACK_TLP;
- TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next);
+ if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
+ l_rsm->r_flags |= RACK_RWND_COLLAPSED;
+ rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
+#ifdef INVARIANTS
+ if (rm != r_rsm) {
+ panic("removing head in rack:%p rsm:%p rm:%p",
+ rack, r_rsm, rm);
+ }
+#endif
if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
/* Transfer the split limit to the map we free */
r_rsm->r_limit_type = l_rsm->r_limit_type;
@@ -2552,9 +2707,11 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
* Tail Loss Probe.
*/
struct rack_sendmap *rsm = NULL;
+ struct rack_sendmap *insret;
struct socket *so;
uint32_t amm, old_prr_snd = 0;
uint32_t out, avail;
+ int collapsed_win = 0;
if (tp->t_timers->tt_flags & TT_STOPPED) {
return (1);
@@ -2571,14 +2728,28 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
* A TLP timer has expired. We have been idle for 2 rtts. So we now
* need to figure out how to force a full MSS segment out.
*/
- rack_log_to_event(rack, RACK_TO_FRM_TLP);
+ rack_log_to_event(rack, RACK_TO_FRM_TLP, 0);
counter_u64_add(rack_tlp_tot, 1);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
so = tp->t_inpcb->inp_socket;
+#ifdef KERN_TLS
+ if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
+ /*
+ * For hardware TLS we do *not* want to send
+ * new data, lets instead just do a retransmission.
+ */
+ goto need_retran;
+ }
+#endif
avail = sbavail(&so->so_snd);
out = tp->snd_max - tp->snd_una;
- rack->rc_timer_up = 1;
+ rack->tlp_timer_up = 1;
+ if (out > tp->snd_wnd) {
+ /* special case, we need a retransmission */
+ collapsed_win = 1;
+ goto need_retran;
+ }
/*
* If we are in recovery we can jazz out a segment if new data is
* present simply by setting rc_prr_sndcnt to a segment.
@@ -2587,18 +2758,19 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
/* New data is available */
amm = avail - out;
- if (amm > tp->t_maxseg) {
- amm = tp->t_maxseg;
- } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
+ if (amm > ctf_fixed_maxseg(tp)) {
+ amm = ctf_fixed_maxseg(tp);
+ } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) {
/* not enough to fill a MTU and no-delay is off */
goto need_retran;
}
if (IN_RECOVERY(tp->t_flags)) {
/* Unlikely */
old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
- if (out + amm <= tp->snd_wnd)
+ if (out + amm <= tp->snd_wnd) {
rack->r_ctl.rc_prr_sndcnt = amm;
- else
+ rack_log_to_prr(rack, 4);
+ } else
goto need_retran;
} else {
/* Set the send-new override */
@@ -2618,28 +2790,52 @@ need_retran:
* Ok we need to arrange the last un-acked segment to be re-sent, or
* optionally the first un-acked segment.
*/
- if (rack_always_send_oldest)
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
- else {
- rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
- if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
- rsm = rack_find_high_nonack(rack, rsm);
+ if (collapsed_win == 0) {
+ if (rack_always_send_oldest)
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ else {
+ rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
+ rsm = rack_find_high_nonack(rack, rsm);
+ }
}
- }
- if (rsm == NULL) {
- counter_u64_add(rack_tlp_does_nada, 1);
+ if (rsm == NULL) {
+ counter_u64_add(rack_tlp_does_nada, 1);
#ifdef TCP_BLACKBOX
- tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
+ tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
#endif
- goto out;
+ goto out;
+ }
+ } else {
+ /*
+ * We must find the last segment
+ * that was acceptable by the client.
+ */
+ RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
+ if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
+ /* Found one */
+ break;
+ }
+ }
+ if (rsm == NULL) {
+ /* None? if so send the first */
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (rsm == NULL) {
+ counter_u64_add(rack_tlp_does_nada, 1);
+#ifdef TCP_BLACKBOX
+ tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
+#endif
+ goto out;
+ }
+ }
}
- if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
+ if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
/*
* We need to split this the last segment in two.
*/
- int32_t idx;
struct rack_sendmap *nrsm;
+
nrsm = rack_alloc_full_limit(rack);
if (nrsm == NULL) {
/*
@@ -2649,17 +2845,15 @@ need_retran:
counter_u64_add(rack_tlp_does_nada, 1);
goto out;
}
- nrsm->r_start = (rsm->r_end - tp->t_maxseg);
- nrsm->r_end = rsm->r_end;
- nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
- nrsm->r_flags = rsm->r_flags;
- nrsm->r_sndcnt = rsm->r_sndcnt;
- nrsm->r_rtr_bytes = 0;
- rsm->r_end = nrsm->r_start;
- for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
- nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
- }
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+ rack_clone_rsm(rack, nrsm, rsm,
+ (rsm->r_end - ctf_fixed_maxseg(tp)));
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
+ }
+#endif
if (rsm->r_in_tmap) {
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
@@ -2684,11 +2878,12 @@ send:
* peer in max times. We need the retransmit timer to take
* over.
*/
-restore:
+ restore:
rack->r_ctl.rc_tlpsend = NULL;
if (rsm)
rsm->r_flags &= ~RACK_TLP;
rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
+ rack_log_to_prr(rack, 5);
counter_u64_add(rack_tlp_retran_fail, 1);
goto out;
} else if (rsm) {
@@ -2708,7 +2903,7 @@ restore:
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
return (0);
out:
- rack->rc_timer_up = 0;
+ rack->tlp_timer_up = 0;
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
return (0);
}
@@ -2727,7 +2922,7 @@ rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
if (tp->t_timers->tt_flags & TT_STOPPED) {
return (1);
}
- rack_log_to_event(rack, RACK_TO_FRM_DELACK);
+ rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0);
tp->t_flags &= ~TF_DELACK;
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
@@ -2746,8 +2941,9 @@ rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
static int
rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
+ struct tcptemp *t_template;
struct inpcb *inp;
- int32_t retval = 0;
+ int32_t retval = 1;
inp = tp->t_inpcb;
@@ -2795,9 +2991,22 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
goto out;
}
- tp->t_flags |= TF_FORCEDATA;
+ t_template = tcpip_maketemplate(rack->rc_inp);
+ if (t_template) {
+ tcp_respond(tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ tp->rcv_nxt, tp->snd_una - 1, 0);
+ /* This sends an ack */
+ if (tp->t_flags & TF_DELACK)
+ tp->t_flags &= ~TF_DELACK;
+ free(t_template, M_TEMP);
+ }
+ if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+ tp->t_rxtshift++;
out:
- rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
+ rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0);
+ rack_start_hpts_timer(rack, tp, cts,
+ 0, 0, 0);
return (retval);
}
@@ -2818,7 +3027,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
}
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
inp = tp->t_inpcb;
- rack_log_to_event(rack, RACK_TO_FRM_KEEP);
+ rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0);
/*
* Keep-alive timer went off; send something or drop connection if
* idle for too long.
@@ -2849,7 +3058,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
free(t_template, M_TEMP);
}
}
- rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
+ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
return (1);
dropit:
TCPSTAT_INC(tcps_keepdrops);
@@ -2874,7 +3083,7 @@ rack_remxt_tmr(struct tcpcb *tp)
rack = (struct tcp_rack *)tp->t_fb_ptr;
rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
- rack_log_to_event(rack, RACK_TO_FRM_TMR);
+ rack_log_to_event(rack, RACK_TO_FRM_TMR, 0);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
/*
@@ -2885,10 +3094,11 @@ rack_remxt_tmr(struct tcpcb *tp)
* so for now we will just let the normal rxt timer
* and tlp timer take care of it.
*/
- TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
+ RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
if (rsm->r_flags & RACK_ACKED) {
cnt++;
- rsm->r_sndcnt = 0;
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
if (rsm->r_in_tmap == 0) {
/* We must re-add it back to the tlist */
if (trsm == NULL) {
@@ -2897,9 +3107,9 @@ rack_remxt_tmr(struct tcpcb *tp)
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
}
rsm->r_in_tmap = 1;
- trsm = rsm;
}
}
+ trsm = rsm;
rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
}
/* Clear the count (we just un-acked them) */
@@ -2907,10 +3117,9 @@ rack_remxt_tmr(struct tcpcb *tp)
/* Clear the tlp rtx mark */
rack->r_ctl.rc_tlp_rtx_out = 0;
rack->r_ctl.rc_tlp_seg_send_cnt = 0;
- rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
- /* Setup so we send one segment */
- if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rack->r_ctl.rc_prr_sndcnt = 0;
+ rack_log_to_prr(rack, 6);
rack->r_timer_override = 1;
}
@@ -2944,7 +3153,18 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
* retransmit interval. Back off to a longer retransmit interval
* and retransmit one segment.
*/
- if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ rack_remxt_tmr(tp);
+ if ((rack->r_ctl.rc_resend == NULL) ||
+ ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
+ /*
+ * If the rwnd collapsed on
+ * the one we are retransmitting
+ * it does not count against the
+ * rxt count.
+ */
+ tp->t_rxtshift++;
+ }
+ if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
retval = 1;
@@ -2952,7 +3172,6 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
(tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
goto out;
}
- rack_remxt_tmr(tp);
if (tp->t_state == TCPS_SYN_SENT) {
/*
* If the SYN was retransmitted, indicate CWND to be limited
@@ -2987,7 +3206,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
TCPSTAT_INC(tcps_rexmttimeo);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
- rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
+ rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
TCPT_RANGESET(tp->t_rxtcur, rexmt,
@@ -3089,16 +3308,6 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
}
}
/*
- * Disable RFC1323 and SACK if we haven't got any response to our
- * third SYN to work-around some broken terminal servers (most of
- * which have hopefully been retired) that have bad VJ header
- * compression code which trashes TCP segments containing
- * unknown-to-them TCP options.
- */
- if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
- (tp->t_rxtshift == 3))
- tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
- /*
* If we backed off this far, our srtt estimate is probably bogus.
* Clobber it so we'll take the next rtt measurement as our srtt;
* move the current srtt into rttvar to keep the current retransmit
@@ -3168,10 +3377,13 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8
if (timers & PACE_TMR_DELACK) {
ret = rack_timeout_delack(tp, rack, cts);
} else if (timers & PACE_TMR_RACK) {
+ rack->r_ctl.rc_tlp_rxt_last_time = cts;
ret = rack_timeout_rack(tp, rack, cts);
} else if (timers & PACE_TMR_TLP) {
+ rack->r_ctl.rc_tlp_rxt_last_time = cts;
ret = rack_timeout_tlp(tp, rack, cts);
} else if (timers & PACE_TMR_RXT) {
+ rack->r_ctl.rc_tlp_rxt_last_time = cts;
ret = rack_timeout_rxt(tp, rack, cts);
} else if (timers & PACE_TMR_PERSIT) {
ret = rack_timeout_persist(tp, rack, cts);
@@ -3262,7 +3474,8 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
int32_t idx;
rsm->r_rtr_cnt++;
- rsm->r_sndcnt++;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ rsm->r_dupack = 0;
if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
rsm->r_flags |= RACK_OVERMAX;
@@ -3280,6 +3493,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
}
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
}
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
@@ -3288,23 +3502,20 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
rsm->r_flags &= ~RACK_SACK_PASSED;
rsm->r_flags |= RACK_WAS_SACKPASS;
}
- /* Update memory for next rtr */
- rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
}
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
+ struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp)
{
/*
* We (re-)transmitted starting at rsm->r_start for some length
* (possibly less than r_end.
*/
- struct rack_sendmap *nrsm;
+ struct rack_sendmap *nrsm, *insret;
uint32_t c_end;
int32_t len;
- int32_t idx;
len = *lenp;
c_end = rsm->r_start + len;
@@ -3346,17 +3557,16 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
* we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
* 1, 6 and the new piece will be 6, 11.
*/
- nrsm->r_start = c_end;
- nrsm->r_end = rsm->r_end;
- nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
- nrsm->r_flags = rsm->r_flags;
- nrsm->r_sndcnt = rsm->r_sndcnt;
- nrsm->r_rtr_bytes = 0;
- rsm->r_end = c_end;
- for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
- nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ rack_clone_rsm(rack, nrsm, rsm, c_end);
+ nrsm->r_dupack = 0;
+ rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
}
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+#endif
if (rsm->r_in_tmap) {
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
@@ -3374,9 +3584,8 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
uint8_t pass, struct rack_sendmap *hintrsm)
{
struct tcp_rack *rack;
- struct rack_sendmap *rsm, *nrsm;
+ struct rack_sendmap *rsm, *nrsm, *insret, fe;
register uint32_t snd_max, snd_una;
- int32_t idx;
/*
* Add to the RACK log of packets in flight or retransmitted. If
@@ -3426,7 +3635,10 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
end = seq_out + len;
seq_out = snd_una;
- len = end - seq_out;
+ if (SEQ_GEQ(end, seq_out))
+ len = end - seq_out;
+ else
+ len = 0;
}
snd_max = tp->snd_max;
if (th_flags & (TH_SYN | TH_FIN)) {
@@ -3456,8 +3668,9 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
if (IN_RECOVERY(tp->t_flags)) {
rack->r_ctl.rc_prr_out += len;
}
- /* First question is it a retransmission? */
+ /* First question is it a retransmission or new? */
if (seq_out == snd_max) {
+ /* Its new */
again:
rsm = rack_alloc(rack);
if (rsm == NULL) {
@@ -3475,10 +3688,24 @@ again:
rsm->r_tim_lastsent[0] = ts;
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
- rsm->r_start = seq_out;
- rsm->r_end = rsm->r_start + len;
- rsm->r_sndcnt = 0;
- TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
+ if (th_flags & TH_SYN) {
+ /* The data space is one beyond snd_una */
+ rsm->r_start = seq_out + 1;
+ rsm->r_end = rsm->r_start + (len - 1);
+ } else {
+ /* Normal case */
+ rsm->r_start = seq_out;
+ rsm->r_end = rsm->r_start + len;
+ }
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
+ }
+#endif
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
return;
@@ -3486,22 +3713,16 @@ again:
/*
* If we reach here its a retransmission and we need to find it.
*/
+ memset(&fe, 0, sizeof(fe));
more:
if (hintrsm && (hintrsm->r_start == seq_out)) {
rsm = hintrsm;
hintrsm = NULL;
- } else if (rack->r_ctl.rc_next) {
- /* We have a hint from a previous run */
- rsm = rack->r_ctl.rc_next;
} else {
/* No hints sorry */
rsm = NULL;
}
if ((rsm) && (rsm->r_start == seq_out)) {
- /*
- * We used rc_next or hintrsm to retransmit, hopefully the
- * likely case.
- */
seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
if (len == 0) {
return;
@@ -3510,14 +3731,16 @@ more:
}
}
/* Ok it was not the last pointer go through it the hard way. */
- TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
+refind:
+ fe.r_start = seq_out;
+ rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ if (rsm) {
if (rsm->r_start == seq_out) {
seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
- rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
if (len == 0) {
return;
} else {
- continue;
+ goto refind;
}
}
if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
@@ -3535,17 +3758,14 @@ more:
* copy rsm to nrsm and then trim the front of rsm
* to not include this part.
*/
- nrsm->r_start = seq_out;
- nrsm->r_end = rsm->r_end;
- nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
- nrsm->r_flags = rsm->r_flags;
- nrsm->r_sndcnt = rsm->r_sndcnt;
- nrsm->r_rtr_bytes = 0;
- for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
- nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
+ rack_clone_rsm(rack, nrsm, rsm, seq_out);
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
}
- rsm->r_end = nrsm->r_start;
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
+#endif
if (rsm->r_in_tmap) {
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
@@ -3568,7 +3788,7 @@ more:
printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
seq_out, len, tp->snd_una, tp->snd_max);
printf("Starting Dump of all rack entries\n");
- TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
+ RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
printf("rsm:%p start:%u end:%u\n",
rsm, rsm->r_start, rsm->r_end);
}
@@ -3799,7 +4019,7 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
rack->r_ctl.rc_rack_min_rtt = 1;
}
}
- tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
+ tcp_rack_xmit_timer(rack, t + 1);
if ((rsm->r_flags & RACK_TLP) &&
(!IN_RECOVERY(tp->t_flags))) {
/* Segment was a TLP and our retrans matched */
@@ -3812,9 +4032,9 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
* When we enter recovery we need to assure
* we send one packet.
*/
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
- } else
- rack->r_ctl.rc_tlp_rtx_out = 0;
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 7);
+ }
}
if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
/* New more recent rack_tmit_time */
@@ -3833,7 +4053,7 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
if ((to->to_flags & TOF_TS) &&
(ack_type == CUM_ACKED) &&
(to->to_tsecr) &&
- ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
+ ((rsm->r_flags & RACK_OVERMAX) == 0)) {
/*
* Now which timestamp does it match? In this block the ACK
* must be coming from a previous transmission.
@@ -3930,11 +4150,7 @@ rack_log_sack_passed(struct tcpcb *tp,
struct tcp_rack *rack, struct rack_sendmap *rsm)
{
struct rack_sendmap *nrsm;
- uint32_t ts;
- int32_t idx;
- idx = rsm->r_rtr_cnt - 1;
- ts = rsm->r_tim_lastsent[idx];
nrsm = rsm;
TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
rack_head, r_tnext) {
@@ -3943,7 +4159,11 @@ rack_log_sack_passed(struct tcpcb *tp,
continue;
}
if (nrsm->r_flags & RACK_ACKED) {
- /* Skip ack'd segments */
+ /*
+ * Skip ack'd segments, though we
+ * should not see these, since tmap
+ * should not have ack'd segments.
+ */
continue;
}
if (nrsm->r_flags & RACK_SACK_PASSED) {
@@ -3954,146 +4174,219 @@ rack_log_sack_passed(struct tcpcb *tp,
*/
break;
}
- idx = nrsm->r_rtr_cnt - 1;
- if (ts == nrsm->r_tim_lastsent[idx]) {
- /*
- * For this case lets use seq no, if we sent in a
- * big block (TSO) we would have a bunch of segments
- * sent at the same time.
- *
- * We would only get a report if its SEQ is earlier.
- * If we have done multiple retransmits the times
- * would not be equal.
- */
- if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
- nrsm->r_flags |= RACK_SACK_PASSED;
- nrsm->r_flags &= ~RACK_WAS_SACKPASS;
- }
- } else {
- /*
- * Here they were sent at different times, not a big
- * block. Since we transmitted this one later and
- * see it sack'd then this must also be missing (or
- * we would have gotten a sack block for it)
- */
- nrsm->r_flags |= RACK_SACK_PASSED;
- nrsm->r_flags &= ~RACK_WAS_SACKPASS;
- }
+ nrsm->r_flags |= RACK_SACK_PASSED;
+ nrsm->r_flags &= ~RACK_WAS_SACKPASS;
}
}
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
- struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
+ struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
{
- int32_t idx;
- int32_t times = 0;
uint32_t start, end, changed = 0;
- struct rack_sendmap *rsm, *nrsm;
+ struct rack_sendmap stack_map;
+ struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
int32_t used_ref = 1;
+ int moved = 0;
start = sack->start;
end = sack->end;
rsm = *prsm;
- if (rsm && SEQ_LT(start, rsm->r_start)) {
- TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
- if (SEQ_GEQ(start, rsm->r_start) &&
- SEQ_LT(start, rsm->r_end)) {
- goto do_rest_ofb;
- }
- }
- }
- if (rsm == NULL) {
-start_at_beginning:
- rsm = NULL;
+ memset(&fe, 0, sizeof(fe));
+do_rest_ofb:
+ if ((rsm == NULL) ||
+ (SEQ_LT(end, rsm->r_start)) ||
+ (SEQ_GEQ(start, rsm->r_end)) ||
+ (SEQ_LT(start, rsm->r_start))) {
+ /*
+ * We are not in the right spot,
+ * find the correct spot in the tree.
+ */
used_ref = 0;
+ fe.r_start = start;
+ rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ moved++;
}
- /* First lets locate the block where this guy is */
- TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
- if (SEQ_GEQ(start, rsm->r_start) &&
- SEQ_LT(start, rsm->r_end)) {
- break;
- }
- }
-do_rest_ofb:
if (rsm == NULL) {
- /*
- * This happens when we get duplicate sack blocks with the
- * same end. For example SACK 4: 100 SACK 3: 100 The sort
- * will not change there location so we would just start at
- * the end of the first one and get lost.
- */
- if (tp->t_flags & TF_SENTFIN) {
- /*
- * Check to see if we have not logged the FIN that
- * went out.
+ /* TSNH */
+ goto out;
+ }
+ /* Ok we have an ACK for some piece of this rsm */
+ if (rsm->r_start != start) {
+ if ((rsm->r_flags & RACK_ACKED) == 0) {
+ /**
+ * Need to split this in two pieces the before and after,
+ * the before remains in the map, the after must be
+ * added. In other words we have:
+ * rsm |--------------|
+ * sackblk |------->
+ * rsm will become
+ * rsm |---|
+ * and nrsm will be the sacked piece
+ * nrsm |----------|
+ *
+ * But before we start down that path lets
+ * see if the sack spans over on top of
+ * the next guy and it is already sacked.
*/
- nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
- if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
- /*
- * Ok we did not get the FIN logged.
+ next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ if (next && (next->r_flags & RACK_ACKED) &&
+ SEQ_GEQ(end, next->r_start)) {
+ /**
+ * So the next one is already acked, and
+ * we can thus by hookery use our stack_map
+ * to reflect the piece being sacked and
+ * then adjust the two tree entries moving
+ * the start and ends around. So we start like:
+ * rsm |------------| (not-acked)
+ * next |-----------| (acked)
+ * sackblk |-------->
+ * We want to end like so:
+ * rsm |------| (not-acked)
+ * next |-----------------| (acked)
+ * nrsm |-----|
+ * Where nrsm is a temporary stack piece we
+ * use to update all the gizmos.
*/
- nrsm->r_end++;
- rsm = nrsm;
- goto do_rest_ofb;
- }
- }
- if (times == 1) {
+ /* Copy up our fudge block */
+ nrsm = &stack_map;
+ memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
+ /* Now adjust our tree blocks */
+ rsm->r_end = start;
+ next->r_start = start;
+ /* Clear out the dup ack count of the remainder */
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ /* Now lets make sure our fudge block is right */
+ nrsm->r_start = start;
+ /* Now lets update all the stats and such */
+ rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
+ changed += (nrsm->r_end - nrsm->r_start);
+ rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
+ if (nrsm->r_flags & RACK_SACK_PASSED) {
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ /*
+ * Now we want to go up from rsm (the
+ * one left un-acked) to the next one
+ * in the tmap. We do this so when
+ * we walk backwards we include marking
+ * sack-passed on rsm (The one passed in
+ * is skipped since it is generally called
+ * on something sacked before removing it
+ * from the tmap).
+ */
+ if (rsm->r_in_tmap) {
+ nrsm = TAILQ_NEXT(rsm, r_tnext);
+ /*
+ * Now that we have the next
+ * one walk backwards from there.
+ */
+ if (nrsm && nrsm->r_in_tmap)
+ rack_log_sack_passed(tp, rack, nrsm);
+ }
+ /* Now are we done? */
+ if (SEQ_LT(end, next->r_end) ||
+ (end == next->r_end)) {
+ /* Done with block */
+ goto out;
+ }
+ counter_u64_add(rack_sack_used_next_merge, 1);
+ /* Postion for the next block */
+ start = next->r_end;
+ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
+ if (rsm == NULL)
+ goto out;
+ } else {
+ /**
+ * We can't use any hookery here, so we
+ * need to split the map. We enter like
+ * so:
+ * rsm |--------|
+ * sackblk |----->
+ * We will add the new block nrsm and
+ * that will be the new portion, and then
+ * fall through after reseting rsm. So we
+ * split and look like this:
+ * rsm |----|
+ * sackblk |----->
+ * nrsm |---|
+ * We then fall through reseting
+ * rsm to nrsm, so the next block
+ * picks it up.
+ */
+ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
+ if (nrsm == NULL) {
+ /*
+ * failed XXXrrs what can we do but loose the sack
+ * info?
+ */
+ goto out;
+ }
+ counter_u64_add(rack_sack_splits, 1);
+ rack_clone_rsm(rack, nrsm, rsm, start);
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
#ifdef INVARIANTS
- panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
- tp, rack, sack, to, prsm);
-#else
- goto out;
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
+ }
#endif
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ /* Position us to point to the new nrsm that starts the sack blk */
+ rsm = nrsm;
+ }
+ } else {
+ /* Already sacked this piece */
+ counter_u64_add(rack_sack_skipped_acked, 1);
+ moved++;
+ if (end == rsm->r_end) {
+ /* Done with block */
+ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ goto out;
+ } else if (SEQ_LT(end, rsm->r_end)) {
+ /* A partial sack to a already sacked block */
+ moved++;
+ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ goto out;
+ } else {
+ /*
+ * The end goes beyond this guy
+ * repostion the start to the
+ * next block.
+ */
+ start = rsm->r_end;
+ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ if (rsm == NULL)
+ goto out;
+ }
}
- times++;
- counter_u64_add(rack_sack_proc_restart, 1);
- goto start_at_beginning;
- }
- /* Ok we have an ACK for some piece of rsm */
- if (rsm->r_start != start) {
- /*
- * Need to split this in two pieces the before and after.
- */
- nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
- if (nrsm == NULL) {
- /*
- * failed XXXrrs what can we do but loose the sack
- * info?
- */
- goto out;
- }
- nrsm->r_start = start;
- nrsm->r_rtr_bytes = 0;
- nrsm->r_end = rsm->r_end;
- nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
- nrsm->r_flags = rsm->r_flags;
- nrsm->r_sndcnt = rsm->r_sndcnt;
- for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
- nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
- }
- rsm->r_end = nrsm->r_start;
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
- if (rsm->r_in_tmap) {
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
- nrsm->r_in_tmap = 1;
- }
- rsm->r_flags &= (~RACK_HAS_FIN);
- rsm = nrsm;
}
if (SEQ_GEQ(end, rsm->r_end)) {
- /*
+ /**
* The end of this block is either beyond this guy or right
- * at this guy.
+ * at this guy. I.e.:
+ * rsm --- |-----|
+ * end |-----|
+ * <or>
+ * end |---------|
*/
-
+ if (rsm->r_flags & RACK_TLP)
+ rack->r_ctl.rc_tlp_rtx_out = 0;
if ((rsm->r_flags & RACK_ACKED) == 0) {
rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
changed += (rsm->r_end - rsm->r_start);
rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
- rack_log_sack_passed(tp, rack, rsm);
+ if (rsm->r_in_tmap) /* should be true */
+ rack_log_sack_passed(tp, rack, rsm);
/* Is Reordering occuring? */
if (rsm->r_flags & RACK_SACK_PASSED) {
+ rsm->r_flags &= ~RACK_SACK_PASSED;
counter_u64_add(rack_reorder_seen, 1);
rack->r_ctl.rc_reorder_ts = cts;
}
@@ -4103,80 +4396,171 @@ do_rest_ofb:
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
}
+ } else {
+ counter_u64_add(rack_sack_skipped_acked, 1);
+ moved++;
}
if (end == rsm->r_end) {
- /* This block only - done */
+ /* This block only - done, setup for next */
goto out;
}
- /* There is more not coverend by this rsm move on */
+ /*
+ * There is more not coverend by this rsm move on
+ * to the next block in the RB tree.
+ */
+ nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
start = rsm->r_end;
- nrsm = TAILQ_NEXT(rsm, r_next);
rsm = nrsm;
- times = 0;
+ if (rsm == NULL)
+ goto out;
goto do_rest_ofb;
}
- /* Ok we need to split off this one at the tail */
- nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
- if (nrsm == NULL) {
- /* failed rrs what can we do but loose the sack info? */
- goto out;
- }
- /* Clone it */
- nrsm->r_start = end;
- nrsm->r_end = rsm->r_end;
- nrsm->r_rtr_bytes = 0;
- nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
- nrsm->r_flags = rsm->r_flags;
- nrsm->r_sndcnt = rsm->r_sndcnt;
- for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
- nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
- }
- /* The sack block does not cover this guy fully */
- rsm->r_flags &= (~RACK_HAS_FIN);
- rsm->r_end = end;
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
- if (rsm->r_in_tmap) {
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
- nrsm->r_in_tmap = 1;
- }
- if (rsm->r_flags & RACK_ACKED) {
- /* Been here done that */
- goto out;
- }
- rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
- changed += (rsm->r_end - rsm->r_start);
- rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
- rack_log_sack_passed(tp, rack, rsm);
- /* Is Reordering occuring? */
- if (rsm->r_flags & RACK_SACK_PASSED) {
- counter_u64_add(rack_reorder_seen, 1);
- rack->r_ctl.rc_reorder_ts = cts;
- }
- rsm->r_flags |= RACK_ACKED;
- rsm->r_flags &= ~RACK_TLP;
- if (rsm->r_in_tmap) {
- TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
- rsm->r_in_tmap = 0;
+ /**
+ * The end of this sack block is smaller than
+ * our rsm i.e.:
+ * rsm --- |-----|
+ * end |--|
+ */
+ if ((rsm->r_flags & RACK_ACKED) == 0) {
+ prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ if (prev && (prev->r_flags & RACK_ACKED)) {
+ /**
+ * Goal, we want the right remainder of rsm to shrink
+ * in place and span from (rsm->r_start = end) to rsm->r_end.
+ * We want to expand prev to go all the way
+ * to prev->r_end <- end.
+ * so in the tree we have before:
+ * prev |--------| (acked)
+ * rsm |-------| (non-acked)
+ * sackblk |-|
+ * We churn it so we end up with
+ * prev |----------| (acked)
+ * rsm |-----| (non-acked)
+ * nrsm |-| (temporary)
+ */
+ nrsm = &stack_map;
+ memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
+ prev->r_end = end;
+ rsm->r_start = end;
+ /* Now adjust nrsm (stack copy) to be
+ * the one that is the small
+ * piece that was "sacked".
+ */
+ nrsm->r_end = end;
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ /*
+ * Now nrsm is our new little piece
+ * that is acked (which was merged
+ * to prev). Update the rtt and changed
+ * based on that. Also check for reordering.
+ */
+ rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
+ changed += (nrsm->r_end - nrsm->r_start);
+ rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
+ if (nrsm->r_flags & RACK_SACK_PASSED) {
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ rsm = prev;
+ counter_u64_add(rack_sack_used_prev_merge, 1);
+ } else {
+ /**
+ * This is the case where our previous
+ * block is not acked either, so we must
+ * split the block in two.
+ */
+ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
+ if (nrsm == NULL) {
+ /* failed rrs what can we do but loose the sack info? */
+ goto out;
+ }
+ /**
+ * In this case nrsm becomes
+ * nrsm->r_start = end;
+ * nrsm->r_end = rsm->r_end;
+ * which is un-acked.
+ * <and>
+ * rsm->r_end = nrsm->r_start;
+ * i.e. the remaining un-acked
+ * piece is left on the left
+ * hand side.
+ *
+ * So we start like this
+ * rsm |----------| (not acked)
+ * sackblk |---|
+ * build it so we have
+ * rsm |---| (acked)
+ * nrsm |------| (not acked)
+ */
+ counter_u64_add(rack_sack_splits, 1);
+ rack_clone_rsm(rack, nrsm, rsm, end);
+ rsm->r_flags &= (~RACK_HAS_FIN);
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
+ }
+#endif
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ nrsm->r_dupack = 0;
+ rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
+ if (rsm->r_flags & RACK_TLP)
+ rack->r_ctl.rc_tlp_rtx_out = 0;
+ rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
+ changed += (rsm->r_end - rsm->r_start);
+ rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
+ if (rsm->r_in_tmap) /* should be true */
+ rack_log_sack_passed(tp, rack, rsm);
+ /* Is Reordering occuring? */
+ if (rsm->r_flags & RACK_SACK_PASSED) {
+ rsm->r_flags &= ~RACK_SACK_PASSED;
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = cts;
+ }
+ rsm->r_flags |= RACK_ACKED;
+ rsm->r_flags &= ~RACK_TLP;
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ }
+ } else if (start != end){
+ /*
+ * The block was already acked.
+ */
+ counter_u64_add(rack_sack_skipped_acked, 1);
+ moved++;
}
out:
if (rsm && (rsm->r_flags & RACK_ACKED)) {
/*
- * Now can we merge this newly acked
- * block with either the previous or
+ * Now can we merge where we worked
+ * with either the previous or
* next block?
*/
- nrsm = TAILQ_NEXT(rsm, r_next);
- if (nrsm &&
- (nrsm->r_flags & RACK_ACKED)) {
+ next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ while (next) {
+ if (next->r_flags & RACK_ACKED) {
/* yep this and next can be merged */
- rsm = rack_merge_rsm(rack, rsm, nrsm);
+ rsm = rack_merge_rsm(rack, rsm, next);
+ next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ } else
+ break;
}
/* Now what about the previous? */
- nrsm = TAILQ_PREV(rsm, rack_head, r_next);
- if (nrsm &&
- (nrsm->r_flags & RACK_ACKED)) {
+ prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ while (prev) {
+ if (prev->r_flags & RACK_ACKED) {
/* yep the previous and this can be merged */
- rsm = rack_merge_rsm(rack, nrsm, rsm);
+ rsm = rack_merge_rsm(rack, prev, rsm);
+ prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ } else
+ break;
}
}
if (used_ref == 0) {
@@ -4184,12 +4568,14 @@ out:
} else {
counter_u64_add(rack_sack_proc_short, 1);
}
- /* Save off where we last were */
- if (rsm)
- rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
+ /* Save off the next one for quick reference. */
+ if (rsm)
+ nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
else
- rack->r_ctl.rc_sacklast = NULL;
- *prsm = rsm;
+ nrsm = NULL;
+ *prsm = rack->r_ctl.rc_sacklast = nrsm;
+ /* Pass back the moved. */
+ *moved_two = moved;
return (changed);
}
@@ -4218,7 +4604,7 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac
tmap = rsm;
}
tmap->r_in_tmap = 1;
- rsm = TAILQ_NEXT(rsm, r_next);
+ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
}
/*
* Now lets possibly clear the sack filter so we start
@@ -4230,16 +4616,72 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac
}
static void
+rack_do_decay(struct tcp_rack *rack)
+{
+ struct timeval res;
+
+#define timersub(tvp, uvp, vvp) \
+ do { \
+ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
+ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
+ if ((vvp)->tv_usec < 0) { \
+ (vvp)->tv_sec--; \
+ (vvp)->tv_usec += 1000000; \
+ } \
+ } while (0)
+
+ timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res);
+#undef timersub
+
+ rack->r_ctl.input_pkt++;
+ if ((rack->rc_in_persist) ||
+ (res.tv_sec >= 1) ||
+ (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
+ /*
+ * Check for decay of non-SAD,
+ * we want all SAD detection metrics to
+ * decay 1/4 per second (or more) passed.
+ */
+ uint32_t pkt_delta;
+
+ pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
+ /* Update our saved tracking values */
+ rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
+ rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
+ /* Now do we escape without decay? */
+ if (rack->rc_in_persist ||
+ (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
+ (pkt_delta < tcp_sad_low_pps)){
+ /*
+ * We don't decay idle connections
+ * or ones that have a low input pps.
+ */
+ return;
+ }
+ /* Decay the counters */
+ rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
+ tcp_sad_decay_val);
+ rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
+ tcp_sad_decay_val);
+ rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
+ tcp_sad_decay_val);
+ rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
+ tcp_sad_decay_val);
+ }
+}
+
+static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
{
- uint32_t changed, last_seq, entered_recovery = 0;
+ uint32_t changed, entered_recovery = 0;
struct tcp_rack *rack;
- struct rack_sendmap *rsm;
+ struct rack_sendmap *rsm, *rm;
struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
register uint32_t th_ack;
int32_t i, j, k, num_sack_blks = 0;
uint32_t cts, acked, ack_point, sack_changed = 0;
-
+ int loop_start = 0, moved_two = 0;
+
INP_WLOCK_ASSERT(tp->t_inpcb);
if (th->th_flags & TH_RST) {
/* We don't log resets */
@@ -4247,10 +4689,31 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
}
rack = (struct tcp_rack *)tp->t_fb_ptr;
cts = tcp_ts_getticks();
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
changed = 0;
th_ack = th->th_ack;
+ if (rack->sack_attack_disable == 0)
+ rack_do_decay(rack);
+ if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
+ /*
+ * You only get credit for
+ * MSS and greater (and you get extra
+ * credit for larger cum-ack moves).
+ */
+ int ac;
+ ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
+ rack->r_ctl.ack_count += ac;
+ counter_u64_add(rack_ack_total, ac);
+ }
+ if (rack->r_ctl.ack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
if (SEQ_GT(th_ack, tp->snd_una)) {
rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
tp->t_acktime = ticks;
@@ -4264,8 +4727,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
* RTT's.
*/
rack->r_wanted_output++;
-more:
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ more:
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
if (rsm == NULL) {
if ((th_ack - 1) == tp->iss) {
/*
@@ -4282,9 +4745,9 @@ more:
}
#ifdef INVARIANTS
panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
- tp,
- th, tp->t_state, rack,
- tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
+ tp,
+ th, tp->t_state, rack,
+ tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
#endif
goto proc_sack;
}
@@ -4292,8 +4755,8 @@ more:
/* Huh map is missing this */
#ifdef INVARIANTS
printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
- rsm->r_start,
- th_ack, tp->t_state, rack->r_state);
+ rsm->r_start,
+ th_ack, tp->t_state, rack->r_state);
#endif
goto proc_sack;
}
@@ -4305,15 +4768,19 @@ more:
rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
rsm->r_rtr_bytes = 0;
- TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
+ if (rsm->r_flags & RACK_TLP)
+ rack->r_ctl.rc_tlp_rtx_out = 0;
+ rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#ifdef INVARIANTS
+ if (rm != rsm) {
+ panic("removing head in rack:%p rsm:%p rm:%p",
+ rack, rsm, rm);
+ }
+#endif
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
}
- if (rack->r_ctl.rc_next == rsm) {
- /* scoot along the marker */
- rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
- }
if (rsm->r_flags & RACK_ACKED) {
/*
* It was acked on the scoreboard -- remove
@@ -4322,10 +4789,11 @@ more:
rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
} else if (rsm->r_flags & RACK_SACK_PASSED) {
/*
- * There are acked segments ACKED on the
+ * There are segments ACKED on the
* scoreboard further up. We are seeing
* reordering.
*/
+ rsm->r_flags &= ~RACK_SACK_PASSED;
counter_u64_add(rack_reorder_seen, 1);
rsm->r_flags |= RACK_ACKED;
rack->r_ctl.rc_reorder_ts = cts;
@@ -4357,13 +4825,31 @@ more:
*/
rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
}
- rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
- rsm->r_rtr_bytes = 0;
+ /*
+ * Clear the dup ack count for
+ * the piece that remains.
+ */
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ if (rsm->r_rtr_bytes) {
+ /*
+ * It was retransmitted adjust the
+ * sack holes for what was acked.
+ */
+ int ack_am;
+
+ ack_am = (th_ack - rsm->r_start);
+ if (ack_am >= rsm->r_rtr_bytes) {
+ rack->r_ctl.rc_holes_rxt -= ack_am;
+ rsm->r_rtr_bytes -= ack_am;
+ }
+ }
+ /* Update where the piece starts */
rsm->r_start = th_ack;
}
proc_sack:
/* Check for reneging */
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
/*
* The peer has moved snd_una up to
@@ -4382,15 +4868,9 @@ proc_sack:
rack_peer_reneges(rack, rsm, th->th_ack);
}
if ((to->to_flags & TOF_SACK) == 0) {
- /* We are done nothing left to log */
+ /* We are done nothing left */
goto out;
}
- rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
- if (rsm) {
- last_seq = rsm->r_end;
- } else {
- last_seq = tp->snd_max;
- }
/* Sack block processing */
if (SEQ_GT(th_ack, tp->snd_una))
ack_point = th_ack;
@@ -4398,7 +4878,7 @@ proc_sack:
ack_point = tp->snd_una;
for (i = 0; i < to->to_nsacks; i++) {
bcopy((to->to_sacks + i * TCPOLEN_SACK),
- &sack, sizeof(sack));
+ &sack, sizeof(sack));
sack.start = ntohl(sack.start);
sack.end = ntohl(sack.end);
if (SEQ_GT(sack.end, sack.start) &&
@@ -4406,28 +4886,19 @@ proc_sack:
SEQ_LT(sack.start, tp->snd_max) &&
SEQ_GT(sack.end, ack_point) &&
SEQ_LEQ(sack.end, tp->snd_max)) {
- if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
- (SEQ_LT(sack.end, last_seq)) &&
- ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
- /*
- * Not the last piece and its smaller than
- * 1/8th of a MSS. We ignore this.
- */
- counter_u64_add(rack_runt_sacks, 1);
- continue;
- }
sack_blocks[num_sack_blks] = sack;
num_sack_blks++;
+#ifdef NETFLIX_STATS
} else if (SEQ_LEQ(sack.start, th_ack) &&
SEQ_LEQ(sack.end, th_ack)) {
/*
* Its a D-SACK block.
*/
-/* tcp_record_dsack(sack.start, sack.end); */
+ tcp_record_dsack(sack.start, sack.end);
+#endif
}
+
}
- if (num_sack_blks == 0)
- goto out;
/*
* Sort the SACK blocks so we can update the rack scoreboard with
* just one pass.
@@ -4437,7 +4908,12 @@ proc_sack:
num_sack_blks, th->th_ack);
ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
}
+ if (num_sack_blks == 0) {
+ /* Nothing to sack (DSACKs?) */
+ goto out_with_totals;
+ }
if (num_sack_blks < 2) {
+ /* Only one, we don't need to sort */
goto do_sack_work;
}
/* Sort the sacks */
@@ -4452,9 +4928,11 @@ proc_sack:
}
/*
* Now are any of the sack block ends the same (yes some
- * implememtations send these)?
+ * implementations send these)?
*/
again:
+ if (num_sack_blks == 0)
+ goto out_with_totals;
if (num_sack_blks > 1) {
for (i = 0; i < num_sack_blks; i++) {
for (j = i + 1; j < num_sack_blks; j++) {
@@ -4488,16 +4966,230 @@ again:
}
}
do_sack_work:
+ /*
+ * First lets look to see if
+ * we have retransmitted and
+ * can use the transmit next?
+ */
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm &&
+ SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
+ SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
+ /*
+ * We probably did the FR and the next
+ * SACK in continues as we would expect.
+ */
+ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
+ if (acked) {
+ rack->r_wanted_output++;
+ changed += acked;
+ sack_changed += acked;
+ }
+ if (num_sack_blks == 1) {
+ /*
+ * This is what we would expect from
+ * a normal implementation to happen
+ * after we have retransmitted the FR,
+ * i.e the sack-filter pushes down
+ * to 1 block and the next to be retransmitted
+ * is the sequence in the sack block (has more
+ * are acked). Count this as ACK'd data to boost
+ * up the chances of recovering any false positives.
+ */
+ rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
+ counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
+ counter_u64_add(rack_express_sack, 1);
+ if (rack->r_ctl.ack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
+ goto out_with_totals;
+ } else {
+ /*
+ * Start the loop through the
+ * rest of blocks, past the first block.
+ */
+ moved_two = 0;
+ loop_start = 1;
+ }
+ }
+ /* Its a sack of some sort */
+ rack->r_ctl.sack_count++;
+ if (rack->r_ctl.sack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
+ counter_u64_add(rack_sack_total, 1);
+ if (rack->sack_attack_disable) {
+ /* An attacker disablement is in place */
+ if (num_sack_blks > 1) {
+ rack->r_ctl.sack_count += (num_sack_blks - 1);
+ rack->r_ctl.sack_moved_extra++;
+ counter_u64_add(rack_move_some, 1);
+ if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
+ rack->r_ctl.sack_moved_extra /= 2;
+ rack->r_ctl.sack_noextra_move /= 2;
+ }
+ }
+ goto out;
+ }
rsm = rack->r_ctl.rc_sacklast;
- for (i = 0; i < num_sack_blks; i++) {
- acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
+ for (i = loop_start; i < num_sack_blks; i++) {
+ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
if (acked) {
rack->r_wanted_output++;
changed += acked;
sack_changed += acked;
}
+ if (moved_two) {
+ /*
+ * If we did not get a SACK for at least a MSS and
+ * had to move at all, or if we moved more than our
+ * threshold, it counts against the "extra" move.
+ */
+ rack->r_ctl.sack_moved_extra += moved_two;
+ counter_u64_add(rack_move_some, 1);
+ } else {
+ /*
+ * else we did not have to move
+ * any more than we would expect.
+ */
+ rack->r_ctl.sack_noextra_move++;
+ counter_u64_add(rack_move_none, 1);
+ }
+ if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
+ /*
+ * If the SACK was not a full MSS then
+ * we add to sack_count the number of
+ * MSS's (or possibly more than
+ * a MSS if its a TSO send) we had to skip by.
+ */
+ rack->r_ctl.sack_count += moved_two;
+ counter_u64_add(rack_sack_total, moved_two);
+ }
+ /*
+ * Now we need to setup for the next
+ * round. First we make sure we won't
+ * exceed the size of our uint32_t on
+ * the various counts, and then clear out
+ * moved_two.
+ */
+ if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
+ (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
+ rack->r_ctl.sack_moved_extra /= 2;
+ rack->r_ctl.sack_noextra_move /= 2;
+ }
+ if (rack->r_ctl.sack_count > 0xfff00000) {
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
+ moved_two = 0;
+ }
+out_with_totals:
+ if (num_sack_blks > 1) {
+ /*
+ * You get an extra stroke if
+ * you have more than one sack-blk, this
+ * could be where we are skipping forward
+ * and the sack-filter is still working, or
+ * it could be an attacker constantly
+ * moving us.
+ */
+ rack->r_ctl.sack_moved_extra++;
+ counter_u64_add(rack_move_some, 1);
}
out:
+#ifdef NETFLIX_EXP_DETECTION
+ if ((rack->do_detection || tcp_force_detection) &&
+ tcp_sack_to_ack_thresh &&
+ tcp_sack_to_move_thresh &&
+ ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
+ /*
+ * We have thresholds set to find
+ * possible attackers and disable sack.
+ * Check them.
+ */
+ uint64_t ackratio, moveratio, movetotal;
+
+ /* Log detecting */
+ rack_log_sad(rack, 1);
+ ackratio = (uint64_t)(rack->r_ctl.sack_count);
+ ackratio *= (uint64_t)(1000);
+ if (rack->r_ctl.ack_count)
+ ackratio /= (uint64_t)(rack->r_ctl.ack_count);
+ else {
+ /* We really should not hit here */
+ ackratio = 1000;
+ }
+ if ((rack->sack_attack_disable == 0) &&
+ (ackratio > rack_highest_sack_thresh_seen))
+ rack_highest_sack_thresh_seen = (uint32_t)ackratio;
+ movetotal = rack->r_ctl.sack_moved_extra;
+ movetotal += rack->r_ctl.sack_noextra_move;
+ moveratio = rack->r_ctl.sack_moved_extra;
+ moveratio *= (uint64_t)1000;
+ if (movetotal)
+ moveratio /= movetotal;
+ else {
+ /* No moves, thats pretty good */
+ moveratio = 0;
+ }
+ if ((rack->sack_attack_disable == 0) &&
+ (moveratio > rack_highest_move_thresh_seen))
+ rack_highest_move_thresh_seen = (uint32_t)moveratio;
+ if (rack->sack_attack_disable == 0) {
+ if ((ackratio > tcp_sack_to_ack_thresh) &&
+ (moveratio > tcp_sack_to_move_thresh)) {
+ /* Disable sack processing */
+ rack->sack_attack_disable = 1;
+ if (rack->r_rep_attack == 0) {
+ rack->r_rep_attack = 1;
+ counter_u64_add(rack_sack_attacks_detected, 1);
+ }
+ if (tcp_attack_on_turns_on_logging) {
+ /*
+ * Turn on logging, used for debugging
+ * false positives.
+ */
+ rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
+ }
+ /* Clamp the cwnd at flight size */
+ rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
+ rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ rack_log_sad(rack, 2);
+ }
+ } else {
+ /* We are sack-disabled check for false positives */
+ if ((ackratio <= tcp_restoral_thresh) ||
+ (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) {
+ rack->sack_attack_disable = 0;
+ rack_log_sad(rack, 3);
+ /* Restart counting */
+ rack->r_ctl.sack_count = 0;
+ rack->r_ctl.sack_moved_extra = 0;
+ rack->r_ctl.sack_noextra_move = 1;
+ rack->r_ctl.ack_count = max(1,
+ (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp)));
+
+ if (rack->r_rep_reverse == 0) {
+ rack->r_rep_reverse = 1;
+ counter_u64_add(rack_sack_attacks_reversed, 1);
+ }
+ /* Restore the cwnd */
+ if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
+ rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
+ }
+ }
+ }
+#endif
if (changed) {
/* Something changed cancel the rack timer */
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
@@ -4523,12 +5215,13 @@ out:
* When we enter recovery we need to assure we send
* one packet.
*/
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 8);
rack->r_timer_override = 1;
}
}
if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
- /* Deal with changed an PRR here (in recovery only) */
+ /* Deal with changed and PRR here (in recovery only) */
uint32_t pipe, snd_una;
rack->r_ctl.rc_prr_delivered += changed;
@@ -4547,6 +5240,7 @@ out:
sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
else {
rack->r_ctl.rc_prr_sndcnt = 0;
+ rack_log_to_prr(rack, 9);
sndcnt = 0;
}
sndcnt++;
@@ -4555,6 +5249,7 @@ out:
else
sndcnt = 0;
rack->r_ctl.rc_prr_sndcnt = sndcnt;
+ rack_log_to_prr(rack, 10);
} else {
uint32_t limit;
@@ -4564,19 +5259,38 @@ out:
limit = 0;
if (changed > limit)
limit = changed;
- limit += tp->t_maxseg;
+ limit += ctf_fixed_maxseg(tp);
if (tp->snd_ssthresh > pipe) {
rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
+ rack_log_to_prr(rack, 11);
} else {
rack->r_ctl.rc_prr_sndcnt = min(0, limit);
+ rack_log_to_prr(rack, 12);
}
}
- if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
+ if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) {
rack->r_timer_override = 1;
}
}
}
+static void
+rack_strike_dupack(struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm && (rsm->r_dupack < 0xff)) {
+ rsm->r_dupack++;
+ if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
+ rack->r_wanted_output = 1;
+ rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
+ } else {
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
+ }
+ }
+}
+
/*
* Return value of 1, we do not need to call rack_process_data().
* return value of 0, rack_process_data can be called.
@@ -4598,10 +5312,15 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (SEQ_GT(th->th_ack, tp->snd_max)) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ rack->r_wanted_output++;
return (1);
}
if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
+ if (rack->rc_in_persist)
+ tp->t_rxtshift = 0;
+ if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
+ rack_strike_dupack(rack);
rack_log_ack(tp, to, th);
}
if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
@@ -4675,9 +5394,6 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
rack->r_wanted_output++;
}
- /*
- * If no data (only SYN) was ACK'd, skip rest of ACK processing.
- */
if (acked == 0) {
if (ofia)
*ofia = ourfinisacked;
@@ -4732,7 +5448,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
- tp->t_acktime = 0;
+ if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
+ tp->t_acktime = 0;
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
/* Set need output so persist might get set */
rack->r_wanted_output++;
@@ -4748,7 +5465,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
*ret_val = 1;
tp = tcp_close(tp);
- rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
return (1);
}
}
@@ -4757,6 +5474,91 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (0);
}
+static void
+rack_collapsed_window(struct tcp_rack *rack)
+{
+ /*
+ * Now we must walk the
+ * send map and divide the
+ * ones left stranded. These
+ * guys can't cause us to abort
+ * the connection and are really
+ * "unsent". However if a buggy
+ * client actually did keep some
+ * of the data i.e. collapsed the win
+ * and refused to ack and then opened
+ * the win and acked that data. We would
+ * get into an ack war, the simplier
+ * method then of just pretending we
+ * did not send those segments something
+ * won't work.
+ */
+ struct rack_sendmap *rsm, *nrsm, fe, *insret;
+ tcp_seq max_seq;
+ uint32_t maxseg;
+
+ max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
+ maxseg = ctf_fixed_maxseg(rack->rc_tp);
+ memset(&fe, 0, sizeof(fe));
+ fe.r_start = max_seq;
+ /* Find the first seq past or at maxseq */
+ rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ if (rsm == NULL) {
+ /* Nothing to do strange */
+ rack->rc_has_collapsed = 0;
+ return;
+ }
+ /*
+ * Now do we need to split at
+ * the collapse point?
+ */
+ if (SEQ_GT(max_seq, rsm->r_start)) {
+ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
+ if (nrsm == NULL) {
+ /* We can't get a rsm, mark all? */
+ nrsm = rsm;
+ goto no_split;
+ }
+ /* Clone it */
+ rack_clone_rsm(rack, nrsm, rsm, max_seq);
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ nrsm, insret, rack, rsm);
+ }
+#endif
+ if (rsm->r_in_tmap) {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
+ nrsm->r_in_tmap = 1;
+ }
+ /*
+ * Set in the new RSM as the
+ * collapsed starting point
+ */
+ rsm = nrsm;
+ }
+no_split:
+ counter_u64_add(rack_collapsed_win, 1);
+ RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
+ nrsm->r_flags |= RACK_RWND_COLLAPSED;
+ rack->rc_has_collapsed = 1;
+ }
+}
+
+static void
+rack_un_collapse_window(struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+
+ RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
+ if (rsm->r_flags & RACK_RWND_COLLAPSED)
+ rsm->r_flags &= ~RACK_RWND_COLLAPSED;
+ else
+ break;
+ }
+ rack->rc_has_collapsed = 0;
+}
/*
* Return value of 1, the TCB is unlocked and most
@@ -4773,11 +5575,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
* send garbage on first SYN.
*/
int32_t nsegs;
-#ifdef TCP_RFC7413
int32_t tfo_syn;
-#else
-#define tfo_syn (FALSE)
-#endif
struct tcp_rack *rack;
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -4804,13 +5602,36 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_wl2 = th->th_ack;
}
}
+ if (tp->snd_wnd < ctf_outstanding(tp))
+ /* The peer collapsed the window */
+ rack_collapsed_window(rack);
+ else if (rack->rc_has_collapsed)
+ rack_un_collapse_window(rack);
/* Was persist timer active and now we have window space? */
- if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
+ if ((rack->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
+ rack->r_ctl.rc_pace_min_segs))) {
rack_exit_persist(tp, rack);
tp->snd_nxt = tp->snd_max;
/* Make sure we output to start the timer */
rack->r_wanted_output++;
}
+ /* Do we enter persists? */
+ if ((rack->rc_in_persist == 0) &&
+ (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_max == tp->snd_una) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
+ (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
+ /*
+ * Here the rwnd is less than
+ * the pacing size, we are established,
+ * nothing is outstanding, and there is
+ * data to send. Enter persists.
+ */
+ tp->snd_nxt = tp->snd_una;
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ }
if (tp->t_flags2 & TF2_DROP_AF_DATA) {
m_freem(m);
return (0);
@@ -4886,10 +5707,8 @@ dodata: /* XXX */
* PRU_RCVD). If a FIN has already been received on this connection
* then we just ignore the text.
*/
-#ifdef TCP_RFC7413
tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
- (tp->t_flags & TF_FASTOPEN));
-#endif
+ IS_FASTOPEN(tp->t_flags));
if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq save_start = th->th_seq;
@@ -4912,6 +5731,20 @@ dodata: /* XXX */
SEGQ_EMPTY(tp) &&
(TCPS_HAVEESTABLISHED(tp->t_state) ||
tfo_syn)) {
+#ifdef NETFLIX_SB_LIMITS
+ u_int mcnt, appended;
+
+ if (so->so_rcv.sb_shlim) {
+ mcnt = m_memcnt(m);
+ appended = 0;
+ if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
+ CFO_NOSLEEP, NULL) == false) {
+ counter_u64_add(tcp_sb_shlim_fails, 1);
+ m_freem(m);
+ return (0);
+ }
+ }
+#endif
if (DELAY_ACK(tp, tlen) || tfo_syn) {
rack_timer_cancel(tp, rack,
rack->r_ctl.rc_rcvtime, __LINE__);
@@ -4925,12 +5758,20 @@ dodata: /* XXX */
TCPSTAT_ADD(tcps_rcvpack, nsegs);
TCPSTAT_ADD(tcps_rcvbyte, tlen);
SOCKBUF_LOCK(&so->so_rcv);
- if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
m_freem(m);
- else
- sbappendstream_locked(&so->so_rcv, m, 0);
+ } else
+#ifdef NETFLIX_SB_LIMITS
+ appended =
+#endif
+ sbappendstream_locked(&so->so_rcv, m, 0);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
+#ifdef NETFLIX_SB_LIMITS
+ if (so->so_rcv.sb_shlim && appended != mcnt)
+ counter_fo_release(so->so_rcv.sb_shlim,
+ mcnt - appended);
+#endif
} else {
/*
* XXX: Due to the header drop above "th" is
@@ -5063,6 +5904,9 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
int32_t nsegs;
int32_t newsize = 0; /* automatic sockbuf scaling */
struct tcp_rack *rack;
+#ifdef NETFLIX_SB_LIMITS
+ u_int mcnt, appended;
+#endif
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -5112,10 +5956,21 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
nsegs = max(1, m->m_pkthdr.lro_nsegs);
-
+#ifdef NETFLIX_SB_LIMITS
+ if (so->so_rcv.sb_shlim) {
+ mcnt = m_memcnt(m);
+ appended = 0;
+ if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
+ CFO_NOSLEEP, NULL) == false) {
+ counter_u64_add(tcp_sb_shlim_fails, 1);
+ m_freem(m);
+ return (1);
+ }
+ }
+#endif
/* Clean receiver SACK report if present */
if (tp->rcv_numsacks)
- tcp_clean_sackreport(tp);
+ tcp_clean_sackreport(tp);
TCPSTAT_INC(tcps_preddat);
tp->rcv_nxt += tlen;
/*
@@ -5149,11 +6004,18 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
newsize, so, NULL))
so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
- sbappendstream_locked(&so->so_rcv, m, 0);
- rack_calc_rwin(so, tp);
+#ifdef NETFLIX_SB_LIMITS
+ appended =
+#endif
+ sbappendstream_locked(&so->so_rcv, m, 0);
+ ctf_calc_rwin(so, tp);
}
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
+#ifdef NETFLIX_SB_LIMITS
+ if (so->so_rcv.sb_shlim && mcnt != appended)
+ counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
+#endif
if (DELAY_ACK(tp, tlen)) {
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
tp->t_flags |= TF_DELACK;
@@ -5231,6 +6093,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
/* Ok if we reach here, we can process a fast-ack */
nsegs = max(1, m->m_pkthdr.lro_nsegs);
rack_log_ack(tp, to, th);
+ /*
+ * We made progress, clear the tlp
+ * out flag so we could start a TLP
+ * again.
+ */
+ rack->r_ctl.rc_tlp_rtx_out = 0;
/* Did the window get updated? */
if (tiwin != tp->snd_wnd) {
tp->snd_wnd = tiwin;
@@ -5238,9 +6106,28 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (tp->snd_wnd > tp->max_sndwnd)
tp->max_sndwnd = tp->snd_wnd;
}
- if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
+ /* Do we exit persists? */
+ if ((rack->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
+ rack->r_ctl.rc_pace_min_segs))) {
rack_exit_persist(tp, rack);
}
+ /* Do we enter persists? */
+ if ((rack->rc_in_persist == 0) &&
+ (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_max == tp->snd_una) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
+ (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
+ /*
+ * Here the rwnd is less than
+ * the pacing size, we are established,
+ * nothing is outstanding, and there is
+ * data to send. Enter persists.
+ */
+ tp->snd_nxt = tp->snd_una;
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ }
/*
* If last ACK falls within this segment's sequence numbers, record
* the timestamp. NOTE that the test is modified according to the
@@ -5290,6 +6177,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
tp->snd_una = th->th_ack;
+ if (tp->snd_wnd < ctf_outstanding(tp)) {
+ /* The peer collapsed the window */
+ rack_collapsed_window(rack);
+ } else if (rack->rc_has_collapsed)
+ rack_un_collapse_window(rack);
+
/*
* Pull snd_wl2 up to prevent seq wrap relative to th_ack.
*/
@@ -5313,7 +6206,8 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
#endif
if (tp->snd_una == tp->snd_max) {
rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
- tp->t_acktime = 0;
+ if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
+ tp->t_acktime = 0;
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
}
/* Wake up the socket if we have room to write more */
@@ -5337,8 +6231,9 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
int32_t ret_val = 0;
int32_t todrop;
int32_t ourfinisacked = 0;
+ struct tcp_rack *rack;
- rack_calc_rwin(so, tp);
+ ctf_calc_rwin(so, tp);
/*
* If the state is SYN_SENT: if seg contains an ACK, but not for our
* SYN, drop the input. if seg contains a RST, then drop the
@@ -5353,27 +6248,30 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
TCP_PROBE5(connect__refused, NULL, tp,
mtod(m, const char *), tp, th);
tp = tcp_drop(tp, ECONNREFUSED);
- rack_do_drop(m, tp);
+ ctf_do_drop(m, tp);
return (1);
}
if (thflags & TH_RST) {
- rack_do_drop(m, tp);
+ ctf_do_drop(m, tp);
return (1);
}
if (!(thflags & TH_SYN)) {
- rack_do_drop(m, tp);
+ ctf_do_drop(m, tp);
return (1);
}
tp->irs = th->th_seq;
tcp_rcvseqinit(tp);
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
if (thflags & TH_ACK) {
+ int tfo_partial = 0;
+
TCPSTAT_INC(tcps_connects);
soisconnected(so);
#ifdef MAC
@@ -5387,22 +6285,47 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->rcv_adv += min(tp->rcv_wnd,
TCP_MAXWIN << tp->rcv_scale);
/*
+ * If not all the data that was sent in the TFO SYN
+ * has been acked, resend the remainder right away.
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ (tp->snd_una != tp->snd_max)) {
+ tp->snd_nxt = th->th_ack;
+ tfo_partial = 1;
+ }
+ /*
* If there's data, delay ACK; if there's also a FIN ACKNOW
* will be turned on later.
*/
- if (DELAY_ACK(tp, tlen) && tlen != 0) {
- rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
- ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
+ if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
+ rack_timer_cancel(tp, rack,
+ rack->r_ctl.rc_rcvtime, __LINE__);
tp->t_flags |= TF_DELACK;
} else {
- ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
+ rack->r_wanted_output++;
tp->t_flags |= TF_ACKNOW;
}
- if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+ if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
+ V_tcp_do_ecn) {
tp->t_flags |= TF_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
}
+ if (SEQ_GT(th->th_ack, tp->snd_una)) {
+ /*
+ * We advance snd_una for the
+ * fast open case. If th_ack is
+ * acknowledging data beyond
+ * snd_una we can't just call
+ * ack-processing since the
+ * data stream in our send-map
+ * will start at snd_una + 1 (one
+ * beyond the SYN). If its just
+ * equal we don't need to do that
+ * and there is no send_map.
+ */
+ tp->snd_una++;
+ }
/*
* Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
* SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
@@ -5454,6 +6377,16 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
* of step 5, ack processing. Otherwise, goto step 6.
*/
if (thflags & TH_ACK) {
+ /* For syn-sent we need to possibly update the rtt */
+ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
+ uint32_t t;
+
+ t = tcp_ts_getticks() - to->to_tsecr;
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ tcp_rack_xmit_timer(rack, t + 1);
+ tcp_rack_xmit_timer_commit(rack, tp);
+ }
if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
return (ret_val);
/* We may have changed to FIN_WAIT_1 above */
@@ -5486,7 +6419,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
}
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
- tiwin, thflags, nxt_pkt));
+ tiwin, thflags, nxt_pkt));
}
/*
@@ -5499,62 +6432,52 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
{
+ struct tcp_rack *rack;
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
- rack_calc_rwin(so, tp);
-
+ ctf_calc_rwin(so, tp);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
-#ifdef TCP_RFC7413
- if (tp->t_flags & TF_FASTOPEN) {
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (IS_FASTOPEN(tp->t_flags)) {
/*
- * When a TFO connection is in SYN_RECEIVED, the only valid
- * packets are the initial SYN, a retransmit/copy of the
- * initial SYN (possibly with a subset of the original
- * data), a valid ACK, a FIN, or a RST.
+ * When a TFO connection is in SYN_RECEIVED, the
+ * only valid packets are the initial SYN, a
+ * retransmit/copy of the initial SYN (possibly with
+ * a subset of the original data), a valid ACK, a
+ * FIN, or a RST.
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
- struct tcp_rack *rack;
-
- rack = (struct tcp_rack *)tp->t_fb_ptr;
if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
(rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
(rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
} else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
-#endif
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
- /*
- * RFC5961 Section 4.2 Send challenge ACK for any SYN in
- * synchronized state.
- */
- if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
- return (ret_val);
- }
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC 1323 PAWS: If we have a timestamp reply on this segment and
* it's less than ts_recent, drop it.
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
/*
@@ -5565,10 +6488,10 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
* "LAND" DoS attack.
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -5592,18 +6515,16 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->ts_recent_age = tcp_ts_getticks();
tp->ts_recent = to->to_tsval;
}
+ tp->snd_wnd = tiwin;
/*
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
* is on (half-synchronized state), then queue data for later
* processing; else drop segment and return.
*/
if ((thflags & TH_ACK) == 0) {
-#ifdef TCP_RFC7413
- if (tp->t_flags & TF_FASTOPEN) {
- tp->snd_wnd = tiwin;
+ if (IS_FASTOPEN(tp->t_flags)) {
cc_conn_init(tp);
}
-#endif
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
tiwin, thflags, nxt_pkt));
}
@@ -5613,13 +6534,22 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
(TF_RCVD_SCALE | TF_REQ_SCALE)) {
tp->rcv_scale = tp->request_r_scale;
- tp->snd_wnd = tiwin;
}
/*
* Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
* FIN-WAIT-1
*/
tp->t_starttime = ticks;
+ if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+
+ /*
+ * Account for the ACK of our SYN prior to
+ * regular ACK processing below.
+ */
+ tp->snd_una++;
+ }
if (tp->t_flags & TF_NEEDFIN) {
tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
@@ -5627,25 +6557,13 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_state_change(tp, TCPS_ESTABLISHED);
TCP_PROBE5(accept__established, NULL, tp,
mtod(m, const char *), tp, th);
-#ifdef TCP_RFC7413
- if (tp->t_tfo_pending) {
- tcp_fastopen_decrement_counter(tp->t_tfo_pending);
- tp->t_tfo_pending = NULL;
-
- /*
- * Account for the ACK of our SYN prior to regular
- * ACK processing below.
- */
- tp->snd_una++;
- }
/*
* TFO connections call cc_conn_init() during SYN
* processing. Calling it again here for such connections
* is not harmless as it would undo the snd_cwnd reduction
* that occurs when a TFO SYN|ACK is retransmitted.
*/
- if (!(tp->t_flags & TF_FASTOPEN))
-#endif
+ if (!IS_FASTOPEN(tp->t_flags))
cc_conn_init(tp);
}
/*
@@ -5653,9 +6571,19 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
* not, do so now to pass queued data to user.
*/
if (tlen == 0 && (thflags & TH_FIN) == 0)
- (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
+ (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
tp->snd_wl1 = th->th_seq - 1;
+ /* For syn-recv we need to possibly update the rtt */
+ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
+ uint32_t t;
+
+ t = tcp_ts_getticks() - to->to_tsecr;
+ if (!tp->t_rttlow || tp->t_rttlow > t)
+ tp->t_rttlow = t;
+ tcp_rack_xmit_timer(rack, t + 1);
+ tcp_rack_xmit_timer_commit(rack, tp);
+ }
if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
return (ret_val);
}
@@ -5735,17 +6663,18 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
}
}
- rack_calc_rwin(so, tp);
+ ctf_calc_rwin(so, tp);
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC5961 Section 4.2 Send challenge ACK for any SYN in
* synchronized state.
*/
if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
+ ctf_challenge_ack(m, th, tp, &ret_val);
return (ret_val);
}
/*
@@ -5754,10 +6683,10 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -5793,10 +6722,11 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
return (ret_val);
} else {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
@@ -5809,7 +6739,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (rack_progress_timeout_check(tp)) {
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -5830,15 +6760,16 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
{
int32_t ret_val = 0;
- rack_calc_rwin(so, tp);
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
+ ctf_calc_rwin(so, tp);
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC5961 Section 4.2 Send challenge ACK for any SYN in
* synchronized state.
*/
if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
+ ctf_challenge_ack(m, th, tp, &ret_val);
return (ret_val);
}
/*
@@ -5847,10 +6778,10 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -5885,10 +6816,11 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
return (ret_val);
} else {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
@@ -5901,7 +6833,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (rack_progress_timeout_check(tp)) {
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -5913,7 +6845,7 @@ static int
rack_check_data_after_close(struct mbuf *m,
struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
{
- struct tcp_rack *rack;
+ struct tcp_rack *rack;
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -5921,7 +6853,7 @@ rack_check_data_after_close(struct mbuf *m,
close_now:
tp = tcp_close(tp);
TCPSTAT_INC(tcps_rcvafterclose);
- rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
return (1);
}
if (sbavail(&so->so_snd) == 0)
@@ -5947,16 +6879,17 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
- rack_calc_rwin(so, tp);
+ ctf_calc_rwin(so, tp);
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC5961 Section 4.2 Send challenge ACK for any SYN in
* synchronized state.
*/
if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
+ ctf_challenge_ack(m, th, tp, &ret_val);
return (ret_val);
}
/*
@@ -5965,10 +6898,10 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -6010,10 +6943,11 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
return (ret_val);
} else {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
@@ -6045,7 +6979,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (rack_progress_timeout_check(tp)) {
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -6066,16 +7000,17 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
- rack_calc_rwin(so, tp);
+ ctf_calc_rwin(so, tp);
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC5961 Section 4.2 Send challenge ACK for any SYN in
* synchronized state.
*/
if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
+ ctf_challenge_ack(m, th, tp, &ret_val);
return (ret_val);
}
/*
@@ -6084,10 +7019,10 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -6129,10 +7064,11 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
return (ret_val);
} else {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
@@ -6151,7 +7087,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (rack_progress_timeout_check(tp)) {
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -6172,16 +7108,17 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
- rack_calc_rwin(so, tp);
+ ctf_calc_rwin(so, tp);
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC5961 Section 4.2 Send challenge ACK for any SYN in
* synchronized state.
*/
if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
+ ctf_challenge_ack(m, th, tp, &ret_val);
return (ret_val);
}
/*
@@ -6190,10 +7127,10 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -6235,10 +7172,11 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
return (ret_val);
} else {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
@@ -6251,13 +7189,13 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ourfinisacked) {
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_close(tp);
- rack_do_drop(m, tp);
+ ctf_do_drop(m, tp);
return (1);
}
if (sbavail(&so->so_snd)) {
if (rack_progress_timeout_check(tp)) {
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -6279,17 +7217,18 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
- rack_calc_rwin(so, tp);
+ ctf_calc_rwin(so, tp);
/* Reset receive buffer auto scaling when not in bulk receive mode. */
- if (thflags & TH_RST)
- return (rack_process_rst(m, th, so, tp));
+ if ((thflags & TH_RST) ||
+ (tp->t_fin_is_rst && (thflags & TH_FIN)))
+ return (ctf_process_rst(m, th, so, tp));
/*
* RFC5961 Section 4.2 Send challenge ACK for any SYN in
* synchronized state.
*/
if (thflags & TH_SYN) {
- rack_challenge_ack(m, th, tp, &ret_val);
+ ctf_challenge_ack(m, th, tp, &ret_val);
return (ret_val);
}
/*
@@ -6298,10 +7237,10 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
TSTMP_LT(to->to_tsval, tp->ts_recent)) {
- if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
+ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
return (ret_val);
}
/*
@@ -6344,10 +7283,11 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
- rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
return (ret_val);
} else {
- rack_do_drop(m, NULL);
+ ctf_do_drop(m, NULL);
return (0);
}
}
@@ -6360,7 +7300,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (rack_progress_timeout_check(tp)) {
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -6377,10 +7317,43 @@ rack_clear_rate_sample(struct tcp_rack *rack)
rack->r_ctl.rack_rs.rs_rtt_tot = 0;
}
+static void
+rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ uint32_t tls_seg = 0;
+
+#ifdef KERN_TLS
+ if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
+ tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd);
+ rack->r_ctl.rc_pace_min_segs = tls_seg;
+ } else
+#endif
+ rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
+ rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs;
+ if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES)
+ rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
+#ifdef KERN_TLS
+ if (tls_seg != 0) {
+ if (rack_hw_tls_max_seg > 1) {
+ rack->r_ctl.rc_pace_max_segs /= tls_seg;
+ if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs)
+ rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg;
+ } else {
+ rack->r_ctl.rc_pace_max_segs = 1;
+ }
+ if (rack->r_ctl.rc_pace_max_segs == 0)
+ rack->r_ctl.rc_pace_max_segs = 1;
+ rack->r_ctl.rc_pace_max_segs *= tls_seg;
+ }
+#endif
+ rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2);
+}
+
static int
rack_init(struct tcpcb *tp)
{
struct tcp_rack *rack = NULL;
+ struct rack_sendmap *insret;
tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
if (tp->t_fb_ptr == NULL) {
@@ -6395,13 +7368,14 @@ rack_init(struct tcpcb *tp)
memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
rack = (struct tcp_rack *)tp->t_fb_ptr;
- TAILQ_INIT(&rack->r_ctl.rc_map);
+ RB_INIT(&rack->r_ctl.rc_mtree);
TAILQ_INIT(&rack->r_ctl.rc_free);
TAILQ_INIT(&rack->r_ctl.rc_tmap);
rack->rc_tp = tp;
if (tp->t_inpcb) {
rack->rc_inp = tp->t_inpcb;
}
+ tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
/* Probably not needed but lets be sure */
rack_clear_rate_sample(rack);
rack->r_cpu = 0;
@@ -6409,27 +7383,36 @@ rack_init(struct tcpcb *tp)
rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
rack->rc_pace_reduce = rack_slot_reduction;
+ if (use_rack_cheat)
+ rack->use_rack_cheat = 1;
if (V_tcp_delack_enabled)
tp->t_delayed_ack = 1;
else
tp->t_delayed_ack = 0;
rack->rc_pace_max_segs = rack_hptsi_segments;
- rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
- rack->r_idle_reduce_largest = rack_reduce_largest_on_idle;
rack->r_enforce_min_pace = rack_min_pace_time;
- rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
rack->r_ctl.rc_prop_rate = rack_proportional_rate;
rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
rack->r_ctl.rc_early_recovery = rack_early_recovery;
rack->rc_always_pace = rack_pace_every_seg;
+ rack_set_pace_segments(tp, rack);
+ rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
rack->r_ctl.rc_min_to = rack_min_to;
- rack->r_ctl.rc_prr_inc_var = rack_inc_var;
+ rack->rack_per_of_gp = rack_per_of_gp;
+ microuptime(&rack->r_ctl.rc_last_ack);
+ rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
+ rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks();
+ /* Do we force on detection? */
+ if (tcp_force_detection)
+ rack->do_detection = 1;
+ else
+ rack->do_detection = 0;
if (tp->snd_una != tp->snd_max) {
/* Create a send map for the current outstanding data */
struct rack_sendmap *rsm;
@@ -6441,18 +7424,24 @@ rack_init(struct tcpcb *tp)
return (ENOMEM);
}
rsm->r_flags = RACK_OVERMAX;
- rsm->r_tim_lastsent[0] = tcp_ts_getticks();
+ rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time;
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
rsm->r_start = tp->snd_una;
rsm->r_end = tp->snd_max;
- rsm->r_sndcnt = 0;
- TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
+ rsm->r_dupack = 0;
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#ifdef INVARIANTS
+ if (insret != NULL) {
+ panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
+ insret, rack, rsm);
+ }
+#endif
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
}
rack_stop_all_timers(tp);
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
return (0);
}
@@ -6472,7 +7461,7 @@ rack_handoff_ok(struct tcpcb *tp)
*/
return (EAGAIN);
}
- if (tp->t_flags & TF_SACK_PERMIT) {
+ if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
return (0);
}
/*
@@ -6487,21 +7476,28 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
{
if (tp->t_fb_ptr) {
struct tcp_rack *rack;
- struct rack_sendmap *rsm;
-
+ struct rack_sendmap *rsm, *nrsm, *rm;
+ if (tp->t_inpcb) {
+ tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
+ tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ }
rack = (struct tcp_rack *)tp->t_fb_ptr;
#ifdef TCP_BLACKBOX
tcp_log_flowend(tp);
#endif
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
- while (rsm) {
- TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
+ RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
+ rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#ifdef INVARIANTS
+ if (rm != rsm) {
+ panic("At fini, rack:%p rsm:%p rm:%p",
+ rack, rsm, rm);
+ }
+#endif
uma_zfree(rack_zone, rsm);
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
}
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
while (rsm) {
- TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
uma_zfree(rack_zone, rsm);
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
}
@@ -6513,6 +7509,7 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
tp->snd_nxt = tp->snd_max;
}
+
static void
rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
{
@@ -6526,6 +7523,7 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
rack->r_substate = rack_do_syn_recv;
break;
case TCPS_ESTABLISHED:
+ rack_set_pace_segments(tp, rack);
rack->r_state = TCPS_ESTABLISHED;
rack->r_substate = rack_do_established;
break;
@@ -6600,21 +7598,13 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
return;
}
}
- if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
- if ((tp->t_flags & TF_SENTFIN) &&
- ((tp->snd_max - tp->snd_una) == 1) &&
- (rsm->r_flags & RACK_HAS_FIN)) {
- /* needs to be a RXT */
- if (tmr_up == PACE_TMR_RXT)
- return;
- } else if (tmr_up == PACE_TMR_RACK)
- return;
- } else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
+ if (SEQ_GT(tp->snd_max, tp->snd_una) &&
((tmr_up == PACE_TMR_TLP) ||
+ (tmr_up == PACE_TMR_RACK) ||
(tmr_up == PACE_TMR_RXT))) {
/*
- * Either a TLP or RXT is fine if no sack-passed
- * is in place and data is outstanding.
+ * Either a Rack, TLP or RXT is fine if we
+ * have outstanding data.
*/
return;
} else if (tmr_up == PACE_TMR_DELACK) {
@@ -6633,11 +7623,11 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
* with the slot set to what was in the saved slot.
*/
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
}
-static void
-rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
+static int
+rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
int32_t nxt_pkt, struct timeval *tv)
{
@@ -6650,6 +7640,10 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct rack_sendmap *rsm;
int32_t prev_state = 0;
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
+ tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+ }
cts = tcp_tv_to_mssectick(tv);
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -6662,34 +7656,55 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* allow the tcbinfo to be in either locked or unlocked, as the
* caller may have unnecessarily acquired a lock due to a race.
*/
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+ tp->t_state != TCPS_ESTABLISHED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ }
INP_WLOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
__func__));
KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
__func__));
- {
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
- TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
- tlen, &log, true);
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
+ TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
+ tlen, &log, true, &tv);
+ }
+ if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
+ way_out = 4;
+ retval = 0;
+ goto done_with_input;
+ }
+ /*
+ * If a segment with the ACK-bit set arrives in the SYN-SENT state
+ * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
+ */
+ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return(1);
}
/*
* Segment received on connection. Reset idle time and keep-alive
* timer. XXX: This should be done after segment validation to
* ignore broken/spoofed segs.
*/
- if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
- if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
- counter_u64_add(rack_input_idle_reduces, 1);
- rack_cc_after_idle(tp,
- (rack->r_idle_reduce_largest ? 1 :0));
- }
+ if (tp->t_idle_reduce &&
+ (tp->snd_max == tp->snd_una) &&
+ ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
+ counter_u64_add(rack_input_idle_reduces, 1);
+ rack_cc_after_idle(tp);
}
- rack->r_ctl.rc_rcvtime = cts;
tp->t_rcvtime = ticks;
/*
@@ -6700,6 +7715,8 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
#ifdef NETFLIX_STATS
stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
#endif
+ if (tiwin > rack->r_ctl.rc_high_rwnd)
+ rack->r_ctl.rc_high_rwnd = tiwin;
/*
* TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
* this to occur after we've validated the segment.
@@ -6782,6 +7799,22 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((tp->t_flags & TF_SACK_PERMIT) &&
(to.to_flags & TOF_SACKPERM) == 0)
tp->t_flags &= ~TF_SACK_PERMIT;
+ if (IS_FASTOPEN(tp->t_flags)) {
+ if (to.to_flags & TOF_FASTOPEN) {
+ uint16_t mss;
+
+ if (to.to_flags & TOF_MSS)
+ mss = to.to_mss;
+ else
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
+ mss = TCP6_MSS;
+ else
+ mss = TCP_MSS;
+ tcp_fastopen_update_cache(tp, mss,
+ to.to_tfo_len, to.to_tfo_cookie);
+ } else
+ tcp_fastopen_disable_path(tp);
+ }
}
/*
* At this point we are at the initial call. Here we decide
@@ -6793,7 +7826,7 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_switch_back_to_default(tp);
(*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
tlen, iptos);
- return;
+ return (1);
}
/* Set the flag */
rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
@@ -6805,9 +7838,12 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* always. All other times (timers etc) we must have a rack-state
* set (so we assure we have done the checks above for SACK).
*/
+ memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval));
+ rack->r_ctl.rc_rcvtime = cts;
if (rack->r_state != tp->t_state)
rack_set_state(tp, rack);
- if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
+ if (SEQ_GT(th->th_ack, tp->snd_una) &&
+ (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
kern_prefetch(rsm, &prev_state);
prev_state = rack->r_state;
rack->r_ctl.rc_tlp_send_cnt = 0;
@@ -6828,15 +7864,24 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* is gone.
*/
INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (rack->set_pacing_done_a_iw == 0) {
+ /* How much has been acked? */
+ if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
+ /* We have enough to set in the pacing segment size */
+ rack->set_pacing_done_a_iw = 1;
+ rack_set_pace_segments(tp, rack);
+ }
+ }
tcp_rack_xmit_timer_commit(rack, tp);
- if (nxt_pkt == 0) {
+ if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) {
if (rack->r_wanted_output != 0) {
did_out = 1;
(void)tp->t_fb->tfb_tcp_output(tp);
}
- rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
+ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
}
- if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
+ if ((nxt_pkt == 0) &&
+ ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
(SEQ_GT(tp->snd_max, tp->snd_una) ||
(tp->t_flags & TF_DELACK) ||
((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
@@ -6844,20 +7889,24 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/* We could not send (probably in the hpts but stopped the timer earlier)? */
if ((tp->snd_max == tp->snd_una) &&
((tp->t_flags & TF_DELACK) == 0) &&
+ (rack->rc_inp->inp_in_hpts) &&
(rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
/* keep alive not needed if we are hptsi output yet */
;
} else {
- if (rack->rc_inp->inp_in_hpts)
+ if (rack->rc_inp->inp_in_hpts) {
tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
+ counter_u64_add(rack_per_timer_hole, 1);
+ }
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
}
way_out = 1;
- } else {
+ } else if (nxt_pkt == 0) {
/* Do we have the correct timer running? */
rack_timer_audit(tp, rack, &so->so_snd);
way_out = 2;
}
+ done_with_input:
rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
if (did_out)
rack->r_wanted_output = 0;
@@ -6868,8 +7917,8 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
retval, tp, prev_state);
}
#endif
- INP_WUNLOCK(tp->t_inpcb);
}
+ return (retval);
}
void
@@ -6877,29 +7926,24 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
{
struct timeval tv;
-#ifdef RSS
- struct tcp_function_block *tfb;
- struct tcp_rack *rack;
- struct inpcb *inp;
- rack = (struct tcp_rack *)tp->t_fb_ptr;
- if (rack->r_state == 0) {
- /*
- * Initial input (ACK to SYN-ACK etc)lets go ahead and get
- * it processed
- */
+ /* First lets see if we have old packets */
+ if (tp->t_in_pkt) {
+ if (ctf_do_queued_segments(so, tp, 1)) {
+ m_freem(m);
+ return;
+ }
+ }
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+ } else {
+ /* Should not be should we kassert instead? */
tcp_get_usecs(&tv);
- rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
- tlen, iptos, 0, &tv);
- return;
}
- tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
- INP_WUNLOCK(tp->t_inpcb);
-#else
- tcp_get_usecs(&tv);
- rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
- tlen, iptos, 0, &tv);
-#endif
+ if(rack_do_segment_nounlock(m, th, so, tp,
+ drop_hdrlen, tlen, iptos, 0, &tv) == 0)
+ INP_WUNLOCK(tp->t_inpcb);
}
struct rack_sendmap *
@@ -6907,10 +7951,10 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
{
struct rack_sendmap *rsm = NULL;
int32_t idx;
- uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
+ uint32_t srtt = 0, thresh = 0, ts_low = 0;
/* Return the next guy to be re-transmitted */
- if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
+ if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
return (NULL);
}
if (tp->t_flags & TF_SENTFIN) {
@@ -6927,10 +7971,6 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
return (NULL);
}
check_it:
- srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
- srtt = TICKS_2_MSEC(srtt_cur);
- if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
- srtt = rack->rc_rack_rtt;
if (rsm->r_flags & RACK_ACKED) {
return (NULL);
}
@@ -6938,18 +7978,133 @@ check_it:
/* Its not yet ready */
return (NULL);
}
+ srtt = rack_grab_rtt(tp, rack);
idx = rsm->r_rtr_cnt - 1;
ts_low = rsm->r_tim_lastsent[idx];
thresh = rack_calc_thresh_rack(rack, srtt, tsused);
- if (tsused <= ts_low) {
+ if ((tsused == ts_low) ||
+ (TSTMP_LT(tsused, ts_low))) {
+ /* No time since sending */
+ return (NULL);
+ }
+ if ((tsused - ts_low) < thresh) {
+ /* It has not been long enough yet */
return (NULL);
}
- if ((tsused - ts_low) >= thresh) {
+ if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ ((rsm->r_flags & RACK_SACK_PASSED) &&
+ (rack->sack_attack_disable == 0))) {
+ /*
+ * We have passed the dup-ack threshold <or>
+ * a SACK has indicated this is missing.
+ * Note that if you are a declared attacker
+ * it is only the dup-ack threshold that
+ * will cause retransmits.
+ */
+ /* log retransmit reason */
+ rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
return (rsm);
}
return (NULL);
}
+static int32_t
+rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len)
+{
+ int32_t slot = 0;
+
+ if ((rack->rack_per_of_gp == 0) ||
+ (rack->rc_always_pace == 0)) {
+ /*
+ * We use the most optimistic possible cwnd/srtt for
+ * sending calculations. This will make our
+ * calculation anticipate getting more through
+ * quicker then possible. But thats ok we don't want
+ * the peer to have a gap in data sending.
+ */
+ uint32_t srtt, cwnd, tr_perms = 0;
+
+old_method:
+ if (rack->r_ctl.rc_rack_min_rtt)
+ srtt = rack->r_ctl.rc_rack_min_rtt;
+ else
+ srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
+ if (rack->r_ctl.rc_rack_largest_cwnd)
+ cwnd = rack->r_ctl.rc_rack_largest_cwnd;
+ else
+ cwnd = tp->snd_cwnd;
+ tr_perms = cwnd / srtt;
+ if (tr_perms == 0) {
+ tr_perms = ctf_fixed_maxseg(tp);
+ }
+ /*
+ * Calculate how long this will take to drain, if
+ * the calculation comes out to zero, thats ok we
+ * will use send_a_lot to possibly spin around for
+ * more increasing tot_len_this_send to the point
+ * that its going to require a pace, or we hit the
+ * cwnd. Which in that case we are just waiting for
+ * a ACK.
+ */
+ slot = len / tr_perms;
+ /* Now do we reduce the time so we don't run dry? */
+ if (slot && rack->rc_pace_reduce) {
+ int32_t reduce;
+
+ reduce = (slot / rack->rc_pace_reduce);
+ if (reduce < slot) {
+ slot -= reduce;
+ } else
+ slot = 0;
+ }
+ } else {
+ int cnt;
+ uint64_t bw_est, bw_raise, res, lentim;
+
+ bw_est = 0;
+ for (cnt=0; cnt<RACK_GP_HIST; cnt++) {
+ if ((rack->r_ctl.rc_gp_hist_filled == 0) &&
+ (rack->r_ctl.rc_gp_history[cnt] == 0))
+ break;
+ bw_est += rack->r_ctl.rc_gp_history[cnt];
+ }
+ if (bw_est == 0) {
+ /*
+ * No way yet to make a b/w estimate
+ * (no goodput est yet).
+ */
+ goto old_method;
+ }
+ /* Covert to bytes per second */
+ bw_est *= MSEC_IN_SECOND;
+ /*
+ * Now ratchet it up by our percentage. Note
+ * that the minimum you can do is 1 which would
+ * get you 101% of the average last N goodput estimates.
+ * The max you can do is 256 which would yeild you
+ * 356% of the last N goodput estimates.
+ */
+ bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp;
+ bw_est += bw_raise;
+ /* average by the number we added */
+ bw_est /= cnt;
+ /* Now calculate a rate based on this b/w */
+ lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND;
+ res = lentim / bw_est;
+ slot = (uint32_t)res;
+ }
+ if (rack->r_enforce_min_pace &&
+ (slot == 0)) {
+ /* We are enforcing a minimum pace time of 1ms */
+ slot = rack->r_enforce_min_pace;
+ }
+ if (slot)
+ counter_u64_add(rack_calc_nonzero, 1);
+ else
+ counter_u64_add(rack_calc_zero, 1);
+ return (slot);
+}
+
static int
rack_output(struct tcpcb *tp)
{
@@ -6961,22 +8116,19 @@ rack_output(struct tcpcb *tp)
struct mbuf *mb;
uint32_t if_hw_tsomaxsegcount = 0;
uint32_t if_hw_tsomaxsegsize;
+ int32_t maxseg;
long tot_len_this_send = 0;
struct ip *ip = NULL;
#ifdef TCPDEBUG
struct ipovly *ipov = NULL;
#endif
-#ifdef NETFLIX_TCP_O_UDP
struct udphdr *udp = NULL;
-#endif
struct tcp_rack *rack;
struct tcphdr *th;
uint8_t pass = 0;
+ uint8_t wanted_cookie = 0;
u_char opt[TCP_MAXOLEN];
- unsigned ipoptlen, optlen, hdrlen;
-#ifdef NETFLIX_TCP_O_UDP
- unsigned ulen;
-#endif
+ unsigned ipoptlen, optlen, hdrlen, ulen=0;
uint32_t rack_seq;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
@@ -6987,13 +8139,16 @@ rack_output(struct tcpcb *tp)
int32_t sub_from_prr = 0;
volatile int32_t sack_rxmit;
struct rack_sendmap *rsm = NULL;
- int32_t tso, mtu, would_have_fin = 0;
+ int32_t tso, mtu;
struct tcpopt to;
int32_t slot = 0;
+ int32_t sup_rack = 0;
uint32_t cts;
- uint8_t hpts_calling, doing_tlp = 0;
+ uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0;
int32_t do_a_prefetch;
int32_t prefetch_rsm = 0;
+ int force_tso = 0;
+ int32_t orig_len;
int32_t prefetch_so_done = 0;
struct tcp_log_buffer *lgb = NULL;
struct inpcb *inp;
@@ -7002,11 +8157,8 @@ rack_output(struct tcpcb *tp)
struct ip6_hdr *ip6 = NULL;
int32_t isipv6;
#endif
-#ifdef KERN_TLS
- const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
-#else
- const bool hw_tls = false;
-#endif
+ uint8_t filled_all = 0;
+ bool hw_tls = false;
/* setup and take the cache hits here */
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -7015,24 +8167,26 @@ rack_output(struct tcpcb *tp)
sb = &so->so_snd;
kern_prefetch(sb, &do_a_prefetch);
do_a_prefetch = 1;
+
+#ifdef KERN_TLS
+ hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
+#endif
INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return (tcp_offload_output(tp));
#endif
-
-#ifdef TCP_RFC7413
+ maxseg = ctf_fixed_maxseg(tp);
/*
* For TFO connections in SYN_RECEIVED, only allow the initial
* SYN|ACK and those sent by the retransmit timer.
*/
- if ((tp->t_flags & TF_FASTOPEN) &&
+ if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED) &&
- SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */
- (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
+ SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */
+ (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */
return (0);
-#endif
#ifdef INET6
if (rack->r_state) {
/* Use the cache line loaded if possible */
@@ -7075,6 +8229,17 @@ rack_output(struct tcpcb *tp)
rack->r_wanted_output = 0;
rack->r_timer_override = 0;
/*
+ * For TFO connections in SYN_SENT or SYN_RECEIVED,
+ * only allow the initial SYN or SYN|ACK and those sent
+ * by the retransmit timer.
+ */
+ if (IS_FASTOPEN(tp->t_flags) &&
+ ((tp->t_state == TCPS_SYN_RECEIVED) ||
+ (tp->t_state == TCPS_SYN_SENT)) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
+ (tp->t_rxtshift == 0)) /* not a retransmit */
+ return (0);
+ /*
* Determine length of data that should be transmitted, and flags
* that will be used. If there is some data or critical controls
* (SYN, RST) to send, then transmit; otherwise, investigate
@@ -7083,8 +8248,7 @@ rack_output(struct tcpcb *tp)
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
if (tp->t_idle_reduce) {
if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
- rack_cc_after_idle(tp,
- (rack->r_idle_reduce_largest ? 1 :0));
+ rack_cc_after_idle(tp);
}
tp->t_flags &= ~TF_LASTIDLE;
if (idle) {
@@ -7107,17 +8271,6 @@ again:
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
flags = tcp_outflags[tp->t_state];
- /*
- * Send any SACK-generated retransmissions. If we're explicitly
- * trying to send out new data (when sendalot is 1), bypass this
- * function. If we retransmit in fast recovery mode, decrement
- * snd_cwnd, since we're replacing a (future) new transmission with
- * a retransmission now, and we previously incremented snd_cwnd in
- * tcp_input().
- */
- /*
- * Still in sack recovery , reset rxmit flag to zero.
- */
while (rack->rc_free_cnt < rack_free_cache) {
rsm = rack_alloc(rack);
if (rsm == NULL) {
@@ -7126,7 +8279,7 @@ again:
slot = 1;
goto just_return_nolock;
}
- TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
rack->rc_free_cnt++;
rsm = NULL;
}
@@ -7145,18 +8298,24 @@ again:
long tlen;
doing_tlp = 1;
- rsm = rack->r_ctl.rc_tlpsend;
+ /*
+ * Check if we can do a TLP with a RACK'd packet
+ * this can happen if we are not doing the rack
+ * cheat and we skipped to a TLP and it
+ * went off.
+ */
+ rsm = tcp_rack_output(tp, rack, cts);
+ if (rsm == NULL)
+ rsm = rack->r_ctl.rc_tlpsend;
rack->r_ctl.rc_tlpsend = NULL;
sack_rxmit = 1;
tlen = rsm->r_end - rsm->r_start;
- if (tlen > tp->t_maxseg)
- tlen = tp->t_maxseg;
-#ifdef INVARIANTS
- if (SEQ_GT(tp->snd_una, rsm->r_start)) {
- panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
- tp, rack, tp->snd_una, rsm, rsm->r_start);
- }
-#endif
+ if (tlen > ctf_fixed_maxseg(tp))
+ tlen = ctf_fixed_maxseg(tp);
+ KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
+ ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
+ __func__, __LINE__,
+ rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
cwin = min(tp->snd_wnd, tlen);
len = cwin;
@@ -7167,16 +8326,19 @@ again:
len = rsm->r_end - rsm->r_start;
sack_rxmit = 1;
sendalot = 0;
+ KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
+ ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
+ __func__, __LINE__,
+ rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
- if (len >= tp->t_maxseg) {
- len = tp->t_maxseg;
+ if (len >= ctf_fixed_maxseg(tp)) {
+ len = ctf_fixed_maxseg(tp);
}
- KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
- __func__, sb_offset));
} else if ((rack->rc_in_persist == 0) &&
((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
- long tlen;
+ int maxseg;
+ maxseg = ctf_fixed_maxseg(tp);
if ((!IN_RECOVERY(tp->t_flags)) &&
((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
/* Enter recovery if not induced by a time-out */
@@ -7188,7 +8350,8 @@ again:
* When we enter recovery we need to assure we send
* one packet.
*/
- rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 13);
}
#ifdef INVARIANTS
if (SEQ_LT(rsm->r_start, tp->snd_una)) {
@@ -7196,39 +8359,57 @@ again:
tp, rack, rsm, rsm->r_start, tp->snd_una);
}
#endif
- tlen = rsm->r_end - rsm->r_start;
+ len = rsm->r_end - rsm->r_start;
+ KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
+ ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
+ __func__, __LINE__,
+ rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
- if (tlen > rack->r_ctl.rc_prr_sndcnt) {
- len = rack->r_ctl.rc_prr_sndcnt;
- } else {
- len = tlen;
- }
- if (len >= tp->t_maxseg) {
- sendalot = 1;
- len = tp->t_maxseg;
- } else {
- sendalot = 0;
- if ((rack->rc_timer_up == 0) &&
- (len < tlen)) {
+ /* Can we send it within the PRR boundary? */
+ if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) {
+ /* It does not fit */
+ if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) &&
+ (rack->r_ctl.rc_prr_sndcnt < maxseg)) {
/*
- * If its not a timer don't send a partial
- * segment.
+ * prr is less than a segment, we
+ * have more acks due in besides
+ * what we need to resend. Lets not send
+ * to avoid sending small pieces of
+ * what we need to retransmit.
*/
len = 0;
goto just_return_nolock;
}
+ len = rack->r_ctl.rc_prr_sndcnt;
+ }
+ sendalot = 0;
+ if (len >= maxseg) {
+ len = maxseg;
}
- KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
- __func__, sb_offset));
if (len > 0) {
sub_from_prr = 1;
sack_rxmit = 1;
TCPSTAT_INC(tcps_sack_rexmits);
TCPSTAT_ADD(tcps_sack_rexmit_bytes,
- min(len, tp->t_maxseg));
+ min(len, ctf_fixed_maxseg(tp)));
counter_u64_add(rack_rtm_prr_retran, 1);
}
}
+ /*
+ * Enforce a connection sendmap count limit if set
+ * as long as we are not retransmiting.
+ */
+ if ((rsm == NULL) &&
+ (rack->do_detection == 0) &&
+ (rack_tcp_map_entries_limit > 0) &&
+ (rack->r_ctl.rc_num_maps_alloced >= rack_tcp_map_entries_limit)) {
+ counter_u64_add(rack_to_alloc_limited, 1);
+ if (!rack->alloc_limit_reported) {
+ rack->alloc_limit_reported = 1;
+ counter_u64_add(rack_alloc_limited_conns, 1);
+ }
+ goto just_return_nolock;
+ }
if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
/* we are retransmitting the fin */
len--;
@@ -7244,20 +8425,6 @@ again:
/* For debugging */
rack->r_ctl.rc_rsm_at_retran = rsm;
#endif
- /*
- * Enforce a connection sendmap count limit if set
- * as long as we are not retransmiting.
- */
- if ((rsm == NULL) &&
- (rack_map_entries_limit > 0) &&
- (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
- counter_u64_add(rack_to_alloc_limited, 1);
- if (!rack->alloc_limit_reported) {
- rack->alloc_limit_reported = 1;
- counter_u64_add(rack_alloc_limited_conns, 1);
- }
- goto just_return_nolock;
- }
/*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
@@ -7299,7 +8466,9 @@ again:
flags &= ~TH_FIN;
sendwin = 1;
} else {
- if (rack->rc_in_persist)
+ if ((rack->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
+ rack->r_ctl.rc_pace_min_segs)))
rack_exit_persist(tp, rack);
/*
* If we are dropping persist mode then we need to
@@ -7328,7 +8497,7 @@ again:
uint32_t avail;
avail = sbavail(sb);
- if (SEQ_GT(tp->snd_nxt, tp->snd_una))
+ if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
sb_offset = tp->snd_nxt - tp->snd_una;
else
sb_offset = 0;
@@ -7343,7 +8512,7 @@ again:
else
len = rack->r_ctl.rc_tlp_new_data;
rack->r_ctl.rc_tlp_new_data = 0;
- doing_tlp = 1;
+ new_data_tlp = doing_tlp = 1;
} else {
if (sendwin > avail) {
/* use the available */
@@ -7387,13 +8556,12 @@ again:
if (len > 0) {
if (len > rack->r_ctl.rc_prr_sndcnt)
len = rack->r_ctl.rc_prr_sndcnt;
-
if (len > 0) {
sub_from_prr = 1;
counter_u64_add(rack_rtm_prr_newdata, 1);
}
}
- if (len > tp->t_maxseg) {
+ if (len > ctf_fixed_maxseg(tp)) {
/*
* We should never send more than a MSS when
* retransmitting or sending new data in prr
@@ -7402,8 +8570,8 @@ again:
* let us send a lot as well :-)
*/
if (rack->r_ctl.rc_prr_sendalot == 0)
- len = tp->t_maxseg;
- } else if (len < tp->t_maxseg) {
+ len = ctf_fixed_maxseg(tp);
+ } else if (len < ctf_fixed_maxseg(tp)) {
/*
* Do we send any? The idea here is if the
* send empty's the socket buffer we want to
@@ -7429,19 +8597,18 @@ again:
* SYN-SENT state and if segment contains data and if we don't know
* that foreign host supports TAO, suppress sending segment.
*/
- if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
- if ((tp->t_state != TCPS_SYN_RECEIVED) &&
- (tp->t_state != TCPS_SYN_SENT))
+ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
+ ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
+ if (tp->t_state != TCPS_SYN_RECEIVED)
flags &= ~TH_SYN;
-#ifdef TCP_RFC7413
/*
* When sending additional segments following a TFO SYN|ACK,
* do not include the SYN bit.
*/
- if ((tp->t_flags & TF_FASTOPEN) &&
+ if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED))
flags &= ~TH_SYN;
-#endif
+ sb_offset--, len++;
}
/*
* Be careful not to send data and/or FIN on SYN segments. This
@@ -7452,16 +8619,30 @@ again:
len = 0;
flags &= ~TH_FIN;
}
-#ifdef TCP_RFC7413
/*
- * When retransmitting SYN|ACK on a passively-created TFO socket,
- * don't include data, as the presence of data may have caused the
- * original SYN|ACK to have been dropped by a middlebox.
+ * On TFO sockets, ensure no data is sent in the following cases:
+ *
+ * - When retransmitting SYN|ACK on a passively-created socket
+ *
+ * - When retransmitting SYN on an actively created socket
+ *
+ * - When sending a zero-length cookie (cookie request) on an
+ * actively created socket
+ *
+ * - When the socket is in the CLOSED state (RST is being sent)
*/
- if ((tp->t_flags & TF_FASTOPEN) &&
- ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)))
+ if (IS_FASTOPEN(tp->t_flags) &&
+ (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
+ ((tp->t_state == TCPS_SYN_SENT) &&
+ (tp->t_tfo_client_cookie_len == 0)) ||
+ (flags & TH_RST))) {
+ sack_rxmit = 0;
len = 0;
-#endif
+ }
+ /* Without fast-open there should never be data sent on a SYN */
+ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
+ len = 0;
+ orig_len = len;
if (len <= 0) {
/*
* If FIN has been sent but not acked, but we haven't been
@@ -7481,10 +8662,68 @@ again:
len = 0;
if ((tp->snd_wnd == 0) &&
(TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ (tp->snd_una == tp->snd_max) &&
(sb_offset < (int)sbavail(sb))) {
tp->snd_nxt = tp->snd_una;
rack_enter_persist(tp, rack, cts);
}
+ } else if ((rsm == NULL) &&
+ ((doing_tlp == 0) || (new_data_tlp == 1)) &&
+ (len < rack->r_ctl.rc_pace_max_segs)) {
+ /*
+ * We are not sending a full segment for
+ * some reason. Should we not send anything (think
+ * sws or persists)?
+ */
+ if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ (len < (int)(sbavail(sb) - sb_offset))) {
+ /*
+ * Here the rwnd is less than
+ * the pacing size, this is not a retransmit,
+ * we are established and
+ * the send is not the last in the socket buffer
+ * we send nothing, and may enter persists.
+ */
+ len = 0;
+ if (tp->snd_max == tp->snd_una) {
+ /*
+ * Nothing out we can
+ * go into persists.
+ */
+ rack_enter_persist(tp, rack, cts);
+ tp->snd_nxt = tp->snd_una;
+ }
+ } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) &&
+ (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
+ (len < (int)(sbavail(sb) - sb_offset)) &&
+ (len < rack->r_ctl.rc_pace_min_segs)) {
+ /*
+ * Here we are not retransmitting, and
+ * the cwnd is not so small that we could
+ * not send at least a min size (rxt timer
+ * not having gone off), We have 2 segments or
+ * more already in flight, its not the tail end
+ * of the socket buffer and the cwnd is blocking
+ * us from sending out a minimum pacing segment size.
+ * Lets not send anything.
+ */
+ len = 0;
+ } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
+ min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
+ (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
+ (len < (int)(sbavail(sb) - sb_offset)) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state))) {
+ /*
+ * Here we have a send window but we have
+ * filled it up and we can't send another pacing segment.
+ * We also have in flight more than 2 segments
+ * and we are not completing the sb i.e. we allow
+ * the last bytes of the sb to go out even if
+ * its not a full pacing segment.
+ */
+ len = 0;
+ }
}
/* len will be >= 0 after this point. */
KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
@@ -7537,10 +8776,8 @@ again:
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
ipoptlen += ipsec_optlen;
#endif
- if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
-#ifdef NETFLIX_TCP_O_UDP
+ if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) &&
(tp->t_port == 0) &&
-#endif
((tp->t_flags & TF_SIGNATURE) == 0) &&
tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
ipoptlen == 0)
@@ -7556,16 +8793,7 @@ again:
*/
outstanding--;
}
- if (outstanding > 0) {
- /*
- * This is sub-optimal. We only send a stand alone
- * FIN on its own segment.
- */
- if (flags & TH_FIN) {
- flags &= ~TH_FIN;
- would_have_fin = 1;
- }
- } else if (sack_rxmit) {
+ if (sack_rxmit) {
if ((rsm->r_flags & RACK_HAS_FIN) == 0)
flags &= ~TH_FIN;
} else {
@@ -7587,7 +8815,7 @@ again:
* limited the window size) - we need to retransmit
*/
if (len) {
- if (len >= tp->t_maxseg) {
+ if (len >= ctf_fixed_maxseg(tp)) {
pass = 1;
goto send;
}
@@ -7677,10 +8905,10 @@ again:
if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
goto dontupdate;
- if (adv >= (int32_t)(2 * tp->t_maxseg) &&
+ if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) &&
(adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
- so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
+ so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) {
pass = 7;
goto send;
}
@@ -7709,13 +8937,10 @@ dontupdate:
* If our state indicates that FIN should be sent and we have not
* yet done so, then we need to send.
*/
- if (flags & TH_FIN) {
- if ((tp->t_flags & TF_SENTFIN) ||
- (((tp->t_flags & TF_SENTFIN) == 0) &&
- (tp->snd_nxt == tp->snd_una))) {
- pass = 11;
- goto send;
- }
+ if ((flags & TH_FIN) &&
+ (tp->snd_nxt == tp->snd_una)) {
+ pass = 11;
+ goto send;
}
/*
* No reason to send a segment, just return.
@@ -7725,12 +8950,38 @@ just_return:
just_return_nolock:
if (tot_len_this_send == 0)
counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
- rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
+ if (slot) {
+ /* set the rack tcb into the slot N */
+ counter_u64_add(rack_paced_segments, 1);
+ } else if (tot_len_this_send) {
+ counter_u64_add(rack_unpaced_segments, 1);
+ }
+ /* Check if we need to go into persists or not */
+ if ((rack->rc_in_persist == 0) &&
+ (tp->snd_max == tp->snd_una) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
+ (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) &&
+ (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) {
+ /* Yes lets make sure to move to persist before timer-start */
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ }
+ rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
tp->t_flags &= ~TF_FORCEDATA;
return (0);
send:
+ if ((flags & TH_FIN) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
+ /*
+ * We do not transmit a FIN
+ * with data outstanding. We
+ * need to make it so all data
+ * is acked first.
+ */
+ flags &= ~TH_FIN;
+ }
if (doing_tlp == 0) {
/*
* Data not a TLP, and its not the rxt firing. If it is the
@@ -7743,7 +8994,7 @@ send:
}
SOCKBUF_LOCK_ASSERT(sb);
if (len > 0) {
- if (len >= tp->t_maxseg)
+ if (len >= ctf_fixed_maxseg(tp))
tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
else
tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
@@ -7774,27 +9025,44 @@ send:
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCP_O_UDP
+#ifdef NETFLIX_TCPOUDP
if (tp->t_port)
to.to_mss -= V_tcp_udp_tunneling_overhead;
#endif
to.to_flags |= TOF_MSS;
-#ifdef TCP_RFC7413
+
/*
- * Only include the TFO option on the first
- * transmission of the SYN|ACK on a
- * passively-created TFO socket, as the presence of
- * the TFO option may have caused the original
- * SYN|ACK to have been dropped by a middlebox.
+ * On SYN or SYN|ACK transmits on TFO connections,
+ * only include the TFO option if it is not a
+ * retransmit, as the presence of the TFO option may
+ * have caused the original SYN or SYN|ACK to have
+ * been dropped by a middlebox.
*/
- if ((tp->t_flags & TF_FASTOPEN) &&
- (tp->t_state == TCPS_SYN_RECEIVED) &&
+ if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_rxtshift == 0)) {
- to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN;
- to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
- to.to_flags |= TOF_FASTOPEN;
+ if (tp->t_state == TCPS_SYN_RECEIVED) {
+ to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
+ to.to_tfo_cookie =
+ (u_int8_t *)&tp->t_tfo_cookie.server;
+ to.to_flags |= TOF_FASTOPEN;
+ wanted_cookie = 1;
+ } else if (tp->t_state == TCPS_SYN_SENT) {
+ to.to_tfo_len =
+ tp->t_tfo_client_cookie_len;
+ to.to_tfo_cookie =
+ tp->t_tfo_cookie.client;
+ to.to_flags |= TOF_FASTOPEN;
+ wanted_cookie = 1;
+ /*
+ * If we wind up having more data to
+ * send with the SYN than can fit in
+ * one segment, don't send any more
+ * until the SYN|ACK comes back from
+ * the other end.
+ */
+ sendalot = 0;
+ }
}
-#endif
}
/* Window scaling. */
if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
@@ -7829,8 +9097,15 @@ send:
/* Processing the options. */
hdrlen += optlen = tcp_addoptions(&to, opt);
+ /*
+ * If we wanted a TFO option to be added, but it was unable
+ * to fit, ensure no data is sent.
+ */
+ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
+ !(to.to_flags & TOF_FASTOPEN))
+ len = 0;
}
-#ifdef NETFLIX_TCP_O_UDP
+#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
if (V_tcp_udp_tunneling_port == 0) {
/* The port was removed?? */
@@ -7854,16 +9129,18 @@ send:
ipoptlen += ipsec_optlen;
#endif
+#ifdef KERN_TLS
+ /* force TSO for so TLS offload can get mss */
+ if (sb->sb_flags & SB_TLS_IFNET) {
+ force_tso = 1;
+ }
+#endif
/*
* Adjust data length if insertion of options will bump the packet
* length beyond the t_maxseg length. Clear the FIN bit because we
* cut off the tail of the segment.
*/
if (len + optlen + ipoptlen > tp->t_maxseg) {
- if (flags & TH_FIN) {
- would_have_fin = 1;
- flags &= ~TH_FIN;
- }
if (tso) {
uint32_t if_hw_tsomax;
uint32_t moff;
@@ -7896,18 +9173,19 @@ send:
* unless the send sockbuf can be emptied:
*/
max_len = (tp->t_maxseg - optlen);
- if ((sb_offset + len) < sbavail(sb)) {
+ if (((sb_offset + len) < sbavail(sb)) &&
+ (hw_tls == 0)) {
moff = len % (u_int)max_len;
if (moff != 0) {
len -= moff;
sendalot = 1;
}
- }
- /*
+ }
+ /*
* In case there are too many small fragments don't
* use TSO:
*/
- if (len <= max_len) {
+ if (len <= maxseg) {
len = max_len;
sendalot = 1;
tso = 0;
@@ -7974,9 +9252,11 @@ send:
uint32_t moff;
if (rack->rc_pace_max_segs)
- max_val = rack->rc_pace_max_segs * tp->t_maxseg;
+ max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp);
else
max_val = len;
+ if (rack->r_ctl.rc_pace_max_segs < max_val)
+ max_val = rack->r_ctl.rc_pace_max_segs;
/*
* We allow a limit on sending with hptsi.
*/
@@ -8017,9 +9297,17 @@ send:
msb = NULL;
else
msb = sb;
- m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len,
- if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
- hw_tls /*, NULL */);
+ m->m_next = tcp_m_copym(
+#ifdef NETFLIX_COPY_ARGS
+ tp,
+#endif
+ mb, moff, &len,
+ if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
+ ((rsm == NULL) ? hw_tls : 0)
+#ifdef NETFLIX_COPY_ARGS
+ , &filled_all
+#endif
+ );
if (len <= (tp->t_maxseg - optlen)) {
/*
* Must have ran out of mbufs for the copy
@@ -8053,8 +9341,6 @@ send:
* TLP should not count in retran count, but
* in its own bin
*/
-/* tp->t_sndtlppack++;*/
-/* tp->t_sndtlpbyte += len;*/
counter_u64_add(rack_tlp_retran, 1);
counter_u64_add(rack_tlp_retran_bytes, len);
} else {
@@ -8085,7 +9371,7 @@ send:
flags |= TH_PUSH;
/*
- * Are we doing hptsi, if so we must calculate the slot. We
+ * Are we doing pacing, if so we must calculate the slot. We
* only do hptsi in ESTABLISHED and with no RESET being
* sent where we have data to send.
*/
@@ -8094,56 +9380,10 @@ send:
((tp->t_state == TCPS_FIN_WAIT_1) &&
((tp->t_flags & TF_SENTFIN) == 0) &&
((flags & TH_FIN) == 0))) &&
- ((flags & TH_RST) == 0) &&
- (rack->rc_always_pace)) {
- /*
- * We use the most optimistic possible cwnd/srtt for
- * sending calculations. This will make our
- * calculation anticipate getting more through
- * quicker then possible. But thats ok we don't want
- * the peer to have a gap in data sending.
- */
- uint32_t srtt, cwnd, tr_perms = 0;
-
- if (rack->r_ctl.rc_rack_min_rtt)
- srtt = rack->r_ctl.rc_rack_min_rtt;
- else
- srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
- if (rack->r_ctl.rc_rack_largest_cwnd)
- cwnd = rack->r_ctl.rc_rack_largest_cwnd;
- else
- cwnd = tp->snd_cwnd;
- tr_perms = cwnd / srtt;
- if (tr_perms == 0) {
- tr_perms = tp->t_maxseg;
- }
+ ((flags & TH_RST) == 0)) {
+ /* Get our pacing rate */
tot_len_this_send += len;
- /*
- * Calculate how long this will take to drain, if
- * the calculation comes out to zero, thats ok we
- * will use send_a_lot to possibly spin around for
- * more increasing tot_len_this_send to the point
- * that its going to require a pace, or we hit the
- * cwnd. Which in that case we are just waiting for
- * a ACK.
- */
- slot = tot_len_this_send / tr_perms;
- /* Now do we reduce the time so we don't run dry? */
- if (slot && rack->rc_pace_reduce) {
- int32_t reduce;
-
- reduce = (slot / rack->rc_pace_reduce);
- if (reduce < slot) {
- slot -= reduce;
- } else
- slot = 0;
- }
- if (rack->r_enforce_min_pace &&
- (slot == 0) &&
- (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
- /* We are enforcing a minimum pace time of 1ms */
- slot = rack->r_enforce_min_pace;
- }
+ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send);
}
SOCKBUF_UNLOCK(sb);
} else {
@@ -8180,7 +9420,7 @@ send:
#ifdef INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCP_O_UDP
+#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -8188,10 +9428,14 @@ send:
ulen = hdrlen + len - sizeof(struct ip6_hdr);
udp->uh_ulen = htons(ulen);
th = (struct tcphdr *)(udp + 1);
- } else
+ } else
#endif
th = (struct tcphdr *)(ip6 + 1);
- tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th);
+ tcpip_fillheaders(inp,
+#ifdef NETFLIX_TCPOUDP
+ tp->t_port,
+#endif
+ ip6, th);
} else
#endif /* INET6 */
{
@@ -8199,7 +9443,7 @@ send:
#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
#endif
-#ifdef NETFLIX_TCP_O_UDP
+#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -8210,7 +9454,11 @@ send:
} else
#endif
th = (struct tcphdr *)(ip + 1);
- tcpip_fillheaders(inp,/*tp->t_port, */ ip, th);
+ tcpip_fillheaders(inp,
+#ifdef NETFLIX_TCPOUDP
+ tp->t_port,
+#endif
+ ip, th);
}
/*
* Fill in fields, remembering maximum advertised window for use in
@@ -8301,15 +9549,20 @@ send:
/*
* Calculate receive window. Don't shrink window, but avoid silly
* window syndrome.
+ * If a RST segment is sent, advertise a window of zero.
*/
- if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
- recwin < (long)tp->t_maxseg)
+ if (flags & TH_RST) {
recwin = 0;
- if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
- recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
- recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
- if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
- recwin = (long)TCP_MAXWIN << tp->rcv_scale;
+ } else {
+ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
+ recwin < (long)ctf_fixed_maxseg(tp))
+ recwin = 0;
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
+ recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
+ recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
+ if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
+ recwin = (long)TCP_MAXWIN << tp->rcv_scale;
+ }
/*
* According to RFC1323 the window field in a SYN (i.e., a <SYN> or
@@ -8376,7 +9629,6 @@ send:
* ip6_plen is not need to be filled now, and will be filled
* in ip6_output.
*/
-#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -8384,15 +9636,12 @@ send:
th->th_sum = htons(0);
UDPSTAT_INC(udps_opackets);
} else {
-#endif
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in6_cksum_pseudo(ip6,
sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
0);
-#ifdef NETFLIX_TCP_O_UDP
}
-#endif
}
#endif
#if defined(INET6) && defined(INET)
@@ -8400,7 +9649,6 @@ send:
#endif
#ifdef INET
{
-#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -8409,28 +9657,24 @@ send:
th->th_sum = htons(0);
UDPSTAT_INC(udps_opackets);
} else {
-#endif
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
IPPROTO_TCP + len + optlen));
-#ifdef NETFLIX_TCP_O_UDP
}
-#endif
/* IP version must be set here for ipv4/ipv6 checking later */
KASSERT(ip->ip_v == IPVERSION,
("%s: IP version incorrect: %d", __func__, ip->ip_v));
}
#endif
-
/*
* Enable TSO and specify the size of the segments. The TCP pseudo
* header checksum is always provided. XXX: Fixme: This is currently
* not the case for IPv6.
*/
- if (tso) {
- KASSERT(len > tp->t_maxseg - optlen,
+ if (tso || force_tso) {
+ KASSERT(force_tso || len > tp->t_maxseg - optlen,
("%s: len <= tso_segsz", __func__));
m->m_pkthdr.csum_flags |= CSUM_TSO;
m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
@@ -8443,7 +9687,6 @@ send:
/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
hhook_run_tcp_est_out(tp, th, &to, len, tso);
#endif
-
#ifdef TCPDEBUG
/*
* Trace.
@@ -8470,18 +9713,29 @@ send:
/* We're getting ready to send; log now. */
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex4 = orig_len;
+ if (filled_all)
+ log.u_bbr.flex5 = 0x80000000;
+ else
+ log.u_bbr.flex5 = 0;
if (rsm || sack_rxmit) {
log.u_bbr.flex8 = 1;
} else {
log.u_bbr.flex8 = 0;
}
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
- len, &log, false, NULL, NULL, 0, NULL);
+ len, &log, false, NULL, NULL, 0, &tv);
} else
lgb = NULL;
@@ -8585,7 +9839,7 @@ out:
if (TCPS_HAVEESTABLISHED(tp->t_state) &&
(tp->t_flags & TF_SACK_PERMIT) &&
tp->rcv_numsacks > 0)
- tcp_clean_dsack_blocks(tp);
+ tcp_clean_dsack_blocks(tp);
if (len == 0)
counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
else if (len == 1) {
@@ -8593,12 +9847,38 @@ out:
} else if (len > 1) {
int idx;
- idx = (len / tp->t_maxseg) + 3;
+ idx = (len / ctf_fixed_maxseg(tp)) + 3;
if (idx >= TCP_MSS_ACCT_ATIMER)
counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
else
counter_u64_add(rack_out_size[idx], 1);
}
+ if (hw_tls && len > 0) {
+ if (filled_all) {
+ counter_u64_add(rack_tls_filled, 1);
+ rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1);
+ } else {
+ if (rsm) {
+ counter_u64_add(rack_tls_rxt, 1);
+ rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1);
+ } else if (doing_tlp) {
+ counter_u64_add(rack_tls_tlp, 1);
+ rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1);
+ } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) {
+ counter_u64_add(rack_tls_app, 1);
+ rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1);
+ } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) {
+ counter_u64_add(rack_tls_cwnd, 1);
+ rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1);
+ } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) {
+ counter_u64_add(rack_tls_rwnd, 1);
+ rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1);
+ } else {
+ rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1);
+ counter_u64_add(rack_tls_other, 1);
+ }
+ }
+ }
}
if (sub_from_prr && (error == 0)) {
if (rack->r_ctl.rc_prr_sndcnt >= len)
@@ -8609,17 +9889,20 @@ out:
sub_from_prr = 0;
rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
pass, rsm);
+ if ((error == 0) &&
+ (len > 0) &&
+ (tp->snd_una == tp->snd_max))
+ rack->r_ctl.rc_tlp_rxt_last_time = cts;
if ((tp->t_flags & TF_FORCEDATA) == 0 ||
(rack->rc_in_persist == 0)) {
-#ifdef NETFLIX_STATS
tcp_seq startseq = tp->snd_nxt;
-#endif
+
/*
* Advance snd_nxt over sequence space of this segment.
*/
if (error)
/* We don't log or do anything with errors */
- goto timer;
+ goto nomore;
if (flags & (TH_SYN | TH_FIN)) {
if (flags & TH_SYN)
@@ -8631,7 +9914,7 @@ out:
}
/* In the ENOBUFS case we do *not* update snd_max */
if (sack_rxmit)
- goto timer;
+ goto nomore;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
@@ -8644,6 +9927,17 @@ out:
tp->t_acktime = ticks;
}
tp->snd_max = tp->snd_nxt;
+ /*
+ * Time this transmission if not a retransmission and
+ * not currently timing anything.
+ * This is only relevant in case of switching back to
+ * the base stack.
+ */
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ TCPSTAT_INC(tcps_segstimed);
+ }
#ifdef NETFLIX_STATS
if (!(tp->t_flags & TF_GPUTINPROG) && len) {
tp->t_flags |= TF_GPUTINPROG;
@@ -8654,26 +9948,6 @@ out:
}
#endif
}
- /*
- * Set retransmit timer if not currently set, and not doing
- * a pure ack or a keep-alive probe. Initial value for
- * retransmit timer is smoothed round-trip time + 2 *
- * round-trip time variance. Initialize shift counter which
- * is used for backoff of retransmit time.
- */
-timer:
- if ((tp->snd_wnd == 0) &&
- TCPS_HAVEESTABLISHED(tp->t_state)) {
- /*
- * If the persists timer was set above (right before
- * the goto send), and still needs to be on. Lets
- * make sure all is canceled. If the persist timer
- * is not running, we want to get it up.
- */
- if (rack->rc_in_persist == 0) {
- rack_enter_persist(tp, rack, cts);
- }
- }
} else {
/*
* Persist case, update snd_max but since we are in persist
@@ -8755,7 +10029,7 @@ nomore:
goto again;
}
slot = 10;
- rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
+ rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
tp->t_flags &= ~TF_FORCEDATA;
return (error);
case ENETUNREACH:
@@ -8769,7 +10043,7 @@ nomore:
/* FALLTHROUGH */
default:
slot = 10;
- rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
+ rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
tp->t_flags &= ~TF_FORCEDATA;
return (error);
}
@@ -8789,15 +10063,22 @@ nomore:
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
enobufs:
rack->r_tlp_running = 0;
- if ((flags & TH_RST) || (would_have_fin == 1)) {
+ if (flags & TH_RST) {
/*
- * We don't send again after a RST. We also do *not* send
- * again if we would have had a find, but now have
- * outstanding data.
+ * We don't send again after sending a RST.
*/
slot = 0;
sendalot = 0;
}
+ if (rsm && (slot == 0)) {
+ /*
+ * Dup ack retransmission possibly, so
+ * lets assure we have at least min rack
+ * time, if its a rack resend then the rack
+ * to will also be set to this.
+ */
+ slot = rack->r_ctl.rc_min_to;
+ }
if (slot) {
/* set the rack tcb into the slot N */
counter_u64_add(rack_paced_segments, 1);
@@ -8811,7 +10092,7 @@ enobufs:
counter_u64_add(rack_unpaced_segments, 1);
}
tp->t_flags &= ~TF_FORCEDATA;
- rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
+ rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
return (error);
}
@@ -8847,8 +10128,10 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
case TCP_RACK_TLP_INC_VAR:
case TCP_RACK_IDLE_REDUCE_HIGH:
case TCP_RACK_MIN_PACE:
- case TCP_RACK_MIN_PACE_SEG:
+ case TCP_RACK_GP_INCREASE:
case TCP_BBR_RACK_RTT_USE:
+ case TCP_BBR_USE_RACK_CHEAT:
+ case TCP_RACK_DO_DETECTION:
case TCP_DATA_AFTER_CLOSE:
break;
default:
@@ -8867,6 +10150,13 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
tp = intotcpcb(inp);
rack = (struct tcp_rack *)tp->t_fb_ptr;
switch (sopt->sopt_name) {
+ case TCP_RACK_DO_DETECTION:
+ RACK_OPTS_INC(tcp_rack_no_sack);
+ if (optval == 0)
+ rack->do_detection = 0;
+ else
+ rack->do_detection = 1;
+ break;
case TCP_RACK_PROP_RATE:
if ((optval <= 0) || (optval >= 100)) {
error = EINVAL;
@@ -8919,6 +10209,7 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
/* Max segments in a pace */
RACK_OPTS_INC(tcp_rack_max_seg);
rack->rc_pace_max_segs = optval;
+ rack_set_pace_segments(tp, rack);
break;
case TCP_RACK_PRR_SENDALOT:
/* Allow PRR to send more than one seg */
@@ -8956,6 +10247,13 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
else
error = EINVAL;
break;
+ case TCP_BBR_USE_RACK_CHEAT:
+ RACK_OPTS_INC(tcp_rack_cheat);
+ if (optval)
+ rack->use_rack_cheat = 1;
+ else
+ rack->use_rack_cheat = 0;
+ break;
case TCP_RACK_PKT_DELAY:
/* RACK added ms i.e. rack-rtt + reord + N */
RACK_OPTS_INC(tcp_rack_pkt_delay);
@@ -8963,15 +10261,10 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
break;
case TCP_RACK_TLP_INC_VAR:
/* Does TLP include rtt variance in t-o */
- RACK_OPTS_INC(tcp_rack_tlp_inc_var);
- rack->r_ctl.rc_prr_inc_var = optval;
+ return (EINVAL);
break;
case TCP_RACK_IDLE_REDUCE_HIGH:
- RACK_OPTS_INC(tcp_rack_idle_reduce_high);
- if (optval)
- rack->r_idle_reduce_largest = 1;
- else
- rack->r_idle_reduce_largest = 0;
+ return (EINVAL);
break;
case TCP_DELACK:
if (optval == 0)
@@ -8991,12 +10284,13 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
else
rack->r_enforce_min_pace = optval;
break;
- case TCP_RACK_MIN_PACE_SEG:
- RACK_OPTS_INC(tcp_rack_min_pace_seg);
- if (optval >= 16)
- rack->r_min_pace_seg_thresh = 15;
+ case TCP_RACK_GP_INCREASE:
+ if ((optval >= 0) &&
+ (optval <= 256))
+ rack->rack_per_of_gp = optval;
else
- rack->r_min_pace_seg_thresh = optval;
+ error = EINVAL;
+
break;
case TCP_BBR_RACK_RTT_USE:
if ((optval != USE_RTT_HIGH) &&
@@ -9016,7 +10310,9 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt,
return (tcp_default_ctloutput(so, sopt, inp, tp));
break;
}
-/* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/
+#ifdef NETFLIX_STATS
+ tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
+#endif
INP_WUNLOCK(inp);
return (error);
}
@@ -9034,6 +10330,10 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt,
* impact to this routine.
*/
switch (sopt->sopt_name) {
+ case TCP_RACK_DO_DETECTION:
+ optval = rack->do_detection;
+ break;
+
case TCP_RACK_PROP_RATE:
optval = rack->r_ctl.rc_prop_rate;
break;
@@ -9081,6 +10381,10 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt,
/* Does reordering fade after ms time */
optval = rack->r_ctl.rc_reorder_fade;
break;
+ case TCP_BBR_USE_RACK_CHEAT:
+ /* Do we use the rack cheat for rxt */
+ optval = rack->use_rack_cheat;
+ break;
case TCP_RACK_TLP_THRESH:
/* RACK TLP theshold i.e. srtt+(srtt/N) */
optval = rack->r_ctl.rc_tlp_threshold;
@@ -9094,16 +10398,16 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt,
break;
case TCP_RACK_TLP_INC_VAR:
/* Does TLP include rtt variance in t-o */
- optval = rack->r_ctl.rc_prr_inc_var;
+ return (EINVAL);
break;
case TCP_RACK_IDLE_REDUCE_HIGH:
- optval = rack->r_idle_reduce_largest;
+ return (EINVAL);
break;
case TCP_RACK_MIN_PACE:
optval = rack->r_enforce_min_pace;
break;
- case TCP_RACK_MIN_PACE_SEG:
- optval = rack->r_min_pace_seg_thresh;
+ case TCP_RACK_GP_INCREASE:
+ optval = rack->rack_per_of_gp;
break;
case TCP_BBR_RACK_RTT_USE:
optval = rack->r_ctl.rc_rate_sample_method;
@@ -9145,9 +10449,11 @@ out:
}
-struct tcp_function_block __tcp_rack = {
+static struct tcp_function_block __tcp_rack = {
.tfb_tcp_block_name = __XSTRING(STACKNAME),
.tfb_tcp_output = rack_output,
+ .tfb_do_queued_segments = ctf_do_queued_segments,
+ .tfb_do_segment_nounlock = rack_do_segment_nounlock,
.tfb_tcp_do_segment = rack_do_segment,
.tfb_tcp_ctloutput = rack_ctloutput,
.tfb_tcp_fb_init = rack_init,
@@ -9202,7 +10508,11 @@ tcp_addrack(module_t mod, int32_t type, void *data)
rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
OID_AUTO,
+#ifdef STACKALIAS
+ __XSTRING(STACKALIAS),
+#else
__XSTRING(STACKNAME),
+#endif
CTLFLAG_RW, 0,
"");
if (rack_sysctl_root == NULL) {
@@ -9226,6 +10536,7 @@ free_uma:
printf("Failed to register rack module -- err:%d\n", err);
return (err);
}
+ tcp_lro_reg_mbufq();
rack_mod_inited = true;
break;
case MOD_QUIESCE:
@@ -9242,6 +10553,7 @@ free_uma:
rack_counter_destroy();
rack_mod_inited = false;
}
+ tcp_lro_dereg_mbufq();
err = 0;
break;
default:
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index 96553320cd97..a61c8c4eedf1 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -40,7 +40,7 @@ __FBSDID("$FreeBSD$");
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
#include "opt_ratelimit.h"
-/*#include "opt_kern_tls.h"*/
+#include "opt_kern_tls.h"
#include <sys/param.h>
#include <sys/module.h>
#include <sys/kernel.h>
@@ -50,20 +50,25 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
+#include <sys/qmath.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#ifdef KERN_TLS
-#include <sys/sockbuf_tls.h>
+#include <sys/ktls.h>
#endif
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/tree.h>
+#ifdef NETFLIX_STATS
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
+#endif
#include <sys/refcount.h>
#include <sys/queue.h>
#include <sys/smp.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
+#include <sys/tim_filter.h>
#include <sys/time.h>
#include <vm/uma.h>
#include <sys/kern_prefetch.h>
@@ -85,6 +90,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
@@ -133,14 +139,14 @@ __FBSDID("$FreeBSD$");
uint32_t
ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
{
- struct sbtls_info *tls;
+ struct ktls_session *tls;
uint32_t len;
again:
tls = so->so_snd.sb_tls_info;
- len = tls->sb_params.sb_maxlen; /* max tls payload */
- len += tls->sb_params.sb_tls_hlen; /* tls header len */
- len += tls->sb_params.sb_tls_tlen; /* tls trailer len */
+ len = tls->params.max_frame_len; /* max tls payload */
+ len += tls->params.tls_hlen; /* tls header len */
+ len += tls->params.tls_tlen; /* tls trailer len */
if ((len * 4) > rwnd) {
/*
* Stroke this will suck counter and what
@@ -148,10 +154,10 @@ again:
* TCP perspective I am not sure
* what should be done...
*/
- if (tls->sb_params.sb_maxlen > 4096) {
- tls->sb_params.sb_maxlen -= 4096;
- if (tls->sb_params.sb_maxlen < 4096)
- tls->sb_params.sb_maxlen = 4096;
+ if (tls->params.max_frame_len > 4096) {
+ tls->params.max_frame_len -= 4096;
+ if (tls->params.max_frame_len < 4096)
+ tls->params.max_frame_len = 4096;
goto again;
}
}
@@ -414,7 +420,13 @@ skip_vnet:
* have been called (if we can).
*/
m->m_pkthdr.lro_nsegs = 1;
- tcp_get_usecs(&tv);
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+ } else {
+ /* Should not be should we kassert instead? */
+ tcp_get_usecs(&tv);
+ }
/* Now what about next packet? */
if (m_save || has_pkt)
nxt_pkt = 1;
@@ -425,7 +437,7 @@ skip_vnet:
if (retval) {
/* We lost the lock and tcb probably */
m = m_save;
- while (m) {
+ while(m) {
m_save = m->m_nextpkt;
m->m_nextpkt = NULL;
m_freem(m);
@@ -434,7 +446,7 @@ skip_vnet:
if (no_vn == 0)
CURVNET_RESTORE();
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- return (retval);
+ return(retval);
}
skipped_pkt:
m = m_save;
@@ -442,7 +454,7 @@ skipped_pkt:
if (no_vn == 0)
CURVNET_RESTORE();
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- return (retval);
+ return(retval);
}
int
@@ -457,7 +469,7 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
tp->t_tail_pkt = NULL;
if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
/* We lost the tcpcb (maybe a RST came in)? */
- return (1);
+ return(1);
}
}
return (0);
@@ -466,14 +478,14 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
uint32_t
ctf_outstanding(struct tcpcb *tp)
{
- return (tp->snd_max - tp->snd_una);
+ return(tp->snd_max - tp->snd_una);
}
uint32_t
ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
{
if (rc_sacked <= ctf_outstanding(tp))
- return (ctf_outstanding(tp) - rc_sacked);
+ return(ctf_outstanding(tp) - rc_sacked);
else {
/* TSNH */
#ifdef INVARIANTS
@@ -908,5 +920,5 @@ ctf_decay_count(uint32_t count, uint32_t decay)
* count decay value.
*/
decayed_count = count - (uint32_t)perc_count;
- return (decayed_count);
+ return(decayed_count);
}
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h
index 822208338d67..6cb2fed7c2fa 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@@ -1,7 +1,7 @@
#ifndef __pacer_timer_h__
#define __pacer_timer_h__
/*-
- * Copyright (c) 2017 Netflix, Inc.
+ * Copyright (c) 2017-9 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c
index 2ef0eadfa944..c4b35d5b8ca8 100644
--- a/sys/netinet/tcp_stacks/sack_filter.c
+++ b/sys/netinet/tcp_stacks/sack_filter.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2017 Netflix, Inc.
+ * Copyright (c) 2017-9 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -140,6 +140,7 @@ static int32_t
is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
{
int32_t i, cnt;
+
for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
if (sack_blk_used(sf, i)) {
if (SEQ_LT(b->start, sf->sf_ack)) {
@@ -150,8 +151,9 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
/* End back behind too */
b->end = sf->sf_ack;
}
- if (b->start == b->end)
+ if (b->start == b->end) {
return(1);
+ }
/* Jonathans Rule 1 */
if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
@@ -312,21 +314,22 @@ sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq
if (num == 0)
return(num);
- /* Now what we are left is either
+ /* Now what we are left with is either
* completely merged on to the board
- * from the above steps, or are new
+ * from the above steps, or is new
* and need to be added to the board
* with the last one updated to current.
*
- * First copy it out we want to return that
+ * First copy it out, we want to return that
* to our caller for processing.
*/
memcpy(in, blkboard, (num * sizeof(struct sackblk)));
numblks = num;
/* Now go through and add to our board as needed */
for(i=(num-1); i>=0; i--) {
- if (is_sack_on_board(sf, &blkboard[i]))
+ if (is_sack_on_board(sf, &blkboard[i])) {
continue;
+ }
/* Add this guy its not listed */
sf->sf_cur++;
sf->sf_cur %= SACK_FILTER_BLOCKS;
@@ -463,25 +466,60 @@ sack_board_collapse(struct sack_filter *sf)
}
#ifndef _KERNEL
+uint64_t saved=0;
+uint64_t tot_sack_blks=0;
+
+static void
+sack_filter_dump(FILE *out, struct sack_filter *sf)
+{
+ int i;
+ fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
+ sf->sf_ack, sf->sf_bits,
+ sf->sf_cur, sf->sf_used);
+
+ for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+ if (sack_blk_used(sf, i)) {
+ fprintf(out, "Entry:%d start:%u end:%u\n", i,
+ sf->sf_blks[i].start,
+ sf->sf_blks[i].end);
+ }
+ }
+}
+#endif
+
+#ifndef _KERNEL
static
#endif
int
-sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
+sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
+ tcp_seq th_ack)
{
int32_t i, ret;
if (numblks > TCP_MAX_SACK) {
+#ifdef _KERNEL
panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
sf, in,
numblks);
+#endif
return(numblks);
}
+#ifndef _KERNEL
+ if ((sf->sf_used > 1) && (no_collapse == 0))
+ sack_board_collapse(sf);
+
+#else
+ if (sf->sf_used > 1)
+ sack_board_collapse(sf);
+#endif
if ((sf->sf_used == 0) && numblks) {
/*
* We are brand new add the blocks in
* reverse order. Note we can see more
* than one in new, since ack's could be lost.
*/
+ int cnt_added = 0;
+
sf->sf_ack = th_ack;
for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
@@ -489,6 +527,7 @@ sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_se
sf->sf_cur++;
sf->sf_cur %= SACK_FILTER_BLOCKS;
sf->sf_used++;
+ cnt_added++;
#ifndef _KERNEL
if (sf->sf_used > highest_used)
highest_used = sf->sf_used;
@@ -496,7 +535,8 @@ sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_se
}
if (sf->sf_cur)
sf->sf_cur--;
- return(numblks);
+
+ return (cnt_added);
}
if (SEQ_GT(th_ack, sf->sf_ack)) {
sack_filter_prune(sf, th_ack);
@@ -509,51 +549,82 @@ sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_se
}
} else
ret = 0;
-#ifndef _KERNEL
- if ((sf->sf_used > 1) && (no_collapse == 0))
- sack_board_collapse(sf);
-
-#else
- if (sf->sf_used > 1)
- sack_board_collapse(sf);
-
-#endif
return (ret);
}
-#ifndef _KERNEL
-uint64_t saved=0;
-uint64_t tot_sack_blks=0;
-
-static void
-sack_filter_dump(FILE *out, struct sack_filter *sf)
+void
+sack_filter_reject(struct sack_filter *sf, struct sackblk *in)
{
+ /*
+ * Given a specified block (that had made
+ * it past the sack filter). Reject that
+ * block triming it off any sack-filter block
+ * that has it. Usually because the block was
+ * too small and did not cover a whole send.
+ *
+ * This function will only "undo" sack-blocks
+ * that are fresh and touch the edges of
+ * blocks in our filter.
+ */
int i;
- fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
- sf->sf_ack, sf->sf_bits,
- sf->sf_cur, sf->sf_used);
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
- if (sack_blk_used(sf, i)) {
- fprintf(out, "Entry:%d start:%u end:%u\n", i,
- sf->sf_blks[i].start,
- sf->sf_blks[i].end);
+ if (sack_blk_used(sf, i) == 0)
+ continue;
+ /*
+ * Now given the sack-filter block does it touch
+ * with one of the ends
+ */
+ if (sf->sf_blks[i].end == in->end) {
+ /* The end moves back to start */
+ if (SEQ_GT(in->start, sf->sf_blks[i].start))
+ /* in-blk |----| */
+ /* sf-blk |---------| */
+ sf->sf_blks[i].end = in->start;
+ else {
+ /* It consumes this block */
+ /* in-blk |---------| */
+ /* sf-blk |------| */
+ /* <or> */
+ /* sf-blk |---------| */
+ sf->sf_bits = sack_blk_clr(sf, i);
+ sf->sf_used--;
+ }
+ continue;
+ }
+ if (sf->sf_blks[i].start == in->start) {
+ if (SEQ_LT(in->end, sf->sf_blks[i].end)) {
+ /* in-blk |----| */
+ /* sf-blk |---------| */
+ sf->sf_blks[i].start = in->end;
+ } else {
+ /* It consumes this block */
+ /* in-blk |----------| */
+ /* sf-blk |-------| */
+ /* <or> */
+ /* sf-blk |----------| */
+ sf->sf_bits = sack_blk_clr(sf, i);
+ sf->sf_used--;
+ }
+ continue;
}
}
}
+#ifndef _KERNEL
+
int
main(int argc, char **argv)
{
char buffer[512];
struct sackblk blks[TCP_MAX_SACK];
FILE *err;
- tcp_seq th_ack, snd_una;
+ tcp_seq th_ack, snd_una, snd_max = 0;
struct sack_filter sf;
int32_t numblks,i;
int snd_una_set=0;
double a, b, c;
- int invalid_sack_print = 0;
+ int invalid_sack_print = 0;
uint32_t chg_remembered=0;
uint32_t sack_chg=0;
char line_buf[10][256];
@@ -604,7 +675,11 @@ main(int argc, char **argv)
line_buf_at++;
if (strncmp(buffer, "QUIT", 4) == 0) {
break;
- } else if (strncmp(buffer, "DONE", 4) == 0) {
+ } else if (strncmp(buffer, "DUMP", 4) == 0) {
+ sack_filter_dump(out, &sf);
+ } else if (strncmp(buffer, "MAX:", 4) == 0) {
+ snd_max = strtoul(&buffer[4], NULL, 0);
+ } else if (strncmp(buffer, "COMMIT", 6) == 0) {
int nn, ii;
if (numblks) {
uint32_t szof, tot_chg;
@@ -660,6 +735,7 @@ main(int argc, char **argv)
char *end=NULL;
uint32_t start;
uint32_t endv;
+
start = strtoul(&buffer[5], &end, 0);
if (end) {
endv = strtoul(&end[1], NULL, 0);
@@ -667,6 +743,8 @@ main(int argc, char **argv)
fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
continue;
}
+ if (SEQ_GT(endv, snd_max))
+ snd_max = endv;
if (SEQ_LT(endv, start)) {
fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
continue;
@@ -678,6 +756,28 @@ main(int argc, char **argv)
blks[numblks].start = start;
blks[numblks].end = endv;
numblks++;
+ } else if (strncmp(buffer, "REJ:n:n", 4) == 0) {
+ struct sackblk in;
+ char *end=NULL;
+
+ in.start = strtoul(&buffer[4], &end, 0);
+ if (end) {
+ in.end = strtoul(&end[1], NULL, 0);
+ sack_filter_reject(&sf, &in);
+ } else
+ fprintf(out, "Invalid input END:A:B\n");
+ } else if (strncmp(buffer, "HELP", 4) == 0) {
+ fprintf(out, "You can input:\n");
+ fprintf(out, "SACK:S:E -- to define a sack block\n");
+ fprintf(out, "RXT -- to clear the filter without changing the remembered\n");
+ fprintf(out, "EXIT -- To clear the sack filter and start all fresh\n");
+ fprintf(out, "ACK:N -- To advance the cum-ack to N\n");
+ fprintf(out, "MAX:N -- To set send-max to N\n");
+ fprintf(out, "COMMIT -- To apply the sack you built to the filter and dump the filter\n");
+ fprintf(out, "DUMP -- To display the current contents of the sack filter\n");
+ fprintf(out, "QUIT -- To exit this program\n");
+ } else {
+ fprintf(out, "Command %s unknown\n", buffer);
}
memset(buffer, 0, sizeof(buffer));
}
diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h
index 3ef986209566..2d01a0c15471 100644
--- a/sys/netinet/tcp_stacks/sack_filter.h
+++ b/sys/netinet/tcp_stacks/sack_filter.h
@@ -1,7 +1,7 @@
#ifndef __sack_filter_h__
#define __sack_filter_h__
/*-
- * Copyright (c) 2017 Netflix, Inc.
+ * Copyright (c) 2017-9 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -50,7 +50,8 @@ struct sack_filter {
};
#ifdef _KERNEL
void sack_filter_clear(struct sack_filter *sf, tcp_seq seq);
-int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack);
-
+int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
+ tcp_seq th_ack);
+void sack_filter_reject(struct sack_filter *sf, struct sackblk *in);
#endif
#endif
diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h
new file mode 100644
index 000000000000..f09e25a18390
--- /dev/null
+++ b/sys/netinet/tcp_stacks/tcp_bbr.h
@@ -0,0 +1,845 @@
+/*-
+ * Copyright (c) 2016-9
+ * Netflix Inc. All rights reserved.
+ * Author Randall R. Stewart
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_BBR_H_
+#define _NETINET_TCP_BBR_H_
+
+#define BBR_INITIAL_RTO 1000000 /* 1 second in micro-seconds */
+/* Send map flags */
+#define BBR_ACKED 0x0001 /* The remote endpoint acked this */
+#define BBR_WAS_RENEGED 0x0002 /* The peer reneged the ack */
+#define BBR_RXT_CLEARED 0x0004 /* ACK Cleared by the RXT timer */
+#define BBR_OVERMAX 0x0008 /* We have more retran's then we can
+ * fit */
+#define BBR_SACK_PASSED 0x0010 /* A sack was done above this block */
+#define BBR_WAS_SACKPASS 0x0020 /* We retransmitted due to SACK pass */
+#define BBR_HAS_FIN 0x0040 /* segment is sent with fin */
+#define BBR_TLP 0x0080 /* segment sent as tail-loss-probe */
+#define BBR_HAS_SYN 0x0100 /* segment has the syn */
+#define BBR_MARKED_LOST 0x0200 /*
+ * This segments is lost and
+ * totaled into bbr->rc_ctl.rc_lost
+ */
+#define BBR_RWND_COLLAPSED 0x0400 /* The peer collapsed the rwnd on the segment */
+#define BBR_NUM_OF_RETRANS 7
+
+/* Defines for socket options to set pacing overheads */
+#define BBR_INCL_ENET_OH 0x01
+#define BBR_INCL_IP_OH 0x02
+#define BBR_INCL_TCP_OH 0x03
+
+/*
+ * With the addition of both measurement algorithms
+ * I had to move over the size of a
+ * cache line (unfortunately). For now there is
+ * no way around this. We may be able to cut back
+ * at some point I hope.
+ */
+struct bbr_sendmap {
+ TAILQ_ENTRY(bbr_sendmap) r_next; /* seq number arrayed next */
+ TAILQ_ENTRY(bbr_sendmap) r_tnext; /* Time of tmit based next */
+ uint32_t r_start; /* Sequence number of the segment */
+ uint32_t r_end; /* End seq, this is 1 beyond actually */
+
+ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
+ uint32_t r_delivered; /* Delivered amount at send */
+
+ uint32_t r_del_time; /* The time of the last delivery update */
+ uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time
+ * sent */
+ unused_bit:1,
+ r_is_drain:1, /* In a draining cycle */
+ r_app_limited:1,/* We went app limited */
+ r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */
+ uint8_t r_dupack; /* Dup ack count */
+ uint8_t r_in_tmap:1, /* Flag to see if its in the r_tnext array */
+ r_is_smallmap:1,/* Was logged as a small-map send-map item */
+ r_is_gain:1, /* Was in gain cycle */
+ r_bbr_state:5; /* The BBR state at send */
+ uint8_t r_limit_type; /* is this entry counted against a limit? */
+
+ uint16_t r_flags; /* Flags as defined above */
+ uint16_t r_spare16;
+ uint32_t r_del_ack_ts; /* At send what timestamp of peer was (if r_ts_valid set) */
+ /****************Cache line*****************/
+ uint32_t r_tim_lastsent[BBR_NUM_OF_RETRANS];
+ /*
+ * Question, should we instead just grab the sending b/w
+ * from the filter with the gain and store it in a
+ * uint64_t instead?
+ */
+ uint32_t r_first_sent_time; /* Time of first pkt in flight sent */
+ uint32_t r_pacing_delay; /* pacing delay of this send */
+ uint32_t r_flight_at_send; /* flight at the time of the send */
+#ifdef _KERNEL
+} __aligned(CACHE_LINE_SIZE);
+#else
+};
+#endif
+#define BBR_LIMIT_TYPE_SPLIT 1
+
+TAILQ_HEAD(bbr_head, bbr_sendmap);
+
+#define BBR_SEGMENT_TIME_SIZE 1500 /* How many bytes in time_between */
+
+#define BBR_MIN_SEG 1460 /* MSS size */
+#define BBR_MAX_GAIN_VALUE 0xffff
+
+#define BBR_TIMER_FUDGE 1500 /* 1.5ms in micro seconds */
+
+/* BW twiddle secret codes */
+#define BBR_RED_BW_CONGSIG 0 /* We enter recovery and set using b/w */
+#define BBR_RED_BW_RATECAL 1 /* We are calculating the loss rate */
+#define BBR_RED_BW_USELRBW 2 /* We are dropping the lower b/w with
+ * cDR */
+#define BBR_RED_BW_SETHIGHLOSS 3 /* We have set our highloss value at
+ * exit from probe-rtt */
+#define BBR_RED_BW_PE_CLREARLY 4 /* We have decided to clear the
+ * reduction early */
+#define BBR_RED_BW_PE_CLAFDEL 5 /* We are clearing it on schedule
+ * delayed */
+#define BBR_RED_BW_REC_ENDCLL 6 /* Recover exits save high if needed
+ * an clear to start measuring */
+#define BBR_RED_BW_PE_NOEARLY_OUT 7 /* Set pkt epoch judged that we do not
+ * get out of jail early */
+/* codes for just-return */
+#define BBR_JR_SENT_DATA 0
+#define BBR_JR_CWND_LIMITED 1
+#define BBR_JR_RWND_LIMITED 2
+#define BBR_JR_APP_LIMITED 3
+#define BBR_JR_ASSESSING 4
+/* For calculating a rate */
+#define BBR_CALC_BW 1
+#define BBR_CALC_LOSS 2
+
+#define BBR_RTT_BY_TIMESTAMP 0
+#define BBR_RTT_BY_EXACTMATCH 1
+#define BBR_RTT_BY_EARLIER_RET 2
+#define BBR_RTT_BY_THIS_RETRAN 3
+#define BBR_RTT_BY_SOME_RETRAN 4
+#define BBR_RTT_BY_TSMATCHING 5
+
+/* Markers to track where we enter persists from */
+#define BBR_PERSISTS_FROM_1 1
+#define BBR_PERSISTS_FROM_2 2
+#define BBR_PERSISTS_FROM_3 3
+#define BBR_PERSISTS_FROM_4 4
+#define BBR_PERSISTS_FROM_5 5
+
+/* magic cookies to ask for the RTT */
+#define BBR_RTT_PROP 0
+#define BBR_RTT_RACK 1
+#define BBR_RTT_PKTRTT 2
+#define BBR_SRTT 3
+
+#define BBR_SACKED 0
+#define BBR_CUM_ACKED 1
+
+/* threshold in useconds where we consider we need a higher min cwnd */
+#define BBR_HIGH_SPEED 1000
+#define BBR_HIGHSPEED_NUM_MSS 12
+
+#define MAX_REDUCE_RXT 3 /* What is the maximum times we are willing to
+ * reduce b/w in RTX's. Setting this has a
+ * multiplicative effect e.g. if we are
+ * reducing by 20% then setting it to 3 means
+ * you will have reduced the b/w estimate by >
+ * 60% before you stop. */
+/*
+ * We use the rate sample structure to
+ * assist in single sack/ack rate and rtt
+ * calculation. In the future we will expand
+ * this in BBR to do forward rate sample
+ * b/w estimation.
+ */
+#define BBR_RS_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */
+#define BBR_RS_BW_EMPTY 0x00000002 /* Nothing yet stored in cDR */
+#define BBR_RS_RTT_VALID 0x00000004 /* We have at least one valid RTT */
+#define BBR_RS_BW_VAILD 0x00000008 /* We have a valid cDR */
+#define BBR_RS_EMPTY (BBR_RS_RTT_EMPTY|BBR_RS_BW_EMPTY)
+struct bbr_rtt_sample {
+ uint32_t rs_flags;
+ uint32_t rs_rtt_lowest;
+ uint32_t rs_rtt_lowest_sendtime;
+ uint32_t rs_rtt_low_seq_start;
+
+ uint32_t rs_rtt_highest;
+ uint32_t rs_rtt_cnt;
+
+ uint64_t rs_rtt_tot;
+ uint32_t cur_rtt;
+ uint32_t cur_rtt_bytecnt;
+
+ uint32_t cur_rtt_rsmcnt;
+ uint32_t rc_crtt_set:1,
+ avail_bits:31;
+ uint64_t rs_cDR;
+};
+
+/* RTT shrink reasons */
+#define BBR_RTTS_INIT 0
+#define BBR_RTTS_NEWRTT 1
+#define BBR_RTTS_RTTPROBE 2
+#define BBR_RTTS_WASIDLE 3
+#define BBR_RTTS_PERSIST 4
+#define BBR_RTTS_REACHTAR 5
+#define BBR_RTTS_ENTERPROBE 6
+#define BBR_RTTS_SHRINK_PG 7
+#define BBR_RTTS_SHRINK_PG_FINAL 8
+#define BBR_RTTS_NEW_TARGET 9
+#define BBR_RTTS_LEAVE_DRAIN 10
+#define BBR_RTTS_RESETS_VALUES 11
+
+#define BBR_NUM_RATES 5
+/* Rate flags */
+#define BBR_RT_FLAG_FREE 0x00 /* Is on the free list */
+#define BBR_RT_FLAG_INUSE 0x01 /* Has been allocated */
+#define BBR_RT_FLAG_READY 0x02 /* Ready to initiate a measurement. */
+#define BBR_RT_FLAG_CAPPED_PRE 0x04 /* Ready to cap if we send the next segment */
+#define BBR_RT_FLAG_CAPPED 0x08 /* Measurement is capped */
+#define BBR_RT_FLAG_PASTFA 0x10 /* Past the first ack. */
+#define BBR_RT_FLAG_LIMITED 0x20 /* Saw application/cwnd or rwnd limited period */
+#define BBR_RT_SEEN_A_ACK 0x40 /* A ack has been saved */
+#define BBR_RT_PREV_RTT_SET 0x80 /* There was a RTT set in */
+#define BBR_RT_PREV_SEND_TIME 0x100 /*
+ *There was a RTT send time set that can be used
+ * no snd_limits
+ */
+#define BBR_RT_SET_GRADIENT 0x200
+#define BBR_RT_TS_VALID 0x400
+
+
+struct bbr_log {
+ union {
+ struct bbr_sendmap *rsm; /* For alloc/free */
+ uint64_t sb_acc; /* For out/ack or t-o */
+ };
+ struct tcpcb *tp;
+ uint32_t t_flags;
+ uint32_t th_seq;
+ uint32_t th_ack;
+ uint32_t snd_una;
+ uint32_t snd_nxt;
+ uint32_t snd_max;
+ uint32_t snd_cwnd;
+ uint32_t snd_wnd;
+ uint32_t rc_lost;
+ uint32_t target_cwnd; /* UU */
+ uint32_t inflight; /* UU */
+ uint32_t applimited; /* UU */
+ /* Things for BBR */
+ uint32_t delivered; /* UU */
+ uint64_t cur_del_rate; /* UU */
+ uint64_t delRate; /* UU */
+ uint64_t rttProp; /* UU */
+ uint64_t lt_bw; /* UU */
+ uint32_t timeStamp;
+ uint32_t time;
+ uint32_t slot; /* UU */
+ uint32_t delayed_by;
+ uint32_t exp_del;
+ uint32_t pkts_out;
+ uint32_t new_win;
+ uint32_t hptsi_gain; /* UU */
+ uint32_t cwnd_gain; /* UU */
+ uint32_t epoch; /* UU */
+ uint32_t lt_epoch; /* UU */
+ /* Sack fun */
+ uint32_t blk_start[4]; /* xx */
+ uint32_t blk_end[4];
+ uint32_t len; /* Timeout T3=1, TLP=2, RACK=3 */
+ uint8_t type;
+ uint8_t n_sackblks;
+ uint8_t applied; /* UU */
+ uint8_t inhpts; /* UU */
+ uint8_t ininput; /* UU */
+ uint8_t use_lt_bw; /* UU */
+};
+
+struct bbr_log_sysctl_out {
+ uint32_t bbr_log_at;
+ uint32_t bbr_log_max;
+ struct bbr_log entries[0];
+};
+
+/*
+ * Magic numbers for logging timeout events if the
+ * logging is enabled.
+ */
+#define BBR_TO_FRM_TMR 1
+#define BBR_TO_FRM_TLP 2
+#define BBR_TO_FRM_RACK 3
+#define BBR_TO_FRM_KEEP 4
+#define BBR_TO_FRM_PERSIST 5
+#define BBR_TO_FRM_DELACK 6
+
+#define BBR_SEES_STRETCH_ACK 1
+#define BBR_SEES_COMPRESSED_ACKS 2
+
+
+/*
+ * As we get each SACK we wade through the
+ * rc_map and mark off what is acked.
+ * We also increment rc_sacked as well.
+ *
+ * We also pay attention to missing entries
+ * based on the time and possibly mark them
+ * for retransmit. If we do and we are not already
+ * in recovery we enter recovery. In doing
+ * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec.
+ * We also setup rc_next/rc_snd_nxt/rc_send_end so
+ * we will know where to send from. When not in
+ * recovery rc_next will be NULL and rc_snd_nxt should
+ * equal snd_max.
+ *
+ * Whenever we retransmit from recovery we increment
+ * rc_holes_rxt as we retran a block and mark it as retransmitted
+ * with the time it was sent. During non-recovery sending we
+ * add to our map and note the time down of any send expanding
+ * the rc_map at the tail and moving rc_snd_nxt up with snd_max.
+ *
+ * In recovery during SACK/ACK processing if a chunk has
+ * been retransmitted and it is now acked, we decrement rc_holes_rxt.
+ * When we retransmit from the scoreboard we use
+ * rc_next and rc_snd_nxt/rc_send_end to help us
+ * find what needs to be retran.
+ *
+ * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt
+ * This gets us the effect of RFC6675 pipe, counting twice for
+ * bytes retransmitted.
+ */
+
+#define TT_BBR_FR_TMR 0x2001
+
+#define BBR_SCALE 8
+#define BBR_UNIT (1 << BBR_SCALE)
+
+#define BBR_NUM_RTTS_FOR_DEL_LIMIT 8 /* How many pkt-rtts do we keep
+ * Delivery rate for */
+#define BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT 10 /* How many pkt-rtts do we keep
+ * Delivery rate for google */
+
+#define BBR_SECONDS_NO_RTT 10 /* 10 seconds with no RTT shrinkage */
+#define BBR_PROBERTT_MAX 200 /* 200ms */
+#define BBR_PROBERTT_NUM_MSS 4
+#define BBR_STARTUP_EPOCHS 3
+#define USECS_IN_MSEC 1000
+#define BBR_TIME_TO_SECONDS(a) (a / USECS_IN_SECOND)
+#define BBR_TIME_TO_MILLI(a) (a / MS_IN_USEC)
+
+
+/* BBR keeps time in usec's so we divide by 1000 and round up */
+#define BBR_TS_TO_MS(t) ((t+999)/MS_IN_USEC)
+
+/*
+ * Locking for the rack control block.
+ * a) Locked by INP_WLOCK
+ * b) Locked by the hpts-mutex
+ *
+ */
+#define BBR_STATE_STARTUP 0x01
+#define BBR_STATE_DRAIN 0x02
+#define BBR_STATE_PROBE_BW 0x03
+#define BBR_STATE_PROBE_RTT 0x04
+#define BBR_STATE_IDLE_EXIT 0x05
+
+/* Substate defines for STATE == PROBE_BW */
+#define BBR_SUB_GAIN 0 /* State 0 where we are 5/4 BBR_UNIT */
+#define BBR_SUB_DRAIN 1 /* State 1 where we are at 3/4 BBR_UNIT */
+#define BBR_SUB_LEVEL1 2 /* State 1 first BBR_UNIT */
+#define BBR_SUB_LEVEL2 3 /* State 2nd BBR_UNIT */
+#define BBR_SUB_LEVEL3 4 /* State 3rd BBR_UNIT */
+#define BBR_SUB_LEVEL4 5 /* State 4th BBR_UNIT */
+#define BBR_SUB_LEVEL5 6 /* State 5th BBR_UNIT */
+#define BBR_SUB_LEVEL6 7 /* State last BBR_UNIT */
+#define BBR_SUBSTATE_COUNT 8
+
+/* Single remaining reduce log */
+#define BBR_REDUCE_AT_FR 5
+
+#define BBR_BIG_LOG_SIZE 300000
+
+/* Bits per second in bytes per second */
+#define FORTY_EIGHT_MBPS 6000000 /* 48 megabits in bytes */
+#define THIRTY_MBPS 3750000 /* 30 megabits in bytes */
+#define TWENTY_THREE_MBPS 2896000
+#define FIVETWELVE_MBPS 64000000 /* 512 megabits in bytes */
+#define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
+
+struct bbr_stats {
+ uint64_t bbr_badfr; /* 0 */
+ uint64_t bbr_badfr_bytes; /* 1 */
+ uint64_t bbr_saw_oerr; /* 2 */
+ uint64_t bbr_saw_emsgsiz; /* 3 */
+ uint64_t bbr_reorder_seen; /* 4 */
+ uint64_t bbr_tlp_tot; /* 5 */
+ uint64_t bbr_tlp_newdata; /* 6 */
+ uint64_t bbr_offset_recovery; /* 7 */
+ uint64_t bbr_tlp_retran_fail; /* 8 */
+ uint64_t bbr_to_tot; /* 9 */
+ uint64_t bbr_to_arm_rack; /* 10 */
+ uint64_t bbr_enter_probertt; /* 11 */
+ uint64_t bbr_tlp_set; /* 12 */
+ uint64_t bbr_resends_set; /* 13 */
+ uint64_t bbr_force_output; /* 14 */
+ uint64_t bbr_to_arm_tlp; /* 15 */
+ uint64_t bbr_paced_segments; /* 16 */
+ uint64_t bbr_saw_enobuf; /* 17 */
+ uint64_t bbr_to_alloc_failed; /* 18 */
+ uint64_t bbr_to_alloc_emerg; /* 19 */
+ uint64_t bbr_sack_proc_all; /* 20 */
+ uint64_t bbr_sack_proc_short; /* 21 */
+ uint64_t bbr_sack_proc_restart; /* 22 */
+ uint64_t bbr_to_alloc; /* 23 */
+ uint64_t bbr_offset_drop; /* 24 */
+ uint64_t bbr_runt_sacks; /* 25 */
+ uint64_t bbr_sack_passed; /* 26 */
+ uint64_t bbr_rlock_left_ret0; /* 27 */
+ uint64_t bbr_rlock_left_ret1; /* 28 */
+ uint64_t bbr_dynamic_rwnd; /* 29 */
+ uint64_t bbr_static_rwnd; /* 30 */
+ uint64_t bbr_sack_blocks; /* 31 */
+ uint64_t bbr_sack_blocks_skip; /* 32 */
+ uint64_t bbr_sack_search_both; /* 33 */
+ uint64_t bbr_sack_search_fwd; /* 34 */
+ uint64_t bbr_sack_search_back; /* 35 */
+ uint64_t bbr_plain_acks; /* 36 */
+ uint64_t bbr_acks_with_sacks; /* 37 */
+ uint64_t bbr_progress_drops; /* 38 */
+ uint64_t bbr_early; /* 39 */
+ uint64_t bbr_reneges_seen; /* 40 */
+ uint64_t bbr_persist_reneg; /* 41 */
+ uint64_t bbr_dropped_af_data; /* 42 */
+ uint64_t bbr_failed_mbuf_aloc; /* 43 */
+ uint64_t bbr_cwnd_limited; /* 44 */
+ uint64_t bbr_rwnd_limited; /* 45 */
+ uint64_t bbr_app_limited; /* 46 */
+ uint64_t bbr_force_timer_start; /* 47 */
+ uint64_t bbr_hpts_min_time; /* 48 */
+ uint64_t bbr_meets_tso_thresh; /* 49 */
+ uint64_t bbr_miss_tso_rwnd; /* 50 */
+ uint64_t bbr_miss_tso_cwnd; /* 51 */
+ uint64_t bbr_miss_tso_app; /* 52 */
+ uint64_t bbr_miss_retran; /* 53 */
+ uint64_t bbr_miss_tlp; /* 54 */
+ uint64_t bbr_miss_unknown; /* 55 */
+ uint64_t bbr_hdwr_rl_add_ok; /* 56 */
+ uint64_t bbr_hdwr_rl_add_fail; /* 57 */
+ uint64_t bbr_hdwr_rl_mod_ok; /* 58 */
+ uint64_t bbr_hdwr_rl_mod_fail; /* 59 */
+ uint64_t bbr_collapsed_win; /* 60 */
+ uint64_t bbr_alloc_limited; /* 61 */
+ uint64_t bbr_alloc_limited_conns; /* 62 */
+ uint64_t bbr_split_limited; /* 63 */
+};
+
+/*
+ * The structure bbr_opt_stats is a simple
+ * way to see how many options are being
+ * changed in the stack.
+ */
+struct bbr_opts_stats {
+ uint64_t tcp_bbr_pace_per_sec;
+ uint64_t tcp_bbr_pace_del_tar;
+ uint64_t tcp_bbr_pace_seg_max;
+ uint64_t tcp_bbr_pace_seg_min;
+ uint64_t tcp_bbr_pace_cross;
+ uint64_t tcp_bbr_drain_inc_extra;
+ uint64_t tcp_bbr_unlimited;
+ uint64_t tcp_bbr_iwintso;
+ uint64_t tcp_bbr_rec_over_hpts;
+ uint64_t tcp_bbr_recforce;
+ uint64_t tcp_bbr_startup_pg;
+ uint64_t tcp_bbr_drain_pg;
+ uint64_t tcp_bbr_rwnd_is_app;
+ uint64_t tcp_bbr_probe_rtt_int;
+ uint64_t tcp_bbr_one_retran;
+ uint64_t tcp_bbr_startup_loss_exit;
+ uint64_t tcp_bbr_use_lowgain;
+ uint64_t tcp_bbr_lowgain_thresh;
+ uint64_t tcp_bbr_lowgain_half;
+ uint64_t tcp_bbr_lowgain_fd;
+ uint64_t tcp_bbr_usedel_rate;
+ uint64_t tcp_bbr_min_rto;
+ uint64_t tcp_bbr_max_rto;
+ uint64_t tcp_rack_pace_max_seg;
+ uint64_t tcp_rack_min_to;
+ uint64_t tcp_rack_reord_thresh;
+ uint64_t tcp_rack_reord_fade;
+ uint64_t tcp_rack_tlp_thresh;
+ uint64_t tcp_rack_pkt_delay;
+ uint64_t tcp_bbr_startup_exit_epoch;
+ uint64_t tcp_bbr_ack_comp_alg;
+ uint64_t tcp_rack_cheat;
+ uint64_t tcp_iwnd_tso;
+ uint64_t tcp_utter_max_tso;
+ uint64_t tcp_hdwr_pacing;
+ uint64_t tcp_extra_state;
+ uint64_t tcp_floor_min_tso;
+ /* New */
+ uint64_t tcp_bbr_algorithm;
+ uint64_t tcp_bbr_tslimits;
+ uint64_t tcp_bbr_probertt_len;
+ uint64_t tcp_bbr_probertt_gain;
+ uint64_t tcp_bbr_topaceout;
+ uint64_t tcp_use_rackcheat;
+ uint64_t tcp_delack;
+ uint64_t tcp_maxpeak;
+ uint64_t tcp_retran_wtso;
+ uint64_t tcp_data_ac;
+ uint64_t tcp_ts_raises;
+ uint64_t tcp_pacing_oh_tmr;
+ uint64_t tcp_pacing_oh;
+ uint64_t tcp_policer_det;
+};
+
+
+#ifdef _KERNEL
+#define BBR_STAT_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t))
+extern counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
+#define BBR_STAT_ADD(name, amm) counter_u64_add(bbr_stat_arry[(offsetof(struct bbr_stats, name)/sizeof(uint64_t))], (amm))
+#define BBR_STAT_INC(name) BBR_STAT_ADD(name, 1)
+#define BBR_OPTS_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t))
+extern counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
+#define BBR_OPTS_ADD(name, amm) counter_u64_add(bbr_opts_arry[(offsetof(struct bbr_opts_stats, name)/sizeof(uint64_t))], (amm))
+#define BBR_OPTS_INC(name) BBR_OPTS_ADD(name, 1)
+#endif
+
+#define BBR_NUM_LOSS_RATES 3
+#define BBR_NUM_BW_RATES 3
+
+#define BBR_RECOVERY_LOWRTT 1
+#define BBR_RECOVERY_MEDRTT 2
+#define BBR_RECOVERY_HIGHRTT 3
+#define BBR_RECOVERY_EXTREMERTT 4
+
+
+struct bbr_control {
+ /*******************************/
+ /* Cache line 2 from bbr start */
+ /*******************************/
+ struct bbr_head rc_map; /* List of all segments Lock(a) */
+ struct bbr_head rc_tmap; /* List in transmit order Lock(a) */
+ struct bbr_sendmap *rc_resend; /* something we have been asked to
+ * resend */
+ uint32_t rc_last_delay_val; /* How much we expect to delay Lock(a) */
+ uint32_t rc_bbr_hptsi_gain:16, /* Current hptsi gain Lock(a) */
+ rc_hpts_flags:16; /* flags on whats on the pacer wheel */
+
+ uint32_t rc_delivered; /* BRR delivered amount Lock(a) */
+ uint32_t rc_hptsi_agg_delay; /* How much time are we behind */
+
+ uint32_t rc_flight_at_input;
+ uint32_t rc_lost_bytes; /* Total bytes currently marked lost */
+ /*******************************/
+ /* Cache line 3 from bbr start */
+ /*******************************/
+ struct time_filter rc_delrate;
+ /*******************************/
+ /* Cache line 4 from bbr start */
+ /*******************************/
+ struct bbr_head rc_free; /* List of Free map entries Lock(a) */
+ struct bbr_sendmap *rc_tlp_send; /* something we have been
+ * asked to resend */
+ uint32_t rc_del_time;
+ uint32_t rc_target_at_state; /* Target for a state */
+
+ uint16_t rc_free_cnt; /* Number of free entries on the rc_free list
+ * Lock(a) */
+ uint16_t rc_startup_pg;
+
+ uint32_t cur_rtt; /* Last RTT from ack */
+
+
+ uint32_t rc_went_idle_time; /* Used for persits to see if its
+ * probe-rtt qualified */
+ uint32_t rc_pace_max_segs:17, /* How much in any single TSO we send Lock(a) */
+ rc_pace_min_segs:15; /* The minimum single segment size before we enter persists */
+
+ uint32_t rc_rtt_shrinks; /* Time of last rtt shrinkage Lock(a) */
+ uint32_t r_app_limited_until;
+ uint32_t rc_timer_exp; /* If a timer ticks of expiry */
+ uint32_t rc_rcv_epoch_start; /* Start time of the Epoch Lock(a) */
+
+ /*******************************/
+ /* Cache line 5 from bbr start */
+ /*******************************/
+
+ uint32_t rc_lost_at_pktepoch; /* what the lost value was at the last
+ * pkt-epoch */
+ uint32_t r_measurement_count; /* count of measurement applied lock(a) */
+
+
+ uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */
+ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
+ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */
+
+ struct bbr_sendmap *rc_sacklast; /* sack remembered place
+ * Lock(a) */
+ struct bbr_sendmap *rc_next; /* remembered place where we next
+ * retransmit at Lock(a) */
+
+ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */
+ uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */
+
+ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
+ uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */
+ /*- ---
+ * used only inital and close
+ */
+ uint32_t rc_high_rwnd; /* Highest rwnd seen */
+ uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */
+
+ uint32_t rc_last_rtt; /* Last valid measured RTT that ack'd data */
+ uint32_t bbr_cross_over;
+
+ /*******************************/
+ /* Cache line 6 from bbr start */
+ /*******************************/
+ struct sack_filter bbr_sf;
+
+ /*******************************/
+ /* Cache line 7 from bbr start */
+ /*******************************/
+ struct time_filter_small rc_rttprop;
+ uint32_t last_inbound_ts; /* Peers last timestamp */
+
+ uint32_t rc_inc_tcp_oh: 1,
+ rc_inc_ip_oh: 1,
+ rc_inc_enet_oh:1,
+ rc_incr_tmrs:1,
+ restrict_growth:28;
+ uint32_t rc_lt_epoch_use; /* When we started lt-bw use Lock(a) */
+
+ uint32_t rc_recovery_start; /* Time we start recovery Lock(a) */
+ uint32_t rc_lt_del; /* Delivered at lt bw sampling start Lock(a) */
+
+ uint64_t rc_bbr_cur_del_rate; /* Current measured delivery rate
+ * Lock(a) */
+
+ /*******************************/
+ /* Cache line 8 from bbr start */
+ /*******************************/
+ uint32_t rc_cwnd_on_ent; /* On entry to recovery the cwnd
+ * Lock(a) */
+ uint32_t rc_agg_early; /* aggregate amount early */
+
+ uint32_t rc_rcvtime; /* When we last received data Lock(a) */
+ uint32_t rc_pkt_epoch_del; /* seq num that we need for RTT epoch */
+
+ uint32_t rc_pkt_epoch; /* Epoch based on packet RTTs */
+ uint32_t rc_pkt_epoch_time; /* Time we started the pkt epoch */
+
+ uint32_t rc_pkt_epoch_rtt; /* RTT using the packet epoch */
+ uint32_t rc_rtt_epoch; /* Current RTT epoch, it ticks every rttProp
+ * Lock(a) */
+ uint32_t lowest_rtt;
+ uint32_t bbr_smallest_srtt_this_state;
+
+ uint32_t rc_lt_epoch; /* LT epoch start of bw_sampling */
+ uint32_t rc_lost_at_startup;
+
+ uint32_t rc_bbr_state_atflight;
+ uint32_t rc_bbr_last_startup_epoch; /* Last startup epoch where we
+ * increased 20% */
+ uint32_t rc_bbr_enters_probertt; /* Timestamp we entered
+ * probertt Lock(a) */
+ uint32_t rc_lt_time; /* Time of lt sampling start Lock(a) */
+
+ /*******************************/
+ /* Cache line 9 from bbr start */
+ /*******************************/
+ uint64_t rc_lt_bw; /* LT bw calculated Lock(a) */
+ uint64_t rc_bbr_lastbtlbw; /* For startup, what was last btlbw I
+ * saw to check the 20% gain Lock(a) */
+
+
+ uint32_t rc_bbr_cwnd_gain; /* Current cwnd gain Lock(a) */
+ uint32_t rc_pkt_epoch_loss_rate; /* pkt-epoch loss rate */
+
+ uint32_t rc_saved_cwnd; /* Saved cwnd during Probe-rtt drain Lock(a) */
+ uint32_t substate_pe;
+
+ uint32_t rc_lost; /* Number of bytes lost Lock(a) */
+ uint32_t rc_exta_time_gd; /* How much extra time we got in d/g */
+
+ uint32_t rc_lt_lost; /* Number of lt bytes lost at sampling start
+ * Lock(a) */
+ uint32_t rc_bbr_state_time;
+
+ uint32_t rc_min_to; /* Socket option value Lock(a) */
+ uint32_t rc_initial_hptsi_bw; /* Our initial startup bw Lock(a) */
+
+ uint32_t bbr_lost_at_state; /* Temp counter debug lost value as we
+ * enter a state */
+ /*******************************/
+ /* Cache line 10 from bbr start */
+ /*******************************/
+ uint32_t rc_level_state_extra;
+ uint32_t rc_red_cwnd_pe;
+ const struct tcp_hwrate_limit_table *crte;
+ uint64_t red_bw;
+
+ uint32_t rc_probertt_int;
+ uint32_t rc_probertt_srttchktim; /* Time we last did a srtt
+ * check */
+ uint32_t gain_epoch; /* Epoch we should be out of gain */
+ uint32_t rc_min_rto_ms;
+
+ uint32_t rc_reorder_fade; /* Socket option value Lock(a) */
+ uint32_t last_startup_measure;
+
+ int32_t bbr_hptsi_per_second;
+ int32_t bbr_hptsi_segments_delay_tar;
+
+ int32_t bbr_hptsi_segments_max;
+ uint32_t bbr_rttprobe_gain_val;
+ /*******************************/
+ /* Cache line 11 from bbr start */
+ /*******************************/
+ uint32_t cur_rtt_send_time; /* Time we sent our rtt measured packet */
+ uint32_t bbr_peer_tsratio; /* Our calculated ts ratio to multply */
+ uint32_t bbr_ts_check_tstmp; /* When we filled it the TS that came on the ack */
+ uint32_t bbr_ts_check_our_cts; /* When we filled it the cts of the send */
+ uint32_t rc_tlp_rxt_last_time;
+ uint32_t bbr_smallest_srtt_state2;
+ uint32_t bbr_hdwr_cnt_noset_snt; /* count of hw pacing sends during delay */
+ uint32_t startup_last_srtt;
+ uint32_t rc_ack_hdwr_delay;
+ uint32_t highest_hdwr_delay; /* Largest delay we have seen from hardware */
+ uint32_t non_gain_extra;
+ uint32_t recovery_lr; /* The sum of the loss rate from the pe's during recovery */
+ uint32_t last_in_probertt;
+ uint32_t flightsize_at_drain; /* In draining what was the last marked flight size */
+ uint32_t rc_pe_of_prtt; /* PE we went into probe-rtt */
+ uint32_t ts_in; /* ts that went with the last rtt */
+
+ uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent
+ * rc_last_tlp_seq Lock(a) */
+ uint16_t rc_drain_pg;
+ uint32_t rc_num_maps_alloced; /* num send map entries allocated */
+ uint32_t rc_num_split_allocs; /* num split map entries allocated */
+ uint16_t rc_num_small_maps_alloced; /* Number of sack blocks
+ * allocated */
+ uint16_t bbr_hptsi_bytes_min;
+
+ uint16_t bbr_hptsi_segments_floor;
+ uint16_t bbr_utter_max;
+ uint16_t bbr_google_discount;
+
+};
+
+
+struct socket;
+struct tcp_bbr {
+ /* First cache line 0x00 */
+ int32_t(*r_substate) (struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, struct tcpopt *,
+ int32_t, int32_t, uint32_t, int32_t, int32_t); /* Lock(a) */
+ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
+ struct inpcb *rc_inp; /* The inpcb Lock(a) */
+ struct timeval rc_tv;
+ uint32_t rc_pacer_started; /* Time we started the pacer */
+ uint16_t no_pacing_until:8, /* No pacing until N packet epochs */
+ ts_can_raise:1,/* TS b/w calculations can raise the bw higher */
+ skip_gain:1, /* Skip the gain cycle (hardware pacing) */
+ gain_is_limited:1, /* With hardware pacing we are limiting gain */
+ output_error_seen:1,
+ oerror_cnt:4,
+ hw_pacing_set:1; /* long enough has passed for us to start pacing */
+ uint16_t xxx_r_ack_count; /* During recovery count of ack's received
+ * that added data since output */
+ uint16_t bbr_segs_rcvd; /* In Segment count since we sent a ack */
+
+ uint8_t bbr_timer_src:4, /* Used for debugging Lock(a) */
+ bbr_use_rack_cheat:1, /* Use the rack cheat */
+ bbr_init_win_cheat:1, /* Send full IW for TSO */
+ bbr_attempt_hdwr_pace:1,/* Try to do hardware pacing */
+ bbr_hdrw_pacing:1; /* Hardware pacing is available */
+ uint8_t bbr_hdw_pace_ena:1, /* Does the connection allow hardware pacing to be attempted */
+ bbr_prev_in_rec:1, /* We were previously in recovery */
+ pkt_conservation:1,
+ use_policer_detection:1,
+ xxx_bbr_hdw_pace_idx:4; /* If hardware pacing is on, index to slot in pace tbl */
+ uint16_t r_wanted_output:1,
+ rtt_valid:1,
+ rc_timer_first:1,
+ rc_output_starts_timer:1,
+ rc_resends_use_tso:1,
+ rc_all_timers_stopped:1,
+ rc_loss_exit:1,
+ rc_ack_was_delayed:1,
+ rc_lt_is_sampling:1,
+ rc_filled_pipe:1,
+ rc_tlp_new_data:1,
+ rc_hit_state_1:1,
+ rc_ts_valid:1,
+ rc_prtt_set_ts:1,
+ rc_is_pkt_epoch_now:1,
+ rc_has_collapsed:1;
+
+ uint8_t r_state:4, /* Current bbr state Lock(a) */
+ r_agg_early_set:1, /* Did we get called early */
+ r_init_rtt:1,
+ r_use_policer:1, /* For google mode only */
+ r_recovery_bw:1;
+ uint8_t r_timer_override:1, /* pacer override Lock(a) 0/1 */
+ rc_in_persist:1,
+ rc_lt_use_bw:1,
+ rc_allow_data_af_clo:1,
+ rc_tlp_rtx_out:1, /* A TLP is in flight */
+ rc_tlp_in_progress:1, /* a TLP timer is running needed? */
+ rc_use_idle_restart:1; /* Do we restart fast after idle (persist or applim) */
+ uint8_t rc_bbr_state:3, /* What is the major BBR state */
+ rc_bbr_substate:3, /* For probeBW state */
+ r_is_v6:1,
+ rc_past_init_win:1;
+ uint8_t rc_last_options;
+ uint8_t rc_tlp_threshold; /* Socket option value Lock(a) */
+ uint8_t rc_max_rto_sec;
+ uint8_t rc_cwnd_limited:1, /* We are cwnd limited */
+ rc_tmr_stopped:7; /* What timers have been stopped */
+ uint8_t rc_use_google:1,
+ rc_use_ts_limit:1,
+ rc_ts_data_set:1, /* We have filled a set point to determine */
+ rc_ts_clock_set:1, /* We have determined the ts type */
+ rc_ts_cant_be_used:1, /* We determined we can't use ts values */
+ rc_ack_is_cumack:1,
+ rc_no_pacing:1,
+ alloc_limit_reported:1;
+ uint8_t rc_init_win;
+ /* Cache line 2 0x40 */
+ struct bbr_control r_ctl;
+#ifdef _KERNEL
+} __aligned(CACHE_LINE_SIZE);
+#else
+};
+#endif
+
+#endif
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index 235951999e64..a19fc8969d9f 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2016 Netflix, Inc.
+ * Copyright (c) 2016-9 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -28,39 +28,39 @@
#ifndef _NETINET_TCP_RACK_H_
#define _NETINET_TCP_RACK_H_
-#define RACK_ACKED 0x0001/* The remote endpoint acked this */
-#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */
-#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */
-#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */
-#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */
-#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */
-#define RACK_HAS_FIN 0x0040/* segment is sent with fin */
-#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */
-
+#define RACK_ACKED 0x0001/* The remote endpoint acked this */
+#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order - not used */
+#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc - not used */
+#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */
+#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */
+#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */
+#define RACK_HAS_FIN 0x0040/* segment is sent with fin */
+#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */
+#define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */
#define RACK_NUM_OF_RETRANS 3
#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
struct rack_sendmap {
- TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */
- TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
- uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
uint32_t r_start; /* Sequence number of the segment */
uint32_t r_end; /* End seq, this is 1 beyond actually */
+ TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
+ RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time
* sent */
- uint8_t r_flags; /* Flags as defined above */
- uint8_t r_sndcnt; /* Retran count, not limited by
- * RACK_NUM_OF_RETRANS */
+ uint16_t r_flags; /* Flags as defined above */
+ uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
+ uint8_t r_dupack; /* Dup ack count */
uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */
uint8_t r_limit_type; /* is this entry counted against a limit? */
- uint8_t r_resv[2];
+ uint8_t r_resv[49];
};
-#define RACK_LIMIT_TYPE_SPLIT 1
+RB_HEAD(rack_rb_tree_head, rack_sendmap);
TAILQ_HEAD(rack_head, rack_sendmap);
+#define RACK_LIMIT_TYPE_SPLIT 1
/*
* We use the rate sample structure to
@@ -136,6 +136,8 @@ struct rack_opts_stats {
uint64_t rack_no_timer_in_hpts;
uint64_t tcp_rack_min_pace_seg;
uint64_t tcp_rack_min_pace;
+ uint64_t tcp_rack_cheat;
+ uint64_t tcp_rack_no_sack;
};
#define TLP_USE_ID 1 /* Internet draft behavior */
@@ -188,15 +190,19 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
* b) Locked by the hpts-mutex
*
*/
+#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
- struct rack_head rc_map;/* List of all segments Lock(a) */
+ struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */
struct rack_head rc_tmap; /* List in transmit order Lock(a) */
struct rack_sendmap *rc_tlpsend; /* Remembered place for
* tlp_sending Lock(a) */
struct rack_sendmap *rc_resend; /* something we have been asked to
* resend */
+ struct timeval rc_last_time_decay; /* SAD time decay happened here */
+ uint32_t input_pkt;
+ uint32_t saved_input_pkt;
uint32_t rc_hpts_flags;
uint32_t rc_timer_exp; /* If a timer ticks of expiry */
uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */
@@ -244,22 +250,32 @@ struct rack_control {
* have allocated */
uint32_t rc_rcvtime; /* When we last received data */
uint32_t rc_num_split_allocs; /* num split map entries allocated */
+
uint32_t rc_last_output_to;
uint32_t rc_went_idle_time;
struct rack_sendmap *rc_sacklast; /* sack remembered place
* Lock(a) */
- struct rack_sendmap *rc_next; /* remembered place where we next
- * retransmit at Lock(a) */
struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for
* cache line alignment
* Lock(a) */
+ struct timeval rc_last_ack;
/* Cache line split 0x100 */
struct sack_filter rack_sf;
/* Cache line split 0x140 */
/* Flags for various things */
+ uint32_t rc_pace_max_segs;
+ uint32_t rc_pace_min_segs;
+ uint32_t rc_high_rwnd;
+ uint32_t ack_count;
+ uint32_t sack_count;
+ uint32_t sack_noextra_move;
+ uint32_t sack_moved_extra;
struct rack_rtt_sample rack_rs;
+ uint32_t rc_tlp_rxt_last_time;
+ uint32_t rc_saved_cwnd;
+ uint32_t rc_gp_history[RACK_GP_HIST];
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
@@ -270,9 +286,11 @@ struct rack_control {
uint8_t rc_early_recovery; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_min_to; /* Socket option value Lock(a) */
- uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */
uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */
uint8_t rc_rate_sample_method;
+ uint8_t rc_gp_hist_idx: 7,
+ rc_gp_hist_filled: 1;
+
};
#ifdef _KERNEL
@@ -305,16 +323,22 @@ struct tcp_rack {
rc_last_pto_set : 1, /* XXX not used */
rc_tlp_in_progress : 1,
rc_always_pace : 1, /* Socket option value Lock(a) */
- rc_timer_up : 1; /* The rack timer is up flag Lock(a) */
- uint8_t r_idle_reduce_largest : 1,
- r_enforce_min_pace : 2,
- r_min_pace_seg_thresh : 5;
+ tlp_timer_up : 1; /* The tlp timer is up flag Lock(a) */
+ uint8_t r_enforce_min_pace : 2,
+ rc_has_collapsed : 1,
+ r_rep_attack : 1,
+ r_rep_reverse : 1,
+ r_xxx_min_pace_seg_thresh : 3;
uint8_t rack_tlp_threshold_use;
uint8_t rc_allow_data_af_clo: 1,
delayed_ack : 1,
+ set_pacing_done_a_iw : 1,
+ use_rack_cheat : 1,
alloc_limit_reported : 1,
- rc_avail : 5;
- uint8_t r_resv[2]; /* Fill to cache line boundary */
+ sack_attack_disable : 1,
+ do_detection : 1,
+ rc_avail : 1;
+ uint16_t rack_per_of_gp;
/* Cache line 2 0x40 */
struct rack_control r_ctl;
} __aligned(CACHE_LINE_SIZE);