/*-
* Copyright (c) 2016-2020 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/**
* Author: Randall Stewart <rrs@netflix.com>
* This work is based on the ACM Queue paper
* BBR - Congestion Based Congestion Control
* and also numerous discussions with Neal, Yuchung and Van.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/arb.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#ifdef STATS
#include <sys/qmath.h>
#include <sys/tree.h>
#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#endif
#include <sys/refcount.h>
#include <sys/queue.h>
#include <sys/eventhandler.h>
#include <sys/smp.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/tim_filter.h>
#include <sys/time.h>
#include <sys/protosw.h>
#include <vm/uma.h>
#include <sys/kern_prefetch.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
#define TCPSTATES /* for logging */
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h> /* required for icmp_var.h */
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#define TCPOUTFLAGS
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_hpts.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_ratelimit.h>
#include <netinet/tcp_lro.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
#include <netinet/tcp_fastopen.h>
#include <netipsec/ipsec_support.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/ethernet.h>
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif /* IPSEC */
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <machine/in_cksum.h>
#ifdef MAC
#include <security/mac/mac_framework.h>
#endif
#include "sack_filter.h"
#include "tcp_bbr.h"
#include "rack_bbr_common.h"
uma_zone_t bbr_zone;
uma_zone_t bbr_pcb_zone;
struct sysctl_ctx_list bbr_sysctl_ctx;
struct sysctl_oid *bbr_sysctl_root;
#define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \
(tv) = (value); \
if ((u_long)(tv) < (u_long)(tvmin)) \
(tv) = (tvmin); \
if ((u_long)(tv) > (u_long)(tvmax)) \
(tv) = (tvmax); \
} while(0)
/*#define BBR_INVARIANT 1*/
/*
* initial window
*/
static uint32_t bbr_def_init_win = 10;
static int32_t bbr_persist_min = 250000; /* 250ms */
static int32_t bbr_persist_max = 1000000; /* 1 Second */
static int32_t bbr_cwnd_may_shrink = 0;
static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP;
static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT;
static int32_t bbr_hardware_pacing_limit = 8000;
static int32_t bbr_quanta = 3; /* How much extra quanta do we get? */
static int32_t bbr_no_retran = 0;
static int32_t bbr_error_base_paceout = 10000; /* usec to pace */
static int32_t bbr_max_net_error_cnt = 10;
/* Should the following be dynamic too -- loss wise */
static int32_t bbr_rtt_gain_thresh = 0;
/* Measurement controls */
static int32_t bbr_use_google_algo = 1;
static int32_t bbr_ts_limiting = 1;
static int32_t bbr_ts_can_raise = 0;
static int32_t bbr_do_red = 600;
static int32_t bbr_red_scale = 20000;
static int32_t bbr_red_mul = 1;
static int32_t bbr_red_div = 2;
static int32_t bbr_red_growth_restrict = 1;
static int32_t bbr_target_is_bbunit = 0;
static int32_t bbr_drop_limit = 0;
/*
* How much gain do we need to see to
* stay in startup?
*/
static int32_t bbr_marks_rxt_sack_passed = 0;
static int32_t bbr_start_exit = 25;
static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */
static int32_t bbr_startup_loss_thresh = 2000; /* 20.00% loss */
static int32_t bbr_hptsi_max_mul = 1; /* These two mul/div assure a min pacing */
static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this
* if we go back ever to where the pacer
* has priority over timers.
*/
static int32_t bbr_policer_call_from_rack_to = 0;
static int32_t bbr_policer_detection_enabled = 1;
static int32_t bbr_min_measurements_req = 1; /* We need at least 2
* measurments before we are
* "good" note that 2 == 1.
* This is because we use a >
* comparison. This means if
* min_measure was 0, it takes
* num-measures > min(0) and
* you get 1 measurement and
* you are good. Set to 1, you
* have to have two
* measurements (this is done
* to prevent it from being ok
* to have no measurements). */
static int32_t bbr_no_pacing_until = 4;
static int32_t bbr_min_usec_delta = 20000; /* 20,000 usecs */
static int32_t bbr_min_peer_delta = 20; /* 20 units */
static int32_t bbr_delta_percent = 150; /* 15.0 % */
static int32_t bbr_target_cwnd_mult_limit = 8;
/*
* bbr_cwnd_min_val is the number of
* segments we hold to in the RTT probe
* state typically 4.
*/
static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS;
static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS;
static int32_t bbr_gain_to_target = 1;
static int32_t bbr_gain_gets_extra_too = 1;
/*
* bbr_high_gain is the 2/ln(2) value we need
* to double the sending rate in startup. This
* is used for both cwnd and hptsi gain's.
*/
static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1;
static int32_t bbr_use_lower_gain_in_startup = 1;
/* thresholds for reduction on drain in sub-states/drain */
static int32_t bbr_drain_rtt = BBR_SRTT;
static int32_t bbr_drain_floor = 88;
static int32_t google_allow_early_out = 1;
static int32_t google_consider_lost = 1;
static int32_t bbr_drain_drop_mul = 4;
static int32_t bbr_drain_drop_div = 5;
static int32_t bbr_rand_ot = 50;
static int32_t bbr_can_force_probertt = 0;
static int32_t bbr_can_adjust_probertt = 1;
static int32_t bbr_probertt_sets_rtt = 0;
static int32_t bbr_can_use_ts_for_rtt = 1;
static int32_t bbr_is_ratio = 0;
static int32_t bbr_sub_drain_app_limit = 1;
static int32_t bbr_prtt_slam_cwnd = 1;
static int32_t bbr_sub_drain_slam_cwnd = 1;
static int32_t bbr_slam_cwnd_in_main_drain = 1;
static int32_t bbr_filter_len_sec = 6; /* How long does the rttProp filter
* hold */
static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4);
/*
* bbr_drain_gain is the reverse of the high_gain
* designed to drain back out the standing queue
* that is formed in startup by causing a larger
* hptsi gain and thus drainging the packets
* in flight.
*/
static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885;
static int32_t bbr_rttprobe_gain = 192;
/*
* The cwnd_gain is the default cwnd gain applied when
* calculating a target cwnd. Note that the cwnd is
* a secondary factor in the way BBR works (see the
* paper and think about it, it will take some time).
* Basically the hptsi_gain spreads the packets out
* so you never get more than BDP to the peer even
* if the cwnd is high. In our implemenation that
* means in non-recovery/retransmission scenarios
* cwnd will never be reached by the flight-size.
*/
static int32_t bbr_cwnd_gain = BBR_UNIT * 2;
static int32_t bbr_tlp_type_to_use = BBR_SRTT;
static int32_t bbr_delack_time = 100000; /* 100ms in useconds */
static int32_t bbr_sack_not_required = 0; /* set to one to allow non-sack to use bbr */
static int32_t bbr_initial_bw_bps = 62500; /* 500kbps in bytes ps */
static int32_t bbr_ignore_data_after_close = 1;
static int16_t bbr_hptsi_gain[] = {
(BBR_UNIT *5 / 4),
(BBR_UNIT * 3 / 4),
BBR_UNIT,
BBR_UNIT,
BBR_UNIT,
BBR_UNIT,
BBR_UNIT,
BBR_UNIT
};
int32_t bbr_use_rack_resend_cheat = 1;
int32_t bbr_sends_full_iwnd = 1;
#define BBR_HPTSI_GAIN_MAX 8
/*
* The BBR module incorporates a number of
* TCP ideas that have been put out into the IETF
* over the last few years:
* - Yuchung Cheng's RACK TCP (for which its named) that
* will stop us using the number of dup acks and instead
* use time as the gage of when we retransmit.
* - Reorder Detection of RFC4737 and the Tail-Loss probe draft
* of Dukkipati et.al.
* - Van Jacobson's et.al BBR.
*
* RACK depends on SACK, so if an endpoint arrives that
* cannot do SACK the state machine below will shuttle the
* connection back to using the "default" TCP stack that is
* in FreeBSD.
*
* To implement BBR and RACK the original TCP stack was first decomposed
* into a functional state machine with individual states
* for each of the possible TCP connection states. The do_segement
* functions role in life is to mandate the connection supports SACK
* initially and then assure that the RACK state matches the conenction
* state before calling the states do_segment function. Data processing
* of inbound segments also now happens in the hpts_do_segment in general
* with only one exception. This is so we can keep the connection on
* a single CPU.
*
* Each state is simplified due to the fact that the original do_segment
* has been decomposed and we *know* what state we are in (no
* switches on the state) and all tests for SACK are gone. This
* greatly simplifies what each state does.
*
* TCP output is also over-written with a new version since it
* must maintain the new rack scoreboard and has had hptsi
* integrated as a requirment. Still todo is to eliminate the
* use of the callout_() system and use the hpts for all
* timers as well.
*/
static uint32_t bbr_rtt_probe_time = 200000; /* 200ms in micro seconds */
static uint32_t bbr_rtt_probe_cwndtarg = 4; /* How many mss's outstanding */
static const int32_t bbr_min_req_free = 2; /* The min we must have on the
* free list */
static int32_t bbr_tlp_thresh = 1;
static int32_t bbr_reorder_thresh = 2;
static int32_t bbr_reorder_fade = 60000000; /* 0 - never fade, def
* 60,000,000 - 60 seconds */
static int32_t bbr_pkt_delay = 1000;
static int32_t bbr_min_to = 1000; /* Number of usec's minimum timeout */
static int32_t bbr_incr_timers = 1;
static int32_t bbr_tlp_min = 10000; /* 10ms in usecs */
static int32_t bbr_delayed_ack_time = 200000; /* 200ms in usecs */
static int32_t bbr_exit_startup_at_loss = 1;
/*
* bbr_lt_bw_ratio is 1/8th
* bbr_lt_bw_diff is < 4 Kbit/sec
*/
static uint64_t bbr_lt_bw_diff = 4000 / 8; /* In bytes per second */
static uint64_t bbr_lt_bw_ratio = 8; /* For 1/8th */
static uint32_t bbr_lt_bw_max_rtts = 48; /* How many rtt's do we use
* the lt_bw for */
static uint32_t bbr_lt_intvl_min_rtts = 4; /* Min num of RTT's to measure
* lt_bw */
static int32_t bbr_lt_intvl_fp = 0; /* False positive epoch diff */
static int32_t bbr_lt_loss_thresh = 196; /* Lost vs delivered % */
static int32_t bbr_lt_fd_thresh = 100; /* false detection % */
static int32_t bbr_verbose_logging = 0;
/*
* Currently regular tcp has a rto_min of 30ms
* the backoff goes 12 times so that ends up
* being a total of 122.850 seconds before a
* connection is killed.
*/
static int32_t bbr_rto_min_ms = 30; /* 30ms same as main freebsd */
static int32_t bbr_rto_max_sec = 4; /* 4 seconds */
/****************************************************/
/* DEFAULT TSO SIZING (cpu performance impacting) */
/****************************************************/
/* What amount is our formula using to get TSO size */
static int32_t bbr_hptsi_per_second = 1000;
/*
* For hptsi under bbr_cross_over connections what is delay
* target 7ms (in usec) combined with a seg_max of 2
* gets us close to identical google behavior in
* TSO size selection (possibly more 1MSS sends).
*/
static int32_t bbr_hptsi_segments_delay_tar = 7000;
/* Does pacing delay include overhead's in its time calculations? */
static int32_t bbr_include_enet_oh = 0;
static int32_t bbr_include_ip_oh = 1;
static int32_t bbr_include_tcp_oh = 1;
static int32_t bbr_google_discount = 10;
/* Do we use (nf mode) pkt-epoch to drive us or rttProp? */
static int32_t bbr_state_is_pkt_epoch = 0;
static int32_t bbr_state_drain_2_tar = 1;
/* What is the max the 0 - bbr_cross_over MBPS TSO target
* can reach using our delay target. Note that this
* value becomes the floor for the cross over
* algorithm.
*/
static int32_t bbr_hptsi_segments_max = 2;
static int32_t bbr_hptsi_segments_floor = 1;
static int32_t bbr_hptsi_utter_max = 0;
/* What is the min the 0 - bbr_cross-over MBPS TSO target can be */
static int32_t bbr_hptsi_bytes_min = 1460;
static int32_t bbr_all_get_min = 0;
/* Cross over point from algo-a to algo-b */
static uint32_t bbr_cross_over = TWENTY_THREE_MBPS;
/* Do we deal with our restart state? */
static int32_t bbr_uses_idle_restart = 0;
static int32_t bbr_idle_restart_threshold = 100000; /* 100ms in useconds */
/* Do we allow hardware pacing? */
static int32_t bbr_allow_hdwr_pacing = 0;
static int32_t bbr_hdwr_pace_adjust = 2; /* multipler when we calc the tso size */
static int32_t bbr_hdwr_pace_floor = 1;
static int32_t bbr_hdwr_pacing_delay_cnt = 10;
/****************************************************/
static int32_t bbr_resends_use_tso = 0;
static int32_t bbr_tlp_max_resend = 2;
static int32_t bbr_sack_block_limit = 128;
#define BBR_MAX_STAT 19
counter_u64_t bbr_state_time[BBR_MAX_STAT];
counter_u64_t bbr_state_lost[BBR_MAX_STAT];
counter_u64_t bbr_state_resend[BBR_MAX_STAT];
counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t bbr_flows_whdwr_pacing;
counter_u64_t bbr_flows_nohdwr_pacing;
counter_u64_t bbr_nohdwr_pacing_enobuf;
counter_u64_t bbr_hdwr_pacing_enobuf;
static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr);
/*
* Static defintions we need for forward declarations.
*/
static uint32_t
bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain,
uint32_t useconds_time, uint64_t bw);
static uint32_t
bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain);
static void
bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win);
static void
bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses);
static void
bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line,
int dolog);
static uint32_t
bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain);
static void
bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch,
int32_t pkt_epoch, uint32_t losses);
static uint32_t
bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm);
static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp);
static uint32_t
bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
struct bbr_sendmap *rsm, uint32_t srtt,
uint32_t cts);
static void
bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
int32_t line);
static void
bbr_set_state_target(struct tcp_bbr *bbr, int line);
static void
bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line);
static void
bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line);
static void
tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts);
static void
bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts);
static void
bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt,
uint32_t line, uint8_t is_start, uint16_t set);
static struct bbr_sendmap *
bbr_find_lowest_rsm(struct tcp_bbr *bbr);
static __inline uint32_t
bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
static void
bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which);
static void
bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
uint32_t thresh, uint32_t to);
static void
bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
static void
bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay);
static void
bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr,
uint32_t cts, int32_t line);
static void
bbr_stop_all_timers(struct tcpcb *tp);
static void
bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
static void
bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts);
static void
bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts);
static void
bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod);
static int
bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp,
struct tcpcb *tp);
static inline uint8_t
bbr_state_val(struct tcp_bbr *bbr)
{
return(bbr->rc_bbr_substate);
}
static inline uint32_t
get_min_cwnd(struct tcp_bbr *bbr)
{
int mss;
mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED)
return (bbr_cwnd_min_val_hs * mss);
else
return (bbr_cwnd_min_val * mss);
}
static uint32_t
bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr)
{
uint64_t srtt, var;
uint64_t ret_val;
bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
if (tp->t_srtt == 0) {
srtt = (uint64_t)BBR_INITIAL_RTO;
var = 0;
} else {
srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT);
}
TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]),
bbr_persist_min, bbr_persist_max);
return ((uint32_t)ret_val);
}
static uint32_t
bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
{
/*
* Start the FR timer, we do this based on getting the first one in
* the rc_tmap. Note that if its NULL we must stop the timer. in all
* events we need to stop the running timer (if its running) before
* starting the new one.
*/
uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
int32_t idx;
int32_t is_tlp_timer = 0;
struct bbr_sendmap *rsm;
if (bbr->rc_all_timers_stopped) {
/* All timers have been stopped none are to run */
return (0);
}
if (bbr->rc_in_persist) {
/* We can't start any timer in persists */
return (bbr_get_persists_timer_val(tp, bbr));
}
rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
if ((rsm == NULL) ||
((tp->t_flags & TF_SACK_PERMIT) == 0) ||
(tp->t_state < TCPS_ESTABLISHED)) {
/* Nothing on the send map */
activate_rxt:
if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
uint64_t tov;
time_since_sent = 0;
rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
if (rsm) {
idx = rsm->r_rtr_cnt - 1;
if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
tstmp_touse = rsm->r_tim_lastsent[idx];
else
tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
if (TSTMP_GT(tstmp_touse, cts))
time_since_sent = cts - tstmp_touse;
}
bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
if (tp->t_srtt == 0)
tov = BBR_INITIAL_RTO;
else
tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) +
((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT);
if (tp->t_rxtshift)
tov *= tcp_backoff[tp->t_rxtshift];
if (tov > time_since_sent)
tov -= time_since_sent;
else
tov = bbr->r_ctl.rc_min_to;
TCPT_RANGESET_NOSLOP(to, tov,
(bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC),
(bbr->rc_max_rto_sec * USECS_IN_SECOND));
bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to);
return (to);
}
return (0);
}
if (rsm->r_flags & BBR_ACKED) {
rsm = bbr_find_lowest_rsm(bbr);
if (rsm == NULL) {
/* No lowest? */
goto activate_rxt;
}
}
/* Convert from ms to usecs */
if (rsm->r_flags & BBR_SACK_PASSED) {
if ((tp->t_flags & TF_SENTFIN) &&
((tp->snd_max - tp->snd_una) == 1) &&
(rsm->r_flags & BBR_HAS_FIN)) {
/*
* We don't start a bbr rack timer if all we have is
* a FIN outstanding.
*/
goto activate_rxt;
}
srtt = bbr_get_rtt(bbr, BBR_RTT_RACK);
thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm);
idx = rsm->r_rtr_cnt - 1;
exp = rsm->r_tim_lastsent[idx] + thresh;
if (SEQ_GEQ(exp, cts)) {
to = exp - cts;
if (to < bbr->r_ctl.rc_min_to) {
to = bbr->r_ctl.rc_min_to;
}
} else {
to = bbr->r_ctl.rc_min_to;
}
} else {
/* Ok we need to do a TLP not RACK */
if (bbr->rc_tlp_in_progress != 0) {
/*
* The previous send was a TLP.
*/
goto activate_rxt;
}
rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
if (rsm == NULL) {
/* We found no rsm to TLP with. */
goto activate_rxt;
}
if (rsm->r_flags & BBR_HAS_FIN) {
/* If its a FIN we don't do TLP */
rsm = NULL;
goto activate_rxt;
}
time_since_sent = 0;
idx = rsm->r_rtr_cnt - 1;
if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
tstmp_touse = rsm->r_tim_lastsent[idx];
else
tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
if (TSTMP_GT(tstmp_touse, cts))
time_since_sent = cts - tstmp_touse;
is_tlp_timer = 1;
srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use);
thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts);
if (thresh > time_since_sent)
to = thresh - time_since_sent;
else
to = bbr->r_ctl.rc_min_to;
if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
/*
* If the TLP time works out to larger than the max
* RTO lets not do TLP.. just RTO.
*/
goto activate_rxt;
}
if ((bbr->rc_tlp_rtx_out == 1) &&
(rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) {
/*
* Second retransmit of the same TLP
* lets not.
*/
bbr->rc_tlp_rtx_out = 0;
goto activate_rxt;
}
if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) {
/*
* The tail is no longer the last one I did a probe
* on
*/
bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
}
}
if (is_tlp_timer == 0) {
BBR_STAT_INC(bbr_to_arm_rack);
bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
} else {
bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to);
if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
/*
* We have exceeded how many times we can retran the
* current TLP timer, switch to the RTO timer.
*/
goto activate_rxt;
} else {
BBR_STAT_INC(bbr_to_arm_tlp);
bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
}
}
return (to);
}
static inline int32_t
bbr_minseg(struct tcp_bbr *bbr)
{
return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options);
}
static void
bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
{
struct inpcb *inp;
struct hpts_diag diag;
uint32_t delayed_ack = 0;
uint32_t left = 0;
uint32_t hpts_timeout;
uint8_t stopped;
int32_t delay_calc = 0;
uint32_t prev_delay = 0;
inp = tp->t_inpcb;
if (inp->inp_in_hpts) {
/* A previous call is already set up */
return;
}
if ((tp->t_state == TCPS_CLOSED) ||
(tp->t_state == TCPS_LISTEN)) {
return;
}
stopped = bbr->rc_tmr_stopped;
if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
left = bbr->r_ctl.rc_timer_exp - cts;
}
bbr->r_ctl.rc_hpts_flags = 0;
bbr->r_ctl.rc_timer_exp = 0;
prev_delay = bbr->r_ctl.rc_last_delay_val;
if (bbr->r_ctl.rc_last_delay_val &&
(slot == 0)) {
/*
* If a previous pacer delay was in place we
* are not coming from the output side (where
* we calculate a delay, more likely a timer).
*/
slot = bbr->r_ctl.rc_last_delay_val;
if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
/* Compensate for time passed */
delay_calc = cts - bbr->rc_pacer_started;
if (delay_calc <= slot)
slot -= delay_calc;
}
}
/* Do we have early to make up for by pushing out the pacing time? */
if (bbr->r_agg_early_set) {
bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
slot += bbr->r_ctl.rc_agg_early;
bbr->r_ctl.rc_agg_early = 0;
bbr->r_agg_early_set = 0;
}
/* Are we running a total debt that needs to be compensated for? */
if (bbr->r_ctl.rc_hptsi_agg_delay) {
if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
/* We nuke the delay */
slot -= bbr->r_ctl.rc_hptsi_agg_delay;
bbr->r_ctl.rc_hptsi_agg_delay = 0;
} else {
/* We nuke some of the delay, put in a minimal 100usecs */
bbr->r_ctl.rc_hptsi_agg_delay -= slot;
bbr->r_ctl.rc_last_delay_val = slot = 100;
}
}
bbr->r_ctl.rc_last_delay_val = slot;
hpts_timeout = bbr_timer_start(tp, bbr, cts);
if (tp->t_flags & TF_DELACK) {
if (bbr->rc_in_persist == 0) {
delayed_ack = bbr_delack_time;
} else {
/*
* We are in persists and have
* gotten a new data element.
*/
if (hpts_timeout > bbr_delack_time) {
/*
* Lets make the persists timer (which acks)
* be the smaller of hpts_timeout and bbr_delack_time.
*/
hpts_timeout = bbr_delack_time;
}
}
}
if (delayed_ack &&
((hpts_timeout == 0) ||
(delayed_ack < hpts_timeout))) {
/* We need a Delayed ack timer */
bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
hpts_timeout = delayed_ack;
}
if (slot) {
/* Mark that we have a pacing timer up */
BBR_STAT_INC(bbr_paced_segments);
bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
}
/*
* If no timers are going to run and we will fall off thfe hptsi
* wheel, we resort to a keep-alive timer if its configured.
*/
if ((hpts_timeout == 0) &&
(slot == 0)) {
if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING)) {
/*
* Ok we have no timer (persists, rack, tlp, rxt or
* del-ack), we don't have segments being paced. So
* all that is left is the keepalive timer.
*/
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
} else {
hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
}
bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
}
}
if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
(bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
/*
* RACK, TLP, persists and RXT timers all are restartable
* based on actions input .. i.e we received a packet (ack
* or sack) and that changes things (rw, or snd_una etc).
* Thus we can restart them with a new value. For
* keep-alive, delayed_ack we keep track of what was left
* and restart the timer with a smaller value.
*/
if (left < hpts_timeout)
hpts_timeout = left;
}
if (bbr->r_ctl.rc_incr_tmrs && slot &&
(bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
/*
* If configured to do so, and the timer is either
* the TLP or RXT timer, we need to increase the timeout
* by the pacing time. Consider the bottleneck at my
* machine as an example, we are sending something
* to start a TLP on. The last packet won't be emitted
* fully until the pacing time (the bottleneck will hold
* the data in place). Once the packet is emitted that
* is when we want to start waiting for the TLP. This
* is most evident with hardware pacing (where the nic
* is holding the packet(s) before emitting). But it
* can also show up in the network so we do it for all
* cases. Technically we would take off one packet from
* this extra delay but this is easier and being more
* conservative is probably better.
*/
hpts_timeout += slot;
}
if (hpts_timeout) {
/*
* Hack alert for now we can't time-out over 2147 seconds (a
* bit more than 35min)
*/
if (hpts_timeout > 0x7ffffffe)
hpts_timeout = 0x7ffffffe;
bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
} else
bbr->r_ctl.rc_timer_exp = 0;
if ((slot) &&
(bbr->rc_use_google ||
bbr->output_error_seen ||
(slot <= hpts_timeout)) ) {
/*
* Tell LRO that it can queue packets while
* we pace.
*/
bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
(bbr->rc_cwnd_limited == 0)) {
/*
* If we are not cwnd limited and we
* are running a rack timer we put on
* the do not disturbe even for sack.
*/
inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
} else
inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
bbr->rc_pacer_started = cts;
(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
__LINE__, &diag);
bbr->rc_timer_first = 0;
bbr->bbr_timer_src = frm;
bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
bbr_log_hpts_diag(bbr, cts, &diag);
} else if (hpts_timeout) {
(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
__LINE__, &diag);
/*
* We add the flag here as well if the slot is set,
* since hpts will call in to clear the queue first before
* calling the output routine (which does our timers).
* We don't want to set the flag if its just a timer
* else the arrival of data might (that causes us
* to send more) might get delayed. Imagine being
* on a keep-alive timer and a request comes in for
* more data.
*/
if (slot)
bbr->rc_pacer_started = cts;
if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
(bbr->rc_cwnd_limited == 0)) {
/*
* For a rack timer, don't wake us even
* if a sack arrives as long as we are
* not cwnd limited.
*/
bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
} else {
/* All other timers wake us up */
bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
}
bbr->bbr_timer_src = frm;
bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
bbr_log_hpts_diag(bbr, cts, &diag);
bbr->rc_timer_first = 1;
}
bbr->rc_tmr_stopped = 0;
bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
}
static void
bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb)
{
/*
* We received an ack, and then did not call send or were bounced
* out due to the hpts was running. Now a timer is up as well, is it
* the right timer?
*/
struct inpcb *inp;
struct bbr_sendmap *rsm;
uint32_t hpts_timeout;
int tmr_up;
tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
return;
rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
(tmr_up == PACE_TMR_RXT)) {
/* Should be an RXT */
return;
}
inp = bbr->rc_inp;
if (rsm == NULL) {
/* Nothing outstanding? */
if (tp->t_flags & TF_DELACK) {
if (tmr_up == PACE_TMR_DELACK)
/*
* We are supposed to have delayed ack up
* and we do
*/
return;
} else if (sbavail(&inp->inp_socket->so_snd) &&
(tmr_up == PACE_TMR_RXT)) {
/*
* if we hit enobufs then we would expect the
* possiblity of nothing outstanding and the RXT up
* (and the hptsi timer).
*/
return;
} else if (((V_tcp_always_keepalive ||
inp->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING)) &&
(tmr_up == PACE_TMR_KEEP) &&
(tp->snd_max == tp->snd_una)) {
/* We should have keep alive up and we do */
return;
}
}
if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) {
if ((tp->t_flags & TF_SENTFIN) &&
((tp->snd_max - tp->snd_una) == 1) &&
(rsm->r_flags & BBR_HAS_FIN)) {
/* needs to be a RXT */
if (tmr_up == PACE_TMR_RXT)
return;
else
goto wrong_timer;
} else if (tmr_up == PACE_TMR_RACK)
return;
else
goto wrong_timer;
} else if (rsm && (tmr_up == PACE_TMR_RACK)) {
/* Rack timer has priority if we have data out */
return;
} else if (SEQ_GT(tp->snd_max, tp->snd_una) &&
((tmr_up == PACE_TMR_TLP) ||
(tmr_up == PACE_TMR_RXT))) {
/*
* Either a TLP or RXT is fine if no sack-passed is in place
* and data is outstanding.
*/
return;
} else if (tmr_up == PACE_TMR_DELACK) {
/*
* If the delayed ack was going to go off before the
* rtx/tlp/rack timer were going to expire, then that would
* be the timer in control. Note we don't check the time
* here trusting the code is correct.
*/
return;
}
if (SEQ_GT(tp->snd_max, tp->snd_una) &&
((tmr_up == PACE_TMR_RXT) ||
(tmr_up == PACE_TMR_TLP) ||
(tmr_up == PACE_TMR_RACK))) {
/*
* We have outstanding data and
* we *do* have a RACK, TLP or RXT
* timer running. We won't restart
* anything here since thats probably ok we
* will get called with some timer here shortly.
*/
return;
}
/*
* Ok the timer originally started is not what we want now. We will
* force the hpts to be stopped if any, and restart with the slot
* set to what was in the saved slot.
*/
wrong_timer:
if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
if (inp->inp_in_hpts)
tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
bbr_timer_cancel(bbr, __LINE__, cts);
bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,
0);
} else {
/*
* Output is hptsi so we just need to switch the type of
* timer. We don't bother with keep-alive, since when we
* jump through the output, it will start the keep-alive if
* nothing is sent.
*
* We only need a delayed-ack added and or the hpts_timeout.
*/
hpts_timeout = bbr_timer_start(tp, bbr, cts);
if (tp->t_flags & TF_DELACK) {
if (hpts_timeout == 0) {
hpts_timeout = bbr_delack_time;
bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
}
else if (hpts_timeout > bbr_delack_time) {
hpts_timeout = bbr_delack_time;
bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
}
}
if (hpts_timeout) {
if (hpts_timeout > 0x7ffffffe)
hpts_timeout = 0x7ffffffe;
bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
}
}
}
int32_t bbr_clear_lost = 0;
/*
* Considers the two time values now (cts) and earlier.
* If cts is smaller than earlier, we could have
* had a sequence wrap (our counter wraps every
* 70 min or so) or it could be just clock skew
* getting us two differnt time values. Clock skew
* will show up within 10ms or so. So in such
* a case (where cts is behind earlier time by
* less than 10ms) we return 0. Otherwise we
* return the true difference between them.
*/
static inline uint32_t
bbr_calc_time(uint32_t cts, uint32_t earlier_time) {
/*
* Given two timestamps, the current time stamp cts, and some other
* time-stamp taken in theory earlier return the difference. The
* trick is here sometimes locking will get the other timestamp
* after the cts. If this occurs we need to return 0.
*/
if (TSTMP_GEQ(cts, earlier_time))
return (cts - earlier_time);
/*
* cts is behind earlier_time if its less than 10ms consider it 0.
* If its more than 10ms difference then we had a time wrap. Else
* its just the normal locking foo. I wonder if we should not go to
* 64bit TS and get rid of this issue.
*/
if (TSTMP_GEQ((cts + 10000), earlier_time))
return (0);
/*
* Ok the time must have wrapped. So we need to answer a large
* amount of time, which the normal subtraction should do.
*/
return (cts - earlier_time);
}
static int
sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS)
{
uint32_t stat;
int32_t error;
error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t));
if (error || req->newptr == NULL)
return error;
error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
if (error)
return (error);
if (stat == 1) {
#ifdef BBR_INVARIANTS
printf("Clearing BBR lost counters\n");
#endif
COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT);
COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT);
COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT);
} else if (stat == 2) {
#ifdef BBR_INVARIANTS
printf("Clearing BBR option counters\n");
#endif
COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE);
} else if (stat == 3) {
#ifdef BBR_INVARIANTS
printf("Clearing BBR stats counters\n");
#endif
COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE);
} else if (stat == 4) {
#ifdef BBR_INVARIANTS
printf("Clearing BBR out-size counters\n");
#endif
COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE);
}
bbr_clear_lost = 0;
return (0);
}
static void
bbr_init_sysctls(void)
{
struct sysctl_oid *bbr_probertt;
struct sysctl_oid *bbr_hptsi;
struct sysctl_oid *bbr_measure;
struct sysctl_oid *bbr_cwnd;
struct sysctl_oid *bbr_timeout;
struct sysctl_oid *bbr_states;
struct sysctl_oid *bbr_startup;
struct sysctl_oid *bbr_policer;
/* Probe rtt controls */
bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"probertt",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "gain", CTLFLAG_RW,
&bbr_rttprobe_gain, 192,
"What is the filter gain drop in probe_rtt (0=disable)?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "cwnd", CTLFLAG_RW,
&bbr_rtt_probe_cwndtarg, 4,
"How many mss's are outstanding during probe-rtt");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "int", CTLFLAG_RW,
&bbr_rtt_probe_limit, 4000000,
"If RTT has not shrank in this many micro-seconds enter probe-rtt");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "mintime", CTLFLAG_RW,
&bbr_rtt_probe_time, 200000,
"How many microseconds in probe-rtt");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "filter_len_sec", CTLFLAG_RW,
&bbr_filter_len_sec, 6,
"How long in seconds does the rttProp filter run?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "drain_rtt", CTLFLAG_RW,
&bbr_drain_rtt, BBR_SRTT,
"What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "can_force", CTLFLAG_RW,
&bbr_can_force_probertt, 0,
"If we keep setting new low rtt's but delay going in probe-rtt can we force in??");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "enter_sets_force", CTLFLAG_RW,
&bbr_probertt_sets_rtt, 0,
"In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "can_adjust", CTLFLAG_RW,
&bbr_can_adjust_probertt, 1,
"Can we dynamically adjust the probe-rtt limits and times?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "is_ratio", CTLFLAG_RW,
&bbr_is_ratio, 0,
"is the limit to filter a ratio?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "use_cwnd", CTLFLAG_RW,
&bbr_prtt_slam_cwnd, 0,
"Should we set/recover cwnd?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_probertt),
OID_AUTO, "can_use_ts", CTLFLAG_RW,
&bbr_can_use_ts_for_rtt, 1,
"Can we use the ms timestamp if available for retransmistted rtt calculations?");
/* Pacing controls */
bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"pacing",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "hw_pacing", CTLFLAG_RW,
&bbr_allow_hdwr_pacing, 1,
"Do we allow hardware pacing?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "hw_pacing_limit", CTLFLAG_RW,
&bbr_hardware_pacing_limit, 4000,
"Do we have a limited number of connections for pacing chelsio (0=no limit)?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "hw_pacing_adj", CTLFLAG_RW,
&bbr_hdwr_pace_adjust, 2,
"Multiplier to calculated tso size?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "hw_pacing_floor", CTLFLAG_RW,
&bbr_hdwr_pace_floor, 1,
"Do we invoke the hardware pacing floor?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW,
&bbr_hdwr_pacing_delay_cnt, 10,
"How many packets must be sent after hdwr pacing is enabled");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "bw_cross", CTLFLAG_RW,
&bbr_cross_over, 3000000,
"What is the point where we cross over to linux like TSO size set");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "seg_deltarg", CTLFLAG_RW,
&bbr_hptsi_segments_delay_tar, 7000,
"What is the worse case delay target for hptsi < 48Mbp connections");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "enet_oh", CTLFLAG_RW,
&bbr_include_enet_oh, 0,
"Do we include the ethernet overhead in calculating pacing delay?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "ip_oh", CTLFLAG_RW,
&bbr_include_ip_oh, 1,
"Do we include the IP overhead in calculating pacing delay?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "tcp_oh", CTLFLAG_RW,
&bbr_include_tcp_oh, 0,
"Do we include the TCP overhead in calculating pacing delay?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "google_discount", CTLFLAG_RW,
&bbr_google_discount, 10,
"What is the default google discount percentage wise for pacing (11 = 1.1%%)?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "all_get_min", CTLFLAG_RW,
&bbr_all_get_min, 0,
"If you are less than a MSS do you just get the min?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "tso_min", CTLFLAG_RW,
&bbr_hptsi_bytes_min, 1460,
"For 0 -> 24Mbps what is floor number of segments for TSO");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "seg_tso_max", CTLFLAG_RW,
&bbr_hptsi_segments_max, 6,
"For 0 -> 24Mbps what is top number of segments for TSO");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "seg_floor", CTLFLAG_RW,
&bbr_hptsi_segments_floor, 1,
"Minimum TSO size we will fall too in segments");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "utter_max", CTLFLAG_RW,
&bbr_hptsi_utter_max, 0,
"The absolute maximum that any pacing (outside of hardware) can be");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "seg_divisor", CTLFLAG_RW,
&bbr_hptsi_per_second, 100,
"What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps ");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "srtt_mul", CTLFLAG_RW,
&bbr_hptsi_max_mul, 1,
"The multiplier for pace len max");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_hptsi),
OID_AUTO, "srtt_div", CTLFLAG_RW,
&bbr_hptsi_max_div, 2,
"The divisor for pace len max");
/* Measurement controls */
bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"measure",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Measurement controls");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "min_i_bw", CTLFLAG_RW,
&bbr_initial_bw_bps, 62500,
"Minimum initial b/w in bytes per second");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "no_sack_needed", CTLFLAG_RW,
&bbr_sack_not_required, 0,
"Do we allow bbr to run on connections not supporting SACK?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "use_google", CTLFLAG_RW,
&bbr_use_google_algo, 0,
"Use has close to google V1.0 has possible?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "ts_limiting", CTLFLAG_RW,
&bbr_ts_limiting, 1,
"Do we attempt to use the peers timestamp to limit b/w caculations?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "ts_can_raise", CTLFLAG_RW,
&bbr_ts_can_raise, 0,
"Can we raise the b/w via timestamp b/w calculation?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "ts_delta", CTLFLAG_RW,
&bbr_min_usec_delta, 20000,
"How long in usec between ts of our sends in ts validation code?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "ts_peer_delta", CTLFLAG_RW,
&bbr_min_peer_delta, 20,
"What min numerical value should be between the peer deltas?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "ts_delta_percent", CTLFLAG_RW,
&bbr_delta_percent, 150,
"What percentage (150 = 15.0) do we allow variance for?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "min_measure_good_bw", CTLFLAG_RW,
&bbr_min_measurements_req, 1,
"What is the minimum measurment count we need before we switch to our b/w estimate");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "min_measure_before_pace", CTLFLAG_RW,
&bbr_no_pacing_until, 4,
"How many pkt-epoch's (0 is off) do we need before pacing is on?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "quanta", CTLFLAG_RW,
&bbr_quanta, 2,
"Extra quanta to add when calculating the target (ID section 4.2.3.2).");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "noretran", CTLFLAG_RW,
&bbr_no_retran, 0,
"Should google mode not use retransmission measurements for the b/w estimation?");
/* State controls */
bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"states",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"State controls");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "idle_restart", CTLFLAG_RW,
&bbr_uses_idle_restart, 0,
"Do we use a new special idle_restart state to ramp back up quickly?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "idle_restart_threshold", CTLFLAG_RW,
&bbr_idle_restart_threshold, 100000,
"How long must we be idle before we restart??");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "use_pkt_epoch", CTLFLAG_RW,
&bbr_state_is_pkt_epoch, 0,
"Do we use a pkt-epoch for substate if 0 rttProp?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "startup_rtt_gain", CTLFLAG_RW,
&bbr_rtt_gain_thresh, 0,
"What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "drain_floor", CTLFLAG_RW,
&bbr_drain_floor, 88,
"What is the lowest we can drain (pg) too?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "drain_2_target", CTLFLAG_RW,
&bbr_state_drain_2_tar, 1,
"Do we drain to target in drain substate?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "gain_2_target", CTLFLAG_RW,
&bbr_gain_to_target, 1,
"Does probe bw gain to target??");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "gain_extra_time", CTLFLAG_RW,
&bbr_gain_gets_extra_too, 1,
"Does probe bw gain get the extra time too?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "ld_div", CTLFLAG_RW,
&bbr_drain_drop_div, 5,
"Long drain drop divider?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "ld_mul", CTLFLAG_RW,
&bbr_drain_drop_mul, 4,
"Long drain drop multiplier?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "rand_ot_disc", CTLFLAG_RW,
&bbr_rand_ot, 50,
"Random discount of the ot?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "dr_filter_life", CTLFLAG_RW,
&bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT,
"How many packet-epochs does the b/w delivery rate last?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "subdrain_applimited", CTLFLAG_RW,
&bbr_sub_drain_app_limit, 0,
"Does our sub-state drain invoke app limited if its long?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW,
&bbr_sub_drain_slam_cwnd, 0,
"Should we set/recover cwnd for sub-state drain?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW,
&bbr_slam_cwnd_in_main_drain, 0,
"Should we set/recover cwnd for main-state drain?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "google_gets_earlyout", CTLFLAG_RW,
&google_allow_early_out, 1,
"Should we allow google probe-bw/drain to exit early at flight target?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_states),
OID_AUTO, "google_exit_loss", CTLFLAG_RW,
&google_consider_lost, 1,
"Should we have losses exit gain of probebw in google mode??");
/* Startup controls */
bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"startup",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Startup controls");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_startup),
OID_AUTO, "cheat_iwnd", CTLFLAG_RW,
&bbr_sends_full_iwnd, 1,
"Do we not pace but burst out initial windows has our TSO size?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_startup),
OID_AUTO, "loss_threshold", CTLFLAG_RW,
&bbr_startup_loss_thresh, 2000,
"In startup what is the loss threshold in a pe that will exit us from startup?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_startup),
OID_AUTO, "use_lowerpg", CTLFLAG_RW,
&bbr_use_lower_gain_in_startup, 1,
"Should we use a lower hptsi gain if we see loss in startup?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_startup),
OID_AUTO, "gain", CTLFLAG_RW,
&bbr_start_exit, 25,
"What gain percent do we need to see to stay in startup??");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_startup),
OID_AUTO, "low_gain", CTLFLAG_RW,
&bbr_low_start_exit, 15,
"What gain percent do we need to see to stay in the lower gain startup??");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_startup),
OID_AUTO, "loss_exit", CTLFLAG_RW,
&bbr_exit_startup_at_loss, 1,
"Should we exit startup at loss in an epoch if we are not gaining?");
/* CWND controls */
bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"cwnd",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Cwnd controls");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "tar_rtt", CTLFLAG_RW,
&bbr_cwndtarget_rtt_touse, 0,
"Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "may_shrink", CTLFLAG_RW,
&bbr_cwnd_may_shrink, 0,
"Can the cwnd shrink if it would grow to more than the target?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "max_target_limit", CTLFLAG_RW,
&bbr_target_cwnd_mult_limit, 8,
"Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "highspeed_min", CTLFLAG_RW,
&bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS,
"What is the high-speed min cwnd (rttProp under 1ms)");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "lowspeed_min", CTLFLAG_RW,
&bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS,
"What is the min cwnd (rttProp > 1ms)");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "initwin", CTLFLAG_RW,
&bbr_def_init_win, 10,
"What is the BBR initial window, if 0 use tcp version");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "do_loss_red", CTLFLAG_RW,
&bbr_do_red, 600,
"Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "red_scale", CTLFLAG_RW,
&bbr_red_scale, 20000,
"What RTT do we scale with?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "red_growslow", CTLFLAG_RW,
&bbr_red_growth_restrict, 1,
"Do we restrict cwnd growth for whats in flight?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "red_div", CTLFLAG_RW,
&bbr_red_div, 2,
"If we reduce whats the divisor?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "red_mul", CTLFLAG_RW,
&bbr_red_mul, 1,
"If we reduce whats the mulitiplier?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "target_is_unit", CTLFLAG_RW,
&bbr_target_is_bbunit, 0,
"Is the state target the pacing_gain or BBR_UNIT?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "drop_limit", CTLFLAG_RW,
&bbr_drop_limit, 0,
"Number of segments limit for drop (0=use min_cwnd w/flight)?");
/* Timeout controls */
bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"timeout",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Time out controls");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "delack", CTLFLAG_RW,
&bbr_delack_time, 100000,
"BBR's delayed ack time");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "tlp_uses", CTLFLAG_RW,
&bbr_tlp_type_to_use, 3,
"RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "persmin", CTLFLAG_RW,
&bbr_persist_min, 250000,
"What is the minimum time in microseconds between persists");
SYSCTL_ADD_U32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "persmax", CTLFLAG_RW,
&bbr_persist_max, 1000000,
"What is the largest delay in microseconds between persists");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "tlp_minto", CTLFLAG_RW,
&bbr_tlp_min, 10000,
"TLP Min timeout in usecs");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "tlp_dack_time", CTLFLAG_RW,
&bbr_delayed_ack_time, 200000,
"TLP delayed ack compensation value");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "minrto", CTLFLAG_RW,
&bbr_rto_min_ms, 30,
"Minimum RTO in ms");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "maxrto", CTLFLAG_RW,
&bbr_rto_max_sec, 4,
"Maxiumum RTO in seconds -- should be at least as large as min_rto");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "tlp_retry", CTLFLAG_RW,
&bbr_tlp_max_resend, 2,
"How many times does TLP retry a single segment or multiple with no ACK");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "minto", CTLFLAG_RW,
&bbr_min_to, 1000,
"Minimum rack timeout in useconds");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "pktdelay", CTLFLAG_RW,
&bbr_pkt_delay, 1000,
"Extra RACK time (in useconds) besides reordering thresh");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "incr_tmrs", CTLFLAG_RW,
&bbr_incr_timers, 1,
"Increase the RXT/TLP timer by the pacing time used?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_timeout),
OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW,
&bbr_marks_rxt_sack_passed, 0,
"Mark sack passed on all those not ack'd when a RXT hits?");
/* Policer controls */
bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO,
"policer",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Policer controls");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "detect_enable", CTLFLAG_RW,
&bbr_policer_detection_enabled, 1,
"Is policer detection enabled??");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "min_pes", CTLFLAG_RW,
&bbr_lt_intvl_min_rtts, 4,
"Minimum number of PE's?");
SYSCTL_ADD_U64(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "bwdiff", CTLFLAG_RW,
&bbr_lt_bw_diff, (4000/8),
"Minimal bw diff?");
SYSCTL_ADD_U64(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "bwratio", CTLFLAG_RW,
&bbr_lt_bw_ratio, 8,
"Minimal bw diff?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "from_rack_rxt", CTLFLAG_RW,
&bbr_policer_call_from_rack_to, 0,
"Do we call the policer detection code from a rack-timeout?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "false_postive", CTLFLAG_RW,
&bbr_lt_intvl_fp, 0,
"What packet epoch do we do false-postive detection at (0=no)?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "loss_thresh", CTLFLAG_RW,
&bbr_lt_loss_thresh, 196,
"Loss threshold 196 = 19.6%?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_policer),
OID_AUTO, "false_postive_thresh", CTLFLAG_RW,
&bbr_lt_fd_thresh, 100,
"What percentage is the false detection threshold (150=15.0)?");
/* All the rest */
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "cheat_rxt", CTLFLAG_RW,
&bbr_use_rack_resend_cheat, 0,
"Do we burst 1ms between sends on retransmissions (like rack)?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "error_paceout", CTLFLAG_RW,
&bbr_error_base_paceout, 10000,
"When we hit an error what is the min to pace out in usec's?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "kill_paceout", CTLFLAG_RW,
&bbr_max_net_error_cnt, 10,
"When we hit this many errors in a row, kill the session?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "data_after_close", CTLFLAG_RW,
&bbr_ignore_data_after_close, 1,
"Do we hold off sending a RST until all pending data is ack'd");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "resend_use_tso", CTLFLAG_RW,
&bbr_resends_use_tso, 0,
"Can resends use TSO?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "sblklimit", CTLFLAG_RW,
&bbr_sack_block_limit, 128,
"When do we start ignoring small sack blocks");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "bb_verbose", CTLFLAG_RW,
&bbr_verbose_logging, 0,
"Should BBR black box logging be verbose");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "reorder_thresh", CTLFLAG_RW,
&bbr_reorder_thresh, 2,
"What factor for rack will be added when seeing reordering (shift right)");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "reorder_fade", CTLFLAG_RW,
&bbr_reorder_fade, 0,
"Does reorder detection fade, if so how many ms (0 means never)");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
&bbr_tlp_thresh, 1,
"what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
/* Stats and counters */
/* The pacing counters for hdwr/software can't be in the array */
bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD,
&bbr_hdwr_pacing_enobuf,
"Total number of enobufs for hardware paced flows");
SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD,
&bbr_nohdwr_pacing_enobuf,
"Total number of enobufs for non-hardware paced flows");
bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "hdwr_pacing", CTLFLAG_RD,
&bbr_flows_whdwr_pacing,
"Total number of hardware paced flows");
bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "software_pacing", CTLFLAG_RD,
&bbr_flows_nohdwr_pacing,
"Total number of software paced flows");
COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "stats", CTLFLAG_RD,
bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats");
COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "opts", CTLFLAG_RD,
bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats");
COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "lost", CTLFLAG_RD,
bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur");
COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "stateresend", CTLFLAG_RD,
bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend");
COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "statetime", CTLFLAG_RD,
bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states");
COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "outsize", CTLFLAG_RD,
bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls");
SYSCTL_ADD_PROC(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_sysctl_root),
OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
&bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters");
}
static void
bbr_counter_destroy(void)
{
COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE);
COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE);
COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE);
COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT);
COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT);
COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT);
counter_u64_free(bbr_nohdwr_pacing_enobuf);
counter_u64_free(bbr_hdwr_pacing_enobuf);
counter_u64_free(bbr_flows_whdwr_pacing);
counter_u64_free(bbr_flows_nohdwr_pacing);
}
static __inline void
bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts)
{
memset(l, 0, sizeof(union tcp_log_stackspecific));
l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate;
l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate);
l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
l->bw_inuse = bbr_get_bw(bbr);
l->inflight = ctf_flight_size(bbr->rc_tp,
(bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
l->applimited = bbr->r_ctl.r_app_limited_until;
l->delivered = bbr->r_ctl.rc_delivered;
l->timeStamp = cts;
l->lost = bbr->r_ctl.rc_lost;
l->bbr_state = bbr->rc_bbr_state;
l->bbr_substate = bbr_state_val(bbr);
l->epoch = bbr->r_ctl.rc_rtt_epoch;
l->lt_epoch = bbr->r_ctl.rc_lt_epoch;
l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
l->inhpts = bbr->rc_inp->inp_in_hpts;
l->ininput = bbr->rc_inp->inp_in_input;
l->use_lt_bw = bbr->rc_lt_use_bw;
l->pkts_out = bbr->r_ctl.rc_flight_at_input;
l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
}
static void
bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = 0;
log.u_bbr.flex2 = 0;
log.u_bbr.flex5 = 0;
log.u_bbr.flex3 = 0;
log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate;
log.u_bbr.flex7 = reason;
log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt;
log.u_bbr.flex8 = 0;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_BW_RED_EV, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = seq;
log.u_bbr.flex2 = count;
log.u_bbr.flex8 = mode;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_LOWGAIN, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling,
uint8_t reason, uint32_t p_maxseg, int len)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = p_maxseg;
log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags;
log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
log.u_bbr.flex4 = reason;
log.u_bbr.flex5 = bbr->rc_in_persist;
log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val;
log.u_bbr.flex7 = p_maxseg;
log.u_bbr.flex8 = bbr->rc_in_persist;
log.u_bbr.pkts_out = 0;
log.u_bbr.applimited = len;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_JUSTRET, 0,
tlen, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = seq;
log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_ENTREC, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
{
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = tso;
log.u_bbr.flex2 = maxseg;
log.u_bbr.flex3 = mtu;
log.u_bbr.flex4 = csum_flags;
TCP_LOG_EVENTP(tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_MSGSIZE, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_flowend(struct tcp_bbr *bbr)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
struct sockbuf *r, *s;
struct timeval tv;
if (bbr->rc_inp->inp_socket) {
r = &bbr->rc_inp->inp_socket->so_rcv;
s = &bbr->rc_inp->inp_socket->so_snd;
} else {
r = s = NULL;
}
bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv));
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
r, s,
TCP_LOG_FLOWEND, 0,
0, &log, false, &tv);
}
}
static void
bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line,
uint32_t lost, uint32_t del)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = lost;
log.u_bbr.flex2 = del;
log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw;
log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt;
log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
log.u_bbr.flex7 = line;
log.u_bbr.flex8 = 0;
log.u_bbr.inflight = bbr->r_ctl.r_measurement_count;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_PKT_EPOCH, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->r_ctl.rc_lost;
log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat;
log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat;
log.u_bbr.flex7 = line;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_TIME_EPOCH, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex2 = new_tar;
log.u_bbr.flex3 = line;
log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
log.u_bbr.flex5 = bbr_quanta;
log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs;
log.u_bbr.flex7 = bbr->rc_last_options;
log.u_bbr.flex8 = meth;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_STATE_TARGET, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
if (bbr_state_is_pkt_epoch)
log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
else
log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP);
log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000);
log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra;
log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_STATE, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,
uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt;
log.u_bbr.flex4 = applied;
log.u_bbr.flex5 = rtt;
log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex7 = cond;
log.u_bbr.flex8 = reas;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_RTT_SHRINKS, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_exit_rec(struct tcp_bbr *bbr)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start;
log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_EXITREC, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg,
uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = prev_acked;
log.u_bbr.flex3 = bytes_this_ack;
log.u_bbr.flex4 = chg;
log.u_bbr.flex5 = th_ack;
log.u_bbr.flex6 = target;
log.u_bbr.flex8 = meth;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_CWND, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin)
{
/*
* Log the rtt sample we are applying to the srtt algorithm in
* useconds.
*/
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = rtt;
log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time;
log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay;
log.u_bbr.flex4 = bbr->rc_tp->ts_offset;
log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv);
log.u_bbr.flex6 = tsin;
log.u_bbr.flex7 = 0;
log.u_bbr.flex8 = bbr->rc_ack_was_delayed;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
TCP_LOG_RTT, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = time_in;
log.u_bbr.flex2 = line;
log.u_bbr.flex8 = enter_exit;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_PERSIST, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age;
log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time;
log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_ACKCLEAR, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen,
uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
struct timeval tv;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = nsegs;
log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes;
if (m) {
struct timespec ts;
log.u_bbr.flex3 = m->m_flags;
if (m->m_flags & M_TSTMP) {
mbuf_tstmp2timespec(m, &ts);
tv.tv_sec = ts.tv_sec;
tv.tv_usec = ts.tv_nsec / 1000;
log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv);
} else {
log.u_bbr.lt_epoch = 0;
}
if (m->m_flags & M_TSTMP_LRO) {
tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
log.u_bbr.flex5 = tcp_tv_to_usectick(&tv);
} else {
/* No arrival timestamp */
log.u_bbr.flex5 = 0;
}
log.u_bbr.pkts_out = tcp_get_usecs(&tv);
} else {
log.u_bbr.flex3 = 0;
log.u_bbr.flex5 = 0;
log.u_bbr.flex6 = 0;
log.u_bbr.pkts_out = 0;
}
log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex7 = bbr->r_wanted_output;
log.u_bbr.flex8 = bbr->rc_in_persist;
TCP_LOG_EVENTP(bbr->rc_tp, th,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
TCP_LOG_IN, 0,
tlen, &log, true, &bbr->rc_tv);
}
}
static void
bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = did_out;
log.u_bbr.flex2 = nxt_pkt;
log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val;
log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp;
log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes;
log.u_bbr.flex7 = bbr->r_wanted_output;
log.u_bbr.flex8 = bbr->rc_in_persist;
log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_DOSEG_DONE, 0,
0, &log, true, &bbr->rc_tv);
}
}
static void
bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts,
int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = o_len;
log.u_bbr.flex3 = segcnt;
log.u_bbr.flex4 = segsiz;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_ENOBUF_JMP, ENOBUFS,
len, &log, true, &bbr->rc_tv);
}
}
static void
bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = timers;
log.u_bbr.flex2 = ret;
log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
log.u_bbr.flex5 = cts;
log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex8 = hpts_calling;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_TO_PROCESS, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
uint64_t ar;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->bbr_timer_src;
log.u_bbr.flex2 = 0;
log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
ar = (uint64_t)(bbr->r_ctl.rc_resend);
ar >>= 32;
ar &= 0x00000000ffffffff;
log.u_bbr.flex4 = (uint32_t)ar;
ar = (uint64_t)bbr->r_ctl.rc_resend;
ar &= 0x00000000ffffffff;
log.u_bbr.flex5 = (uint32_t)ar;
log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
log.u_bbr.flex8 = to_num;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_RTO, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = flex1;
log.u_bbr.flex2 = flex2;
log.u_bbr.flex3 = flex3;
log.u_bbr.flex4 = 0;
log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
log.u_bbr.flex8 = reason;
log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_REDUCE, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = diag->p_nxt_slot;
log.u_bbr.flex2 = diag->p_cur_slot;
log.u_bbr.flex3 = diag->slot_req;
log.u_bbr.flex4 = diag->inp_hptsslot;
log.u_bbr.flex5 = diag->slot_remaining;
log.u_bbr.flex6 = diag->need_new_to;
log.u_bbr.flex7 = diag->p_hpts_active;
log.u_bbr.flex8 = diag->p_on_min_sleep;
/* Hijack other fields as needed */
log.u_bbr.epoch = diag->have_slept;
log.u_bbr.lt_epoch = diag->yet_to_sleep;
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
log.u_bbr.inflight = diag->p_runningtick;
log.u_bbr.bw_inuse = diag->wheel_tick;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.delRate = diag->maxticks;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_HPTSDIAG, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
uint32_t thresh, uint32_t to)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->rc_tp->t_rttvar;
log.u_bbr.flex2 = time_since_sent;
log.u_bbr.flex3 = srtt;
log.u_bbr.flex4 = thresh;
log.u_bbr.flex5 = to;
log.u_bbr.flex6 = bbr->rc_tp->t_srtt;
log.u_bbr.flex8 = mode;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_TIMERPREP, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = usecs;
log.u_bbr.flex2 = len;
log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff);
log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff);
if (override)
log.u_bbr.flex5 = (1 << 2);
else
log.u_bbr.flex5 = 0;
log.u_bbr.flex6 = override;
log.u_bbr.flex7 = gain;
log.u_bbr.flex8 = mod;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_HPTSI_CALC, 0,
len, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->bbr_timer_src;
log.u_bbr.flex2 = to;
log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = slot;
log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot;
log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2;
log.u_bbr.flex8 = which;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_TIMERSTAR, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = thresh;
log.u_bbr.flex2 = lro;
log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts;
log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
log.u_bbr.flex6 = srtt;
log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift;
log.u_bbr.flex8 = frm;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_THRESH_CALC, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = bbr->bbr_timer_src;
log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = bbr->rc_in_persist;
log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
log.u_bbr.flex8 = hpts_removed;
log.u_bbr.pkts_out = bbr->rc_pacer_started;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_TIMERCANC, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio;
log.u_bbr.flex2 = (peer_delta >> 32);
log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff);
log.u_bbr.flex4 = (delta >> 32);
log.u_bbr.flex5 = (delta & 0x00000000ffffffff);
log.u_bbr.flex7 = bbr->rc_ts_clock_set;
log.u_bbr.flex8 = bbr->rc_ts_cant_be_used;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_TSTMP_VAL, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = tsosz;
log.u_bbr.flex2 = tls;
log.u_bbr.flex3 = tcp_min_hptsi_time;
log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min;
log.u_bbr.flex5 = old_val;
log.u_bbr.flex6 = maxseg;
log.u_bbr.flex7 = bbr->rc_no_pacing;
log.u_bbr.flex7 <<= 1;
log.u_bbr.flex7 |= bbr->rc_past_init_win;
if (hdwr)
log.u_bbr.flex8 = 0x80 | bbr->rc_use_google;
else
log.u_bbr.flex8 = bbr->rc_use_google;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRTSO, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm,
uint32_t flags, uint32_t line)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = rsm->r_start;
log.u_bbr.flex3 = rsm->r_end;
log.u_bbr.flex4 = rsm->r_delivered;
log.u_bbr.flex5 = rsm->r_rtr_cnt;
log.u_bbr.flex6 = rsm->r_dupack;
log.u_bbr.flex7 = rsm->r_tim_lastsent[0];
log.u_bbr.flex8 = rsm->r_flags;
/* Hijack the pkts_out fids */
log.u_bbr.applimited = flags;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_RSM_CLEARED, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts,
uint32_t flex3, uint32_t flex2, uint32_t flex5,
uint32_t flex6, uint32_t pkts_out, int flex7,
uint32_t flex4, uint32_t flex1)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = flex1;
log.u_bbr.flex2 = flex2;
log.u_bbr.flex3 = flex3;
log.u_bbr.flex4 = flex4;
log.u_bbr.flex5 = flex5;
log.u_bbr.flex6 = flex6;
log.u_bbr.flex7 = flex7;
/* Hijack the pkts_out fids */
log.u_bbr.pkts_out = pkts_out;
log.u_bbr.flex8 = flex8;
if (bbr->rc_ack_was_delayed)
log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay;
else
log.u_bbr.epoch = 0;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRUPD, 0,
flex2, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
uint32_t newbw, uint32_t obw, uint32_t diff,
uint32_t tim)
{
if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = reason;
log.u_bbr.flex2 = newbw;
log.u_bbr.flex3 = obw;
log.u_bbr.flex4 = diff;
log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost;
log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del;
log.u_bbr.flex7 = bbr->rc_lt_is_sampling;
log.u_bbr.pkts_out = tim;
log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw;
if (bbr->rc_lt_use_bw == 0)
log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
else
log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_BWSAMP, 0,
0, &log, false, &bbr->rc_tv);
}
}
static inline void
bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = tick;
log.u_bbr.flex3 = tp->t_maxunacktime;
log.u_bbr.flex4 = tp->t_acktime;
log.u_bbr.flex8 = event;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_PROGRESS, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
uint64_t rate, uint64_t hw_rate, int line, uint32_t cts,
int error)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff);
log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
log.u_bbr.bw_inuse = rate;
log.u_bbr.flex5 = line;
log.u_bbr.flex6 = error;
log.u_bbr.flex8 = bbr->skip_gain;
log.u_bbr.flex8 <<= 1;
log.u_bbr.flex8 |= bbr->gain_is_limited;
log.u_bbr.flex8 <<= 1;
log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing;
log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_HDWR_PACE, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = slot;
log.u_bbr.flex2 = del_by;
log.u_bbr.flex3 = prev_delay;
log.u_bbr.flex4 = line;
log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val;
log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay;
log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags);
log.u_bbr.flex8 = bbr->rc_in_persist;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRSND, 0,
len, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->r_ctl.rc_delivered;
log.u_bbr.flex2 = 0;
log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt;
log.u_bbr.flex4 = end;
log.u_bbr.flex5 = seq;
log.u_bbr.flex6 = t;
log.u_bbr.flex7 = match;
log.u_bbr.flex8 = flags;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRRTT, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method)
{
if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
log.u_bbr.flex3 = bbr->r_ctl.gain_epoch;
log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs;
log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight;
log.u_bbr.flex7 = 0;
log.u_bbr.flex8 = entry_method;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_EXIT_GAIN, 0,
0, &log, false, &bbr->rc_tv);
}
}
static void
bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired)
{
if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
/* R-HU */
log.u_bbr.flex1 = 0;
log.u_bbr.flex2 = 0;
log.u_bbr.flex3 = 0;
log.u_bbr.flex4 = 0;
log.u_bbr.flex7 = 0;
log.u_bbr.flex8 = settings_desired;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
BBR_LOG_SETTINGS_CHG, 0,
0, &log, false, &bbr->rc_tv);
}
}
/*
* Returns the bw from the our filter.
*/
static inline uint64_t
bbr_get_full_bw(struct tcp_bbr *bbr)
{
uint64_t bw;
bw = get_filter_value(&bbr->r_ctl.rc_delrate);
return (bw);
}
static inline void
bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
{
uint64_t calclr;
uint32_t lost, del;
if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch)
lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch;
else
lost = 0;
del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del;
if (lost == 0) {
calclr = 0;
} else if (del) {
calclr = lost;
calclr *= (uint64_t)1000;
calclr /= (uint64_t)del;
} else {
/* Nothing delivered? 100.0% loss */
calclr = 1000;
}
bbr->r_ctl.rc_pkt_epoch_loss_rate = (uint32_t)calclr;
if (IN_RECOVERY(bbr->rc_tp->t_flags))
bbr->r_ctl.recovery_lr += (uint32_t)calclr;
bbr->r_ctl.rc_pkt_epoch++;
if (bbr->rc_no_pacing &&
(bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) {
bbr->rc_no_pacing = 0;
tcp_bbr_tso_size_check(bbr, cts);
}
bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time);
bbr->r_ctl.rc_pkt_epoch_time = cts;
/* What was our loss rate */
bbr_log_pkt_epoch(bbr, cts, line, lost, del);
bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered;
bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost;
}
static inline void
bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
{
uint32_t epoch_time;
/* Tick the RTT clock */
bbr->r_ctl.rc_rtt_epoch++;
epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start;
bbr_log_time_epoch(bbr, cts, line, epoch_time);
bbr->r_ctl.rc_rcv_epoch_start = cts;
}
static inline void
bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked)
{
if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) {
bbr->rc_is_pkt_epoch_now = 1;
}
}
/*
* Returns the bw from either the b/w filter
* or from the lt_bw (if the connection is being
* policed).
*/
static inline uint64_t
__bbr_get_bw(struct tcp_bbr *bbr)
{
uint64_t bw, min_bw;
uint64_t rtt;
int gm_measure_cnt = 1;
/*
* For startup we make, like google, a
* minimum b/w. This is generated from the
* IW and the rttProp. We do fall back to srtt
* if for some reason (initial handshake) we don't
* have a rttProp. We, in the worst case, fall back
* to the configured min_bw (rc_initial_hptsi_bw).
*/
if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
/* Attempt first to use rttProp */
rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
if (rtt && (rtt < 0xffffffff)) {
measure:
min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
((uint64_t)1000000);
min_bw /= rtt;
if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) {
min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
}
} else if (bbr->rc_tp->t_srtt != 0) {
/* No rttProp, use srtt? */
rtt = bbr_get_rtt(bbr, BBR_SRTT);
goto measure;
} else {
min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
}
} else
min_bw = 0;
if ((bbr->rc_past_init_win == 0) &&
(bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp)))
bbr->rc_past_init_win = 1;
if ((bbr->rc_use_google) && (bbr->r_ctl.r_measurement_count >= 1))
gm_measure_cnt = 0;
if (gm_measure_cnt &&
((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) ||
(bbr->rc_past_init_win == 0))) {
/* For google we use our guess rate until we get 1 measurement */
use_initial_window:
rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
if (rtt && (rtt < 0xffffffff)) {
/*
* We have an RTT measurment. Use that in
* combination with our initial window to calculate
* a b/w.
*/
bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
((uint64_t)1000000);
bw /= rtt;
if (bw < bbr->r_ctl.rc_initial_hptsi_bw) {
bw = bbr->r_ctl.rc_initial_hptsi_bw;
}
} else {
/* Drop back to the 40 and punt to a default */
bw = bbr->r_ctl.rc_initial_hptsi_bw;
}
if (bw < 1)
/* Probably should panic */
bw = 1;
if (bw > min_bw)
return (bw);
else
return (min_bw);
}
if (bbr->rc_lt_use_bw)
bw = bbr->r_ctl.rc_lt_bw;
else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0))
bw = bbr->r_ctl.red_bw;
else
bw = get_filter_value(&bbr->r_ctl.rc_delrate);
if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == <
|