/*-
* Copyright (c) 2016-2020 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
#include "opt_ratelimit.h"
#include "opt_kern_tls.h"
#include <sys/param.h>
#include <sys/arb.h>
#include <sys/module.h>
#include <sys/kernel.h>
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#ifdef STATS
#include <sys/qmath.h>
#include <sys/tree.h>
#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#else
#include <sys/tree.h>
#endif
#include <sys/refcount.h>
#include <sys/queue.h>
#include <sys/tim_filter.h>
#include <sys/smp.h>
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>
#include <sys/protosw.h>
#ifdef TCP_ACCOUNTING
#include <sys/sched.h>
#include <machine/cpu.h>
#endif
#include <vm/uma.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
#define TCPSTATES /* for logging */
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h> /* required for icmp_var.h */
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_ratelimit.h>
#include <netinet/tcp_accounting.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_newreno.h>
#include <netinet/tcp_fastopen.h>
#include <netinet/tcp_lro.h>
#ifdef NETFLIX_SHARED_CWND
#include <netinet/tcp_shared_cwnd.h>
#endif
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
#include <netipsec/ipsec_support.h>
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif /* IPSEC */
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <machine/in_cksum.h>
#ifdef MAC
#include <security/mac/mac_framework.h>
#endif
#include "sack_filter.h"
#include "tcp_rack.h"
#include "rack_bbr_common.h"
uma_zone_t rack_zone;
uma_zone_t rack_pcb_zone;
#ifndef TICKS2SBT
#define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t)))
#endif
VNET_DECLARE(uint32_t, newreno_beta);
VNET_DECLARE(uint32_t, newreno_beta_ecn);
#define V_newreno_beta VNET(newreno_beta)
#define V_newreno_beta_ecn VNET(newreno_beta_ecn)
MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
struct sysctl_ctx_list rack_sysctl_ctx;
struct sysctl_oid *rack_sysctl_root;
#define CUM_ACKED 1
#define SACKED 2
/*
* The RACK module incorporates a number of
* TCP ideas that have been put out into the IETF
* over the last few years:
* - Matt Mathis's Rate Halving which slowly drops
* the congestion window so that the ack clock can
* be maintained during a recovery.
* - Yuchung Cheng's RACK TCP (for which its named) that
* will stop us using the number of dup acks and instead
* use time as the gage of when we retransmit.
* - Reorder Detection of RFC4737 and the Tail-Loss probe draft
* of Dukkipati et.al.
* RACK depends on SACK, so if an endpoint arrives that
* cannot do SACK the state machine below will shuttle the
* connection back to using the "default" TCP stack that is
* in FreeBSD.
*
* To implement RACK the original TCP stack was first decomposed
* into a functional state machine with individual states
* for each of the possible TCP connection states. The do_segement
* functions role in life is to mandate the connection supports SACK
* initially and then assure that the RACK state matches the conenction
* state before calling the states do_segment function. Each
* state is simplified due to the fact that the original do_segment
* has been decomposed and we *know* what state we are in (no
* switches on the state) and all tests for SACK are gone. This
* greatly simplifies what each state does.
*
* TCP output is also over-written with a new version since it
* must maintain the new rack scoreboard.
*
*/
static int32_t rack_tlp_thresh = 1;
static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */
static int32_t rack_tlp_use_greater = 1;
static int32_t rack_reorder_thresh = 2;
static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
* - 60 seconds */
static uint8_t rack_req_measurements = 1;
/* Attack threshold detections */
static uint32_t rack_highest_sack_thresh_seen = 0;
static uint32_t rack_highest_move_thresh_seen = 0;
static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */
static int32_t rack_hw_rate_caps = 1; /* 1; */
static int32_t rack_hw_rate_min = 0; /* 1500000;*/
static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
static int32_t rack_hw_up_only = 1;
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */
static int32_t rack_verbose_logging = 0;
static int32_t rack_ignore_data_after_close = 1;
static int32_t rack_enable_shared_cwnd = 1;
static int32_t rack_use_cmp_acks = 1;
static int32_t rack_use_fsb = 1;
static int32_t rack_use_rfo = 1;
static int32_t rack_use_rsm_rfo = 1;
static int32_t rack_max_abc_post_recovery = 2;
static int32_t rack_client_low_buf = 0;
static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
#ifdef TCP_ACCOUNTING
static int32_t rack_tcp_accounting = 0;
#endif
static int32_t rack_limits_scwnd = 1;
static int32_t rack_enable_mqueue_for_nonpaced = 0;
static int32_t rack_disable_prr = 0;
static int32_t use_rack_rr = 1;
static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
static int32_t rack_persist_min = 250000; /* 250usec */
static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
static int32_t rack_default_init_window = 0; /* Use system default */
static int32_t rack_limit_time_with_srtt = 0;
static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */
static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
/*
* Currently regular tcp has a rto_min of 30ms
* the backoff goes 12 times so that ends up
* being a total of 122.850 seconds before a
* connection is killed.
*/
static uint32_t rack_def_data_window = 20;
static uint32_t rack_goal_bdp = 2;
static uint32_t rack_min_srtts = 1;
static uint32_t rack_min_measure_usec = 0;
static int32_t rack_tlp_min = 10000; /* 10ms */
static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */
static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */
static const int32_t rack_free_cache = 2;
static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
static int32_t rack_pace_every_seg = 0;
static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */
static int32_t rack_slot_reduction = 4;
static int32_t rack_wma_divisor = 8; /* For WMA calculation */
static int32_t rack_cwnd_block_ends_measure = 0;
static int32_t rack_rwnd_block_ends_measure = 0;
static int32_t rack_def_profile = 0;
static int32_t rack_lower_cwnd_at_tlp = 0;
static int32_t rack_limited_retran = 0;
static int32_t rack_always_send_oldest = 0;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */
static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */
static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */
/* Probertt */
static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */
static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */
static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */
static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */
static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */
static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */
static uint32_t rack_probertt_use_min_rtt_exit = 0;
static uint32_t rack_probe_rtt_sets_cwnd = 0;
static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */
static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */
static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */
static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */
static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */
static uint32_t rack_probertt_filter_life = 10000000;
static uint32_t rack_probertt_lower_within = 10;
static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */
static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */
static int32_t rack_probertt_clear_is = 1;
static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */
static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */
/* Part of pacing */
static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */
/* Timely information */
/* Combine these two gives the range of 'no change' to bw */
/* ie the up/down provide the upper and lower bound */
static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */
static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */
static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */
static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */
static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */
static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */
static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */
static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */
static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */
static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */
static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */
static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */
static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */
static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */
static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */
static int32_t rack_use_max_for_nobackoff = 0;
static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */
static int32_t rack_timely_no_stopping = 0;
static int32_t rack_down_raise_thresh = 100;
static int32_t rack_req_segs = 1;
static uint64_t rack_bw_rate_cap = 0;
/* Weird delayed ack mode */
static int32_t rack_use_imac_dack = 0;
/* Rack specific counters */
counter_u64_t rack_badfr;
counter_u64_t rack_badfr_bytes;
counter_u64_t rack_rtm_prr_retran;
counter_u64_t rack_rtm_prr_newdata;
counter_u64_t rack_timestamp_mismatch;
counter_u64_t rack_reorder_seen;
counter_u64_t rack_paced_segments;
counter_u64_t rack_unpaced_segments;
counter_u64_t rack_calc_zero;
counter_u64_t rack_calc_nonzero;
counter_u64_t rack_saw_enobuf;
counter_u64_t rack_saw_enobuf_hw;
counter_u64_t rack_saw_enetunreach;
counter_u64_t rack_per_timer_hole;
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
/* Tail loss probe counters */
counter_u64_t rack_tlp_tot;
counter_u64_t rack_tlp_newdata;
counter_u64_t rack_tlp_retran;
counter_u64_t rack_tlp_retran_bytes;
counter_u64_t rack_tlp_retran_fail;
counter_u64_t rack_to_tot;
counter_u64_t rack_to_arm_rack;
counter_u64_t rack_to_arm_tlp;
counter_u64_t rack_hot_alloc;
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
counter_u64_t rack_to_alloc_limited;
counter_u64_t rack_alloc_limited_conns;
counter_u64_t rack_split_limited;
#define MAX_NUM_OF_CNTS 13
counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS];
counter_u64_t rack_multi_single_eq;
counter_u64_t rack_proc_non_comp_ack;
counter_u64_t rack_fto_send;
counter_u64_t rack_fto_rsm_send;
counter_u64_t rack_nfto_resend;
counter_u64_t rack_non_fto_send;
counter_u64_t rack_extended_rfo;
counter_u64_t rack_sack_proc_all;
counter_u64_t rack_sack_proc_short;
counter_u64_t rack_sack_proc_restart;
counter_u64_t rack_sack_attacks_detected;
counter_u64_t rack_sack_attacks_reversed;
counter_u64_t rack_sack_used_next_merge;
counter_u64_t rack_sack_splits;
counter_u64_t rack_sack_used_prev_merge;
counter_u64_t rack_sack_skipped_acked;
counter_u64_t rack_ack_total;
counter_u64_t rack_express_sack;
counter_u64_t rack_sack_total;
counter_u64_t rack_move_none;
counter_u64_t rack_move_some;
counter_u64_t rack_used_tlpmethod;
counter_u64_t rack_used_tlpmethod2;
counter_u64_t rack_enter_tlp_calc;
counter_u64_t rack_input_idle_reduces;
counter_u64_t rack_collapsed_win;
counter_u64_t rack_tlp_does_nada;
counter_u64_t rack_try_scwnd;
counter_u64_t rack_hw_pace_init_fail;
counter_u64_t rack_hw_pace_lost;
counter_u64_t rack_sbsndptr_right;
counter_u64_t rack_sbsndptr_wrong;
/* Temp CPU counters */
counter_u64_t rack_find_high;
counter_u64_t rack_progress_drops;
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
#define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
#define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \
(tv) = (value) + slop; \
if ((u_long)(tv) < (u_long)(tvmin)) \
(tv) = (tvmin); \
if ((u_long)(tv) > (u_long)(tvmax)) \
(tv) = (tvmax); \
} while (0)
static void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
static int
rack_process_ack(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to,
uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
static int
rack_process_data(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
static void
rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
uint8_t limit_type);
static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb *tp,
uint32_t tsused);
static void
rack_cong_signal(struct tcpcb *tp,
uint32_t type, uint32_t ack);
static void rack_counter_destroy(void);
static int
rack_ctloutput(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp);
static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
static void
rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
static void
rack_do_segment(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
uint8_t iptos);
static void rack_dtor(void *mem, int32_t size, void *arg);
static void
rack_log_alt_to_to_cancel(struct tcp_rack *rack,
uint32_t flex1, uint32_t flex2,
uint32_t flex3, uint32_t flex4,
uint32_t flex5, uint32_t flex6,
uint16_t flex7, uint8_t mod);
static void
rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
struct rack_sendmap *rsm, uint8_t quality);
static struct rack_sendmap *
rack_find_high_nonack(struct tcp_rack *rack,
struct rack_sendmap *rsm);
static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
static int
rack_get_sockopt(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
static void
rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
tcp_seq th_ack, int line, uint8_t quality);
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
static int32_t rack_handoff_ok(struct tcpcb *tp);
static int32_t rack_init(struct tcpcb *tp);
static void rack_init_sysctls(void);
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
struct tcphdr *th, int entered_rec, int dup_ack_struck);
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts,
struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm);
static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
static int32_t rack_output(struct tcpcb *tp);
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
uint32_t cts, int *moved_two);
static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
static void rack_remxt_tmr(struct tcpcb *tp);
static int
rack_set_sockopt(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
static int32_t rack_stopall(struct tcpcb *tp);
static void
rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
uint32_t delta);
static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
static int
rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_closing(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_established(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_lastack(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
static int
rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
struct rack_sendmap *
tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
uint32_t tsused);
static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
static void
tcp_rack_partialack(struct tcpcb *tp);
static int
rack_set_profile(struct tcp_rack *rack, int prof);
static void
rack_apply_deferred_options(struct tcp_rack *rack);
int32_t rack_clear_counter=0;
static void
rack_set_cc_pacing(struct tcp_rack *rack)
{
struct sockopt sopt;
struct cc_newreno_opts opt;
struct newreno old, *ptr;
struct tcpcb *tp;
int error;
if (rack->rc_pacing_cc_set)
return;
tp = rack->rc_tp;
if (tp->cc_algo == NULL) {
/* Tcb is leaving */
printf("No cc algorithm?\n");
return;
}
rack->rc_pacing_cc_set = 1;
if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
/* Not new-reno we can't play games with beta! */
goto out;
}
ptr = ((struct newreno *)tp->ccv->cc_data);
if (CC_ALGO(tp)->ctl_output == NULL) {
/* Huh, why does new_reno no longer have a set function? */
printf("no ctl_output for algo:%s\n", tp->cc_algo->name);
goto out;
}
if (ptr == NULL) {
/* Just the default values */
old.beta = V_newreno_beta_ecn;
old.beta_ecn = V_newreno_beta_ecn;
old.newreno_flags = 0;
} else {
old.beta = ptr->beta;
old.beta_ecn = ptr->beta_ecn;
old.newreno_flags = ptr->newreno_flags;
}
sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
sopt.sopt_dir = SOPT_SET;
opt.name = CC_NEWRENO_BETA;
opt.val = rack->r_ctl.rc_saved_beta.beta;
error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
if (error) {
printf("Error returned by ctl_output %d\n", error);
goto out;
}
/*
* Hack alert we need to set in our newreno_flags
* so that Abe behavior is also applied.
*/
((struct newreno *)tp->ccv->cc_data)->newreno_flags = CC_NEWRENO_BETA_ECN;
opt.name = CC_NEWRENO_BETA_ECN;
opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
if (error) {
printf("Error returned by ctl_output %d\n", error);
goto out;
}
/* Save off the original values for restoral */
memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
out:
if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
struct timeval tv;
ptr = ((struct newreno *)tp->ccv->cc_data);
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
if (ptr) {
log.u_bbr.flex1 = ptr->beta;
log.u_bbr.flex2 = ptr->beta_ecn;
log.u_bbr.flex3 = ptr->newreno_flags;
}
log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
log.u_bbr.flex7 = rack->gp_ready;
log.u_bbr.flex7 <<= 1;
log.u_bbr.flex7 |= rack->use_fixed_rate;
log.u_bbr.flex7 <<= 1;
log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex8 = 3;
tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
0, &log, false, NULL, NULL, 0, &tv);
}
}
static void
rack_undo_cc_pacing(struct tcp_rack *rack)
{
struct newreno old, *ptr;
struct tcpcb *tp;
if (rack->rc_pacing_cc_set == 0)
return;
tp = rack->rc_tp;
rack->rc_pacing_cc_set = 0;
if (tp->cc_algo == NULL)
/* Tcb is leaving */
return;
if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
/* Not new-reno nothing to do! */
return;
}
ptr = ((struct newreno *)tp->ccv->cc_data);
if (ptr == NULL) {
/*
* This happens at rack_fini() if the
* cc module gets freed on us. In that
* case we loose our "new" settings but
* thats ok, since the tcb is going away anyway.
*/
return;
}
/* Grab out our set values */
memcpy(&old, ptr, sizeof(struct newreno));
/* Copy back in the original values */
memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
/* Now save back the values we had set in (for when pacing is restored) */
memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
struct timeval tv;
ptr = ((struct newreno *)tp->ccv->cc_data);
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = ptr->beta;
log.u_bbr.flex2 = ptr->beta_ecn;
log.u_bbr.flex3 = ptr->newreno_flags;
log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
log.u_bbr.flex7 = rack->gp_ready;
log.u_bbr.flex7 <<= 1;
log.u_bbr.flex7 |= rack->use_fixed_rate;
log.u_bbr.flex7 <<= 1;
log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex8 = 4;
tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
0, &log, false, NULL, NULL, 0, &tv);
}
}
#ifdef NETFLIX_PEAKRATE
static inline void
rack_update_peakrate_thr(struct tcpcb *tp)
{
/* Keep in mind that t_maxpeakrate is in B/s. */
uint64_t peak;
peak = uqmax((tp->t_maxseg * 2),
(((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
}
#endif
static int
sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
{
uint32_t stat;
int32_t error;
int i;
error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
if (error || req->newptr == NULL)
return error;
error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
if (error)
return (error);
if (stat == 1) {
#ifdef INVARIANTS
printf("Clearing RACK counters\n");
#endif
counter_u64_zero(rack_badfr);
counter_u64_zero(rack_badfr_bytes);
counter_u64_zero(rack_rtm_prr_retran);
counter_u64_zero(rack_rtm_prr_newdata);
counter_u64_zero(rack_timestamp_mismatch);
counter_u64_zero(rack_reorder_seen);
counter_u64_zero(rack_tlp_tot);
counter_u64_zero(rack_tlp_newdata);
counter_u64_zero(rack_tlp_retran);
counter_u64_zero(rack_tlp_retran_bytes);
counter_u64_zero(rack_tlp_retran_fail);
counter_u64_zero(rack_to_tot);
counter_u64_zero(rack_to_arm_rack);
counter_u64_zero(rack_to_arm_tlp);
counter_u64_zero(rack_paced_segments);
counter_u64_zero(rack_calc_zero);
counter_u64_zero(rack_calc_nonzero);
counter_u64_zero(rack_unpaced_segments);
counter_u64_zero(rack_saw_enobuf);
counter_u64_zero(rack_saw_enobuf_hw);
counter_u64_zero(rack_saw_enetunreach);
counter_u64_zero(rack_per_timer_hole);
counter_u64_zero(rack_large_ackcmp);
counter_u64_zero(rack_small_ackcmp);
#ifdef INVARIANTS
counter_u64_zero(rack_adjust_map_bw);
#endif
counter_u64_zero(rack_to_alloc_hard);
counter_u64_zero(rack_to_alloc_emerg);
counter_u64_zero(rack_sack_proc_all);
counter_u64_zero(rack_fto_send);
counter_u64_zero(rack_fto_rsm_send);
counter_u64_zero(rack_extended_rfo);
counter_u64_zero(rack_hw_pace_init_fail);
counter_u64_zero(rack_hw_pace_lost);
counter_u64_zero(rack_sbsndptr_wrong);
counter_u64_zero(rack_sbsndptr_right);
counter_u64_zero(rack_non_fto_send);
counter_u64_zero(rack_nfto_resend);
counter_u64_zero(rack_sack_proc_short);
counter_u64_zero(rack_sack_proc_restart);
counter_u64_zero(rack_to_alloc);
counter_u64_zero(rack_to_alloc_limited);
counter_u64_zero(rack_alloc_limited_conns);
counter_u64_zero(rack_split_limited);
for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
counter_u64_zero(rack_proc_comp_ack[i]);
}
counter_u64_zero(rack_multi_single_eq);
counter_u64_zero(rack_proc_non_comp_ack);
counter_u64_zero(rack_find_high);
counter_u64_zero(rack_sack_attacks_detected);
counter_u64_zero(rack_sack_attacks_reversed);
counter_u64_zero(rack_sack_used_next_merge);
counter_u64_zero(rack_sack_used_prev_merge);
counter_u64_zero(rack_sack_splits);
counter_u64_zero(rack_sack_skipped_acked);
counter_u64_zero(rack_ack_total);
counter_u64_zero(rack_express_sack);
counter_u64_zero(rack_sack_total);
counter_u64_zero(rack_move_none);
counter_u64_zero(rack_move_some);
counter_u64_zero(rack_used_tlpmethod);
counter_u64_zero(rack_used_tlpmethod2);
counter_u64_zero(rack_enter_tlp_calc);
counter_u64_zero(rack_progress_drops);
counter_u64_zero(rack_tlp_does_nada);
counter_u64_zero(rack_try_scwnd);
counter_u64_zero(rack_collapsed_win);
}
rack_clear_counter = 0;
return (0);
}
static void
rack_init_sysctls(void)
{
int i;
struct sysctl_oid *rack_counters;
struct sysctl_oid *rack_attack;
struct sysctl_oid *rack_pacing;
struct sysctl_oid *rack_timely;
struct sysctl_oid *rack_timers;
struct sysctl_oid *rack_tlp;
struct sysctl_oid *rack_misc;
struct sysctl_oid *rack_measure;
struct sysctl_oid *rack_probertt;
struct sysctl_oid *rack_hw_pacing;
rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"sack_attack",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Rack Sack Attack Counters and Controls");
rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"stats",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Rack Counters");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "rate_sample_method", CTLFLAG_RW,
&rack_rate_sample_method , USE_RTT_LOW,
"What method should we use for rate sampling 0=high, 1=low ");
/* Probe rtt related controls */
rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"probertt",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"ProbeRTT related Controls");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
&rack_atexit_prtt_hbp, 130,
"What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
&rack_atexit_prtt, 130,
"What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "gp_per_mul", CTLFLAG_RW,
&rack_per_of_gp_probertt, 60,
"What percentage of goodput do we pace at in probertt");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
&rack_per_of_gp_probertt_reduce, 10,
"What percentage of goodput do we reduce every gp_srtt");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "gp_per_low", CTLFLAG_RW,
&rack_per_of_gp_lowthresh, 40,
"What percentage of goodput do we allow the multiplier to fall to");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "time_between", CTLFLAG_RW,
& rack_time_between_probertt, 96000000,
"How many useconds between the lowest rtt falling must past before we enter probertt");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "safety", CTLFLAG_RW,
&rack_probe_rtt_safety_val, 2000000,
"If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "sets_cwnd", CTLFLAG_RW,
&rack_probe_rtt_sets_cwnd, 0,
"Do we set the cwnd too (if always_lower is on)");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
&rack_max_drain_wait, 2,
"Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
&rack_must_drain, 1,
"We must drain this many gp_srtt's waiting for flight to reach goal");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
&rack_probertt_use_min_rtt_entry, 1,
"Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
&rack_probertt_use_min_rtt_exit, 0,
"How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "length_div", CTLFLAG_RW,
&rack_probertt_gpsrtt_cnt_div, 0,
"How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "length_mul", CTLFLAG_RW,
&rack_probertt_gpsrtt_cnt_mul, 0,
"How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
&rack_min_probertt_hold, 200000,
"What is the minimum time we hold probertt at target");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "filter_life", CTLFLAG_RW,
&rack_probertt_filter_life, 10000000,
"What is the time for the filters life in useconds");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "lower_within", CTLFLAG_RW,
&rack_probertt_lower_within, 10,
"If the rtt goes lower within this percentage of the time, go into probe-rtt");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "must_move", CTLFLAG_RW,
&rack_min_rtt_movement, 250,
"How much is the minimum movement in rtt to count as a drop for probertt purposes");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
&rack_probertt_clear_is, 1,
"Do we clear I/S counts on exiting probe-rtt");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
&rack_max_drain_hbp, 1,
"How many extra drain gpsrtt's do we get in highly buffered paths");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "hbp_threshold", CTLFLAG_RW,
&rack_hbp_thresh, 3,
"We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
/* Pacing related sysctls */
rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"pacing",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Pacing related Controls");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "max_pace_over", CTLFLAG_RW,
&rack_max_per_above, 30,
"What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "pace_to_one", CTLFLAG_RW,
&rack_pace_one_seg, 0,
"Do we allow low b/w pacing of 1MSS instead of two");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
&rack_limit_time_with_srtt, 0,
"Do we limit pacing time based on srtt");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "init_win", CTLFLAG_RW,
&rack_default_init_window, 0,
"Do we have a rack initial window 0 = system default");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "gp_per_ss", CTLFLAG_RW,
&rack_per_of_gp_ss, 250,
"If non zero, what percentage of goodput to pace at in slow start");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "gp_per_ca", CTLFLAG_RW,
&rack_per_of_gp_ca, 150,
"If non zero, what percentage of goodput to pace at in congestion avoidance");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "gp_per_rec", CTLFLAG_RW,
&rack_per_of_gp_rec, 200,
"If non zero, what percentage of goodput to pace at in recovery");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "pace_max_seg", CTLFLAG_RW,
&rack_hptsi_segments, 40,
"What size is the max for TSO segments in pacing and burst mitigation");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "burst_reduces", CTLFLAG_RW,
&rack_slot_reduction, 4,
"When doing only burst mitigation what is the reduce divisor");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "use_pacing", CTLFLAG_RW,
&rack_pace_every_seg, 0,
"If set we use pacing, if clear we use only the original burst mitigation");
SYSCTL_ADD_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "rate_cap", CTLFLAG_RW,
&rack_bw_rate_cap, 0,
"If set we apply this value to the absolute rate cap used by pacing");
SYSCTL_ADD_U8(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
&rack_req_measurements, 1,
"If doing dynamic pacing, how many measurements must be in before we start pacing?");
/* Hardware pacing */
rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"hdwr_pacing",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Pacing related Controls");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "rwnd_factor", CTLFLAG_RW,
&rack_hw_rwnd_factor, 2,
"How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
&rack_enobuf_hw_boost_mult, 2,
"By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
&rack_enobuf_hw_max, 2,
"What is the max boost the pacing time if we see a ENOBUFS?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
&rack_enobuf_hw_min, 2,
"What is the min boost the pacing time if we see a ENOBUFS?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "enable", CTLFLAG_RW,
&rack_enable_hw_pacing, 0,
"Should RACK attempt to use hw pacing?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "rate_cap", CTLFLAG_RW,
&rack_hw_rate_caps, 1,
"Does the highest hardware pacing rate cap the rate we will send at??");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "rate_min", CTLFLAG_RW,
&rack_hw_rate_min, 0,
"Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "rate_to_low", CTLFLAG_RW,
&rack_hw_rate_to_low, 0,
"If we fall below this rate, dis-engage hw pacing?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "up_only", CTLFLAG_RW,
&rack_hw_up_only, 1,
"Do we allow hw pacing to lower the rate selected?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
&rack_hw_pace_extra_slots, 2,
"If the rates between software and hardware match precisely how many extra time_betweens do we get?");
rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"timely",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Rack Timely RTT Controls");
/* Timely based GP dynmics */
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "upper", CTLFLAG_RW,
&rack_gp_per_bw_mul_up, 2,
"Rack timely upper range for equal b/w (in percentage)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "lower", CTLFLAG_RW,
&rack_gp_per_bw_mul_down, 4,
"Rack timely lower range for equal b/w (in percentage)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
&rack_gp_rtt_maxmul, 3,
"Rack timely multipler of lowest rtt for rtt_max");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "rtt_min_div", CTLFLAG_RW,
&rack_gp_rtt_mindiv, 4,
"Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
&rack_gp_rtt_minmul, 1,
"Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "decrease", CTLFLAG_RW,
&rack_gp_decrease_per, 20,
"Rack timely decrease percentage of our GP multiplication factor");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "increase", CTLFLAG_RW,
&rack_gp_increase_per, 2,
"Rack timely increase perentage of our GP multiplication factor");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "lowerbound", CTLFLAG_RW,
&rack_per_lower_bound, 50,
"Rack timely lowest percentage we allow GP multiplier to fall to");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "upperboundss", CTLFLAG_RW,
&rack_per_upper_bound_ss, 0,
"Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "upperboundca", CTLFLAG_RW,
&rack_per_upper_bound_ca, 0,
"Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "dynamicgp", CTLFLAG_RW,
&rack_do_dyn_mul, 0,
"Rack timely do we enable dynmaic timely goodput by default");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "no_rec_red", CTLFLAG_RW,
&rack_gp_no_rec_chg, 1,
"Rack timely do we prohibit the recovery multiplier from being lowered");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
&rack_timely_dec_clear, 6,
"Rack timely what threshold do we count to before another boost during b/w decent");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "max_push_rise", CTLFLAG_RW,
&rack_timely_max_push_rise, 3,
"Rack timely how many times do we push up with b/w increase");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "max_push_drop", CTLFLAG_RW,
&rack_timely_max_push_drop, 3,
"Rack timely how many times do we push back on b/w decent");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "min_segs", CTLFLAG_RW,
&rack_timely_min_segs, 4,
"Rack timely when setting the cwnd what is the min num segments");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "noback_max", CTLFLAG_RW,
&rack_use_max_for_nobackoff, 0,
"Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "interim_timely_only", CTLFLAG_RW,
&rack_timely_int_timely_only, 0,
"Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "nonstop", CTLFLAG_RW,
&rack_timely_no_stopping, 0,
"Rack timely don't stop increase");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
&rack_down_raise_thresh, 100,
"If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timely),
OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
&rack_req_segs, 1,
"Bottom dragging if not these many segments outstanding and room");
/* TLP and Rack related parameters */
rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"tlp",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"TLP and Rack related Controls");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "use_rrr", CTLFLAG_RW,
&use_rack_rr, 1,
"Do we use Rack Rapid Recovery");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "post_rec_labc", CTLFLAG_RW,
&rack_max_abc_post_recovery, 2,
"Since we do early recovery, do we override the l_abc to a value, if so what?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
&rack_non_rxt_use_cr, 0,
"Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "tlpmethod", CTLFLAG_RW,
&rack_tlp_threshold_use, TLP_USE_TWO_ONE,
"What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "limit", CTLFLAG_RW,
&rack_tlp_limit, 2,
"How many TLP's can be sent without sending new data");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "use_greater", CTLFLAG_RW,
&rack_tlp_use_greater, 1,
"Should we use the rack_rtt time if its greater than srtt");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "tlpminto", CTLFLAG_RW,
&rack_tlp_min, 10000,
"TLP minimum timeout per the specification (in microseconds)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "send_oldest", CTLFLAG_RW,
&rack_always_send_oldest, 0,
"Should we always send the oldest TLP and RACK-TLP");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "rack_tlimit", CTLFLAG_RW,
&rack_limited_retran, 0,
"How many times can a rack timeout drive out sends");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
&rack_lower_cwnd_at_tlp, 0,
"When a TLP completes a retran should we enter recovery");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "reorder_thresh", CTLFLAG_RW,
&rack_reorder_thresh, 2,
"What factor for rack will be added when seeing reordering (shift right)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
&rack_tlp_thresh, 1,
"What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "reorder_fade", CTLFLAG_RW,
&rack_reorder_fade, 60000000,
"Does reorder detection fade, if so how many microseconds (0 means never)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "pktdelay", CTLFLAG_RW,
&rack_pkt_delay, 1000,
"Extra RACK time (in microseconds) besides reordering thresh");
/* Timer related controls */
rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"timers",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Timer related controls");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmin", CTLFLAG_RW,
&rack_persist_min, 250000,
"What is the minimum time in microseconds between persists");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmax", CTLFLAG_RW,
&rack_persist_max, 2000000,
"What is the largest delay in microseconds between persists");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "delayed_ack", CTLFLAG_RW,
&rack_delayed_ack_time, 40000,
"Delayed ack time (40ms in microseconds)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "minrto", CTLFLAG_RW,
&rack_rto_min, 30000,
"Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "maxrto", CTLFLAG_RW,
&rack_rto_max, 4000000,
"Maxiumum RTO in microseconds -- should be at least as large as min_rto");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "minto", CTLFLAG_RW,
&rack_min_to, 1000,
"Minimum rack timeout in microseconds");
/* Measure controls */
rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"measure",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Measure related controls");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "wma_divisor", CTLFLAG_RW,
&rack_wma_divisor, 8,
"When doing b/w calculation what is the divisor for the WMA");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "end_cwnd", CTLFLAG_RW,
&rack_cwnd_block_ends_measure, 0,
"Does a cwnd just-return end the measurement window (app limited)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "end_rwnd", CTLFLAG_RW,
&rack_rwnd_block_ends_measure, 0,
"Does an rwnd just-return end the measurement window (app limited -- not persists)");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "min_target", CTLFLAG_RW,
&rack_def_data_window, 20,
"What is the minimum target window (in mss) for a GP measurements");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "goal_bdp", CTLFLAG_RW,
&rack_goal_bdp, 2,
"What is the goal BDP to measure");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "min_srtts", CTLFLAG_RW,
&rack_min_srtts, 1,
"What is the goal BDP to measure");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_measure),
OID_AUTO, "min_measure_tim", CTLFLAG_RW,
&rack_min_measure_usec, 0,
"What is the Minimum time time for a measurement if 0, this is off");
/* Misc rack controls */
rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
"misc",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Misc related controls");
#ifdef TCP_ACCOUNTING
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "tcp_acct", CTLFLAG_RW,
&rack_tcp_accounting, 0,
"Should we turn on TCP accounting for all rack sessions?");
#endif
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
&rack_dsack_std_based, 3,
"How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "prr_addback_max", CTLFLAG_RW,
&rack_prr_addbackmax, 2,
"What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
&rack_stats_gets_ms_rtt, 1,
"What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "clientlowbuf", CTLFLAG_RW,
&rack_client_low_buf, 0,
"Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "defprofile", CTLFLAG_RW,
&rack_def_profile, 0,
"Should RACK use a default profile (0=no, num == profile num)?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "cmpack", CTLFLAG_RW,
&rack_use_cmp_acks, 1,
"Should RACK have LRO send compressed acks");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "fsb", CTLFLAG_RW,
&rack_use_fsb, 1,
"Should RACK use the fast send block?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rfo", CTLFLAG_RW,
&rack_use_rfo, 1,
"Should RACK use rack_fast_output()?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rsmrfo", CTLFLAG_RW,
&rack_use_rsm_rfo, 1,
"Should RACK use rack_fast_rsm_output()?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "shared_cwnd", CTLFLAG_RW,
&rack_enable_shared_cwnd, 1,
"Should RACK try to use the shared cwnd on connections where allowed");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
&rack_limits_scwnd, 1,
"Should RACK place low end time limits on the shared cwnd feature");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
&rack_enable_mqueue_for_nonpaced, 0,
"Should RACK use mbuf queuing for non-paced connections");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "iMac_dack", CTLFLAG_RW,
&rack_use_imac_dack, 0,
"Should RACK try to emulate iMac delayed ack");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "no_prr", CTLFLAG_RW,
&rack_disable_prr, 0,
"Should RACK not use prr and only pace (must have pacing on)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "bb_verbose", CTLFLAG_RW,
&rack_verbose_logging, 0,
"Should RACK black box logging be verbose");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "data_after_close", CTLFLAG_RW,
&rack_ignore_data_after_close, 1,
"Do we hold off sending a RST until all pending data is ack'd");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "no_sack_needed", CTLFLAG_RW,
&rack_sack_not_required, 1,
"Do we allow rack to run on connections not supporting SACK");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "prr_sendalot", CTLFLAG_RW,
&rack_send_a_lot_in_prr, 1,
"Send a lot in prr");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "autoscale", CTLFLAG_RW,
&rack_autosndbuf_inc, 20,
"What percentage should rack scale up its snd buffer by?");
/* Sack Attacker detection stuff */
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
&rack_highest_sack_thresh_seen, 0,
"Highest sack to ack ratio seen");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
&rack_highest_move_thresh_seen, 0,
"Highest move to non-move ratio seen");
rack_ack_total = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
OID_AUTO, "acktotal", CTLFLAG_RD,
&rack_ack_total,
"Total number of Ack's");
rack_express_sack = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
|