diff options
Diffstat (limited to 'sys/netinet')
66 files changed, 2126 insertions, 1113 deletions
diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c index a2e72130fa88..b3e15009244d 100644 --- a/sys/netinet/cc/cc_cubic.c +++ b/sys/netinet/cc/cc_cubic.c @@ -38,7 +38,7 @@ /* * An implementation of the CUBIC congestion control algorithm for FreeBSD, - * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. + * based on the Internet RFC9438 by Xu, Ha, Rhee, Goel, and Eggert. * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the @@ -81,7 +81,7 @@ static void cubic_conn_init(struct cc_var *ccv); static int cubic_mod_init(void); static void cubic_post_recovery(struct cc_var *ccv); static void cubic_record_rtt(struct cc_var *ccv); -static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); +static uint32_t cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg); static void cubic_after_idle(struct cc_var *ccv); static size_t cubic_data_sz(void); static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt); @@ -236,10 +236,11 @@ static void cubic_ack_received(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; - unsigned long W_est, W_cubic; + uint32_t W_est, W_cubic, cwin, target, incr; int usecs_since_epoch; uint32_t mss = tcp_fixed_maxseg(ccv->tp); + cwin = CCV(ccv, snd_cwnd); cubic_data = ccv->cc_data; cubic_record_rtt(ccv); @@ -250,7 +251,7 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { /* Use the logic in NewReno ack_received() for slow start. */ - if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || + if (cwin <= CCV(ccv, snd_ssthresh) || cubic_data->min_rtt_usecs == TCPTV_SRTTBASE) { cubic_does_slow_start(ccv, cubic_data); } else { @@ -265,20 +266,32 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_log_hystart_event(ccv, cubic_data, 11, CCV(ccv, snd_ssthresh)); } - if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && - (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { - /* RFC8312 Section 4.7 */ - cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | - CUBICFLAG_IN_SLOWSTART); - cubic_data->W_max = CCV(ccv, snd_cwnd); - cubic_data->t_epoch = ticks; - cubic_data->K = 0; - } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | + if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | + CUBICFLAG_CONG_EVENT | CUBICFLAG_IN_APPLIMIT)) { + /* + * At the beginning of the current congestion + * avoidance stage, The epoch variables + * (t_epoch, cwnd_epoch, K) are updated in the + * following three cases: + * 1) just exited the slow start + * 2) after a congestion event + * 3) application-limited + */ + cubic_data->t_epoch = ticks; + cubic_data->cwnd_epoch = cwin; + cubic_data->K = cubic_k(cubic_data->W_max / mss, + cubic_data->cwnd_epoch / mss); cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | + CUBICFLAG_CONG_EVENT | CUBICFLAG_IN_APPLIMIT); - cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / mss); + + if (cubic_data->flags & CUBICFLAG_RTO_EVENT) { + /* RFC9438 Section 4.8: Timeout */ + cubic_data->flags &= ~CUBICFLAG_RTO_EVENT; + cubic_data->W_max = cwin; + cubic_data->K = 0; + } } usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick; if (usecs_since_epoch < 0) { @@ -288,12 +301,9 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) usecs_since_epoch = INT_MAX; cubic_data->t_epoch = ticks - INT_MAX; } - W_est = tf_cwnd(ccv); - /* - * The mean RTT is used to best reflect the equations in - * the I-D. + * The mean RTT is used to best reflect the equations. */ W_cubic = cubic_cwnd(usecs_since_epoch + cubic_data->mean_rtt_usecs, @@ -302,33 +312,24 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) cubic_data->K); if (W_cubic < W_est) { - /* - * TCP-friendly region, follow tf - * cwnd growth. - */ - CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX); + /* RFC9438 Section 4.3: Reno-friendly region */ + CCV(ccv, snd_cwnd) = W_est; cubic_data->flags |= CUBICFLAG_IN_TF; - } else if (CCV(ccv, snd_cwnd) < W_cubic) { + } else { /* - * Concave or convex region, follow CUBIC - * cwnd growth. - * Only update snd_cwnd, if it doesn't shrink. + * RFC9438 Section 4.4 or 4.5: + * Concave or Convex Region */ - CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX); - cubic_data->flags &= ~CUBICFLAG_IN_TF; - } - - /* - * If we're not in slow start and we're probing for a - * new cwnd limit at the start of a connection - * (happens when hostcache has a relevant entry), - * keep updating our current estimate of the - * W_max. - */ - if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && - cubic_data->W_max < CCV(ccv, snd_cwnd)) { - cubic_data->W_max = CCV(ccv, snd_cwnd); - cubic_data->K = cubic_k(cubic_data->W_max / mss); + if (W_cubic < cwin) { + target = cwin; + } else if (W_cubic > ((cwin * 3) >> 1)) { + target = (cwin * 3) >> 1; + } else { + target = W_cubic; + } + incr = (((target - cwin) << CUBIC_SHIFT) / + cwin * mss) >> CUBIC_SHIFT; + CCV(ccv, snd_cwnd) = cwin + incr; } } } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && @@ -345,12 +346,11 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) static void cubic_after_idle(struct cc_var *ccv) { - struct cubic *cubic_data; - - cubic_data = ccv->cc_data; + struct cubic *cubic_data = ccv->cc_data; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd)); - cubic_data->K = cubic_k(cubic_data->W_max / tcp_fixed_maxseg(ccv->tp)); + cubic_data->K = cubic_k(cubic_data->W_max / mss, cubic_data->cwnd_epoch / mss); if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. @@ -389,7 +389,9 @@ cubic_cb_init(struct cc_var *ccv, void *ptr) cubic_data = ptr; /* Init some key variables with sensible defaults. */ - cubic_data->t_epoch = ticks; + cubic_data->t_epoch = 0; + cubic_data->cwnd_epoch = 0; + cubic_data->K = 0; cubic_data->min_rtt_usecs = TCPTV_SRTTBASE; cubic_data->mean_rtt_usecs = 1; @@ -416,7 +418,7 @@ static void cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; - uint32_t mss, pipe; + uint32_t mss, pipe, ssthresh; cubic_data = ccv->cc_data; mss = tcp_fixed_maxseg(ccv->tp); @@ -431,10 +433,13 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) } if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { - cubic_ssthresh_update(ccv, mss); + ssthresh = cubic_get_ssthresh(ccv, mss); + CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss); + /* + * The congestion flag will recalculate K at the + * beginning of the congestion avoidance stage. + */ cubic_data->flags |= CUBICFLAG_CONG_EVENT; - cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / mss); } ENTER_RECOVERY(CCV(ccv, t_flags)); } @@ -448,17 +453,20 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) cubic_log_hystart_event(ccv, cubic_data, 9, CCV(ccv, snd_ssthresh)); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { - cubic_ssthresh_update(ccv, mss); + ssthresh = cubic_get_ssthresh(ccv, mss); + CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss); + CCV(ccv, snd_cwnd) = max(ssthresh, mss); + /* + * The congestion flag will recalculate K at the + * beginning of the congestion avoidance stage. + */ cubic_data->flags |= CUBICFLAG_CONG_EVENT; - cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / mss); - CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: - /* RFC8312 Section 4.7 */ + /* RFC9438 Section 4.8: Timeout */ if (CCV(ccv, t_rxtshift) == 1) { /* * Remember the state only for the first RTO event. This @@ -475,12 +483,16 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) (((uint64_t)min(CCV(ccv, snd_wnd), pipe) * CUBIC_BETA) >> CUBIC_SHIFT) / mss) * mss; } - cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; + /* + * The RTO flag will recalculate K at the + * beginning of the congestion avoidance stage. + */ + cubic_data->flags |= CUBICFLAG_RTO_EVENT; CCV(ccv, snd_cwnd) = mss; break; case CC_RTO_ERR: - cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); + cubic_data->flags &= ~CUBICFLAG_RTO_EVENT; cubic_data->K = cubic_data->undo_K; cubic_data->W_max = cubic_data->undo_W_max; cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch; @@ -503,7 +515,7 @@ cubic_conn_init(struct cc_var *ccv) * this here bad things happen when entries from the TCP hostcache * get used. */ - cubic_data->W_max = CCV(ccv, snd_cwnd); + cubic_data->W_max = UINT_MAX; } static int @@ -603,44 +615,36 @@ cubic_record_rtt(struct cc_var *ccv) } /* - * Update the ssthresh in the event of congestion. + * Return the new value for ssthresh in the event of a congestion. */ -static void -cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg) +static uint32_t +cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg) { struct cubic *cubic_data; - uint32_t ssthresh; - uint32_t cwnd; + uint32_t cwnd, pipe; cubic_data = ccv->cc_data; cwnd = CCV(ccv, snd_cwnd); - /* Fast convergence heuristic. */ + /* RFC9438 Section 4.7: Fast convergence */ if (cwnd < cubic_data->W_max) { cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; } - cubic_data->undo_W_max = cubic_data->W_max; cubic_data->W_max = cwnd; if (cubic_data->flags & CUBICFLAG_IN_TF) { - /* If in the TCP friendly region, follow what newreno does */ - ssthresh = newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg); + /* If in the TCP friendly region, follow what newreno does. */ + return (newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg)); - } else if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { - /* - * On the first congestion event, set ssthresh to cwnd * 0.5 - * and reduce W_max to cwnd * beta. This aligns the cubic - * concave region appropriately. - */ - ssthresh = cwnd >> 1; - cubic_data->W_max = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; } else { /* - * On subsequent congestion events, set ssthresh to cwnd * beta. + * RFC9438 Section 4.6: Multiplicative Decrease + * Outside the TCP friendly region, set ssthresh to the size of + * inflight_size * beta. */ - ssthresh = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; + pipe = tcp_compute_pipe(ccv->tp); + return ((pipe * CUBIC_BETA) >> CUBIC_SHIFT); } - CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg); } static void diff --git a/sys/netinet/cc/cc_cubic.h b/sys/netinet/cc/cc_cubic.h index c30128570ab0..c31506d26b00 100644 --- a/sys/netinet/cc/cc_cubic.h +++ b/sys/netinet/cc/cc_cubic.h @@ -88,14 +88,23 @@ /* Kernel only bits */ #ifdef _KERNEL struct cubic { - /* CUBIC K in fixed point form with CUBIC_SHIFT worth of precision. */ + /* + * CUBIC K in fixed point form with CUBIC_SHIFT worth of precision. + * Also means the time period in seconds it takes to increase the + * congestion window size at the beginning of the current congestion + * avoidance stage to W_max. + */ int64_t K; /* Sum of RTT samples across an epoch in usecs. */ int64_t sum_rtt_usecs; - /* Size of cwnd just before cwnd was reduced in the last congestion event */ - uint64_t W_max; - /* The cwnd at the beginning of the current congestion avoidance stage */ - uint64_t cwnd_epoch; + /* Size of cwnd (in bytes) just before cwnd was reduced in the last congestion event. */ + uint32_t W_max; + /* An estimate (in bytes) for the congestion window in the Reno-friendly region */ + uint32_t W_est; + /* An estimate (in bytes) for the congestion window in the CUBIC region */ + uint32_t W_cubic; + /* The cwnd (in bytes) at the beginning of the current congestion avoidance stage. */ + uint32_t cwnd_epoch; /* various flags */ uint32_t flags; /* Minimum observed rtt in usecs. */ @@ -110,8 +119,8 @@ struct cubic { int undo_t_epoch; /* Few variables to restore the state after RTO_ERR */ int64_t undo_K; - uint64_t undo_W_max; - uint64_t undo_cwnd_epoch; + uint32_t undo_W_max; + uint32_t undo_cwnd_epoch; uint32_t css_baseline_minrtt; uint32_t css_current_round_minrtt; uint32_t css_lastround_minrtt; @@ -130,60 +139,103 @@ struct cubic { extern int hz; /* - * Implementation based on the formulae found in the CUBIC Internet Draft - * "draft-ietf-tcpm-cubic-04". + * Implementation based on the formulas in RFC9438. * */ -static __inline float -theoretical_cubic_k(double wmax_pkts) + +/* + * Returns K, the time period in seconds it takes to increase the congestion + * window size at the beginning of the current congestion avoidance stage to + * W_max. + */ +static inline float +theoretical_cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs) { double C; C = 0.4; + if (wmax_segs <= cwnd_epoch_segs) + return 0.0; - return (pow((wmax_pkts * 0.3) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT)); + /* + * Figure 2: K = ((W_max - cwnd_epoch) / C)^(1/3) + */ + return (pow((wmax_segs - cwnd_epoch_segs) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT)); } -static __inline unsigned long -theoretical_cubic_cwnd(int ticks_since_epoch, unsigned long wmax, uint32_t smss) +/* + * Returns the congestion window in segments at time t in seconds based on the + * cubic increase function, where t is the elapsed time in seconds from the + * beginning of the current congestion avoidance stage, as described in RFC9438 + * Section 4.2. + */ +static inline unsigned long +theoretical_cubic_cwnd(int ticks_elapsed, uint32_t wmax_segs, uint32_t cwnd_epoch_segs) { - double C, wmax_pkts; + double C, t; + float K; C = 0.4; - wmax_pkts = wmax / (double)smss; + t = ticks_elapsed / (double)hz; + K = theoretical_cubic_k(wmax_segs, cwnd_epoch_segs); - return (smss * (wmax_pkts + - (C * pow(ticks_since_epoch / (double)hz - - theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0)))); + /* + * Figure 1: W_cubic(t) = C * (t - K)^3 + W_max + */ + return (C * pow(t - K / pow(2, CUBIC_SHIFT), 3.0) + wmax_segs); } -static __inline unsigned long -theoretical_reno_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax, - uint32_t smss) +/* + * Returns estimated Reno congestion window in segments. + */ +static inline unsigned long +theoretical_reno_cwnd(int ticks_elapsed, int rtt_ticks, uint32_t wmax_segs) { - return ((wmax * 0.5) + ((ticks_since_epoch / (float)rtt_ticks) * smss)); + return (wmax_segs * 0.5 + ticks_elapsed / (float)rtt_ticks); } -static __inline unsigned long -theoretical_tf_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax, - uint32_t smss) +/* + * Returns an estimate for the congestion window in segments in the + * Reno-friendly region -- that is, an estimate for the congestion window of + * Reno, as described in RFC9438 Section 4.3, where: + * cwnd: Current congestion window in segments. + * cwnd_prior: Size of cwnd in segments at the time of setting ssthresh most + * recently, either upon exiting the first slow start or just before + * cwnd was reduced in the last congestion event. + * W_est: An estimate for the congestion window in segments in the Reno-friendly + * region -- that is, an estimate for the congestion window of Reno. + */ +static inline unsigned long +theoretical_tf_cwnd(unsigned long W_est, unsigned long segs_acked, unsigned long cwnd, + unsigned long cwnd_prior) { + float cubic_alpha, cubic_beta; + + /* RFC9438 Section 4.6: The parameter β_cubic SHOULD be set to 0.7. */ + cubic_beta = 0.7; - return ((wmax * 0.7) + ((3 * 0.3) / (2 - 0.3) * - (ticks_since_epoch / (float)rtt_ticks) * smss)); + if (W_est >= cwnd_prior) + cubic_alpha = 1.0; + else + cubic_alpha = (3.0 * (1.0 - cubic_beta)) / (1.0 + cubic_beta); + + /* + * Figure 4: W_est = W_est + α_cubic * segments_acked / cwnd + */ + return (W_est + cubic_alpha * segs_acked / cwnd); } #endif /* !_KERNEL */ /* * Compute the CUBIC K value used in the cwnd calculation, using an - * implementation of eqn 2 in the I-D. The method used - * here is adapted from Apple Computer Technical Report #KT-32. + * implementation mentioned in Figure. 2 of RFC9438. + * The method used here is adapted from Apple Computer Technical Report #KT-32. */ -static __inline int64_t -cubic_k(unsigned long wmax_pkts) +static inline int64_t +cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs) { int64_t s, K; uint16_t p; @@ -191,8 +243,13 @@ cubic_k(unsigned long wmax_pkts) K = s = 0; p = 0; - /* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */ - s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR; + /* Handle the corner case where W_max <= cwnd_epoch */ + if (wmax_segs <= cwnd_epoch_segs) { + return 0; + } + + /* (wmax - cwnd_epoch) / C with CUBIC_SHIFT worth of precision. */ + s = ((wmax_segs - cwnd_epoch_segs) << (2 * CUBIC_SHIFT)) / CUBIC_C_FACTOR; /* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */ while (s >= 256) { @@ -213,13 +270,14 @@ cubic_k(unsigned long wmax_pkts) } /* - * Compute the new cwnd value using an implementation of eqn 1 from the I-D. + * Compute and return the new cwnd value in bytes using an implementation + * mentioned in Figure. 1 of RFC9438. * Thanks to Kip Macy for help debugging this function. * * XXXLAS: Characterise bounds for overflow. */ -static __inline unsigned long -cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) +static inline uint32_t +cubic_cwnd(int usecs_since_epoch, uint32_t wmax, uint32_t smss, int64_t K) { int64_t cwnd; @@ -238,7 +296,7 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) cwnd *= (cwnd * cwnd); /* - * C(t - K)^3 + wmax + * Figure 1: C * (t - K)^3 + wmax * The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of * CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above, * and an extra from multiplying through by CUBIC_C_FACTOR. @@ -253,33 +311,9 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) } /* - * Compute an approximation of the NewReno cwnd some number of usecs after a - * congestion event. RTT should be the average RTT estimate for the path - * measured over the previous congestion epoch and wmax is the value of cwnd at - * the last congestion event. The "TCP friendly" concept in the CUBIC I-D is - * rather tricky to understand and it turns out this function is not required. - * It is left here for reference. - * - * XXX: Not used - */ -static __inline unsigned long -reno_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax, - uint32_t smss) -{ - - /* - * For NewReno, beta = 0.5, therefore: W_tcp(t) = wmax*0.5 + t/RTT - * W_tcp(t) deals with cwnd/wmax in pkts, so because our cwnd is in - * bytes, we have to multiply by smss. - */ - return (((wmax * RENO_BETA) + (((usecs_since_epoch * smss) - << CUBIC_SHIFT) / rtt_usecs)) >> CUBIC_SHIFT); -} - -/* * Compute the "TCP friendly" cwnd by newreno in congestion avoidance state. */ -static __inline unsigned long +static inline uint32_t tf_cwnd(struct cc_var *ccv) { /* newreno is "TCP friendly" */ diff --git a/sys/netinet/dccp.h b/sys/netinet/dccp.h index 4fb6a0d2ab3e..da83a1b06861 100644 --- a/sys/netinet/dccp.h +++ b/sys/netinet/dccp.h @@ -64,7 +64,7 @@ struct dccphdr { uint8_t seq[6]; } longseq; } d_seqno; -}; +} __packed; #define d_seqno_short d_seqno.shortseq; #define d_seqno_long d_seqno.longseq.seq; diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index 7845b682f3e4..2ca5b3433e47 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -713,9 +713,6 @@ void icmp6_redirect_input(struct mbuf *, int); void icmp6_redirect_output(struct mbuf *, struct nhop_object *); int icmp6_ratelimit(const struct in6_addr *, const int, const int); -struct ip6ctlparam; -void icmp6_mtudisc_update(struct ip6ctlparam *, int); - /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ do { \ diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index b1f2b0ebf911..b39479565bd6 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -100,15 +100,13 @@ void kmod_icmpstat_inc(int statnum); SYSCTL_DECL(_net_inet_icmp); extern int badport_bandlim(int); -#define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 #define BANDLIM_ICMP_ECHO 1 #define BANDLIM_ICMP_TSTAMP 2 -#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ -#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ -#define BANDLIM_ICMP6_UNREACH 5 -#define BANDLIM_SCTP_OOTB 6 -#define BANDLIM_MAX 7 +#define BANDLIM_TCP_RST 3 +#define BANDLIM_ICMP6_UNREACH 4 +#define BANDLIM_SCTP_OOTB 5 +#define BANDLIM_MAX 6 #endif #endif diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 88da1b139b1f..dc6ef343662d 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -56,6 +56,7 @@ #include <net/if_dl.h> #include <net/if_private.h> #include <net/if_types.h> +#include <net/if_bridgevar.h> #include <net/netisr.h> #include <net/ethernet.h> #include <net/route.h> @@ -832,7 +833,7 @@ in_arpinput(struct mbuf *m) * when we have clusters of interfaces). */ CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { - if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || + if (((bridged && bridge_same_p(ia->ia_ifp->if_bridge, ifp->if_bridge)) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || @@ -842,7 +843,7 @@ in_arpinput(struct mbuf *m) } } CK_LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) - if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || + if (((bridged && bridge_same_p(ia->ia_ifp->if_bridge, ifp->if_bridge)) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); @@ -850,7 +851,7 @@ in_arpinput(struct mbuf *m) } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ - (ia->ia_ifp->if_bridge == ifp->if_softc && \ + (bridge_get_softc_p(ia->ia_ifp) == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index 5082b6294ebb..299f3c2e02bb 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -402,32 +402,43 @@ out: static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { + struct epoch_tracker et; int error; int new; + struct igmp_ifsoftc *igi; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); - IGMP_LOCK(); - new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) - goto out_locked; + return (error); - if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { - error = EINVAL; - goto out_locked; - } + if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) + return (EINVAL); + + IN_MULTI_LIST_LOCK(); + IGMP_LOCK(); + NET_EPOCH_ENTER(et); - CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", - V_igmp_default_version, new); + if (V_igmp_default_version != new) { + CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", + V_igmp_default_version, new); - V_igmp_default_version = new; + V_igmp_default_version = new; -out_locked: + LIST_FOREACH(igi, &V_igi_head, igi_link) { + if (igi->igi_version > V_igmp_default_version){ + igmp_set_version(igi, V_igmp_default_version); + } + } + } + + NET_EPOCH_EXIT(et); + IN_MULTI_LIST_UNLOCK(); IGMP_UNLOCK(); return (error); } diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 2fcbff8611ff..963449d4b4b1 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -57,6 +57,7 @@ #include <net/if_llatbl.h> #include <net/if_private.h> #include <net/if_types.h> +#include <net/if_bridgevar.h> #include <net/route.h> #include <net/route/nhop.h> #include <net/route/route_ctl.h> @@ -519,6 +520,13 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred #endif /* + * Check if bridge wants to allow adding addrs to member interfaces. + */ + if (ifp->if_bridge && bridge_member_ifaddrs_p && + !bridge_member_ifaddrs_p()) + return (EINVAL); + + /* * See whether address already exist. */ iaIsFirst = true; diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c index b889131b544b..538cd43a88a3 100644 --- a/sys/netinet/in_fib_dxr.c +++ b/sys/netinet/in_fib_dxr.c @@ -345,7 +345,7 @@ initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) struct heap_entry *fhp = &da->heap[0]; struct rtentry *rt; struct route_nhop_data rnd; - + da->heap_index = 0; da->dst.sin_addr.s_addr = htonl(dst_u32); rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, @@ -1143,7 +1143,7 @@ dxr_destroy(void *data) free(da, M_DXRAUX); } -static void +static void epoch_dxr_destroy(epoch_context_t ctx) { struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); @@ -1202,7 +1202,7 @@ dxr_dump_end(void *data, struct fib_dp *dp) static enum flm_op_result dxr_dump_rib_item(struct rtentry *rt, void *data) { - + return (FLM_SUCCESS); } diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c index 7e0b9a6a9373..de2a98ce541c 100644 --- a/sys/netinet/in_kdtrace.c +++ b/sys/netinet/in_kdtrace.c @@ -286,6 +286,8 @@ MIB_PROBE_TCP(tcps_sc_unreach); MIB_PROBE_TCP(tcps_sc_zonefail); MIB_PROBE_TCP(tcps_sc_sendcookie); MIB_PROBE_TCP(tcps_sc_recvcookie); +MIB_PROBE_TCP(tcps_sc_spurcookie); +MIB_PROBE_TCP(tcps_sc_failcookie); MIB_PROBE_TCP(tcps_hc_added); MIB_PROBE_TCP(tcps_hc_bucketoverflow); diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h index 7b0d433c60d8..a203b660d777 100644 --- a/sys/netinet/in_kdtrace.h +++ b/sys/netinet/in_kdtrace.h @@ -278,6 +278,8 @@ SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_unreach); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_zonefail); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_sendcookie); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_recvcookie); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_spurcookie); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_failcookie); SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_added); SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_bucketoverflow); diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 3774f73a7a8f..dbe48242381d 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -235,7 +235,7 @@ VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in_pcbhashseed_init, NULL); #ifdef INET -VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1; +VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0; #define V_connect_inaddr_wild VNET(connect_inaddr_wild) SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, @@ -1745,6 +1745,23 @@ in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) } /* + * Dereference and rlock inp, for which the caller must own the + * reference. Returns true if inp no longer usable, false otherwise. + */ +bool +in_pcbrele_rlock(struct inpcb *inp) +{ + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (true); + if ((inp->inp_flags & INP_FREED) != 0) { + INP_RUNLOCK(inp); + return (true); + } + return (false); +} + +/* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 5fe12c4f1e76..9e0618e87601 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -303,6 +303,30 @@ struct sockopt_parameters { char sop_optval[]; }; +#ifdef _SYS_KTLS_H_ +struct xktls_session { + uint32_t tsz; /* total sz of elm, next elm is at this+tsz */ + uint32_t fsz; /* size of the struct up to keys */ + uint64_t inp_gencnt; + kvaddr_t so_pcb; + struct in_conninfo coninf; + u_short rx_vlan_id; + struct xktls_session_onedir rcv; + struct xktls_session_onedir snd; +/* + * Next are + * - keydata for rcv, first cipher of length rcv.cipher_key_len, then + * authentication of length rcv.auth_key_len; + * - driver data (string) of length rcv.drv_st_len, if the rcv session is + * offloaded to ifnet rcv.ifnet; + * - keydata for snd, first cipher of length snd.cipher_key_len, then + * authentication of length snd.auth_key_len; + * - driver data (string) of length snd.drv_st_len, if the snd session is + * offloaded to ifnet snd.ifnet; + */ +}; +#endif /* _SYS_KTLS_H_ */ + #ifdef _KERNEL int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)); @@ -657,6 +681,7 @@ void in_pcbref(struct inpcb *); bool in_pcbrele(struct inpcb *, inp_lookup_t); bool in_pcbrele_rlocked(struct inpcb *); bool in_pcbrele_wlocked(struct inpcb *); +bool in_pcbrele_rlock(struct inpcb *inp); typedef bool inp_match_t(const struct inpcb *, void *); struct inpcb_iterator { diff --git a/sys/netinet/in_prot.c b/sys/netinet/in_prot.c index 204f4f60456e..69f0f3694096 100644 --- a/sys/netinet/in_prot.c +++ b/sys/netinet/in_prot.c @@ -26,21 +26,17 @@ */ /* - * System calls related to processes and protection + * Helpers related to visibility and protection of sockets and inpcb. */ -#include <sys/cdefs.h> -#include "opt_inet.h" -#include "opt_inet6.h" - -#include <sys/param.h> #include <sys/systm.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> +#include <sys/priv.h> #include <sys/proc.h> #include <sys/socket.h> -#include <sys/jail.h> #include <netinet/in.h> #include <netinet/in_pcb.h> @@ -72,3 +68,16 @@ cr_canseeinpcb(struct ucred *cred, struct inpcb *inp) return (0); } + +bool +cr_canexport_ktlskeys(struct thread *td, struct inpcb *inp) +{ + int error; + + if (cr_canseeinpcb(td->td_ucred, inp) == 0 && + cr_xids_subset(td->td_ucred, inp->inp_cred)) + return (true); + error = priv_check(td, PRIV_NETINET_KTLSKEYS); + return (error == 0); + +} diff --git a/sys/netinet/in_rss.c b/sys/netinet/in_rss.c index 698fd86dc7a5..f93a1d2bfd7b 100644 --- a/sys/netinet/in_rss.c +++ b/sys/netinet/in_rss.c @@ -285,7 +285,7 @@ rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, } /* * Only allow 2-tuple for TCP frames if we don't also - * support 2-tuple for TCP. + * support 4-tuple for TCP. */ if ((rss_gethashconfig() & RSS_HASHTYPE_RSS_IPV4) && ((rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) && diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h index 2750733335bb..e2f553ec461c 100644 --- a/sys/netinet/in_systm.h +++ b/sys/netinet/in_systm.h @@ -32,6 +32,8 @@ #ifndef _NETINET_IN_SYSTM_H_ #define _NETINET_IN_SYSTM_H_ +#include <sys/types.h> + /* * Miscellaneous internetwork * definitions for kernel. @@ -56,8 +58,10 @@ typedef u_int32_t n_time; /* ms since 00:00 UTC, byte rev */ #ifdef _KERNEL struct inpcb; struct ucred; +struct thread; int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp); +bool cr_canexport_ktlskeys(struct thread *td, struct inpcb *inp); uint32_t iptime(void); #endif diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h index 8d205ba07cf5..6de41a7e79fa 100644 --- a/sys/netinet/ip.h +++ b/sys/netinet/ip.h @@ -33,7 +33,8 @@ #ifndef _NETINET_IP_H_ #define _NETINET_IP_H_ -#include <sys/cdefs.h> +#include <sys/types.h> +#include <netinet/in.h> /* * Definitions for internet protocol version 4. @@ -66,7 +67,7 @@ struct ip { u_char ip_p; /* protocol */ u_short ip_sum; /* checksum */ struct in_addr ip_src,ip_dst; /* source and dest address */ -} __packed __aligned(2); +} __packed; #define IP_MAXPACKET 65535 /* maximum packet size */ @@ -186,7 +187,7 @@ struct ip_timestamp { uint32_t ipt_time; /* network format */ } ipt_ta[1]; } ipt_timestamp; -}; +} __packed; /* Flag bits for ipt_flg. */ #define IPOPT_TS_TSONLY 0 /* timestamps only */ diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index 0ead7149c1e2..d3d7957cf087 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -206,8 +206,6 @@ struct carpkreq { * * Known issues with locking: * - * - Sending ad, we put the pointer to the softc in an mtag, and no reference - * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ @@ -1688,6 +1686,7 @@ char * carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; + char *mac = NULL; NET_EPOCH_ASSERT(); @@ -1698,18 +1697,26 @@ carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_CARP, - sizeof(struct carp_softc *), M_NOWAIT); - if (mtag == NULL) - /* Better a bit than nothing. */ - return (sc->sc_addr); + sizeof(sc->sc_vhid) + sizeof(sc->sc_addr), + M_NOWAIT); + if (mtag == NULL) { + CARPSTATS_INC(carps_onomem); + break; + } + /* carp_output expects sc_vhid first. */ + bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid)); + /* + * Save sc_addr into mtag data after sc_vhid to avoid + * possible access to destroyed softc. + */ + mac = (char *)(mtag + 1) + sizeof(sc->sc_vhid); + bcopy(sc->sc_addr, mac, sizeof(sc->sc_addr)); - bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); - - return (sc->sc_addr); + break; } - return (NULL); + return (mac); } #endif /* INET6 */ diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c index 9b81760e58f3..51e7c2fbc4b0 100644 --- a/sys/netinet/ip_fastfwd.c +++ b/sys/netinet/ip_fastfwd.c @@ -69,6 +69,7 @@ #include <sys/cdefs.h> #include "opt_ipstealth.h" +#include "opt_sctp.h" #include <sys/param.h> #include <sys/systm.h> @@ -102,6 +103,10 @@ #include <machine/in_cksum.h> +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include <netinet/sctp_crc32.h> +#endif + #define V_ipsendredirects VNET(ipsendredirects) static struct mbuf * @@ -460,6 +465,23 @@ passout: } else gw = (const struct sockaddr *)dst; + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & + ~nh->nh_ifp->if_hwassist)) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP_SCTP & + ~nh->nh_ifp->if_hwassist)) { + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + m->m_pkthdr.csum_flags &= ~CSUM_IP_SCTP; + } +#endif + /* Handle redirect case. */ redest.s_addr = 0; if (V_ipsendredirects && osrc.s_addr == ip->ip_src.s_addr && diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index c440223b81f8..51e68c310915 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -167,149 +167,149 @@ typedef struct _ip_fw3_opheader { */ enum ipfw_opcodes { /* arguments (4 byte each) */ - O_NOP, - - O_IP_SRC, /* u32 = IP */ - O_IP_SRC_MASK, /* ip = IP/mask */ - O_IP_SRC_ME, /* none */ - O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ - - O_IP_DST, /* u32 = IP */ - O_IP_DST_MASK, /* ip = IP/mask */ - O_IP_DST_ME, /* none */ - O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ - - O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ - O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ - O_PROTO, /* arg1=protocol */ - - O_MACADDR2, /* 2 mac addr:mask */ - O_MAC_TYPE, /* same as srcport */ - - O_LAYER2, /* none */ - O_IN, /* none */ - O_FRAG, /* none */ - - O_RECV, /* none */ - O_XMIT, /* none */ - O_VIA, /* none */ - - O_IPOPT, /* arg1 = 2*u8 bitmap */ - O_IPLEN, /* arg1 = len */ - O_IPID, /* arg1 = id */ - - O_IPTOS, /* arg1 = id */ - O_IPPRECEDENCE, /* arg1 = precedence << 5 */ - O_IPTTL, /* arg1 = TTL */ - - O_IPVER, /* arg1 = version */ - O_UID, /* u32 = id */ - O_GID, /* u32 = id */ - O_ESTAB, /* none (tcp established) */ - O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ - O_TCPWIN, /* arg1 = desired win */ - O_TCPSEQ, /* u32 = desired seq. */ - O_TCPACK, /* u32 = desired seq. */ - O_ICMPTYPE, /* u32 = icmp bitmap */ - O_TCPOPTS, /* arg1 = 2*u8 bitmap */ - - O_VERREVPATH, /* none */ - O_VERSRCREACH, /* none */ - - O_PROBE_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ - O_KEEP_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ - O_LIMIT, /* ipfw_insn_limit */ - O_LIMIT_PARENT, /* dyn_type, not an opcode. */ + O_NOP = 0, + + O_IP_SRC = 1, /* u32 = IP */ + O_IP_SRC_MASK = 2, /* ip = IP/mask */ + O_IP_SRC_ME = 3, /* none */ + O_IP_SRC_SET = 4, /* u32=base, arg1=len, bitmap */ + + O_IP_DST = 5, /* u32 = IP */ + O_IP_DST_MASK = 6, /* ip = IP/mask */ + O_IP_DST_ME = 7, /* none */ + O_IP_DST_SET = 8, /* u32=base, arg1=len, bitmap */ + + O_IP_SRCPORT = 9, /* (n)port list:mask 4 byte ea */ + O_IP_DSTPORT = 10, /* (n)port list:mask 4 byte ea */ + O_PROTO = 11, /* arg1=protocol */ + + O_MACADDR2 = 12, /* 2 mac addr:mask */ + O_MAC_TYPE = 13, /* same as srcport */ + + O_LAYER2 = 14, /* none */ + O_IN = 15, /* none */ + O_FRAG = 16, /* none */ + + O_RECV = 17, /* none */ + O_XMIT = 18, /* none */ + O_VIA = 19, /* none */ + + O_IPOPT = 20, /* arg1 = 2*u8 bitmap */ + O_IPLEN = 21, /* arg1 = len */ + O_IPID = 22, /* arg1 = id */ + + O_IPTOS = 23, /* arg1 = id */ + O_IPPRECEDENCE = 24, /* arg1 = precedence << 5 */ + O_IPTTL = 25, /* arg1 = TTL */ + + O_IPVER = 26, /* arg1 = version */ + O_UID = 27, /* u32 = id */ + O_GID = 28, /* u32 = id */ + O_ESTAB = 29, /* none (tcp established) */ + O_TCPFLAGS = 30, /* arg1 = 2*u8 bitmap */ + O_TCPWIN = 31, /* arg1 = desired win */ + O_TCPSEQ = 32, /* u32 = desired seq. */ + O_TCPACK = 33, /* u32 = desired seq. */ + O_ICMPTYPE = 34, /* u32 = icmp bitmap */ + O_TCPOPTS = 35, /* arg1 = 2*u8 bitmap */ + + O_VERREVPATH = 36, /* none */ + O_VERSRCREACH = 37, /* none */ + + O_PROBE_STATE = 38, /* v0:arg1=kidx, v1:kidx=kidx */ + O_KEEP_STATE = 39, /* v0:arg1=kidx, v1:kidx=kidx */ + O_LIMIT = 40, /* ipfw_insn_limit */ + O_LIMIT_PARENT = 41, /* dyn_type, not an opcode. */ /* * These are really 'actions'. */ - O_LOG, /* ipfw_insn_log */ - O_PROB, /* u32 = match probability */ + O_LOG = 42, /* ipfw_insn_log */ + O_PROB = 43, /* u32 = match probability */ - O_CHECK_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ - O_ACCEPT, /* none */ - O_DENY, /* none */ - O_REJECT, /* arg1=icmp arg (same as deny) */ - O_COUNT, /* none */ - O_SKIPTO, /* v0:arg1=next rule number */ + O_CHECK_STATE = 44, /* v0:arg1=kidx, v1:kidx=kidx */ + O_ACCEPT = 45, /* none */ + O_DENY = 46, /* none */ + O_REJECT = 47, /* arg1=icmp arg (same as deny) */ + O_COUNT = 48, /* none */ + O_SKIPTO = 49, /* v0:arg1=next rule number */ /* v1:kidx= next rule number */ - O_PIPE, /* arg1=pipe number */ - O_QUEUE, /* arg1=queue number */ - O_DIVERT, /* arg1=port number */ - O_TEE, /* arg1=port number */ - O_FORWARD_IP, /* fwd sockaddr */ - O_FORWARD_MAC, /* fwd mac */ - O_NAT, /* nope */ - O_REASS, /* none */ + O_PIPE = 50, /* arg1=pipe number */ + O_QUEUE = 51, /* arg1=queue number */ + O_DIVERT = 52, /* arg1=port number */ + O_TEE = 53, /* arg1=port number */ + O_FORWARD_IP = 54, /* fwd sockaddr */ + O_FORWARD_MAC = 55, /* fwd mac */ + O_NAT = 56, /* nope */ + O_REASS = 57, /* none */ /* * More opcodes. */ - O_IPSEC, /* has ipsec history */ - O_IP_SRC_LOOKUP, /* v0:arg1=table number, u32=value */ + O_IPSEC = 58, /* has ipsec history */ + O_IP_SRC_LOOKUP = 59, /* v0:arg1=table number, u32=value */ /* v1:kidx=name, u32=value, arg1=key */ - O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + O_IP_DST_LOOKUP = 60, /* arg1=table number, u32=value */ /* v1:kidx=name, u32=value, arg1=key */ - O_ANTISPOOF, /* none */ - O_JAIL, /* u32 = id */ - O_ALTQ, /* u32 = altq classif. qid */ - O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ - O_TCPDATALEN, /* arg1 = tcp data len */ - O_IP6_SRC, /* address without mask */ - O_IP6_SRC_ME, /* my addresses */ - O_IP6_SRC_MASK, /* address with the mask */ - O_IP6_DST, - O_IP6_DST_ME, - O_IP6_DST_MASK, - O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ - O_ICMP6TYPE, /* icmp6 packet type filtering */ - O_EXT_HDR, /* filtering for ipv6 extension header */ - O_IP6, + O_ANTISPOOF = 61, /* none */ + O_JAIL = 62, /* u32 = id */ + O_ALTQ = 63, /* u32 = altq classif. qid */ + O_DIVERTED = 64, /* arg1=bitmap (1:loop, 2:out) */ + O_TCPDATALEN = 65, /* arg1 = tcp data len */ + O_IP6_SRC = 66, /* address without mask */ + O_IP6_SRC_ME = 67, /* my addresses */ + O_IP6_SRC_MASK = 68, /* address with the mask */ + O_IP6_DST = 69, + O_IP6_DST_ME = 70, + O_IP6_DST_MASK = 71, + O_FLOW6ID = 72, /* for flow id tag in the ipv6 pkt */ + O_ICMP6TYPE = 73, /* icmp6 packet type filtering */ + O_EXT_HDR = 74, /* filtering for ipv6 extension header */ + O_IP6 = 75, /* * actions for ng_ipfw */ - O_NETGRAPH, /* send to ng_ipfw */ - O_NGTEE, /* copy to ng_ipfw */ + O_NETGRAPH = 76, /* send to ng_ipfw */ + O_NGTEE = 77, /* copy to ng_ipfw */ - O_IP4, + O_IP4 = 78, - O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ + O_UNREACH6 = 79, /* arg1=icmpv6 code arg (deny) */ - O_TAG, /* arg1=tag number */ - O_TAGGED, /* arg1=tag number */ + O_TAG = 80, /* arg1=tag number */ + O_TAGGED = 81, /* arg1=tag number */ - O_SETFIB, /* arg1=FIB number */ - O_FIB, /* arg1=FIB desired fib number */ + O_SETFIB = 82, /* arg1=FIB number */ + O_FIB = 83, /* arg1=FIB desired fib number */ - O_SOCKARG, /* socket argument */ + O_SOCKARG = 84, /* socket argument */ - O_CALLRETURN, /* v0:arg1=called rule number */ + O_CALLRETURN = 85, /* v0:arg1=called rule number */ /* v1:kidx=called rule number */ - O_FORWARD_IP6, /* fwd sockaddr_in6 */ + O_FORWARD_IP6 = 86, /* fwd sockaddr_in6 */ - O_DSCP, /* 2 u32 = DSCP mask */ - O_SETDSCP, /* arg1=DSCP value */ - O_IP_FLOW_LOOKUP, /* v0:arg1=table number, u32=value */ + O_DSCP = 87, /* 2 u32 = DSCP mask */ + O_SETDSCP = 88, /* arg1=DSCP value */ + O_IP_FLOW_LOOKUP = 89, /* v0:arg1=table number, u32=value */ /* v1:kidx=name, u32=value */ - O_EXTERNAL_ACTION, /* v0:arg1=id of external action handler */ + O_EXTERNAL_ACTION = 90, /* v0:arg1=id of external action handler */ /* v1:kidx=id of external action handler */ - O_EXTERNAL_INSTANCE, /* v0:arg1=id of eaction handler instance */ + O_EXTERNAL_INSTANCE = 91, /* v0:arg1=id of eaction handler instance */ /* v1:kidx=id of eaction handler instance */ - O_EXTERNAL_DATA, /* variable length data */ + O_EXTERNAL_DATA = 92, /* variable length data */ - O_SKIP_ACTION, /* none */ - O_TCPMSS, /* arg1=MSS value */ + O_SKIP_ACTION = 93, /* none */ + O_TCPMSS = 94, /* arg1=MSS value */ - O_MAC_SRC_LOOKUP, /* kidx=name, u32=value, arg1=key */ - O_MAC_DST_LOOKUP, /* kidx=name, u32=value, arg1=key */ + O_MAC_SRC_LOOKUP = 95, /* kidx=name, u32=value, arg1=key */ + O_MAC_DST_LOOKUP = 96, /* kidx=name, u32=value, arg1=key */ - O_SETMARK, /* u32 = value */ - O_MARK, /* 2 u32 = value, bitmask */ + O_SETMARK = 97, /* u32 = value */ + O_MARK = 98, /* 2 u32 = value, bitmask */ O_LAST_OPCODE /* not an opcode! */ }; diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 17d15d7d9629..fc0848b2c944 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -391,7 +391,6 @@ stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; - nip->ip_off = 0; if (V_error_keeptags) m_tag_copy_chain(m, n, M_NOWAIT); @@ -872,6 +871,8 @@ match: mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; + /* ip->ip_tos will be reflected. */ + ip->ip_off = htons(0); ip->ip_ttl = V_ip_defttl; if (optlen > 0) { @@ -1090,15 +1091,14 @@ ip_next_mtu(int mtu, int dir) * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ -VNET_DEFINE_STATIC(struct counter_rate, icmp_rates[BANDLIM_MAX]); +VNET_DEFINE_STATIC(struct counter_rate *, icmp_rates[BANDLIM_MAX]); #define V_icmp_rates VNET(icmp_rates) static const char *icmp_rate_descrs[BANDLIM_MAX] = { [BANDLIM_ICMP_UNREACH] = "icmp unreach", [BANDLIM_ICMP_ECHO] = "icmp ping", [BANDLIM_ICMP_TSTAMP] = "icmp tstamp", - [BANDLIM_RST_CLOSEDPORT] = "closed port RST", - [BANDLIM_RST_OPENPORT] = "open port RST", + [BANDLIM_TCP_RST] = "tcp reset", [BANDLIM_ICMP6_UNREACH] = "icmp6 unreach", [BANDLIM_SCTP_OOTB] = "sctp ootb", }; @@ -1158,8 +1158,7 @@ icmp_bandlimit_init(void) { for (int i = 0; i < BANDLIM_MAX; i++) { - V_icmp_rates[i].cr_rate = counter_u64_alloc(M_WAITOK); - V_icmp_rates[i].cr_ticks = ticks; + V_icmp_rates[i] = counter_rate_alloc(M_WAITOK, 1); icmplim_new_jitter(i); } } @@ -1172,7 +1171,7 @@ icmp_bandlimit_uninit(void) { for (int i = 0; i < BANDLIM_MAX; i++) - counter_u64_free(V_icmp_rates[i].cr_rate); + counter_rate_free(V_icmp_rates[i]); } VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp_bandlimit_uninit, NULL); @@ -1183,13 +1182,13 @@ badport_bandlim(int which) { int64_t pps; - if (V_icmplim == 0 || which == BANDLIM_UNLIMITED) + if (V_icmplim == 0) return (0); KASSERT(which >= 0 && which < BANDLIM_MAX, ("%s: which %d", __func__, which)); - pps = counter_ratecheck(&V_icmp_rates[which], V_icmplim + + pps = counter_ratecheck(V_icmp_rates[which], V_icmplim + V_icmplim_curr_jitter[which]); if (pps > 0) { if (V_icmplim_output) diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 18ca5861a40e..f782ebc53eb0 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -47,7 +47,7 @@ struct ipovly { u_short ih_len; /* protocol length */ struct in_addr ih_src; /* source internet address */ struct in_addr ih_dst; /* destination internet address */ -}; +} __packed; #ifdef _KERNEL /* diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c index 7858e4d2b9f3..6758813f6a21 100644 --- a/sys/netinet/libalias/alias.c +++ b/sys/netinet/libalias/alias.c @@ -290,13 +290,14 @@ IcmpAliasIn1(struct libalias *la, struct ip *pip) { struct alias_link *lnk; struct icmp *ic; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); /* Get source address from ICMP data field and restore original data */ - lnk = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1); - if (lnk != NULL) { + ret = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1, &lnk); + if (ret == PKT_ALIAS_OK) { u_short original_id; int accumulate; @@ -319,10 +320,8 @@ IcmpAliasIn1(struct libalias *la, struct ip *pip) &original_address, &pip->ip_dst, 2); pip->ip_dst = original_address; } - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } /* @@ -337,6 +336,7 @@ IcmpAliasIn2(struct libalias *la, struct ip *pip) struct udphdr *ud; struct tcphdr *tc; struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); @@ -346,18 +346,26 @@ IcmpAliasIn2(struct libalias *la, struct ip *pip) tc = (struct tcphdr *)ip_next(ip); ic2 = (struct icmp *)ip_next(ip); - if (ip->ip_p == IPPROTO_UDP) - lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, + if (ip->ip_p == IPPROTO_UDP) { + ret = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, ud->uh_dport, ud->uh_sport, - IPPROTO_UDP, 0); - else if (ip->ip_p == IPPROTO_TCP) - lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, + IPPROTO_UDP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_TCP) { + ret = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, tc->th_dport, tc->th_sport, - IPPROTO_TCP, 0); - else if (ip->ip_p == IPPROTO_ICMP) { - if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) - lnk = FindIcmpIn(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); - else + IPPROTO_TCP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || + ic2->icmp_type == ICMP_TSTAMP) { + ret = FindIcmpIn(la, ip->ip_dst, ip->ip_src, + ic2->icmp_id, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else lnk = NULL; } else lnk = NULL; @@ -479,13 +487,15 @@ IcmpAliasOut1(struct libalias *la, struct ip *pip, int create) { struct alias_link *lnk; struct icmp *ic; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); /* Save overwritten data for when echo packet returns */ - lnk = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create); - if (lnk != NULL) { + ret = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create, + &lnk); + if (ret == PKT_ALIAS_OK) { u_short alias_id; int accumulate; @@ -508,10 +518,8 @@ IcmpAliasOut1(struct libalias *la, struct ip *pip, int create) &alias_address, &pip->ip_src, 2); pip->ip_src = alias_address; } - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } /* @@ -526,6 +534,7 @@ IcmpAliasOut2(struct libalias *la, struct ip *pip) struct udphdr *ud; struct tcphdr *tc; struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); @@ -535,18 +544,26 @@ IcmpAliasOut2(struct libalias *la, struct ip *pip) tc = (struct tcphdr *)ip_next(ip); ic2 = (struct icmp *)ip_next(ip); - if (ip->ip_p == IPPROTO_UDP) - lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, + if (ip->ip_p == IPPROTO_UDP) { + ret = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, ud->uh_dport, ud->uh_sport, - IPPROTO_UDP, 0); - else if (ip->ip_p == IPPROTO_TCP) - lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, + IPPROTO_UDP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_TCP) { + ret = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, tc->th_dport, tc->th_sport, - IPPROTO_TCP, 0); - else if (ip->ip_p == IPPROTO_ICMP) { - if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) - lnk = FindIcmpOut(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); - else + IPPROTO_TCP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || + ic2->icmp_type == ICMP_TSTAMP) { + ret = FindIcmpOut(la, ip->ip_dst, ip->ip_src, + ic2->icmp_id, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else lnk = NULL; } else lnk = NULL; @@ -661,14 +678,15 @@ ProtoAliasIn(struct libalias *la, struct in_addr ip_src, struct ip *pip, u_char ip_p, u_short *ip_sum) { struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); /* Return if proxy-only mode is enabled */ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) return (PKT_ALIAS_OK); - lnk = FindProtoIn(la, ip_src, pip->ip_dst, ip_p); - if (lnk != NULL) { + ret = FindProtoIn(la, ip_src, pip->ip_dst, ip_p, &lnk); + if (ret == PKT_ALIAS_OK) { struct in_addr original_address; original_address = GetOriginalAddress(lnk); @@ -677,10 +695,8 @@ ProtoAliasIn(struct libalias *la, struct in_addr ip_src, DifferentialChecksum(ip_sum, &original_address, &pip->ip_dst, 2); pip->ip_dst = original_address; - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } /* @@ -693,6 +709,7 @@ ProtoAliasOut(struct libalias *la, struct ip *pip, struct in_addr ip_dst, u_char ip_p, u_short *ip_sum, int create) { struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); @@ -703,8 +720,8 @@ ProtoAliasOut(struct libalias *la, struct ip *pip, if (!create) return (PKT_ALIAS_IGNORED); - lnk = FindProtoOut(la, pip->ip_src, ip_dst, ip_p); - if (lnk != NULL) { + ret = FindProtoOut(la, pip->ip_src, ip_dst, ip_p, &lnk); + if (ret == PKT_ALIAS_OK) { struct in_addr alias_address; alias_address = GetAliasAddress(lnk); @@ -713,10 +730,8 @@ ProtoAliasOut(struct libalias *la, struct ip *pip, DifferentialChecksum(ip_sum, &alias_address, &pip->ip_src, 2); pip->ip_src = alias_address; - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } #define MF_ISSET(_pip) (ntohs((_pip)->ip_off) & IP_MF) @@ -745,6 +760,7 @@ UdpAliasIn(struct libalias *la, struct ip *pip) { struct udphdr *ud; struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); @@ -752,10 +768,12 @@ UdpAliasIn(struct libalias *la, struct ip *pip) if (ud == NULL) return (PKT_ALIAS_IGNORED); - lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, ud->uh_sport, ud->uh_dport, - IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)); - if (lnk != NULL) { + IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY), &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + { struct in_addr alias_address; struct in_addr original_address; struct in_addr proxy_address; @@ -828,7 +846,6 @@ UdpAliasIn(struct libalias *la, struct ip *pip) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); } static int @@ -840,7 +857,7 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) struct in_addr proxy_server_address; u_short dest_port; u_short proxy_server_port; - int proxy_type; + int proxy_type, ret; LIBALIAS_LOCK_ASSERT(la); @@ -877,10 +894,12 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) pip->ip_dst = proxy_server_address; ud->uh_dport = proxy_server_port; } - lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, ud->uh_sport, ud->uh_dport, - IPPROTO_UDP, create); - if (lnk != NULL) { + IPPROTO_UDP, create, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + { u_short alias_port; struct in_addr alias_address; struct alias_data ad = { @@ -930,7 +949,6 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); } static int @@ -939,6 +957,7 @@ TcpAliasIn(struct libalias *la, struct ip *pip) struct tcphdr *tc; struct alias_link *lnk; size_t dlen; + int ret; LIBALIAS_LOCK_ASSERT(la); @@ -947,11 +966,12 @@ TcpAliasIn(struct libalias *la, struct ip *pip) return (PKT_ALIAS_IGNORED); tc = (struct tcphdr *)ip_next(pip); - lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, tc->th_sport, tc->th_dport, IPPROTO_TCP, - !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)); - if (lnk != NULL) { + !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY), + &lnk); + if (ret == PKT_ALIAS_OK) { struct in_addr alias_address; struct in_addr original_address; struct in_addr proxy_address; @@ -1057,13 +1077,13 @@ TcpAliasIn(struct libalias *la, struct ip *pip) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } static int TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) { - int proxy_type; + int proxy_type, ret; u_short dest_port; u_short proxy_server_port; size_t dlen; @@ -1108,12 +1128,12 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) accumulate -= twowords(&pip->ip_dst); ADJUST_CHECKSUM(accumulate, pip->ip_sum); } - lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, tc->th_sport, tc->th_dport, - IPPROTO_TCP, create); - if (lnk == NULL) - return (PKT_ALIAS_IGNORED); - if (lnk != NULL) { + IPPROTO_TCP, create, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + { u_short alias_port; struct in_addr alias_address; int accumulate; @@ -1177,7 +1197,6 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); } /* Fragment Handling @@ -1581,17 +1600,24 @@ LibAliasUnaliasOut(struct libalias *la, ic = (struct icmp *)ip_next(pip); /* Find a link */ - if (pip->ip_p == IPPROTO_UDP) - lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, + if (pip->ip_p == IPPROTO_UDP) { + iresult = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, ud->uh_dport, ud->uh_sport, - IPPROTO_UDP, 0); - else if (pip->ip_p == IPPROTO_TCP) - lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, + IPPROTO_UDP, 0, &lnk); + if (iresult != PKT_ALIAS_OK) + goto getout; + } else if (pip->ip_p == IPPROTO_TCP) { + iresult = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, tc->th_dport, tc->th_sport, - IPPROTO_TCP, 0); - else if (pip->ip_p == IPPROTO_ICMP) - lnk = FindIcmpIn(la, pip->ip_dst, pip->ip_src, ic->icmp_id, 0); - else + IPPROTO_TCP, 0, &lnk); + if (iresult != PKT_ALIAS_OK) + goto getout; + } else if (pip->ip_p == IPPROTO_ICMP) { + iresult = FindIcmpIn(la, pip->ip_dst, pip->ip_src, + ic->icmp_id, 0, &lnk); + if (iresult != PKT_ALIAS_OK) + goto getout; + } else lnk = NULL; /* Change it from an aliased packet to an unaliased packet */ diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index b09e41935d93..c143d74a2f45 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -28,13 +28,13 @@ #include <sys/cdefs.h> #ifdef _KERNEL -#include <machine/stdarg.h> #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/lock.h> #include <sys/module.h> #include <sys/rwlock.h> +#include <sys/stdarg.h> #include <sys/syslog.h> #else #include <stdarg.h> @@ -1049,15 +1049,19 @@ FindLinkByInternalEndpoint(struct libalias *la, struct in_addr src_addr, (prototypes in alias_local.h) */ -struct alias_link * +int FindIcmpIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short id_alias, - int create) + int create, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, id_alias, LINK_ICMP, 0); @@ -1068,19 +1072,26 @@ FindIcmpIn(struct libalias *la, struct in_addr dst_addr, lnk = AddLink(la, target_addr, dst_addr, alias_addr, id_alias, NO_DEST_PORT, id_alias, LINK_ICMP); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindIcmpOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short id, - int create) + int create, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkOut(la, src_addr, dst_addr, id, NO_DEST_PORT, LINK_ICMP, 0); @@ -1091,8 +1102,11 @@ FindIcmpOut(struct libalias *la, struct in_addr src_addr, lnk = AddLink(la, src_addr, dst_addr, alias_addr, id, NO_DEST_PORT, GET_ALIAS_ID, LINK_ICMP); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } struct alias_link * @@ -1146,18 +1160,21 @@ FindFragmentPtr(struct libalias *la, struct in_addr dst_addr, LINK_FRAGMENT_PTR, 0); } -struct alias_link * +int FindProtoIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, - u_char proto) + u_char proto, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, 0, proto, 1); - if (lnk == NULL && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { struct in_addr target_addr; @@ -1165,22 +1182,28 @@ FindProtoIn(struct libalias *la, struct in_addr dst_addr, lnk = AddLink(la, target_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindProtoOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, - u_char proto) + u_char proto, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkOut(la, src_addr, dst_addr, NO_SRC_PORT, NO_DEST_PORT, proto, 1); - if (lnk == NULL) { struct in_addr alias_addr; @@ -1188,22 +1211,29 @@ FindProtoOut(struct libalias *la, struct in_addr src_addr, lnk = AddLink(la, src_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short dst_port, u_short alias_port, u_char proto, - int create) + int create, + struct alias_link **lnkp) { int link_type; struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; @@ -1212,8 +1242,7 @@ FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, link_type = LINK_TCP; break; default: - return (NULL); - break; + return (PKT_ALIAS_IGNORED); } lnk = FindLinkIn(la, dst_addr, alias_addr, @@ -1227,22 +1256,30 @@ FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, lnk = AddLink(la, target_addr, dst_addr, alias_addr, alias_port, dst_port, alias_port, link_type); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); + } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short dst_port, u_char proto, - int create) + int create, + struct alias_link **lnkp) { int link_type; struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; @@ -1251,12 +1288,10 @@ FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, link_type = LINK_TCP; break; default: - return (NULL); - break; + return (PKT_ALIAS_IGNORED); } lnk = FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, create); - if (lnk == NULL && create) { struct in_addr alias_addr; @@ -1264,8 +1299,11 @@ FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_port, dst_port, GET_ALIAS_PORT, link_type); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } struct alias_link * diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c index e063a67c2902..30cee74fff21 100644 --- a/sys/netinet/libalias/alias_irc.c +++ b/sys/netinet/libalias/alias_irc.c @@ -360,9 +360,9 @@ AliasHandleIrcOut(struct libalias *la, * matter, and this would probably allow it through * at least _some_ firewalls. */ - dcc_lnk = FindUdpTcpOut(la, true_addr, destaddr, + (void)FindUdpTcpOut(la, true_addr, destaddr, true_port, 0, - IPPROTO_TCP, 1); + IPPROTO_TCP, 1, &dcc_lnk); DBprintf(("Got a DCC link\n")); if (dcc_lnk) { struct in_addr alias_address; /* Address from aliasing */ diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h index ef6c89e675d6..7c1dcb0c8eb0 100644 --- a/sys/netinet/libalias/alias_local.h +++ b/sys/netinet/libalias/alias_local.h @@ -239,12 +239,12 @@ struct alias_link * AddLink(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_short src_port, u_short dst_port, int alias_param, int link_type); -struct alias_link * +int FindIcmpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, - u_short _id_alias, int _create); -struct alias_link * + u_short _id_alias, int _create, struct alias_link **_lnkp); +int FindIcmpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, - u_short _id, int _create); + u_short _id, int _create, struct alias_link **_lnkp); struct alias_link * FindFragmentIn1(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _ip_id); @@ -255,18 +255,20 @@ struct alias_link * AddFragmentPtrLink(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); struct alias_link * FindFragmentPtr(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); -struct alias_link * +int FindProtoIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, - u_char _proto); -struct alias_link * + u_char _proto, struct alias_link **_lnkp); +int FindProtoOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, - u_char _proto); -struct alias_link * + u_char _proto, struct alias_link **_lnkp); +int FindUdpTcpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, - u_short _dst_port, u_short _alias_port, u_char _proto, int _create); -struct alias_link * + u_short _dst_port, u_short _alias_port, u_char _proto, int _create, + struct alias_link **_lnkp); +int FindUdpTcpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, - u_short _src_port, u_short _dst_port, u_char _proto, int _create); + u_short _src_port, u_short _dst_port, u_char _proto, int _create, + struct alias_link **_lnkp); struct alias_link * AddPptp(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, struct in_addr _alias_addr, u_int16_t _src_call_id); diff --git a/sys/netinet/libalias/alias_sctp.c b/sys/netinet/libalias/alias_sctp.c index 6781c33f5edb..5ccf31697b42 100644 --- a/sys/netinet/libalias/alias_sctp.c +++ b/sys/netinet/libalias/alias_sctp.c @@ -72,12 +72,12 @@ #ifdef _KERNEL -#include <machine/stdarg.h> #include <sys/param.h> #include <sys/gsb_crc32.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/module.h> +#include <sys/stdarg.h> #include <sys/syslog.h> #include <netinet/libalias/alias_sctp.h> #include <netinet/libalias/alias.h> diff --git a/sys/netinet/libalias/alias_skinny.c b/sys/netinet/libalias/alias_skinny.c index d12046d7953f..fd9e15d3ad40 100644 --- a/sys/netinet/libalias/alias_skinny.c +++ b/sys/netinet/libalias/alias_skinny.c @@ -279,9 +279,9 @@ alias_skinny_opnrcvch_ack(struct libalias *la, struct OpenReceiveChannelAck *opn *localIpAddr = (u_int32_t)opnrcvch_ack->ipAddr; null_addr.s_addr = INADDR_ANY; - opnrcv_lnk = FindUdpTcpOut(la, pip->ip_src, null_addr, + (void)FindUdpTcpOut(la, pip->ip_src, null_addr, htons((u_short) opnrcvch_ack->port), 0, - IPPROTO_UDP, 1); + IPPROTO_UDP, 1, &opnrcv_lnk); opnrcvch_ack->ipAddr = (u_int32_t)GetAliasAddress(opnrcv_lnk).s_addr; opnrcvch_ack->port = (u_int32_t)ntohs(GetAliasPort(opnrcv_lnk)); diff --git a/sys/netinet/libalias/alias_smedia.c b/sys/netinet/libalias/alias_smedia.c index 1c4ee0970a53..badd75a45c61 100644 --- a/sys/netinet/libalias/alias_smedia.c +++ b/sys/netinet/libalias/alias_smedia.c @@ -435,8 +435,8 @@ alias_pna_out(struct libalias *la, struct ip *pip, if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) { memcpy(&port, work, 2); - pna_links = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk), - port, 0, IPPROTO_UDP, 1); + (void)FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk), + port, 0, IPPROTO_UDP, 1, &pna_links); if (pna_links != NULL) { #ifndef NO_FW_PUNCH /* Punch hole in firewall */ diff --git a/sys/netinet/pim.h b/sys/netinet/pim.h index 98230fc6ae2d..4744ffc7e9d8 100644 --- a/sys/netinet/pim.h +++ b/sys/netinet/pim.h @@ -71,7 +71,7 @@ struct pim { #endif /* ! _PIM_VT */ uint8_t pim_reserved; /* Reserved */ uint16_t pim_cksum; /* IP-style checksum */ -}; +} __packed; /* KAME-related name backward compatibility */ #define pim_ver pim_vers #define pim_rsv pim_reserved diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 7b6104da5402..66070faf97e9 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -49,6 +49,7 @@ #include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/stdarg.h> #include <sys/sx.h> #include <sys/sysctl.h> #include <sys/systm.h> @@ -74,7 +75,6 @@ #include <netipsec/ipsec_support.h> -#include <machine/stdarg.h> #include <security/mac/mac_framework.h> extern ipproto_input_t *ip_protox[]; diff --git a/sys/netinet/sctp_bsd_addr.c b/sys/netinet/sctp_bsd_addr.c index a10fbcc5ca40..ac715d8298ec 100644 --- a/sys/netinet/sctp_bsd_addr.c +++ b/sys/netinet/sctp_bsd_addr.c @@ -117,25 +117,26 @@ sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa) { struct in6_ifaddr *ifa6; + KASSERT(ifa->address.sa.sa_family == AF_INET6, + ("sctp_gather_internal_ifa_flags() called with address family %u", + ifa->address.sa.sa_family)); ifa6 = (struct in6_ifaddr *)ifa->ifa; ifa->flags = ifa6->ia6_flags; - if (!MODULE_GLOBAL(ip6_use_deprecated)) { - if (ifa->flags & - IN6_IFF_DEPRECATED) { + if (MODULE_GLOBAL(ip6_use_deprecated)) { + ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; + } else { + if (ifa->flags & IN6_IFF_DEPRECATED) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } - } else { - ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } - if (ifa->flags & - (IN6_IFF_DETACHED | - IN6_IFF_ANYCAST | - IN6_IFF_NOTREADY)) { + if (ifa->flags & (IN6_IFF_DETACHED | IN6_IFF_DUPLICATED)) { + ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; + } + /* Right now, do not support IPv6 anycast addresses */ + if (ifa->flags & IN6_IFF_ANYCAST) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; - } else { - ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } } #endif /* INET6 */ diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index dc31ffbc2161..5f637cc63df5 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -5780,7 +5780,11 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) goto out; } ecn_bits = ip->ip_tos; - if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { + if (m->m_pkthdr.csum_flags & (CSUM_SCTP_VALID | CSUM_IP_SCTP)) { + /* + * Packet with CSUM_IP_SCTP were sent from local host using + * checksum offloading. Checksum not required. + */ SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index 1a8a514fbf5f..2092f20e3c22 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -453,6 +453,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n"); sctp_ifap->localifa_flags = SCTP_ADDR_VALID; +#ifdef INET6 + if (sctp_ifap->address.sa.sa_family == AF_INET6) { + sctp_gather_internal_ifa_flags(sctp_ifap); + } +#endif sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); } @@ -475,6 +480,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, } else { /* Repair ifn_p, which was NULL... */ sctp_ifap->localifa_flags = SCTP_ADDR_VALID; +#ifdef INET6 + if (sctp_ifap->address.sa.sa_family == AF_INET6) { + sctp_gather_internal_ifa_flags(sctp_ifap); + } +#endif SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n", (void *)sctp_ifnp, (void *)sctp_ifap); @@ -500,6 +510,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, memcpy(&sctp_ifap->address, addr, addr->sa_len); sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE; sctp_ifap->flags = ifa_flags; +#ifdef INET6 + if (addr->sa_family == AF_INET6) { + sctp_gather_internal_ifa_flags(sctp_ifap); + } +#endif /* Set scope */ switch (sctp_ifap->address.sa.sa_family) { #ifdef INET @@ -635,7 +650,7 @@ sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, } } SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap); - sctp_ifap->localifa_flags &= SCTP_ADDR_VALID; + sctp_ifap->localifa_flags &= ~SCTP_ADDR_VALID; /* * We don't set the flag. This means that the structure will * hang around in EP's that have bound specific to it until @@ -3050,7 +3065,7 @@ continue_anyway: /* GAK, more FIXME IFA lock? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ - error = EINVAL; + error = EADDRNOTAVAIL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } diff --git a/sys/netinet/sctp_sysctl.c b/sys/netinet/sctp_sysctl.c index a39429ec046e..bd2f23f40727 100644 --- a/sys/netinet/sctp_sysctl.c +++ b/sys/netinet/sctp_sysctl.c @@ -265,6 +265,10 @@ sctp_sysctl_copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *st if (sctp_is_addr_restricted(stcb, sctp_ifa)) { continue; } + } else { + if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + continue; + } } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c index 66af716eea52..7d8cb965ab09 100644 --- a/sys/netinet/sctp_timer.c +++ b/sys/netinet/sctp_timer.c @@ -35,7 +35,6 @@ #define _IP_VHL #include <netinet/sctp_os.h> #include <netinet/sctp_pcb.h> - #include <netinet/sctp_var.h> #include <netinet/sctp_sysctl.h> #include <netinet/sctp_timer.h> diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 94d41ff67836..41a49b318cd5 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -77,7 +77,7 @@ struct tcphdr { u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ -}; +} __packed; static __inline uint16_t __tcp_get_flags(const struct tcphdr *th) diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 5b39c94e0e58..b77ebc928809 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -170,6 +170,50 @@ #define NUM_OF_HPTSI_SLOTS 102400 +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +/* + * When using the hpts, a TCP stack must make sure + * that once a INP_DROPPED flag is applied to a INP + * that it does not expect tcp_output() to ever be + * called by the hpts. The hpts will *not* call + * any output (or input) functions on a TCB that + * is in the DROPPED state. + * + * This implies final ACK's and RST's that might + * be sent when a TCB is still around must be + * sent from a routine like tcp_respond(). + */ +#define LOWEST_SLEEP_ALLOWED 50 +#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep + * this determines min granularity of the + * hpts. If 1, granularity is 10useconds at + * the cost of more CPU (context switching). + * Note do not set this to 0. + */ +#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP +#define DYNAMIC_MAX_SLEEP 5000 /* 5ms */ + +/* Thresholds for raising/lowering sleep */ +#define SLOTS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */ +#define SLOTS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */ +/** + * + * Dynamic adjustment of sleeping times is done in "new" mode + * where we are depending on syscall returns and lro returns + * to push hpts forward mainly and the timer is only a backstop. + * + * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh + * then we do a dynamic adjustment on the time we sleep. + * Our threshold is if the lateness of the first client served (in ticks) is + * greater than or equal too slots_indicate_more_sleep (10ms + * or 10000 ticks). If we were that late, the actual sleep time + * is adjusted down by 50%. If the ticks_ran is less than + * slots_indicate_more_sleep (100 ticks or 1000usecs). + * + */ + /* Each hpts has its own p_mtx which is used for locking */ #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) #define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) @@ -244,11 +288,10 @@ static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); static void tcp_hpts_thread(void *ctx); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; -static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD; +static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD; static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP; - SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Hpts controls"); SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, @@ -366,7 +409,7 @@ sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) new = hpts_sleep_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { - if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) || + if ((new < (dynamic_min_sleep/HPTS_USECS_PER_SLOT)) || (new > HPTS_MAX_SLEEP_ALLOWED)) error = EINVAL; else @@ -404,15 +447,15 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep, &sysctl_net_inet_tcp_hpts_min_sleep, "IU", "The minimum time the hpts must sleep before processing more slots"); -static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP; -static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP; +static int slots_indicate_more_sleep = SLOTS_INDICATE_MORE_SLEEP; +static int slots_indicate_less_sleep = SLOTS_INDICATE_LESS_SLEEP; static int tcp_hpts_no_wake_over_thresh = 1; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW, - &ticks_indicate_more_sleep, 0, + &slots_indicate_more_sleep, 0, "If we only process this many or less on a timeout, we need longer sleep on the next callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW, - &ticks_indicate_less_sleep, 0, + &slots_indicate_less_sleep, 0, "If we process this many or more on a timeout, we need less sleep on the next callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, &tcp_hpts_no_wake_over_thresh, 0, @@ -433,38 +476,40 @@ static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, bool from_callout) { - union tcp_log_stackspecific log; - /* - * Unused logs are - * 64 bit - delRate, rttProp, bw_inuse - * 16 bit - cwnd_gain - * 8 bit - bbr_state, bbr_substate, inhpts; - */ - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.flex1 = hpts->p_nxt_slot; - log.u_bbr.flex2 = hpts->p_cur_slot; - log.u_bbr.flex3 = hpts->p_prev_slot; - log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; - log.u_bbr.flex6 = hpts->p_on_queue_cnt; - log.u_bbr.flex7 = hpts->p_cpu; - log.u_bbr.flex8 = (uint8_t)from_callout; - log.u_bbr.inflight = slots_to_run; - log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; - log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); - log.u_bbr.epoch = hpts->saved_curslot; - log.u_bbr.lt_epoch = hpts->saved_prev_slot; - log.u_bbr.pkts_out = hpts->p_delayed_by; - log.u_bbr.lost = hpts->p_hpts_sleep_time; - log.u_bbr.pacing_gain = hpts->p_cpu; - log.u_bbr.pkt_epoch = hpts->p_runningslot; - log.u_bbr.use_lt_bw = 1; - TCP_LOG_EVENTP(tp, NULL, - &tptosocket(tp)->so_rcv, - &tptosocket(tp)->so_snd, - BBR_LOG_HPTSDIAG, 0, - 0, &log, false, tv); + if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { + union tcp_log_stackspecific log; + /* + * Unused logs are + * 64 bit - delRate, rttProp, bw_inuse + * 16 bit - cwnd_gain + * 8 bit - bbr_state, bbr_substate, inhpts; + */ + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.flex7 = hpts->p_cpu; + log.u_bbr.flex8 = (uint8_t)from_callout; + log.u_bbr.inflight = slots_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usec(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.pacing_gain = hpts->p_cpu; + log.u_bbr.pkt_epoch = hpts->p_runningslot; + log.u_bbr.use_lt_bw = 1; + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); + } } static void @@ -875,7 +920,7 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ return (slot_on); } /* Get the current time relative to the wheel */ - wheel_cts = tcp_tv_to_hptstick(&tv); + wheel_cts = tcp_tv_to_hpts_slot(&tv); /* Map it onto the wheel */ wheel_slot = tick_to_wheel(wheel_cts); /* Now what's the max we can place it at? */ @@ -947,7 +992,7 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ * We need to reschedule the hpts's time-out. */ hpts->p_hpts_sleep_time = slot; - need_new_to = slot * HPTS_TICKS_PER_SLOT; + need_new_to = slot * HPTS_USECS_PER_SLOT; } } /* @@ -1102,7 +1147,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) hpts->p_lasttick = hpts->p_curtick; hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); + tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); if ((hpts->p_on_queue_cnt == 0) || (hpts->p_lasttick == hpts->p_curtick)) { @@ -1118,8 +1163,7 @@ again: hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); - if (((hpts->p_curtick - hpts->p_lasttick) > - ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) && + if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) && (hpts->p_on_queue_cnt != 0)) { /* * Wheel wrap is occuring, basically we @@ -1200,7 +1244,7 @@ again: * was not any (i.e. if slots_to_run == 1, no delay). */ hpts->p_delayed_by = (slots_to_run - (i + 1)) * - HPTS_TICKS_PER_SLOT; + HPTS_USECS_PER_SLOT; runningslot = hpts->p_runningslot; hptsh = &hpts->p_hptss[runningslot]; @@ -1353,10 +1397,7 @@ again: } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ - if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { - tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, - from_callout); - } + tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); @@ -1447,7 +1488,7 @@ no_one: goto again; } no_run: - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); + tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); /* * Set flag to tell that we are done for * any slot input that happens during @@ -1487,7 +1528,7 @@ no_run: } void -__tcp_set_hpts(struct tcpcb *tp, int32_t line) +tcp_set_hpts(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; @@ -1570,7 +1611,7 @@ __tcp_run_hpts(void) ticks_ran = tcp_hptsi(hpts, false); /* We may want to adjust the sleep values here */ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { - if (ticks_ran > ticks_indicate_less_sleep) { + if (ticks_ran > slots_indicate_less_sleep) { struct timeval tv; sbintime_t sb; @@ -1580,7 +1621,7 @@ __tcp_run_hpts(void) /* Reschedule with new to value */ tcp_hpts_set_max_sleep(hpts, 0); tv.tv_sec = 0; - tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; /* Validate its in the right ranges */ if (tv.tv_usec < hpts->p_mysleep.tv_usec) { hpts->overidden_sleep = tv.tv_usec; @@ -1602,7 +1643,7 @@ __tcp_run_hpts(void) callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } else if (ticks_ran < ticks_indicate_more_sleep) { + } else if (ticks_ran < slots_indicate_more_sleep) { /* For the further sleep, don't reschedule hpts */ hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) @@ -1684,7 +1725,7 @@ tcp_hpts_thread(void *ctx) hpts->p_hpts_active = 1; ticks_ran = tcp_hptsi(hpts, true); tv.tv_sec = 0; - tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { hpts->hit_callout_thresh = 1; atomic_add_int(&hpts_that_need_softclock, 1); @@ -1698,11 +1739,11 @@ tcp_hpts_thread(void *ctx) * Only adjust sleep time if we were * called from the callout i.e. direct_wake == 0. */ - if (ticks_ran < ticks_indicate_more_sleep) { + if (ticks_ran < slots_indicate_more_sleep) { hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; - } else if (ticks_ran > ticks_indicate_less_sleep) { + } else if (ticks_ran > slots_indicate_less_sleep) { hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) hpts->p_mysleep.tv_usec = dynamic_min_sleep; @@ -1949,7 +1990,7 @@ tcp_hpts_mod_load(void) hpts->p_hpts_sleep_time = hpts_sleep_max; hpts->p_num = i; hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[i] = tcp_tv_to_usectick(&tv); + tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv); hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); hpts->p_cpu = 0xffff; hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); @@ -1996,7 +2037,7 @@ tcp_hpts_mod_load(void) } } tv.tv_sec = 0; - tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; hpts->sleeping = tv.tv_usec; sb = tvtosbt(tv); callout_reset_sbt_on(&hpts->co, sb, 0, diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index b097a2b98db9..6172baf2a062 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -26,14 +26,38 @@ #ifndef __tcp_hpts_h__ #define __tcp_hpts_h__ -/* Number of useconds in a hpts tick */ -#define HPTS_TICKS_PER_SLOT 10 +/* Number of useconds represented by an hpts slot */ +#define HPTS_USECS_PER_SLOT 10 #define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 +static inline uint32_t +tcp_tv_to_hpts_slot(const struct timeval *sv) +{ + return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT)); +} + +static inline uint32_t +tcp_tv_to_usec(const struct timeval *sv) +{ + return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); +} + +static inline uint32_t +tcp_tv_to_msec(const struct timeval *sv) +{ + return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); +} + +static inline uint64_t +tcp_tv_to_lusec(const struct timeval *sv) +{ + return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); +} + struct hpts_diag { uint32_t p_hpts_active; /* bbr->flex7 x */ uint32_t p_nxt_slot; /* bbr->flex1 x */ @@ -66,52 +90,16 @@ struct hpts_diag { #define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ #define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) -#define DEFAULT_CONNECTION_THESHOLD 100 +#ifdef _KERNEL /* - * When using the hpts, a TCP stack must make sure - * that once a INP_DROPPED flag is applied to a INP - * that it does not expect tcp_output() to ever be - * called by the hpts. The hpts will *not* call - * any output (or input) functions on a TCB that - * is in the DROPPED state. - * - * This implies final ACK's and RST's that might - * be sent when a TCB is still around must be - * sent from a routine like tcp_respond(). - */ -#define LOWEST_SLEEP_ALLOWED 50 -#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep - * this determines min granularity of the - * hpts. If 1, granularity is 10useconds at - * the cost of more CPU (context switching). - * Note do not set this to 0. - */ -#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP -#define DYNAMIC_MAX_SLEEP 5000 /* 5ms */ - -/* Thresholds for raising/lowering sleep */ -#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */ -#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */ -/** - * - * Dynamic adjustment of sleeping times is done in "new" mode - * where we are depending on syscall returns and lro returns - * to push hpts forward mainly and the timer is only a backstop. - * - * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh - * then we do a dynamic adjustment on the time we sleep. - * Our threshold is if the lateness of the first client served (in ticks) is - * greater than or equal too ticks_indicate_more_sleep (10ms - * or 10000 ticks). If we were that late, the actual sleep time - * is adjusted down by 50%. If the ticks_ran is less than - * ticks_indicate_more_sleep (100 ticks or 1000usecs). - * - */ + * The following are the definitions for the kernel HPTS interface for managing + * the HPTS ring and the TCBs on it. +*/ -#ifdef _KERNEL void tcp_hpts_init(struct tcpcb *); void tcp_hpts_remove(struct tcpcb *); + static inline bool tcp_in_hpts(struct tcpcb *tp) { @@ -149,57 +137,17 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, #define tcp_hpts_insert(inp, slot) \ tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) -void __tcp_set_hpts(struct tcpcb *tp, int32_t line); -#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) - -void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); - -void tcp_lro_hpts_init(void); -void tcp_lro_hpts_uninit(void); - -extern int32_t tcp_min_hptsi_time; - -#endif /* _KERNEL */ - -/* - * The following functions should also be available - * to userspace as well. - */ -static __inline uint32_t -tcp_tv_to_hptstick(const struct timeval *sv) -{ - return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT)); -} - -static __inline uint32_t -tcp_tv_to_usectick(const struct timeval *sv) -{ - return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); -} - -static __inline uint32_t -tcp_tv_to_mssectick(const struct timeval *sv) -{ - return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); -} - -static __inline uint64_t -tcp_tv_to_lusectick(const struct timeval *sv) -{ - return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); -} - -#ifdef _KERNEL +void tcp_set_hpts(struct tcpcb *tp); extern int32_t tcp_min_hptsi_time; static inline int32_t get_hpts_min_sleep_time(void) { - return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT); + return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT); } -static __inline uint32_t +static inline uint32_t tcp_gethptstick(struct timeval *sv) { struct timeval tv; @@ -207,10 +155,10 @@ tcp_gethptstick(struct timeval *sv) if (sv == NULL) sv = &tv; microuptime(sv); - return (tcp_tv_to_hptstick(sv)); + return (tcp_tv_to_hpts_slot(sv)); } -static __inline uint64_t +static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { struct timeval tvd; @@ -218,10 +166,10 @@ tcp_get_u64_usecs(struct timeval *tv) if (tv == NULL) tv = &tvd; microuptime(tv); - return (tcp_tv_to_lusectick(tv)); + return (tcp_tv_to_lusec(tv)); } -static __inline uint32_t +static inline uint32_t tcp_get_usecs(struct timeval *tv) { struct timeval tvd; @@ -229,8 +177,15 @@ tcp_get_usecs(struct timeval *tv) if (tv == NULL) tv = &tvd; microuptime(tv); - return (tcp_tv_to_usectick(tv)); + return (tcp_tv_to_usec(tv)); } +/* + * LRO HPTS initialization and uninitialization, only for internal use by the + * HPTS code. + */ +void tcp_lro_hpts_init(void); +void tcp_lro_hpts_uninit(void); + #endif /* _KERNEL */ #endif /* __tcp_hpts_h__ */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 29a6b431f311..6492495dc583 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -383,7 +383,7 @@ cc_conn_init(struct tcpcb *tp) } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, tcp_rexmit_max); } if (metrics.hc_ssthresh) { /* @@ -567,8 +567,6 @@ int tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m; - struct in6_ifaddr *ia6; - struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { @@ -580,19 +578,6 @@ tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) } } - /* - * draft-itojun-ipv6-tcp-to-anycast - * better place to put this in? - */ - ip6 = mtod(m, struct ip6_hdr *); - ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); - if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { - icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, - (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); - *mp = NULL; - return (IPPROTO_DONE); - } - *mp = m; return (tcp_input_with_port(mp, offp, proto, port)); } @@ -624,7 +609,6 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) int tlen = 0, off; int drop_hdrlen; int thflags; - int rstreason = 0; /* For badport_bandlim accounting purposes */ int lookupflag; uint8_t iptos; struct m_tag *fwd_tag = NULL; @@ -636,6 +620,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ + bool closed_port = false; /* segment is hitting a closed port */ NET_EPOCH_ASSERT(); @@ -664,6 +649,12 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + th->th_sum = 0; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { @@ -724,6 +715,12 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + th->th_sum = 0; } else { struct ipovly *ipov = (struct ipovly *)ip; @@ -907,22 +904,22 @@ findpcb: * XXX MRT Send RST using which routing table? */ if (inp == NULL) { - if (rstreason != 0) { + if ((lookupflag & INPLOOKUP_WILDCARD) == 0) { /* We came here after second (safety) lookup. */ - MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0); - goto dropwithreset; - } - /* - * Log communication attempts to ports that are not - * in use. - */ - if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || - V_tcp_log_in_vain == 2) { - if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) + MPASS(!closed_port); + } else { + /* + * Log communication attempts to ports that are not + * in use. + */ + if (((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || + V_tcp_log_in_vain == 2) && + (s = tcp_log_vain(NULL, th, (void *)ip, ip6))) { log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); + } + closed_port = true; } - rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK_ASSERT(inp); @@ -1013,12 +1010,12 @@ findpcb: * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ - rstreason = BANDLIM_RST_CLOSEDPORT; + closed_port = true; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { - rstreason = BANDLIM_RST_CLOSEDPORT; + closed_port = true; goto dropwithreset; } @@ -1070,6 +1067,8 @@ findpcb: * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + int result; + /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1079,8 +1078,8 @@ findpcb: /* * NB: syncache_expand() doesn't unlock inp. */ - rstreason = syncache_expand(&inc, &to, th, &so, m, port); - if (rstreason < 0) { + result = syncache_expand(&inc, &to, th, &so, m, port); + if (result < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped @@ -1088,7 +1087,7 @@ findpcb: * to the sender. */ goto dropunlock; - } else if (rstreason == 0) { + } else if (result == 0) { /* * No syncache entry, or ACK was not for our * SYN/ACK. Do our protection against double @@ -1099,7 +1098,8 @@ findpcb: * don't want to sent RST for the second ACK, * so we perform second lookup without wildcard * match, hoping to find the new socket. If - * the ACK is stray indeed, rstreason would + * the ACK is stray indeed, the missing + * INPLOOKUP_WILDCARD flag in lookupflag would * hint the above code that the lookup was a * second attempt. * @@ -1107,7 +1107,6 @@ findpcb: * of the failure cause. */ INP_WUNLOCK(inp); - rstreason = BANDLIM_RST_OPENPORT; lookupflag &= ~INPLOOKUP_WILDCARD; goto findpcb; } @@ -1131,7 +1130,6 @@ tfo_socket_result: V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { - rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; @@ -1198,7 +1196,6 @@ tfo_socket_result: s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); - rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* @@ -1274,7 +1271,6 @@ tfo_socket_result: "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); - rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } @@ -1395,12 +1391,12 @@ dropwithreset: * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ - if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) || - (rstreason == BANDLIM_RST_CLOSEDPORT && - ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && + if (((!closed_port && V_blackhole == 3) || + (closed_port && + ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && (V_blackhole_local || ( #ifdef INET6 - isipv6 ? !in6_localaddr(&ip6->ip6_src) : + isipv6 ? !in6_localip(&ip6->ip6_src) : #endif #ifdef INET !in_localip(ip->ip_src) @@ -1410,7 +1406,7 @@ dropwithreset: ))) goto dropunlock; TCP_PROBE5(receive, NULL, tp, m, tp, th); - tcp_dropwithreset(m, th, tp, tlen, rstreason); + tcp_dropwithreset(m, th, tp, tlen); m = NULL; /* mbuf chain got consumed. */ dropunlock: @@ -1453,7 +1449,7 @@ drop: * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * - * If all of the criteria are met we increaset the socket buffer + * If all of the criteria are met, we increase the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. @@ -1519,7 +1515,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, uint16_t thflags; int acked, ourfinisacked, needoutput = 0; sackstatus_t sack_changed; - int rstreason, todrop, win, incforsyn = 0; + int todrop, win, incforsyn = 0; uint32_t tiwin; uint16_t nsegs; char *s; @@ -1530,7 +1526,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct tcpopt to; int tfo_syn; u_int maxseg = 0; + bool no_data; + no_data = (tlen == 0); thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = SACK_NOCHANGE; @@ -1562,7 +1560,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rstreason = BANDLIM_UNLIMITED; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -1769,7 +1766,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tp->ts_recent = to.to_tsval; } - if (tlen == 0) { + if (no_data) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && @@ -1978,7 +1975,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -1991,7 +1987,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { - rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } else if (thflags & TH_SYN) { @@ -2212,7 +2207,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); tp = tcp_drop(tp, ECONNRESET); - rstreason = BANDLIM_UNLIMITED; } else { tcp_ecn_input_syn_sent(tp, thflags, iptos); tcp_send_challenge_ack(tp, th, m); @@ -2259,7 +2253,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { - rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -2341,7 +2334,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); - rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } @@ -2572,7 +2564,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); - if (tlen == 0 && + if (no_data && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* @@ -2815,9 +2807,11 @@ enter_recovery: KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && - tp->t_flags & TF_SENTFIN), - ("%s: sent too much", - __func__)); + tp->t_flags & TF_SENTFIN) || + (sent < 2 * maxseg && + tp->t_flags & TF_NODELAY), + ("%s: sent too much: %u>%u", + __func__, sent, maxseg)); tp->snd_limited = 2; } else if (sent > 0) { ++tp->snd_limited; @@ -3126,8 +3120,7 @@ step6: (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ - if (tlen == 0 && - tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; @@ -3437,7 +3430,6 @@ dropafterack: if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { - rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -3449,11 +3441,10 @@ dropafterack: return; dropwithreset: + tcp_dropwithreset(m, th, NULL, tlen); if (tp != NULL) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); - } else - tcp_dropwithreset(m, th, NULL, tlen, rstreason); + } return; drop: @@ -3473,8 +3464,7 @@ drop: * tp may be NULL. */ void -tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, - int tlen, int rstreason) +tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen) { #ifdef INET struct ip *ip; @@ -3514,7 +3504,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, #endif /* Perform bandwidth limiting. */ - if (badport_bandlim(rstreason) < 0) + if (badport_bandlim(BANDLIM_TCP_RST) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ @@ -3745,7 +3735,7 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + max(tp->t_rttmin, rtt + 2), tcp_rexmit_max); /* * We received an ack for a packet that wasn't retransmitted; diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index e9ad05382b81..e24790ece43d 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -29,6 +29,7 @@ #include <sys/cdefs.h> #include "opt_inet.h" +#include "opt_ddb.h" #include <sys/param.h> #include <sys/arb.h> #include <sys/hash.h> @@ -43,11 +44,18 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> +#ifdef DDB +#include <sys/time.h> +#endif #include <sys/tree.h> #include <sys/stats.h> /* Must come after qmath.h and tree.h */ #include <sys/counter.h> #include <dev/tcp_log/tcp_log_dev.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif + #include <net/if.h> #include <net/if_var.h> #include <net/vnet.h> @@ -1840,40 +1848,36 @@ retry: log_buf->tlb_txbuf.tls_sb_ccc = 0; } /* Copy values from tp to the log entry. */ -#define COPY_STAT(f) log_buf->tlb_ ## f = tp->f -#define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f - COPY_STAT_T(state); - COPY_STAT_T(starttime); - COPY_STAT(iss); - COPY_STAT_T(flags); - COPY_STAT(snd_una); - COPY_STAT(snd_max); - COPY_STAT(snd_cwnd); - COPY_STAT(snd_nxt); - COPY_STAT(snd_recover); - COPY_STAT(snd_wnd); - COPY_STAT(snd_ssthresh); - COPY_STAT_T(srtt); - COPY_STAT_T(rttvar); - COPY_STAT(rcv_up); - COPY_STAT(rcv_adv); - COPY_STAT_T(flags2); - COPY_STAT(rcv_nxt); - COPY_STAT(rcv_wnd); - COPY_STAT_T(dupacks); - COPY_STAT_T(segqlen); - COPY_STAT(snd_numholes); + log_buf->tlb_state = tp->t_state; + log_buf->tlb_starttime = tp->t_starttime; + log_buf->tlb_iss = tp->iss; + log_buf->tlb_flags = tp->t_flags; + log_buf->tlb_snd_una = tp->snd_una; + log_buf->tlb_snd_max = tp->snd_max; + log_buf->tlb_snd_cwnd = tp->snd_cwnd; + log_buf->tlb_snd_nxt = tp->snd_nxt; + log_buf->tlb_snd_recover = tp->snd_recover; + log_buf->tlb_snd_wnd = tp->snd_wnd; + log_buf->tlb_snd_ssthresh = tp->snd_ssthresh; + log_buf->tlb_srtt = tp->t_srtt; + log_buf->tlb_rttvar = tp->t_rttvar; + log_buf->tlb_rcv_up = tp->rcv_up; + log_buf->tlb_rcv_adv = tp->rcv_adv; + log_buf->tlb_flags2 = tp->t_flags2; + log_buf->tlb_rcv_nxt = tp->rcv_nxt; + log_buf->tlb_rcv_wnd = tp->rcv_wnd; + log_buf->tlb_dupacks = tp->t_dupacks; + log_buf->tlb_segqlen = tp->t_segqlen; + log_buf->tlb_snd_numholes = tp->snd_numholes; log_buf->tlb_flex1 = 0; log_buf->tlb_flex2 = 0; - COPY_STAT_T(fbyte_in); - COPY_STAT_T(fbyte_out); - COPY_STAT(snd_scale); - COPY_STAT(rcv_scale); + log_buf->tlb_fbyte_in = tp->t_fbyte_in; + log_buf->tlb_fbyte_out = tp->t_fbyte_out; + log_buf->tlb_snd_scale = tp->snd_scale; + log_buf->tlb_rcv_scale = tp->rcv_scale; log_buf->_pad[0] = 0; log_buf->_pad[1] = 0; log_buf->_pad[2] = 0; -#undef COPY_STAT -#undef COPY_STAT_T /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, @@ -2874,10 +2878,11 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags) /* double check log state now that we have the lock */ if (inp->inp_flags & INP_DROPPED) goto done; - if (tp->_t_logstate != TCP_LOG_STATE_OFF) { + if (tcp_bblogging_on(tp)) { struct timeval tv; tcp_log_eventspecific_t log; + memset(&log, 0, sizeof(log)); microuptime(&tv); log.u_sf.offset = offset; log.u_sf.length = nbytes; @@ -2975,3 +2980,370 @@ skip_closed_req: done: INP_WUNLOCK(inp); } + +#ifdef DDB +static void +db_print_indent(int indent) +{ + int i; + + for (i = 0; i < indent; i++) + db_printf(" "); +} + +static void +db_print_tcphdr(struct tcp_log_buffer *tlm_buf) +{ + struct sackblk sack; + struct tcphdr *th; + int cnt, i, j, opt, optlen, num_sacks; + uint32_t val, ecr; + uint16_t mss; + uint16_t flags; + + if ((tlm_buf->tlb_eventflags & TLB_FLAG_HDR) == 0) { + return; + } + th = &tlm_buf->tlb_th; + flags = tcp_get_flags(th); + if (flags & TH_FIN) { + db_printf("F"); + } + if (flags & TH_SYN) { + db_printf("S"); + } + if (flags & TH_RST) { + db_printf("R"); + } + if (flags & TH_PUSH) { + db_printf("P"); + } + if (flags & TH_ACK) { + db_printf("."); + } + if (flags & TH_URG) { + db_printf("U"); + } + if (flags & TH_ECE) { + db_printf("E"); + } + if (flags & TH_CWR) { + db_printf("W"); + } + if (flags & TH_AE) { + db_printf("A"); + } + db_printf(" %u:%u(%u)", ntohl(th->th_seq), + ntohl(th->th_seq) + tlm_buf->tlb_len, tlm_buf->tlb_len); + if (flags & TH_ACK) { + db_printf(" ack %u", ntohl(th->th_ack)); + } + db_printf(" win %u", ntohs(th->th_win)); + if (flags & TH_URG) { + db_printf(" urg %u", ntohs(th->th_urp)); + } + cnt = (th->th_off << 2) - sizeof(struct tcphdr); + if (cnt > 0) { + db_printf(" <"); + for (i = 0; i < cnt; i += optlen) { + opt = tlm_buf->tlb_opts[i]; + if (opt == TCPOPT_EOL || opt == TCPOPT_NOP) { + optlen = 1; + } else { + if (cnt - i < 2) { + break; + } + optlen = tlm_buf->tlb_opts[i + 1]; + if (optlen < 2 || optlen > cnt - i) { + break; + } + } + if (i > 0) { + db_printf(","); + } + switch (opt) { + case TCPOPT_EOL: + db_printf("eol"); + break; + case TCPOPT_NOP: + db_printf("nop"); + break; + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) { + break; + } + bcopy(tlm_buf->tlb_opts + i + 2, &mss, + sizeof(uint16_t)); + db_printf("mss %u", ntohs(mss)); + break; + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) { + break; + } + db_printf("wscale %u", + tlm_buf->tlb_opts[i + 2]); + break; + case TCPOPT_SACK_PERMITTED: + if (optlen != TCPOLEN_SACK_PERMITTED) { + break; + } + db_printf("sackOK"); + break; + case TCPOPT_SACK: + if (optlen == TCPOLEN_SACKHDR || + (optlen - 2) % TCPOLEN_SACK != 0) { + break; + } + num_sacks = (optlen - 2) / TCPOLEN_SACK; + db_printf("sack"); + for (j = 0; j < num_sacks; j++) { + bcopy(tlm_buf->tlb_opts + i + 2 + + j * TCPOLEN_SACK, &sack, + TCPOLEN_SACK); + db_printf(" %u:%u", ntohl(sack.start), + ntohl(sack.end)); + } + break; + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) { + break; + } + bcopy(tlm_buf->tlb_opts + i + 2, &val, + sizeof(uint32_t)); + bcopy(tlm_buf->tlb_opts + i + 6, &ecr, + sizeof(uint32_t)); + db_printf("TS val %u ecr %u", ntohl(val), + ntohl(ecr)); + break; + case TCPOPT_SIGNATURE: + db_printf("md5"); + if (optlen > 2) { + db_printf(" "); + } + for (j = 0; j < optlen - 2; j++) { + db_printf("%02x", + tlm_buf->tlb_opts[i + 2 + j]); + } + break; + case TCPOPT_FAST_OPEN: + db_printf("FO"); + if (optlen > 2) { + db_printf(" "); + } + for (j = 0; j < optlen - 2; j++) { + db_printf("%02x", + tlm_buf->tlb_opts[i + 2 + j]); + } + break; + default: + db_printf("opt=%u len=%u", opt, optlen); + break; + } + } + db_printf(">"); + } +} +static void +db_print_pru(struct tcp_log_buffer *tlm_buf) +{ + switch (tlm_buf->tlb_flex1) { + case PRU_ATTACH: + db_printf("ATTACH"); + break; + case PRU_DETACH: + db_printf("DETACH"); + break; + case PRU_BIND: + db_printf("BIND"); + break; + case PRU_LISTEN: + db_printf("LISTEN"); + break; + case PRU_CONNECT: + db_printf("CONNECT"); + break; + case PRU_ACCEPT: + db_printf("ACCEPT"); + break; + case PRU_DISCONNECT: + db_printf("DISCONNECT"); + break; + case PRU_SHUTDOWN: + db_printf("SHUTDOWN"); + break; + case PRU_RCVD: + db_printf("RCVD"); + break; + case PRU_SEND: + db_printf("SEND"); + break; + case PRU_ABORT: + db_printf("ABORT"); + break; + case PRU_CONTROL: + db_printf("CONTROL"); + break; + case PRU_SENSE: + db_printf("SENSE"); + break; + case PRU_RCVOOB: + db_printf("RCVOOB"); + break; + case PRU_SENDOOB: + db_printf("SENDOOB"); + break; + case PRU_SOCKADDR: + db_printf("SOCKADDR"); + break; + case PRU_PEERADDR: + db_printf("PEERADDR"); + break; + case PRU_CONNECT2: + db_printf("CONNECT2"); + break; + case PRU_FASTTIMO: + db_printf("FASTTIMO"); + break; + case PRU_SLOWTIMO: + db_printf("SLOWTIMO"); + break; + case PRU_PROTORCV: + db_printf("PROTORCV"); + break; + case PRU_PROTOSEND: + db_printf("PROTOSEND"); + break; + case PRU_SEND_EOF: + db_printf("SEND_EOF"); + break; + case PRU_SOSETLABEL: + db_printf("SOSETLABEL"); + break; + case PRU_CLOSE: + db_printf("CLOSE"); + break; + case PRU_FLUSH: + db_printf("FLUSH"); + break; + default: + db_printf("Unknown PRU (%u)", tlm_buf->tlb_flex1); + break; + } + if (tlm_buf->tlb_errno >= 0) { + db_printf(", error: %d", tlm_buf->tlb_errno); + } +} + +static void +db_print_rto(struct tcp_log_buffer *tlm_buf) +{ + tt_what what; + tt_which which; + + what = (tlm_buf->tlb_flex1 & 0xffffff00) >> 8; + which = tlm_buf->tlb_flex1 & 0x000000ff; + switch (what) { + case TT_PROCESSING: + db_printf("Processing "); + break; + case TT_PROCESSED: + db_printf("Processed "); + break; + case TT_STARTING: + db_printf("Starting "); + break; + case TT_STOPPING: + db_printf("Stopping "); + break; + default: + db_printf("Unknown operation (%u) for ", what); + break; + } + switch (which) { + case TT_REXMT: + db_printf("Retransmission "); + break; + case TT_PERSIST: + db_printf("Persist "); + break; + case TT_KEEP: + db_printf("Keepalive "); + break; + case TT_2MSL: + db_printf("2 MSL "); + break; + case TT_DELACK: + db_printf("Delayed ACK "); + break; + default: + db_printf("Unknown (%u) ", which); + break; + } + db_printf("timer"); + if (what == TT_STARTING) { + db_printf(": %u ms", tlm_buf->tlb_flex2); + } +} + +static void +db_print_usersend(struct tcp_log_buffer *tlm_buf) +{ + if ((tlm_buf->tlb_eventflags & TLB_FLAG_RXBUF) == 0) { + return; + } + if ((tlm_buf->tlb_eventflags & TLB_FLAG_TXBUF) == 0) { + return; + } + db_printf("usersend: rcv.acc: %u rcv.ccc: %u snd.acc: %u snd.ccc: %u", + tlm_buf->tlb_rxbuf.tls_sb_acc, tlm_buf->tlb_rxbuf.tls_sb_ccc, + tlm_buf->tlb_txbuf.tls_sb_acc, tlm_buf->tlb_txbuf.tls_sb_ccc); +} + +void +db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent) +{ + struct tcp_log_mem *log_entry; + struct tcp_log_buffer *tlm_buf, *prev_tlm_buf; + int64_t delta_t; + + indent += 2; + prev_tlm_buf = NULL; + STAILQ_FOREACH(log_entry, log_entries, tlm_queue) { + db_print_indent(indent); + tlm_buf = &log_entry->tlm_buf; + if (prev_tlm_buf == NULL) { + db_printf(" 0.000 "); + } else { + delta_t = sbttoms(tvtosbt(tlm_buf->tlb_tv) - + tvtosbt(prev_tlm_buf->tlb_tv)); + db_printf("+%u.%03u ", (uint32_t)(delta_t / 1000), + (uint32_t)(delta_t % 1000)); + } + switch (tlm_buf->tlb_eventid) { + case TCP_LOG_IN: + db_printf("< "); + db_print_tcphdr(tlm_buf); + break; + case TCP_LOG_OUT: + db_printf("> "); + db_print_tcphdr(tlm_buf); + break; + case TCP_LOG_RTO: + db_print_rto(tlm_buf); + break; + case TCP_LOG_PRU: + db_print_pru(tlm_buf); + break; + case TCP_LOG_USERSEND: + db_print_usersend(tlm_buf); + break; + default: + break; + } + db_printf("\n"); + prev_tlm_buf = tlm_buf; + if (db_pager_quit) + break; + } +} +#endif diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index 9ee2d97d47c2..f8c064b6a104 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -60,14 +60,6 @@ struct tcp_log_verbose uint8_t _pad[4]; } ALIGN_TCP_LOG; -/* Internal RACK state variables. */ -struct tcp_log_rack -{ - uint32_t tlr_rack_rtt; /* rc_rack_rtt */ - uint8_t tlr_state; /* Internal RACK state */ - uint8_t _pad[3]; /* Padding */ -}; - struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; @@ -126,7 +118,6 @@ struct tcp_log_sendfile { */ union tcp_log_stackspecific { - struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; struct tcp_log_sendfile u_sf; struct tcp_log_raw u_raw; /* "raw" log access */ @@ -185,7 +176,6 @@ struct tcp_log_buffer uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; -#define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ @@ -387,12 +377,12 @@ extern int32_t tcp_trace_point_count; /* * Returns true if any sort of BB logging is enabled, - * commonly used throughout the codebase. + * commonly used throughout the codebase. */ static inline int tcp_bblogging_on(struct tcpcb *tp) { - if (tp->_t_logstate <= TCP_LOG_STATE_OFF) + if (tp->_t_logstate <= TCP_LOG_STATE_OFF) return (0); if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) return (0); @@ -437,7 +427,7 @@ tcp_set_bblog_state(struct tcpcb *tp, uint8_t ls, uint8_t bbpoint) } } -static inline uint32_t +static inline uint32_t tcp_get_bblog_state(struct tcpcb *tp) { return (tp->_t_logstate); @@ -549,12 +539,12 @@ struct tcpcb; NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ +/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ - if (tcp_bblogging_on(tp)) \ - tcp_log_event(tp, th, rxbuf, txbuf, eventid, \ - errornum, len, stackinfo, th_hostorder, \ - NULL, NULL, 0, tv); \ + KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \ + tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \ + stackinfo, th_hostorder, NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX @@ -580,6 +570,9 @@ void tcp_log_flowend(struct tcpcb *tp); void tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags); int tcp_log_apply_ratio(struct tcpcb *tp, int ratio); +#ifdef DDB +void db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent); +#endif #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 10afed17bf3b..7512679bd4e9 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1301,9 +1301,9 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h return (TCP_LRO_CANNOT); #endif if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != - ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || + ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || (m->m_pkthdr.csum_data != 0xffff)) { - /* + /* * The checksum either did not have hardware offload * or it was a bad checksum. We can't LRO such * a packet. @@ -1334,7 +1334,7 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h #endif /* If no hardware or arrival stamp on the packet add timestamp */ if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) { - m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); + m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); m->m_flags |= M_TSTMP_LRO; } @@ -1429,9 +1429,9 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) int error; if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != - ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || + ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || (m->m_pkthdr.csum_data != 0xffff)) { - /* + /* * The checksum either did not have hardware offload * or it was a bad checksum. We can't LRO such * a packet. @@ -1481,7 +1481,7 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) ((mb->m_flags & M_TSTMP) == 0)) { /* Add in an LRO time since no hardware */ binuptime(&lc->lro_last_queue_time); - mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); + mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); mb->m_flags |= M_TSTMP_LRO; } diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c index 7e756285da45..43587285fe26 100644 --- a/sys/netinet/tcp_lro_hpts.c +++ b/sys/netinet/tcp_lro_hpts.c @@ -188,7 +188,7 @@ tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc, log.u_bbr.cur_del_rate = (uintptr_t)m; log.u_bbr.bw_inuse = (uintptr_t)le->m_head; bintime2timeval(&lc->lro_last_queue_time, &btv); - log.u_bbr.flex6 = tcp_tv_to_usectick(&btv); + log.u_bbr.flex6 = tcp_tv_to_usec(&btv); log.u_bbr.flex7 = le->compressed; log.u_bbr.pacing_gain = le->uncompressed; if (in_epoch(net_epoch_preempt)) diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index bc5b42ee6f2c..2dfb7faf56e3 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1250,7 +1250,7 @@ send: * fack acks recoverypoint. */ if ((tp->t_flags & TF_LRD) && SEQ_GEQ(p->rxmit, p->end)) - p->rxmit = tp->snd_recover; + p->rxmit = SEQ_MAX(p->rxmit, tp->snd_recover); tp->sackhint.sack_bytes_rexmit += len; } if (IN_RECOVERY(tp->t_flags)) { diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index 90d789f0e224..b6c55fac50b3 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -283,7 +283,7 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) INP_WLOCK_ASSERT(tptoinpcb(tp)); /* Check arguments. */ - KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end")); + KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("SEG_GT(rcv_start, rcv_end)")); if ((rcv_start == rcv_end) && (tp->rcv_numsacks >= 1) && @@ -498,8 +498,8 @@ tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) tp->snd_numholes--; atomic_subtract_int(&V_tcp_sack_globalholes, 1); - KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); - KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); + KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes < 0")); + KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes < 0")); } /* @@ -583,6 +583,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { left_edge_delta = th_ack - tp->snd_una; + delivered_data += left_edge_delta; sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; /* @@ -590,7 +591,6 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) * due to DSACK blocks */ if (SEQ_LT(tp->snd_fack, th_ack)) { - delivered_data += th_ack - tp->snd_una; tp->snd_fack = th_ack; sack_changed = SACK_CHANGE; } @@ -684,7 +684,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) delivered_data += sblkp->end - sblkp->start; tp->sackhint.hole_bytes += temp->end - temp->start; KASSERT(tp->sackhint.hole_bytes >= 0, - ("sackhint hole bytes >= 0")); + ("sackhint hole bytes < 0")); tp->snd_fack = sblkp->end; sblkp--; sack_changed = SACK_NEWLOSS; @@ -744,7 +744,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) while (cur != NULL) { if (!(sblkp >= sack_blocks)) { if (((loss_sblks >= tcprexmtthresh) || - (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) + (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) break; loss_thresh += loss_hiack - cur->end; loss_hiack = cur->start; @@ -783,7 +783,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) tp->sackhint.sack_bytes_rexmit -= (SEQ_MIN(cur->rxmit, cur->end) - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, - ("sackhint bytes rtx >= 0")); + ("sackhint bytes rtx < 0")); sack_changed = SACK_CHANGE; if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ @@ -816,7 +816,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end)) - cur->rxmit = tp->snd_recover; + cur->rxmit = SEQ_MAX(cur->rxmit, tp->snd_recover); } else { /* * ACKs some data in middle of a hole; need @@ -843,7 +843,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end)) - cur->rxmit = tp->snd_recover; + cur->rxmit = SEQ_MAX(cur->rxmit, tp->snd_recover); delivered_data += (sblkp->end - sblkp->start); } } @@ -874,13 +874,13 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) if (TAILQ_EMPTY(&tp->snd_holes)) { KASSERT(tp->sackhint.hole_bytes == 0, - ("SACK scoreboard empty, but accounting non-zero\n")); + ("SACK scoreboard empty, but sackhint hole bytes != 0")); tp->sackhint.sack_bytes_rexmit = 0; tp->sackhint.sacked_bytes = 0; tp->sackhint.lost_bytes = 0; } else { KASSERT(tp->sackhint.hole_bytes > 0, - ("SACK scoreboard not empty, but has no bytes\n")); + ("SACK scoreboard not empty, but sackhint hole bytes <= 0")); tp->sackhint.delivered_data = delivered_data; tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0")); @@ -918,9 +918,9 @@ tcp_free_sackholes(struct tcpcb *tp) tp->sackhint.hole_bytes = 0; tp->sackhint.lost_bytes = 0; - KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); + KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes != 0")); KASSERT(tp->sackhint.nexthole == NULL, - ("tp->sackhint.nexthole == NULL")); + ("tp->sackhint.nexthole != NULL")); } /* @@ -1061,11 +1061,15 @@ tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) } } } - KASSERT(SEQ_LT(hole->start, hole->end), ("%s: hole.start >= hole.end", __func__)); + KASSERT(SEQ_LT(hole->start, hole->end), + ("%s: SEQ_GEQ(hole.start, hole.end)", __func__)); if (!(V_tcp_do_newsack)) { - KASSERT(SEQ_LT(hole->start, tp->snd_fack), ("%s: hole.start >= snd.fack", __func__)); - KASSERT(SEQ_LT(hole->end, tp->snd_fack), ("%s: hole.end >= snd.fack", __func__)); - KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), ("%s: hole.rxmit >= snd.fack", __func__)); + KASSERT(SEQ_LT(hole->start, tp->snd_fack), + ("%s: SEG_GEQ(hole.start, snd.fack)", __func__)); + KASSERT(SEQ_LT(hole->end, tp->snd_fack), + ("%s: SEG_GEQ(hole.end, snd.fack)", __func__)); + KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), + ("%s: SEQ_GEQ(hole.rxmit, snd.fack)", __func__)); if (SEQ_GEQ(hole->start, hole->end) || SEQ_GEQ(hole->start, tp->snd_fack) || SEQ_GEQ(hole->end, tp->snd_fack) || diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 17a0744961ce..fed259f4d8e1 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -2173,7 +2173,7 @@ bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin) log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay; log.u_bbr.flex4 = bbr->rc_tp->ts_offset; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; - log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv); + log.u_bbr.pkts_out = tcp_tv_to_msec(&bbr->rc_tv); log.u_bbr.flex6 = tsin; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = bbr->rc_ack_was_delayed; @@ -2241,13 +2241,13 @@ bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uin mbuf_tstmp2timespec(m, &ts); tv.tv_sec = ts.tv_sec; tv.tv_usec = ts.tv_nsec / 1000; - log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv); + log.u_bbr.lt_epoch = tcp_tv_to_usec(&tv); } else { log.u_bbr.lt_epoch = 0; } if (m->m_flags & M_TSTMP_LRO) { mbuf_tstmp2timeval(m, &tv); - log.u_bbr.flex5 = tcp_tv_to_usectick(&tv); + log.u_bbr.flex5 = tcp_tv_to_usec(&tv); } else { /* No arrival timestamp */ log.u_bbr.flex5 = 0; @@ -5126,8 +5126,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) tp->t_maxseg = tp->t_pmtud_saved_maxseg; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; @@ -6792,7 +6792,7 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, (ack_type == BBR_CUM_ACKED) && (to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { - t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr; + t = tcp_tv_to_msec(&bbr->rc_tv) - to->to_tsecr; if (t < 1) t = 1; t *= MS_IN_USEC; @@ -7330,7 +7330,7 @@ bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, uint32_t ts, now, rtt; ts = bbr_ts_convert(to->to_tsecr); - now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv)); + now = bbr_ts_convert(tcp_tv_to_msec(&bbr->rc_tv)); rtt = now - ts; if (rtt < 1) rtt = 1; @@ -7863,7 +7863,7 @@ nothing_left: /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); - ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); BBR_STAT_INC(bbr_dropped_af_data); return (1); } @@ -8461,7 +8461,7 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -8893,7 +8893,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((to->to_flags & TOF_TS) != 0) { uint32_t t, rtt; - t = tcp_tv_to_mssectick(&bbr->rc_tv); + t = tcp_tv_to_msec(&bbr->rc_tv); if (TSTMP_GEQ(t, to->to_tsecr)) { rtt = t - to->to_tsecr; if (rtt == 0) { @@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -9034,7 +9034,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } tp->snd_wnd = tiwin; @@ -9067,7 +9067,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((to->to_flags & TOF_TS) != 0) { uint32_t t, rtt; - t = tcp_tv_to_mssectick(&bbr->rc_tv); + t = tcp_tv_to_msec(&bbr->rc_tv); if (TSTMP_GEQ(t, to->to_tsecr)) { rtt = t - to->to_tsecr; if (rtt == 0) { @@ -9258,7 +9258,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -9355,7 +9355,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -9405,7 +9405,7 @@ close_now: tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); KMOD_TCPSTAT_INC(tcps_rcvafterclose); - ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); + ctf_do_dropwithreset(m, tp, th, *tlen); return (1); } if (sbavail(&so->so_snd) == 0) @@ -9486,7 +9486,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -9602,7 +9602,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -9704,7 +9704,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -9818,7 +9818,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* @@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -10141,7 +10141,7 @@ bbr_init(struct tcpcb *tp, void **ptr) * flags. */ bbr_stop_all_timers(tp, bbr); - /* + /* * Validate the timers are not in usec, if they are convert. * BBR should in theory move to USEC and get rid of a * lot of the TICKS_2 calls.. but for now we stay @@ -10150,7 +10150,7 @@ bbr_init(struct tcpcb *tp, void **ptr) tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, tcp_rexmit_max); bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); return (0); } @@ -11327,7 +11327,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, mbuf_tstmp2timespec(m, &ts); bbr->rc_tv.tv_sec = ts.tv_sec; bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; - bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); + bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usec(&bbr->rc_tv); } else if (m->m_flags & M_TSTMP_LRO) { /* Next the arrival timestamp */ struct timespec ts; @@ -11335,7 +11335,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, mbuf_tstmp2timespec(m, &ts); bbr->rc_tv.tv_sec = ts.tv_sec; bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; - bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); + bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usec(&bbr->rc_tv); } else { /* * Ok just get the current time. @@ -11376,7 +11376,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; - if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv))) + if (TSTMP_GT(to.to_tsecr, tcp_tv_to_msec(&bbr->rc_tv))) to.to_tsecr = 0; } /* @@ -11414,7 +11414,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, (tp->t_flags & TF_REQ_TSTMP)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; - tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv); } else tp->t_flags &= ~TF_REQ_TSTMP; if (to.to_flags & TOF_MSS) @@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } if (tiwin > bbr->r_ctl.rc_high_rwnd) @@ -11544,7 +11544,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (nxt_pkt == 0) { if ((bbr->r_wanted_output != 0) || - (tp->t_flags & TF_ACKNOW)) { + (tp->t_flags & TF_ACKNOW)) { bbr->rc_output_starts_timer = 0; did_out = 1; @@ -11870,7 +11870,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* We take a cache hit here */ memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); - cts = tcp_tv_to_usectick(&bbr->rc_tv); + cts = tcp_tv_to_usec(&bbr->rc_tv); inp = bbr->rc_inp; hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); tp->t_flags2 &= ~TF2_HPTS_CALLS; @@ -12885,7 +12885,7 @@ send: /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { - to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset; + to.to_tsval = tcp_tv_to_msec(&bbr->rc_tv) + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; local_options += TCPOLEN_TIMESTAMP + 2; @@ -12893,7 +12893,7 @@ send: /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) - tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->rfbuf_ts = tcp_tv_to_msec(&bbr->rc_tv); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; @@ -13172,11 +13172,7 @@ send: mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, - ((rsm == NULL) ? hw_tls : 0) -#ifdef NETFLIX_COPY_ARGS - , NULL, NULL -#endif - ); + ((rsm == NULL) ? hw_tls : 0)); if (len <= maxseg) { /* * Must have ran out of mbufs for the copy @@ -13806,8 +13802,8 @@ nomore: tp->t_maxseg = old_maxseg - 40; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; @@ -14127,17 +14123,17 @@ bbr_switch_failed(struct tcpcb *tp) toval = bbr->rc_pacer_started - cts; } else { /* one slot please */ - toval = HPTS_TICKS_PER_SLOT; + toval = HPTS_USECS_PER_SLOT; } } else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) { toval = bbr->r_ctl.rc_timer_exp - cts; } else { /* one slot please */ - toval = HPTS_TICKS_PER_SLOT; + toval = HPTS_USECS_PER_SLOT; } } else - toval = HPTS_TICKS_PER_SLOT; + toval = HPTS_USECS_PER_SLOT; (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), __LINE__, &diag); bbr_log_hpts_diag(bbr, cts, &diag); diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index f5bc435890e7..71dd4de6baf9 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -40,7 +40,6 @@ #endif #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/lock.h> #include <sys/mutex.h> #include <sys/mbuf.h> #include <sys/proc.h> /* for proc0 declaration */ @@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0; static uint32_t rack_pcm_is_enabled = 1; static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ -static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */ +static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */ static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ @@ -605,7 +604,7 @@ rack_get_lt_bw(struct tcp_rack *rack) /* Include all the current bytes too */ microuptime(&tv); bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); - tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); + tim += (tcp_tv_to_lusec(&tv) - rack->r_ctl.lt_timemark); } if ((bytes != 0) && (tim != 0)) return ((bytes * (uint64_t)1000000) / tim); @@ -621,7 +620,7 @@ rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) struct tcpcb *tp; uint32_t old_beta; uint32_t old_beta_ecn; - int error, failed = 0; + int error = 0, failed = 0; tp = rack->rc_tp; if (tp->t_cc == NULL) { @@ -684,7 +683,7 @@ out: struct newreno *ptr; ptr = ((struct newreno *)tp->t_ccv.cc_data); - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = ptr->beta; log.u_bbr.flex2 = ptr->beta_ecn; @@ -938,7 +937,7 @@ rack_init_sysctls(void) SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "time_between", CTLFLAG_RW, - & rack_time_between_probertt, 96000000, + &rack_time_between_probertt, 96000000, "How many useconds between the lowest rtt falling must past before we enter probertt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), @@ -2246,7 +2245,7 @@ rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) ent = rack->r_ctl.rc_last_sft; microuptime(&tv); - timenow = tcp_tv_to_lusectick(&tv); + timenow = tcp_tv_to_lusec(&tv); if (timenow >= ent->deadline) { /* No time left we do DGP only */ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, @@ -2678,7 +2677,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t */ return; } - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = tsused; log.u_bbr.flex2 = thresh; log.u_bbr.flex3 = rsm->r_flags; @@ -2709,7 +2708,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rack->rc_tp->t_srtt; log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; @@ -2752,7 +2751,7 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; @@ -2792,7 +2791,7 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex8 = flag; log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.cur_del_rate = (uintptr_t)prev; @@ -2840,7 +2839,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; @@ -2889,7 +2888,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; - log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + log.u_bbr.bw_inuse = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); log.u_bbr.bw_inuse <<= 32; if (rsm) log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); @@ -3013,7 +3012,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; @@ -3042,7 +3041,7 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; if (rack->rack_no_prr) @@ -3149,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; @@ -3185,7 +3184,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32 if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; @@ -3230,7 +3229,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack, /* No you can't use 1, its for the real to cancel */ return; } - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; @@ -3255,7 +3254,7 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; @@ -3285,7 +3284,7 @@ rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; if (rack->rack_no_prr) @@ -3480,16 +3479,16 @@ static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_flags & RACK_APP_LIMITED) { - if (rack->r_ctl.rc_app_limited_cnt > 0) { - rack->r_ctl.rc_app_limited_cnt--; - } + KASSERT((rack->r_ctl.rc_app_limited_cnt > 0), + ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm)); + rack->r_ctl.rc_app_limited_cnt--; } if (rsm->r_limit_type) { /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } if (rsm == rack->r_ctl.rc_first_appl) { - rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start); + rack->r_ctl.cleared_app_ack_seq = rsm->r_end; rack->r_ctl.cleared_app_ack = 1; if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_first_appl = NULL; @@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) * earlier. * * So lets calculate the BDP with the "known" b/w using - * the SRTT has our rtt and then multiply it by the - * goal. + * the SRTT as our rtt and then multiply it by the goal. */ bw = rack_get_bw(rack); srtt = (uint64_t)tp->t_srtt; @@ -3646,7 +3644,7 @@ rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ } /* Now what about time? */ srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); - tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; + tim = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - tp->gput_ts; if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { /* * We do not allow a measurement if we are in recovery @@ -4118,7 +4116,7 @@ rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; @@ -4864,7 +4862,7 @@ rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, ui union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = add_part; log.u_bbr.flex2 = sub_part; @@ -4893,7 +4891,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, uint64_t resid_bw, subpart = 0, addpart = 0, srtt; int did_add = 0; - us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); if (TSTMP_GEQ(us_cts, tp->gput_ts)) tim = us_cts - tp->gput_ts; @@ -5214,7 +5212,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.current_round; log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; @@ -5250,7 +5248,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.current_round; log.u_bbr.flex2 = (uint32_t)gp_est; @@ -5357,7 +5355,7 @@ skip_measurement: rack->r_ctl.rc_gp_lowrtt = 0xffffffff; rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; - tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); rack->app_limited_needs_set = 0; tp->gput_seq = th_ack; if (rack->in_probe_rtt) @@ -5492,7 +5490,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); rack->r_ctl.lt_seq = tp->snd_max; - tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); if (tmark >= rack->r_ctl.lt_timemark) { rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); } @@ -5533,7 +5531,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = th_ack; log.u_bbr.flex2 = tp->t_ccv.flags; @@ -5648,7 +5646,7 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = th_ack; log.u_bbr.flex2 = tp->t_ccv.flags; @@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) tp->t_badrxtwin = 0; break; } - if ((CC_ALGO(tp)->cong_signal != NULL) && + if ((CC_ALGO(tp)->cong_signal != NULL) && (type != CC_RTO)){ tp->t_ccv.curack = ack; CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); @@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as - * passed we no longer consider reordering has occuring. + * passed we no longer consider reordering as occurring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro @@ -6347,7 +6345,7 @@ activate_tlp: if (to < rack_tlp_min) { to = rack_tlp_min; } - if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { + if (to > TICKS_2_USEC(tcp_rexmit_max)) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. @@ -6392,7 +6390,7 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_se rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); rack->r_ctl.lt_seq = snd_una; - tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); if (tmark >= rack->r_ctl.lt_timemark) { rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); } @@ -6481,7 +6479,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = diag->p_nxt_slot; log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; @@ -6520,7 +6518,7 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = sb->sb_flags; log.u_bbr.flex2 = len; log.u_bbr.flex3 = sb->sb_state; @@ -6594,22 +6592,22 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * on the clock. We always have a min * 10 slots (10 x 10 i.e. 100 usecs). */ - if (slot <= HPTS_TICKS_PER_SLOT) { + if (slot <= HPTS_USECS_PER_SLOT) { /* We gain delay */ - rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); - slot = HPTS_TICKS_PER_SLOT; + rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot); + slot = HPTS_USECS_PER_SLOT; } else { /* We take off some */ - rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); - slot = HPTS_TICKS_PER_SLOT; + rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT); + slot = HPTS_USECS_PER_SLOT; } } else { slot -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; /* Make sure we have 100 useconds at minimum */ - if (slot < HPTS_TICKS_PER_SLOT) { - rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; - slot = HPTS_TICKS_PER_SLOT; + if (slot < HPTS_USECS_PER_SLOT) { + rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot; + slot = HPTS_USECS_PER_SLOT; } if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0; @@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, /* Push bit must go to the right edge as well */ if (rsm->r_flags & RACK_HAD_PUSH) rsm->r_flags &= ~RACK_HAD_PUSH; + /* Update the count if app limited */ + if (nrsm->r_flags & RACK_APP_LIMITED) + rack->r_ctl.rc_app_limited_cnt++; /* Clone over the state of the hw_tls flag */ nrsm->r_hw_tls = rsm->r_hw_tls; /* @@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack, l_rsm->r_flags |= RACK_TLP; if (r_rsm->r_flags & RACK_RWND_COLLAPSED) l_rsm->r_flags |= RACK_RWND_COLLAPSED; - if ((r_rsm->r_flags & RACK_APP_LIMITED) && + if ((r_rsm->r_flags & RACK_APP_LIMITED) && ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { /* * If both are app-limited then let the @@ -7887,8 +7888,8 @@ drop_it: tp->t_maxseg = tp->t_pmtud_saved_maxseg; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; @@ -8032,6 +8033,7 @@ skip_time_check: ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { rack->r_ctl.rc_tlp_rxt_last_time = cts; + rack->r_fast_output = 0; ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); } else if (timers & PACE_TMR_RXT) { rack->r_ctl.rc_tlp_rxt_last_time = cts; @@ -8136,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, * remove the lost desgination and reduce the * bytes considered lost. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -8778,7 +8780,7 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) } stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); #endif - rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); + rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 @@ -8831,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts val = rack_probertt_lower_within * rack_time_between_probertt; val /= 100; - if ((rack->in_probe_rtt == 0) && + if ((rack->in_probe_rtt == 0) && (rack->rc_skip_timely == 0) && ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { rack_enter_probertt(rack, us_cts); @@ -8884,8 +8886,8 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_rack_min_rtt = 1; } } - if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) - us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) + us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; else us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; if (us_rtt == 0) @@ -8894,7 +8896,7 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, /* Kick the RTT to the CC */ CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); } - rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); + rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usec(&rack->r_ctl.act_rcv_time)); if (ack_type == SACKED) { rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); @@ -8989,8 +8991,8 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, * we retransmitted. This is because * we match the timestamps. */ - if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) - us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; + if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) + us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; else us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); @@ -9183,7 +9185,7 @@ rack_need_set_test(struct tcpcb *tp, seq = tp->gput_seq; ts = tp->gput_ts; rack->app_limited_needs_set = 0; - tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); /* Do we start at a new end? */ if ((use_which == RACK_USE_BEG) && SEQ_GEQ(rsm->r_start, tp->gput_seq)) { @@ -10368,7 +10370,7 @@ more: * and yet before retransmitting we get an ack * which can happen due to reordering. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -10818,7 +10820,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered changed = th_ack - rsm->r_start; if (changed) { rack_process_to_cumack(tp, rack, th_ack, cts, to, - tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); + tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time)); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left and no sack. */ @@ -11064,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) * We need to skip anything already set * to be retransmitted. */ - if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || (rsm->r_flags & RACK_MUST_RXT)) { rsm = TAILQ_NEXT(rsm, r_tnext); continue; @@ -11696,7 +11698,7 @@ rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) rack_log_hybrid_sends(rack, ent, __LINE__); /* calculate the time based on the ack arrival */ data = ent->end - ent->start; - laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + laa = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); if (ent->flags & TCP_TRK_TRACK_FLG_FSND) { if (ent->first_send > ent->localtime) ftim = ent->first_send; @@ -11842,7 +11844,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, * less than and we have not closed our window. */ if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { - rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); if (rack->r_ctl.rc_reorder_ts == 0) rack->r_ctl.rc_reorder_ts = 1; } @@ -12036,7 +12038,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); - ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } } @@ -12874,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -13088,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -13101,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -13135,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -13398,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -13494,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -13516,7 +13518,7 @@ rack_check_data_after_close(struct mbuf *m, tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); KMOD_TCPSTAT_INC(tcps_rcvafterclose); - ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); + ctf_do_dropwithreset(m, tp, th, *tlen); return (1); } if (sbavail(&so->so_snd) == 0) @@ -13644,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -13745,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -13847,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -13951,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, tlen); return (1); } } @@ -14227,7 +14229,7 @@ rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex8 = mod; log.u_bbr.flex1 = flex1; @@ -14366,17 +14368,17 @@ rack_switch_failed(struct tcpcb *tp) toval = rack->r_ctl.rc_last_output_to - cts; } else { /* one slot please */ - toval = HPTS_TICKS_PER_SLOT; + toval = HPTS_USECS_PER_SLOT; } } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { toval = rack->r_ctl.rc_timer_exp - cts; } else { /* one slot please */ - toval = HPTS_TICKS_PER_SLOT; + toval = HPTS_USECS_PER_SLOT; } } else - toval = HPTS_TICKS_PER_SLOT; + toval = HPTS_USECS_PER_SLOT; (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), __LINE__, &diag); rack_log_hpts_diag(rack, cts, &diag, &tv); @@ -14636,9 +14638,6 @@ rack_init(struct tcpcb *tp, void **ptr) if (rack->r_ctl.pcm_s == NULL) { rack->r_ctl.pcm_i.cnt_alloc = 0; } -#ifdef NETFLIX_STATS - rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask; -#endif rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; if (rack_enable_shared_cwnd) @@ -14744,12 +14743,12 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_ctl.rack_per_of_gp_ss = 250; } rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; - rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); - rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); + rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); + rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, rack_probertt_filter_life); - us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_of_last_probertt = us_cts; rack->r_ctl.rc_went_idle_time = us_cts; @@ -14958,7 +14957,7 @@ rack_init(struct tcpcb *tp, void **ptr) if (TSTMP_GT(qr.timer_pacing_to, us_cts)) tov = qr.timer_pacing_to - us_cts; else - tov = HPTS_TICKS_PER_SLOT; + tov = HPTS_USECS_PER_SLOT; } if (qr.timer_hpts_flags & PACE_TMR_MASK) { rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; @@ -14966,7 +14965,7 @@ rack_init(struct tcpcb *tp, void **ptr) if (TSTMP_GT(qr.timer_timer_exp, us_cts)) tov = qr.timer_timer_exp - us_cts; else - tov = HPTS_TICKS_PER_SLOT; + tov = HPTS_USECS_PER_SLOT; } } rack_log_chg_info(tp, rack, 4, @@ -15117,7 +15116,7 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex8 = 10; log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; log.u_bbr.flex2 = rack->rc_free_cnt; @@ -15361,7 +15360,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); } #endif - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; @@ -15386,7 +15385,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent ts.tv_nsec = ae->timestamp % 1000000000; ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; - log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); + log.u_bbr.lt_epoch = tcp_tv_to_usec(<v); } else if (ae->flags & TSTMP_LRO) { /* Record the LRO the arrival timestamp */ log.u_bbr.flex3 = M_TSTMP_LRO; @@ -15394,7 +15393,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent ts.tv_nsec = ae->timestamp % 1000000000; ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; - log.u_bbr.flex5 = tcp_tv_to_usectick(<v); + log.u_bbr.flex5 = tcp_tv_to_usec(<v); } log.u_bbr.timeStamp = tcp_get_usecs(<v); /* Log the rcv time */ @@ -15562,10 +15561,10 @@ rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; struct timeval tv; - + (void)tcp_get_usecs(&tv); - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); + memset(&log, 0, sizeof(log)); + log.u_bbr.timeStamp = tcp_tv_to_usec(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex8 = mod; log.u_bbr.flex1 = flex1; @@ -15647,7 +15646,7 @@ rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.current_round; log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; @@ -15748,8 +15747,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb the_win = tp->snd_wnd; win_seq = tp->snd_wl1; win_upd_ack = tp->snd_wl2; - cts = tcp_tv_to_usectick(tv); - ms_cts = tcp_tv_to_mssectick(tv); + cts = tcp_tv_to_usec(tv); + ms_cts = tcp_tv_to_msec(tv); rack->r_ctl.rc_rcvtime = cts; segsiz = ctf_fixed_maxseg(tp); if ((rack->rc_gp_dyn_mul) && @@ -15865,7 +15864,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb * or it could be a keep-alive or persists */ if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { - rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); if (rack->r_ctl.rc_reorder_ts == 0) rack->r_ctl.rc_reorder_ts = 1; } @@ -15884,7 +15883,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb } if (rack->forced_ack) { rack_handle_probe_response(rack, tiwin, - tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); + tcp_tv_to_usec(&rack->r_ctl.act_rcv_time)); } #ifdef TCP_ACCOUNTING win_up_req = 1; @@ -15931,7 +15930,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb rack->r_ctl.act_rcv_time = *tv; } rack_process_to_cumack(tp, rack, ae->ack, cts, to, - tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); + tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time)); #ifdef TCP_REQUEST_TRK rack_req_check_for_comp(rack, high_seq); #endif @@ -16399,7 +16398,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * must process the ack coming in but need to defer sending * anything becase a pacing timer is running. */ - us_cts = tcp_tv_to_usectick(tv); + us_cts = tcp_tv_to_usec(tv); if (m->m_flags & M_ACKCMP) { /* * All compressed ack's are ack's by definition so @@ -16467,8 +16466,8 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if (m->m_flags & M_ACKCMP) { panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); } - cts = tcp_tv_to_usectick(tv); - ms_cts = tcp_tv_to_mssectick(tv); + cts = tcp_tv_to_usec(tv); + ms_cts = tcp_tv_to_msec(tv); nsegs = m->m_pkthdr.lro_nsegs; counter_u64_add(rack_proc_non_comp_ack, 1); #ifdef TCP_ACCOUNTING @@ -16570,7 +16569,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); } #endif - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; @@ -16596,13 +16595,13 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, mbuf_tstmp2timespec(m, &ts); ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; - log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); + log.u_bbr.lt_epoch = tcp_tv_to_usec(<v); } else if (m->m_flags & M_TSTMP_LRO) { /* Record the LRO the arrival timestamp */ mbuf_tstmp2timespec(m, &ts); ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; - log.u_bbr.flex5 = tcp_tv_to_usectick(<v); + log.u_bbr.flex5 = tcp_tv_to_usec(<v); } log.u_bbr.timeStamp = tcp_get_usecs(<v); /* Log the rcv time */ @@ -16654,7 +16653,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); #ifdef TCP_ACCOUNTING sched_unpin(); #endif @@ -16820,7 +16819,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, } if (thflags & TH_FIN) tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); - us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); if ((rack->rc_gp_dyn_mul) && (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { @@ -16918,7 +16917,7 @@ do_output_now: } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { goto do_output_now; } else if ((no_output == 1) && - (nxt_pkt == 0) && + (nxt_pkt == 0) && (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use @@ -17178,6 +17177,12 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->use_fixed_rate; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_always_pace; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->gp_ready; log.u_bbr.bbr_substate = quality; log.u_bbr.bbr_state = rack->dgp_on; log.u_bbr.bbr_state <<= 1; @@ -17344,7 +17349,7 @@ at_lt_bw: union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack_bw_multipler; log.u_bbr.flex2 = len; @@ -17539,8 +17544,8 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str rack->r_ctl.rc_last_us_rtt, 88, __LINE__, NULL, gain); } - if ((bw_est == 0) || (rate_wanted == 0) || - ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { + if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && + (rack->use_fixed_rate == 0)) { /* * No way yet to make a b/w estimate or * our raise is set incorrectly. @@ -17979,7 +17984,7 @@ start_set: tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); rack->r_ctl.rc_gp_cumack_ts = 0; if ((rack->r_ctl.cleared_app_ack == 1) && - (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) { + (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) { /* * We just cleared an application limited period * so the next seq out needs to skip the first @@ -18102,7 +18107,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_ union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = error; log.u_bbr.flex2 = flags; @@ -18367,7 +18372,7 @@ rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); #endif - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = p_rate; log.u_bbr.flex2 = p_queue; @@ -18820,7 +18825,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma counter_u64_add(rack_collapsed_win_rxt, 1); counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); } - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; @@ -19039,7 +19044,7 @@ rack_sndbuf_autoscale(struct tcp_rack *rack) static int rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, - uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) + uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line) { /* * Enter to do fast output. We are given that the sched_pin is @@ -19212,7 +19217,7 @@ again: } if (rack->r_ctl.fsb.rfo_apply_push && (len == rack->r_ctl.fsb.left_to_send)) { - tcp_set_flags(th, flags | TH_PUSH); + flags |= TH_PUSH; add_flag |= RACK_HAD_PUSH; } if ((m->m_next == NULL) || (len <= 0)){ @@ -19369,7 +19374,7 @@ again: if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; @@ -19391,11 +19396,11 @@ again: log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex5 = log.u_bbr.inflight; log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; - log.u_bbr.delivered = 0; + log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send; log.u_bbr.rttProp = 0; log.u_bbr.delRate = rack->r_must_retran; log.u_bbr.delRate <<= 1; - log.u_bbr.pkt_epoch = __LINE__; + log.u_bbr.pkt_epoch = line; /* For fast output no retrans so just inflight and how many mss we send */ log.u_bbr.flex5 = log.u_bbr.inflight; log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); @@ -19437,7 +19442,7 @@ again: } if ((error == 0) && (rack->lt_bw_up == 0)) { /* Unlikely */ - rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv); + rack->r_ctl.lt_timemark = tcp_tv_to_lusec(tv); rack->r_ctl.lt_seq = tp->snd_una; rack->lt_bw_up = 1; } else if ((error == 0) && @@ -19468,7 +19473,7 @@ again: tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); rack->forced_ack = 0; /* If we send something zap the FA flag */ - tot_len += len; + *tot_len += len; if ((tp->t_flags & TF_GPUTINPROG) == 0) rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); tp->snd_max += len; @@ -19504,6 +19509,7 @@ again: } if ((rack->r_ctl.fsb.left_to_send >= segsiz) && (max_val > len) && + (*tot_len < rack->r_ctl.rc_pace_max_segs) && (tso == 0)) { max_val -= len; len = segsiz; @@ -19515,14 +19521,14 @@ again: } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); counter_u64_add(rack_fto_send, 1); - slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__); - rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); + slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); + rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); - tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); + tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz); } sched_unpin(); #endif @@ -19779,7 +19785,7 @@ rack_output(struct tcpcb *tp) #endif early = 0; cts = tcp_get_usecs(&tv); - ms_cts = tcp_tv_to_mssectick(&tv); + ms_cts = tcp_tv_to_msec(&tv); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && tcp_in_hpts(rack->rc_tp)) { /* @@ -19884,20 +19890,36 @@ rack_output(struct tcpcb *tp) TCPS_HAVEESTABLISHED(tp->t_state)) { rack_set_state(tp, rack); } + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + minseg = segsiz; + if (rack->r_ctl.rc_pace_max_segs == 0) + pace_max_seg = rack->rc_user_set_max_segs * segsiz; + else + pace_max_seg = rack->r_ctl.rc_pace_max_segs; if ((rack->r_fast_output) && (doing_tlp == 0) && (tp->rcv_numsacks == 0)) { int ret; error = 0; - ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); - if (ret >= 0) + ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); + if (ret > 0) return(ret); else if (error) { inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; goto nomore; + } else { + /* Return == 0, if there is more we can send tot_len wise fall through and send */ + if (tot_len_this_send >= pace_max_seg) + return (ret); +#ifdef TCP_ACCOUNTING + /* We need to re-pin since fast_output un-pined */ + sched_pin(); + ts_val = get_cyclecount(); +#endif + /* Fall back out so we can send any more that may bring us to pace_max_seg */ } } inp = rack->rc_inp; @@ -20001,15 +20023,9 @@ rack_output(struct tcpcb *tp) again: sendalot = 0; cts = tcp_get_usecs(&tv); - ms_cts = tcp_tv_to_mssectick(&tv); + ms_cts = tcp_tv_to_msec(&tv); tso = 0; mtu = 0; - segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); - minseg = segsiz; - if (rack->r_ctl.rc_pace_max_segs == 0) - pace_max_seg = rack->rc_user_set_max_segs * segsiz; - else - pace_max_seg = rack->r_ctl.rc_pace_max_segs; if (TCPS_HAVEESTABLISHED(tp->t_state) && (rack->r_ctl.pcm_max_seg == 0)) { /* @@ -20025,7 +20041,7 @@ again: rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; } } - if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { + if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { uint32_t rw_avail, cwa; if (tp->snd_wnd > ctf_outstanding(tp)) @@ -20871,6 +20887,7 @@ just_return_nolock: rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && ((IN_RECOVERY(tp->t_flags)) == 0) && + (doing_tlp == 0) && (rack->r_must_retran == 0) && ((tp->t_flags & TF_NEEDFIN) == 0) && (len > 0) && (orig_len > 0) && @@ -21012,7 +21029,7 @@ just_return_nolock: } else log = 1; } - /* Mark the last packet has app limited */ + /* Mark the last packet as app limited */ rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) @@ -21364,7 +21381,8 @@ send: if (max_len <= 0) { len = 0; } else if (len > max_len) { - sendalot = 1; + if (doing_tlp == 0) + sendalot = 1; len = max_len; mark = 2; } @@ -21535,11 +21553,7 @@ send: m->m_next = tcp_m_copym( mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, - ((rsm == NULL) ? hw_tls : 0) -#ifdef NETFLIX_COPY_ARGS - , &s_mb, &s_moff -#endif - ); + ((rsm == NULL) ? hw_tls : 0)); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -21593,7 +21607,6 @@ send: flags |= TH_PUSH; add_flag |= RACK_HAD_PUSH; } - SOCK_SENDBUF_UNLOCK(so); } else { SOCK_SENDBUF_UNLOCK(so); @@ -21886,7 +21899,7 @@ send: if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; @@ -22062,6 +22075,8 @@ out: * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ + if ((rsm == NULL) && doing_tlp) + add_flag |= RACK_TLP; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, rack_to_usec_ts(&tv), rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); @@ -22075,7 +22090,7 @@ out: } if (rsm == NULL) { if (rack->lt_bw_up == 0) { - rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); + rack->r_ctl.lt_timemark = tcp_tv_to_lusec(&tv); rack->r_ctl.lt_seq = tp->snd_una; rack->lt_bw_up = 1; } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { @@ -22148,15 +22163,14 @@ out: rack->r_ctl.rc_prr_sndcnt = 0; } sub_from_prr = 0; - if (doing_tlp) { - /* Make sure the TLP is added */ - add_flag |= RACK_TLP; - } else if (rsm) { - /* If its a resend without TLP then it must not have the flag */ - rsm->r_flags &= ~RACK_TLP; - } - - + if (rsm != NULL) { + if (doing_tlp) + /* Make sure the TLP is added */ + rsm->r_flags |= RACK_TLP; + else + /* If its a resend without TLP then it must not have the flag */ + rsm->r_flags &= ~RACK_TLP; + } if ((error == 0) && (len > 0) && (tp->snd_una == tp->snd_max)) @@ -22494,6 +22508,7 @@ enobufs: ((flags & (TH_SYN|TH_FIN)) == 0) && (rsm == NULL) && (ipoptlen == 0) && + (doing_tlp == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && ((IN_RECOVERY(tp->t_flags)) == 0) && @@ -22520,6 +22535,7 @@ enobufs: rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && (rsm == NULL) && + (doing_tlp == 0) && (ipoptlen == 0) && (rack->r_must_retran == 0) && rack->r_fsb_inited && @@ -22536,7 +22552,7 @@ enobufs: segsiz, pace_max_seg, hw_tls, flags); if (rack->r_fast_output) { error = 0; - ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); + ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); if (ret >= 0) return (ret); else if (error) @@ -22822,7 +22838,7 @@ process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) rack->r_ctl.rc_fixed_pacing_rate_ca = 0; rack->r_ctl.rc_fixed_pacing_rate_ss = 0; /* Now allocate or find our entry that will have these settings */ - sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0); + sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusec(&tv), 0); if (sft == NULL) { rack->rc_tp->tcp_hybrid_error++; /* no space, where would it have gone? */ diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index da26b8cb1f9b..fc12672a45f7 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -507,13 +507,11 @@ ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, - int32_t rstreason, int32_t tlen) + int32_t tlen) { - if (tp != NULL) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); + tcp_dropwithreset(m, th, tp, tlen); + if (tp != NULL) INP_WUNLOCK(tptoinpcb(tp)); - } else - tcp_dropwithreset(m, th, NULL, tlen, rstreason); } void @@ -672,7 +670,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, tlen); return; } else *ret_val = 0; @@ -866,10 +864,10 @@ ctf_calc_rwin(struct socket *so, struct tcpcb *tp) void ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, - int32_t rstreason, int32_t tlen) + int32_t tlen) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); + tcp_dropwithreset(m, th, tp, tlen); tp = tcp_drop(tp, ETIMEDOUT); if (tp) INP_WUNLOCK(tptoinpcb(tp)); diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h index 6a8a056d89b0..cd33cb8ce50b 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.h +++ b/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -101,7 +101,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t rstreason, int32_t tlen); + struct tcphdr *th, int32_t tlen); void ctf_do_drop(struct mbuf *m, struct tcpcb *tp); @@ -125,7 +125,7 @@ ctf_calc_rwin(struct socket *so, struct tcpcb *tp); void ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, - int32_t rstreason, int32_t tlen); + int32_t tlen); uint32_t ctf_fixed_maxseg(struct tcpcb *tp); diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c index 09e90da88895..759bfda98357 100644 --- a/sys/netinet/tcp_stacks/rack_pcm.c +++ b/sys/netinet/tcp_stacks/rack_pcm.c @@ -172,9 +172,9 @@ rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, uint32_t start, uint3 goto skip_ack_accounting; } /* - * Record ACK data. + * Record ACK data. */ - ack_arrival = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + ack_arrival = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); if (SEQ_GT(end, rack->r_ctl.pcm_i.eseq)) { /* Trim the end to the end of our range if it is beyond */ end = rack->r_ctl.pcm_i.eseq; @@ -241,8 +241,8 @@ skip_ack_accounting: for (i=0; i<rack->r_ctl.pcm_i.cnt; i++) { e = &rack->r_ctl.pcm_s[i]; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); + memset(&log, 0, sizeof(log)); + log.u_bbr.timeStamp = tcp_tv_to_usec(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex8 = 1; log.u_bbr.flex1 = e->sseq; @@ -286,7 +286,7 @@ skip_ack_accounting: * Prev time holds the last ack arrival time. */ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); + log.u_bbr.timeStamp = tcp_tv_to_usec(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex8 = 2; log.u_bbr.flex1 = rack->r_ctl.pcm_i.sseq; @@ -305,7 +305,7 @@ skip_ack_accounting: 0, &log, false, NULL, NULL, 0, &tv); } } - /* + /* * Here we need a lot to be added including: * 1) Some form of measurement, where if we think the measurement * is valid we iterate over the PCM data and come up with a path diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c index fc9ee8454a1e..2b70548f3cc6 100644 --- a/sys/netinet/tcp_stacks/sack_filter.c +++ b/sys/netinet/tcp_stacks/sack_filter.c @@ -400,7 +400,7 @@ sack_filter_run(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq break; } /* Copy it out to the outbound */ - memcpy(&in[at], &blkboard[i], sizeof(struct sackblk)); + memcpy(&in[at], &blkboard[i], sizeof(struct sackblk)); at++; room--; /* now lets add it to our sack-board */ @@ -588,7 +588,7 @@ sack_filter_blks(struct tcpcb *tp, struct sack_filter *sf, struct sackblk *in, i sf->sf_ack = th_ack; for(i=0, sf->sf_cur=0; i<numblks; i++) { - if ((in[i].end != tp->snd_max) && + if ((in[i].end != tp->snd_max) && ((in[i].end - in[i].start) < segmax)) { /* * We do not accept blocks less than a MSS minus all @@ -707,7 +707,7 @@ main(int argc, char **argv) out = stdout; memset(&tp, 0, sizeof(tp)); tp.t_maxseg = 1460; - + while ((i = getopt(argc, argv, "dIi:o:?hS:")) != -1) { switch (i) { case 'S': @@ -883,7 +883,7 @@ main(int argc, char **argv) } else { printf("can't open sack_setup.bin -- sorry no load\n"); } - + } else if (strncmp(buffer, "help", 4) == 0) { help: fprintf(out, "You can input:\n"); diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h index b12fcf84567c..a1c0684a4359 100644 --- a/sys/netinet/tcp_stacks/sack_filter.h +++ b/sys/netinet/tcp_stacks/sack_filter.h @@ -42,7 +42,7 @@ * previously processed sack information. * * The second thing that the sack filter does is help protect against malicious - * attackers that are trying to attack any linked lists (or other data structures) + * attackers that are trying to attack any linked lists (or other data structures) * that are used in sack processing. Consider an attacker sending in sacks for * every other byte of data outstanding. This could in theory drastically split * up any scoreboard you are maintaining and make you search through a very large diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h index f88efe3c9ef9..10ddd12bda75 100644 --- a/sys/netinet/tcp_stacks/tcp_bbr.h +++ b/sys/netinet/tcp_stacks/tcp_bbr.h @@ -347,8 +347,6 @@ struct bbr_log_sysctl_out { /* * Locking for the rack control block. * a) Locked by INP_WLOCK - * b) Locked by the hpts-mutex - * */ #define BBR_STATE_STARTUP 0x01 #define BBR_STATE_DRAIN 0x02 diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index 4374594a1d82..144b4fabf7eb 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -327,8 +327,6 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; /* * Locking for the rack control block. * a) Locked by INP_WLOCK - * b) Locked by the hpts-mutex - * */ #define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ #define RETRAN_CNT_SIZE 16 @@ -614,7 +612,6 @@ struct rack_control { struct tcp_rack { /* First cache line 0x00 */ - TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 34964ed8283c..2e039ebbfdd2 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -82,6 +82,7 @@ #include <netinet/ip.h> #include <netinet/ip_icmp.h> #include <netinet/ip_var.h> +#include <netinet/icmp_var.h> #ifdef INET6 #include <netinet/icmp6.h> #include <netinet/ip6.h> @@ -1032,10 +1033,6 @@ tcp_default_fb_init(struct tcpcb *tp, void **ptr) /* We don't use the pointer */ *ptr = NULL; - KASSERT(tp->t_state < TCPS_TIME_WAIT, - ("%s: connection %p in unexpected state %d", __func__, tp, - tp->t_state)); - /* Make sure we get no interesting mbuf queuing behavior */ /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); @@ -1052,7 +1049,8 @@ tcp_default_fb_init(struct tcpcb *tp, void **ptr) if (tp->t_rxtshift == 0) tp->t_rxtcur = rexmt; else - TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, + tcp_rexmit_max); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't @@ -1454,6 +1452,7 @@ tcp_vnet_init(void *arg __unused) VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; + V_tcp_msl_local = TCPTV_MSL_LOCAL; arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, @@ -1473,11 +1472,8 @@ tcp_init(void *arg __unused) tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; - if (tcp_rexmit_initial < 1) - tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; - if (tcp_rexmit_min < 1) - tcp_rexmit_min = 1; + tcp_rexmit_max = TCPTV_REXMTMAX; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; @@ -2086,7 +2082,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; @@ -2161,6 +2157,13 @@ tcp_send_challenge_ack(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m) sbintime_t now; bool send_challenge_ack; + /* + * The sending of a challenge ACK could be triggered by a blind attacker + * to detect an existing TCP connection. To mitigate that, increment + * also the global counter which would be incremented if the attacker + * would have guessed wrongly. + */ + (void)badport_bandlim(BANDLIM_TCP_RST); if (V_tcp_ack_war_time_window == 0 || V_tcp_ack_war_cnt == 0) { /* ACK war protection is disabled. */ send_challenge_ack = true; @@ -2664,6 +2667,272 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); +#define SND_TAG_STATUS_MAXLEN 128 + +#ifdef KERN_TLS + +static struct sx ktlslist_lock; +SX_SYSINIT(ktlslistlock, &ktlslist_lock, "ktlslist"); +static uint64_t ktls_glob_gen = 1; + +static int +tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) +{ + struct xinpgen xig; + struct inpcb *inp; + struct socket *so; + struct ktls_session *ksr, *kss; + char *buf; + struct xktls_session *xktls; + uint64_t ipi_gencnt; + size_t buflen, len, sz; + u_int cnt; + int error; + bool ek, p; + + sx_assert(&ktlslist_lock, SA_XLOCKED); + if (req->newptr != NULL) + return (EPERM); + + len = 0; + cnt = 0; + ipi_gencnt = V_tcbinfo.ipi_gencnt; + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof(xig); + xig.xig_gen = ktls_glob_gen++; + xig.xig_sogen = so_gencnt; + + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_RLOCKPCB); + while ((inp = inp_next(&inpi)) != NULL) { + if (inp->inp_gencnt > ipi_gencnt || + cr_canseeinpcb(req->td->td_ucred, inp) != 0) + continue; + + so = inp->inp_socket; + if (so != NULL && so->so_gencnt <= xig.xig_sogen) { + p = false; + ek = export_keys && cr_canexport_ktlskeys( + req->td, inp); + ksr = so->so_rcv.sb_tls_info; + if (ksr != NULL) { + ksr->gen = xig.xig_gen; + p = true; + if (ek) { + sz = SIZE_T_MAX; + ktls_session_copy_keys(ksr, + NULL, &sz); + len += sz; + } + if (ksr->snd_tag != NULL && + ksr->snd_tag->sw->snd_tag_status_str != + NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = ksr->snd_tag->sw-> + snd_tag_status_str( + ksr->snd_tag, NULL, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) + len += sz; + } + } + kss = so->so_snd.sb_tls_info; + if (kss != NULL) { + kss->gen = xig.xig_gen; + p = true; + if (ek) { + sz = SIZE_T_MAX; + ktls_session_copy_keys(kss, + NULL, &sz); + len += sz; + } + if (kss->snd_tag != NULL && + kss->snd_tag->sw->snd_tag_status_str != + NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = kss->snd_tag->sw-> + snd_tag_status_str( + kss->snd_tag, NULL, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) + len += sz; + } + } + if (p) { + len += sizeof(*xktls); + len = roundup2(len, __alignof(struct + xktls_session)); + } + } + } + if (req->oldptr == NULL) { + len += 2 * sizeof(xig); + len += 3 * len / 4; + req->oldidx = len; + return (0); + } + + if ((error = sysctl_wire_old_buffer(req, 0)) != 0) + return (error); + + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error != 0) + return (error); + + buflen = roundup2(sizeof(*xktls) + 2 * TLS_MAX_PARAM_SIZE + + 2 * SND_TAG_STATUS_MAXLEN, __alignof(struct xktls_session)); + buf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + struct inpcb_iterator inpi1 = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_RLOCKPCB); + while ((inp = inp_next(&inpi1)) != NULL) { + if (inp->inp_gencnt > ipi_gencnt || + cr_canseeinpcb(req->td->td_ucred, inp) != 0) + continue; + + so = inp->inp_socket; + if (so == NULL) + continue; + + p = false; + ek = export_keys && cr_canexport_ktlskeys(req->td, inp); + ksr = so->so_rcv.sb_tls_info; + kss = so->so_snd.sb_tls_info; + xktls = (struct xktls_session *)buf; + if (ksr != NULL && ksr->gen == xig.xig_gen) { + p = true; + ktls_session_to_xktls_onedir(ksr, ek, &xktls->rcv); + } + if (kss != NULL && kss->gen == xig.xig_gen) { + p = true; + ktls_session_to_xktls_onedir(kss, ek, &xktls->snd); + } + if (!p) + continue; + + xktls->inp_gencnt = inp->inp_gencnt; + xktls->so_pcb = (kvaddr_t)inp; + memcpy(&xktls->coninf, &inp->inp_inc, sizeof(xktls->coninf)); + len = sizeof(*xktls); + if (ksr != NULL && ksr->gen == xig.xig_gen) { + if (ek) { + sz = buflen - len; + ktls_session_copy_keys(ksr, buf + len, &sz); + len += sz; + } else { + xktls->rcv.cipher_key_len = 0; + xktls->rcv.auth_key_len = 0; + } + if (ksr->snd_tag != NULL && + ksr->snd_tag->sw->snd_tag_status_str != NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = ksr->snd_tag->sw->snd_tag_status_str( + ksr->snd_tag, buf + len, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) { + xktls->rcv.drv_st_len = sz; + len += sz; + } + } + } + if (kss != NULL && kss->gen == xig.xig_gen) { + if (ek) { + sz = buflen - len; + ktls_session_copy_keys(kss, buf + len, &sz); + len += sz; + } else { + xktls->snd.cipher_key_len = 0; + xktls->snd.auth_key_len = 0; + } + if (kss->snd_tag != NULL && + kss->snd_tag->sw->snd_tag_status_str != NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = kss->snd_tag->sw->snd_tag_status_str( + kss->snd_tag, buf + len, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) { + xktls->snd.drv_st_len = sz; + len += sz; + } + } + } + len = roundup2(len, __alignof(*xktls)); + xktls->tsz = len; + xktls->fsz = sizeof(*xktls); + + error = SYSCTL_OUT(req, xktls, len); + if (error != 0) { + INP_RUNLOCK(inp); + break; + } + cnt++; + } + + if (error == 0) { + xig.xig_sogen = so_gencnt; + xig.xig_count = cnt; + error = SYSCTL_OUT(req, &xig, sizeof(xig)); + } + + zfree(buf, M_TEMP); + return (error); +} + +static int +tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys) +{ + int repeats, error; + + for (repeats = 0; repeats < 100; repeats++) { + if (sx_xlock_sig(&ktlslist_lock)) + return (EINTR); + error = tcp_ktlslist_locked(oidp, arg1, arg2, req, + export_keys); + sx_xunlock(&ktlslist_lock); + if (error != EDEADLK) + break; + if (sig_intr() != 0) { + error = EINTR; + break; + } + req->oldidx = 0; + } + return (error); +} + +static int +tcp_ktlslist_nokeys(SYSCTL_HANDLER_ARGS) +{ + return (tcp_ktlslist1(oidp, arg1, arg2, req, false)); +} + +static int +tcp_ktlslist_wkeys(SYSCTL_HANDLER_ARGS) +{ + return (tcp_ktlslist1(oidp, arg1, arg2, req, true)); +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST, ktlslist, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, tcp_ktlslist_nokeys, "S,xktls_session", + "List of active kTLS sessions for TCP connections"); +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST_WKEYS, ktlslist_wkeys, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, tcp_ktlslist_wkeys, "S,xktls_session", + "List of active kTLS sessions for TCP connections with keys"); +#endif /* KERN_TLS */ + #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) @@ -2936,7 +3205,7 @@ tcp6_next_pmtu(const struct icmp6_hdr *icmp6) * small, set to the min. */ if (mtu < IPV6_MMTU) - mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ + mtu = IPV6_MMTU; return (mtu); } @@ -4276,7 +4545,7 @@ tcp_change_time_units(struct tcpcb *tp, int granularity) panic("Unknown granularity:%d tp:%p", granularity, tp); } -#endif +#endif } void @@ -4364,7 +4633,7 @@ tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = val; log.u_bbr.rttProp = req->timestamp; diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 606d808676e1..80e6b53d10df 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -131,17 +131,18 @@ static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); -static struct syncache - *syncookie_lookup(struct in_conninfo *, struct syncache_head *, - struct syncache *, struct tcphdr *, struct tcpopt *, - struct socket *, uint16_t); +static bool syncookie_expand(struct in_conninfo *, + const struct syncache_head *, struct syncache *, + struct tcphdr *, struct tcpopt *, struct socket *, + uint16_t); static void syncache_pause(struct in_conninfo *); static void syncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS -static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso, uint16_t port); +static void syncookie_cmp(struct in_conninfo *, + const struct syncache_head *, struct syncache *, + struct tcphdr *, struct tcpopt *, struct socket *, + uint16_t); #endif /* @@ -442,7 +443,7 @@ syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) else TCPT_RANGESET(rexmt, tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits], - tcp_rexmit_min, TCPTV_REXMTMAX); + tcp_rexmit_min, tcp_rexmit_max); sc->sc_rxttime = ticks + rexmt; sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { @@ -1096,6 +1097,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ if (locked && !V_tcp_syncookies) { SCH_UNLOCK(sch); + TCPSTAT_INC(tcps_sc_spurcookie); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", @@ -1105,17 +1107,21 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if (locked && !V_tcp_syncookiesonly && sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { SCH_UNLOCK(sch); + TCPSTAT_INC(tcps_sc_spurcookie); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (no syncache entry)\n", s, __func__); goto failed; } - bzero(&scs, sizeof(scs)); - sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port); if (locked) SCH_UNLOCK(sch); - if (sc == NULL) { + bzero(&scs, sizeof(scs)); + if (syncookie_expand(inc, sch, &scs, th, to, *lsop, port)) { + sc = &scs; + TCPSTAT_INC(tcps_sc_recvcookie); + } else { + TCPSTAT_INC(tcps_sc_failcookie); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " @@ -2251,8 +2257,8 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc) return (iss); } -static struct syncache * -syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, +static bool +syncookie_expand(struct in_conninfo *inc, const struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { @@ -2282,7 +2288,7 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) - return (NULL); + return (false); /* Fill in the syncache values. */ sc->sc_flags = 0; @@ -2342,47 +2348,47 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, sc->sc_port = port; - TCPSTAT_INC(tcps_sc_recvcookie); - return (sc); + return (true); } #ifdef INVARIANTS -static int -syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, +static void +syncookie_cmp(struct in_conninfo *inc, const struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { - struct syncache scs, *scx; + struct syncache scs; char *s; bzero(&scs, sizeof(scs)); - scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port); + if (syncookie_expand(inc, sch, &scs, th, to, lso, port) && + (sc->sc_peer_mss != scs.sc_peer_mss || + sc->sc_requested_r_scale != scs.sc_requested_r_scale || + sc->sc_requested_s_scale != scs.sc_requested_s_scale || + (sc->sc_flags & SCF_SACK) != (scs.sc_flags & SCF_SACK))) { - if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) - return (0); + if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) + return; - if (scx != NULL) { - if (sc->sc_peer_mss != scx->sc_peer_mss) + if (sc->sc_peer_mss != scs.sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", - s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); + s, __func__, sc->sc_peer_mss, scs.sc_peer_mss); - if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) + if (sc->sc_requested_r_scale != scs.sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, - scx->sc_requested_r_scale); + scs.sc_requested_r_scale); - if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) + if (sc->sc_requested_s_scale != scs.sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, - scx->sc_requested_s_scale); + scs.sc_requested_s_scale); - if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) + if ((sc->sc_flags & SCF_SACK) != (scs.sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); - } - if (s != NULL) free(s, M_TCPLOG); - return (0); + } } #endif /* INVARIANTS */ diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 4d8dafaec31d..3b9fe7a317b0 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -74,39 +74,33 @@ #include <netinet/tcpip.h> int tcp_persmin; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT | CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT | CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, - &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", + CTLTYPE_INT | CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, - &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", + CTLTYPE_INT | CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); VNET_DEFINE(int, tcp_msl); @@ -115,21 +109,29 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); +VNET_DEFINE(int, tcp_msl_local); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl_local, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(tcp_msl_local), 0, sysctl_msec_to_ticks, "I", + "Maximum segment lifetime for local communication"); + int tcp_rexmit_initial; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT | CTLFLAG_RW, &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", "Initial Retransmission Timeout"); int tcp_rexmit_min; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT | CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); +int tcp_rexmit_max; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_max, CTLTYPE_INT | CTLFLAG_RW, + &tcp_rexmit_max, 0, sysctl_msec_to_ticks, "I", + "Maximum Retransmission Timeout"); + int tcp_rexmit_slop; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT | CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); @@ -144,8 +146,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT | CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); @@ -162,8 +163,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, "Drop TCP options from 3rd and later retransmitted SYN"); int tcp_maxunacktime = TCPTV_MAXUNACKTIME; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, - CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, CTLTYPE_INT | CTLFLAG_RW, &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", "Maximum time (in ms) that a session can linger without making progress"); @@ -629,8 +629,7 @@ tcp_timer_rexmt(struct tcpcb *tp) rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; - TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, tcp_rexmit_max); /* * We enter the path for PLMTUD if connection is established or, if @@ -758,8 +757,8 @@ tcp_timer_rexmt(struct tcpcb *tp) tp->t_maxseg = tp->t_pmtud_saved_maxseg; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h index a3ca268417ba..34a0f1375463 100644 --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -32,6 +32,8 @@ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ +#ifdef _KERNEL + /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments @@ -71,21 +73,22 @@ /* * Time constants. */ -#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ +#define TCPTV_MSL MSEC_2_TICKS(30000) /* max seg lifetime (hah!) */ +#define TCPTV_MSL_LOCAL MSEC_2_TICKS(10) /* max seg lifetime for local comm */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ -#define TCPTV_RTOBASE ( 1*hz) /* assumed RTO if no info */ +#define TCPTV_RTOBASE MSEC_2_TICKS(1000) /* assumed RTO if no info */ -#define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ -#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ +#define TCPTV_PERSMIN MSEC_2_TICKS(5000) /* minimum persist interval */ +#define TCPTV_PERSMAX MSEC_2_TICKS(60000) /* maximum persist interval */ -#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ -#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ -#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ +#define TCPTV_KEEP_INIT MSEC_2_TICKS(75000) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE MSEC_2_TICKS(120*60*1000) /* dflt time before probing */ +#define TCPTV_KEEPINTVL MSEC_2_TICKS(75000) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_MAXUNACKTIME 0 /* max time without making progress */ -#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ +#define TCPTV_FINWAIT2_TIMEOUT MSEC_2_TICKS(60000) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. @@ -107,15 +110,13 @@ * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ -#define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ -#define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ -#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ - -#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ +#define TCPTV_MIN MSEC_2_TICKS(30) /* minimum allowable value */ +#define TCPTV_CPU_VAR MSEC_2_TICKS(200) /* cpu variance allowed (200ms) */ +#define TCPTV_REXMTMAX MSEC_2_TICKS(64000) /* max allowable REXMT value */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ -#define TCPTV_DELACK ( hz/25 ) /* 40ms timeout */ +#define TCPTV_DELACK MSEC_2_TICKS(40) /* 40ms timeout */ /* * If we exceed this number of retransmits for a single segment, we'll consider @@ -135,8 +136,6 @@ (tv) = (tvmax); \ } while(0) -#ifdef _KERNEL - #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) @@ -165,6 +164,7 @@ extern int tcp_maxunacktime; /* max time without making progress */ extern int tcp_maxpersistidle; extern int tcp_rexmit_initial; extern int tcp_rexmit_min; +extern int tcp_rexmit_max; extern int tcp_rexmit_slop; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; @@ -184,6 +184,8 @@ VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) VNET_DECLARE(int, tcp_msl); #define V_tcp_msl VNET(tcp_msl) +VNET_DECLARE(int, tcp_msl_local); +#define V_tcp_msl_local VNET(tcp_msl_local) #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 8d77db275310..ce63fcf9ffc0 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -87,12 +87,52 @@ #include <security/mac/mac_framework.h> -VNET_DEFINE_STATIC(bool, nolocaltimewait) = true; +VNET_DEFINE_STATIC(bool, nolocaltimewait) = false; #define V_nolocaltimewait VNET(nolocaltimewait) -SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), true, + +static int +sysctl_net_inet_tcp_nolocaltimewait(SYSCTL_HANDLER_ARGS) +{ + int error; + bool new; + + new = V_nolocaltimewait; + error = sysctl_handle_bool(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + V_nolocaltimewait = new; + gone_in(16, "net.inet.tcp.nolocaltimewait is obsolete." + " Use net.inet.tcp.msl_local instead.\n"); + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, nolocaltimewait, + CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_U8, + &VNET_NAME(nolocaltimewait), 0, sysctl_net_inet_tcp_nolocaltimewait, "CU", "Do not create TCP TIME_WAIT state for local connections"); +static u_int +tcp_eff_msl(struct tcpcb *tp) +{ + struct inpcb *inp = tptoinpcb(tp); +#ifdef INET6 + bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; +#endif + + if ( +#ifdef INET6 + isipv6 ? in6_localip(&inp->in6p_faddr) : +#endif +#ifdef INET + in_localip(inp->inp_faddr)) +#else + false) +#endif + return (V_tcp_msl_local); + else + return (V_tcp_msl); +} + /* * Move a TCP connection into TIME_WAIT state. * inp is locked, and is unlocked before returning. @@ -127,7 +167,7 @@ tcp_twstart(struct tcpcb *tp) if (V_nolocaltimewait && ( #ifdef INET6 - isipv6 ? in6_localaddr(&inp->in6p_faddr) : + isipv6 ? in6_localip(&inp->in6p_faddr) : #endif #ifdef INET in_localip(inp->inp_faddr) @@ -140,7 +180,7 @@ tcp_twstart(struct tcpcb *tp) return; } - tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl); + tcp_timer_activate(tp, TT_2MSL, 2 * tcp_eff_msl(tp)); INP_WUNLOCK(inp); } @@ -283,7 +323,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tp->rcv_nxt) - tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl); + tcp_timer_activate(tp, TT_2MSL, 2 * tcp_eff_msl(tp)); } /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index fbc204097b25..98c934955121 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -146,7 +146,7 @@ tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error) } /* - * TCP attaches to socket via pru_attach(), reserving space, + * TCP attaches to socket via pr_attach(), reserving space, * and an internet control block. */ static int @@ -164,7 +164,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) goto out; so->so_rcv.sb_flags |= SB_AUTOSIZE; - so->so_snd.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE); error = in_pcballoc(so, &V_tcbinfo); if (error) goto out; @@ -523,7 +523,7 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) goto out; - if (SOLISTENING(so) || so->so_options & SO_REUSEPORT_LB) { + if (SOLISTENING(so)) { error = EOPNOTSUPP; goto out; } @@ -590,7 +590,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) error = EAFNOSUPPORT; goto out; } - if (SOLISTENING(so) || so->so_options & SO_REUSEPORT_LB) { + if (SOLISTENING(so)) { error = EOPNOTSUPP; goto out; } @@ -907,8 +907,8 @@ out: /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other - * pru_*() routines, the mbuf chains are our responsibility. We - * must either enqueue them or free them. The other pru_* routines + * pr_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pr_*() routines * generally are caller-frees. */ static int @@ -1419,6 +1419,7 @@ struct protosw tcp_protosw = { .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in_getsockaddr, @@ -1447,6 +1448,7 @@ struct protosw tcp6_protosw = { .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in6_mapped_sockaddr, @@ -1476,6 +1478,8 @@ tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td) (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) != 0)) return (EISCONN); + if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) + return (EOPNOTSUPP); INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbconnect(inp, sin, td->td_ucred); @@ -1516,8 +1520,11 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td) INP_WLOCK_ASSERT(inp); if (__predict_false((so->so_state & - (SS_ISCONNECTING | SS_ISCONNECTED)) != 0)) + (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | + SS_ISDISCONNECTED)) != 0)) return (EISCONN); + if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) + return (EOPNOTSUPP); INP_HASH_WLOCK(&V_tcbinfo); error = in6_pcbconnect(inp, sin6, td->td_ucred, true); @@ -1761,9 +1768,9 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) /* * Release the ref count the lookup * acquired. - */ + */ refcount_release(&blk->tfb_refcnt); - /* + /* * Now there is a chance that the * init() function mucked with some * things before it failed, such as @@ -1793,7 +1800,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) * new one already. */ refcount_release(&tp->t_fb->tfb_refcnt); - /* + /* * Set in the new stack. */ tp->t_fb = blk; @@ -1927,7 +1934,7 @@ tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt) CC_LIST_RUNLOCK(); return(ESRCH); } - /* + /* * With a reference the algorithm cannot be removed * so we hold a reference through the change process. */ @@ -3044,7 +3051,44 @@ db_print_toobflags(char t_oobflags) } static void -db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) +db_print_bblog_state(int state) +{ + switch (state) { + case TCP_LOG_STATE_RATIO_OFF: + db_printf("TCP_LOG_STATE_RATIO_OFF"); + break; + case TCP_LOG_STATE_CLEAR: + db_printf("TCP_LOG_STATE_CLEAR"); + break; + case TCP_LOG_STATE_OFF: + db_printf("TCP_LOG_STATE_OFF"); + break; + case TCP_LOG_STATE_TAIL: + db_printf("TCP_LOG_STATE_TAIL"); + break; + case TCP_LOG_STATE_HEAD: + db_printf("TCP_LOG_STATE_HEAD"); + break; + case TCP_LOG_STATE_HEAD_AUTO: + db_printf("TCP_LOG_STATE_HEAD_AUTO"); + break; + case TCP_LOG_STATE_CONTINUAL: + db_printf("TCP_LOG_STATE_CONTINUAL"); + break; + case TCP_LOG_STATE_TAIL_AUTO: + db_printf("TCP_LOG_STATE_TAIL_AUTO"); + break; + case TCP_LOG_VIA_BBPOINTS: + db_printf("TCP_LOG_STATE_BBPOINTS"); + break; + default: + db_printf("UNKNOWN(%d)", state); + break; + } +} + +static void +db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog) { db_print_indent(indent); @@ -3154,18 +3198,68 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); + + db_print_indent(indent); + db_printf("t_fb.tfb_tcp_block_name: %s\n", tp->t_fb->tfb_tcp_block_name); + + db_print_indent(indent); + db_printf("t_cc.name: %s\n", tp->t_cc->name); + + db_print_indent(indent); + db_printf("_t_logstate: %d (", tp->_t_logstate); + db_print_bblog_state(tp->_t_logstate); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("t_lognum: %d t_loglimit: %d t_logsn: %u\n", + tp->t_lognum, tp->t_loglimit, tp->t_logsn); + + if (show_bblog) { +#ifdef TCP_BLACKBOX + db_print_bblog_entries(&tp->t_logs, indent); +#else + db_print_indent(indent); + db_printf("BBLog not supported\n"); +#endif + } } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; + bool show_bblog; if (!have_addr) { db_printf("usage: show tcpcb <addr>\n"); return; } + show_bblog = strchr(modif, 'b') != NULL; tp = (struct tcpcb *)addr; - db_print_tcpcb(tp, "tcpcb", 0); + db_print_tcpcb(tp, "tcpcb", 0, show_bblog); +} + +DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct inpcb *inp; + bool only_locked, show_bblog; + + only_locked = strchr(modif, 'l') != NULL; + show_bblog = strchr(modif, 'b') != NULL; + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) { + if (only_locked && + inp->inp_lock.rw_lock == RW_UNLOCKED) + continue; + db_print_tcpcb(intotcpcb(inp), "tcpcb", 0, show_bblog); + if (db_pager_quit) + break; + } + CURVNET_RESTORE(); + if (db_pager_quit) + break; + } } #endif diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 5be024ededc7..53856bae9a66 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -182,7 +182,7 @@ struct tcp_sendfile_track { * snd_una). When the response comes back indicating * that there was data (return value 1), then the caller * can build a sendmap entry based on the range and the - * times. The next query would then be done at the + * times. The next query would then be done at the * newly created sendmap_end. Repeated until sendmap_end == snd_max. * * Flags in sendmap_flags are defined below as well. @@ -197,7 +197,7 @@ struct tcp_sendfile_track { * The rack_times are a misc collection of information that * the old stack might possibly fill in. Of course its possible * that an old stack may not have a piece of information. If so - * then setting that value to zero is advised. Setting any + * then setting that value to zero is advised. Setting any * timestamp passed should only place a zero in it when it * is unfilled. This may mean that a time is off by a micro-second * but this is ok in the grand scheme of things. @@ -205,13 +205,13 @@ struct tcp_sendfile_track { * When switching stacks it is desireable to get as much information * from the old stack to the new stack as possible. Though not always * will the stack be compatible in the types of information. The - * init() function needs to take care when it begins changing + * init() function needs to take care when it begins changing * things such as inp_flags2 and the timer units to position these * changes at a point where it is unlikely they will fail after * making such changes. A stack optionally can have an "undo" - * function + * function * - * To transfer information to the old stack from the new in + * To transfer information to the old stack from the new in * respect to LRO and the inp_flags2, the new stack should set * the inp_flags2 to what it supports. The old stack in its * fini() function should call the tcp_handle_orphaned_packets() @@ -528,15 +528,6 @@ typedef enum { /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 -/* - * TODO: We yet need to brave plowing in - * to tcp_input() and the pru_usrreq() block. - * Right now these go to the old standards which - * are somewhat ok, but in the long term may - * need to be changed. If we do tackle tcp_input() - * then we need to get rid of the tcp_do_segment() - * function below. - */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ @@ -553,13 +544,13 @@ typedef enum { * do is: * a) Make sure that the inp_flags2 is setup correctly * for LRO. There are two flags that the previous - * stack may have set INP_MBUF_ACKCMP and + * stack may have set INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. If the new stack does not * support these it *should* clear the flags. * b) Make sure that the timers are in the proper * granularity that the stack wants. The stack * should check the t_tmr_granularity field. Currently - * there are two values that it may hold + * there are two values that it may hold * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC. * Use the functions tcp_timer_convert(tp, granularity); * to move the timers to the correct format for your stack. @@ -567,14 +558,14 @@ typedef enum { * The new stack may also optionally query the tfb_chg_query * function if the old stack has one. The new stack may ask * for one of three entries and can also state to the old - * stack its support for the INP_MBUF_ACKCMP and + * stack its support for the INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. This is important since if there are * queued ack's without that statement the old stack will * be forced to discard the queued acks. The requests that * can be made for information by the new stacks are: * * Note also that the tfb_tcp_fb_init() when called can - * determine if a query is needed by looking at the + * determine if a query is needed by looking at the * value passed in the ptr. The ptr is designed to be * set in with any allocated memory, but the address * of the condtion (ptr == &tp->t_fb_ptr) will be @@ -582,17 +573,17 @@ typedef enum { * setup of a tcb (which means no query would be needed). * If, however, the value is not t_fb_ptr, then the caller * is in the middle of a stack switch and is the new stack. - * A query would be appropriate (if the new stack support + * A query would be appropriate (if the new stack support * the query mechanism). * * TCP_QUERY_SENDMAP - Query of outstanding data. * TCP_QUERY_TIMERS_UP - Query about running timers. - * TCP_SUPPORTED_LRO - Declaration in req_param of - * the inp_flags2 supported by + * TCP_SUPPORTED_LRO - Declaration in req_param of + * the inp_flags2 supported by * the new stack. * TCP_QUERY_RACK_TIMES - Enquire about various timestamps * and states the old stack may be in. - * + * * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being @@ -936,9 +927,12 @@ struct in_conninfo; + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* - * TCP statistics. - * Many of these should be kept per connection, - * but that's inconvenient at the moment. + * Global (per-VNET) TCP statistics. The below structure represents what we + * export to the userland, but in the kernel we have an array of counter_u64_t + * with as many elements as there are members in the structure. The counters + * shall be increased by TCPSTAT_INC() or KMOD_TCPSTAT_INC(). Adding a new + * counter also requires adding corresponding SDT probes into in_kdtrace.h and + * into in_kdtrace.c. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ @@ -1024,6 +1018,8 @@ struct tcpstat { uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ + uint64_t tcps_sc_spurcookie; /* SYN cookie spurious, rejected */ + uint64_t tcps_sc_failcookie; /* SYN cookie failed, rejected */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ @@ -1243,6 +1239,9 @@ struct tcp_function_info { #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ +#define TCPCTL_KTLSLIST 17 /* connections with active ktls + session */ +#define TCPCTL_KTLSLIST_WKEYS 18 /* KTLSLIST with key data exported */ #ifdef _KERNEL #ifdef SYSCTL_DECL @@ -1380,8 +1379,7 @@ int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); -void tcp_dropwithreset(struct mbuf *, struct tcphdr *, - struct tcpcb *, int, int); +void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c index 76aadad9a3b9..4203029ff7c3 100644 --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -525,7 +525,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err) /* * Temporary failure during offload, take this PCB back. * Detach from the TOE driver and do the rest of what - * TCP's pru_connect would have done if the connection + * TCP's pr_connect() would have done if the connection * wasn't offloaded. */ diff --git a/sys/netinet/toecore.h b/sys/netinet/toecore.h index 612c2fe1caf5..843b261ec162 100644 --- a/sys/netinet/toecore.h +++ b/sys/netinet/toecore.h @@ -66,7 +66,7 @@ struct toedev { void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *); /* - * This is called by the kernel during pru_rcvd for an offloaded TCP + * This is called by the kernel during pr_rcvd() for an offloaded TCP * connection and provides an opportunity for the TOE driver to manage * its rx window and credits. */ diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h index edff456ba70e..010f2210b516 100644 --- a/sys/netinet/udp.h +++ b/sys/netinet/udp.h @@ -44,7 +44,7 @@ struct udphdr { u_short uh_dport; /* destination port */ u_short uh_ulen; /* udp length */ u_short uh_sum; /* udp checksum */ -}; +} __packed; /* * User-settable options (used with setsockopt). diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index dafbaf6dc672..3e6519118a40 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -243,7 +243,6 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in6 udp_in6; #endif struct udpcb *up; - bool filtered; INP_LOCK_ASSERT(inp); @@ -252,13 +251,19 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { + bool filtered; + in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0], up->u_tun_ctx); INP_RLOCK(inp); - if (filtered) - return (in_pcbrele_rlocked(inp)); + if (in_pcbrele_rlocked(inp)) + return (1); + if (filtered) { + INP_RUNLOCK(inp); + return (1); + } } off += sizeof(struct udphdr); @@ -443,7 +448,7 @@ udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast - * or multicast datgram.) + * or multicast datagram.) */ UDPSTAT_INC(udps_noport); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) @@ -555,6 +560,12 @@ udp_input(struct mbuf **mp, int *offp, int proto) ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP_UDP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + uh_sum = 0; } else { char b[offsetof(struct ipovly, ih_src)]; struct ipovly *ipov = (struct ipovly *)ip; @@ -643,7 +654,11 @@ udp_input(struct mbuf **mp, int *offp, int proto) else UDP_PROBE(receive, NULL, NULL, ip, NULL, uh); UDPSTAT_INC(udps_noport); - if (m->m_flags & (M_BCAST | M_MCAST)) { + if (m->m_flags & M_MCAST) { + UDPSTAT_INC(udps_noportmcast); + goto badunlocked; + } + if (m->m_flags & M_BCAST) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } |