aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/cc/cc_cubic.c164
-rw-r--r--sys/netinet/cc/cc_cubic.h160
-rw-r--r--sys/netinet/dccp.h2
-rw-r--r--sys/netinet/icmp6.h3
-rw-r--r--sys/netinet/icmp_var.h10
-rw-r--r--sys/netinet/if_ether.c7
-rw-r--r--sys/netinet/igmp.c33
-rw-r--r--sys/netinet/in.c8
-rw-r--r--sys/netinet/in_fib_dxr.c6
-rw-r--r--sys/netinet/in_kdtrace.c2
-rw-r--r--sys/netinet/in_kdtrace.h2
-rw-r--r--sys/netinet/in_pcb.c19
-rw-r--r--sys/netinet/in_pcb.h25
-rw-r--r--sys/netinet/in_prot.c23
-rw-r--r--sys/netinet/in_rss.c2
-rw-r--r--sys/netinet/in_systm.h4
-rw-r--r--sys/netinet/ip.h7
-rw-r--r--sys/netinet/ip_carp.c27
-rw-r--r--sys/netinet/ip_fastfwd.c22
-rw-r--r--sys/netinet/ip_fw.h220
-rw-r--r--sys/netinet/ip_icmp.c17
-rw-r--r--sys/netinet/ip_var.h2
-rw-r--r--sys/netinet/libalias/alias.c164
-rw-r--r--sys/netinet/libalias/alias_db.c90
-rw-r--r--sys/netinet/libalias/alias_irc.c4
-rw-r--r--sys/netinet/libalias/alias_local.h26
-rw-r--r--sys/netinet/libalias/alias_sctp.c2
-rw-r--r--sys/netinet/libalias/alias_skinny.c4
-rw-r--r--sys/netinet/libalias/alias_smedia.c4
-rw-r--r--sys/netinet/pim.h2
-rw-r--r--sys/netinet/raw_ip.c2
-rw-r--r--sys/netinet/sctp_bsd_addr.c23
-rw-r--r--sys/netinet/sctp_input.c6
-rw-r--r--sys/netinet/sctp_pcb.c19
-rw-r--r--sys/netinet/sctp_sysctl.c4
-rw-r--r--sys/netinet/sctp_timer.c1
-rw-r--r--sys/netinet/tcp.h2
-rw-r--r--sys/netinet/tcp_hpts.c159
-rw-r--r--sys/netinet/tcp_hpts.h137
-rw-r--r--sys/netinet/tcp_input.c124
-rw-r--r--sys/netinet/tcp_log_buf.c432
-rw-r--r--sys/netinet/tcp_log_buf.h27
-rw-r--r--sys/netinet/tcp_lro.c12
-rw-r--r--sys/netinet/tcp_lro_hpts.c2
-rw-r--r--sys/netinet/tcp_output.c2
-rw-r--r--sys/netinet/tcp_sack.c38
-rw-r--r--sys/netinet/tcp_stacks/bbr.c96
-rw-r--r--sys/netinet/tcp_stacks/rack.c340
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c14
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.h4
-rw-r--r--sys/netinet/tcp_stacks/rack_pcm.c12
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.c8
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.h2
-rw-r--r--sys/netinet/tcp_stacks/tcp_bbr.h2
-rw-r--r--sys/netinet/tcp_stacks/tcp_rack.h3
-rw-r--r--sys/netinet/tcp_subr.c295
-rw-r--r--sys/netinet/tcp_syncache.c72
-rw-r--r--sys/netinet/tcp_timer.c51
-rw-r--r--sys/netinet/tcp_timer.h34
-rw-r--r--sys/netinet/tcp_timewait.c52
-rw-r--r--sys/netinet/tcp_usrreq.c120
-rw-r--r--sys/netinet/tcp_var.h52
-rw-r--r--sys/netinet/toecore.c2
-rw-r--r--sys/netinet/toecore.h2
-rw-r--r--sys/netinet/udp.h2
-rw-r--r--sys/netinet/udp_usrreq.c25
66 files changed, 2126 insertions, 1113 deletions
diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c
index a2e72130fa88..b3e15009244d 100644
--- a/sys/netinet/cc/cc_cubic.c
+++ b/sys/netinet/cc/cc_cubic.c
@@ -38,7 +38,7 @@
/*
* An implementation of the CUBIC congestion control algorithm for FreeBSD,
- * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
+ * based on the Internet RFC9438 by Xu, Ha, Rhee, Goel, and Eggert.
* Originally released as part of the NewTCP research project at Swinburne
* University of Technology's Centre for Advanced Internet Architectures,
* Melbourne, Australia, which was made possible in part by a grant from the
@@ -81,7 +81,7 @@ static void cubic_conn_init(struct cc_var *ccv);
static int cubic_mod_init(void);
static void cubic_post_recovery(struct cc_var *ccv);
static void cubic_record_rtt(struct cc_var *ccv);
-static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg);
+static uint32_t cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg);
static void cubic_after_idle(struct cc_var *ccv);
static size_t cubic_data_sz(void);
static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt);
@@ -236,10 +236,11 @@ static void
cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
{
struct cubic *cubic_data;
- unsigned long W_est, W_cubic;
+ uint32_t W_est, W_cubic, cwin, target, incr;
int usecs_since_epoch;
uint32_t mss = tcp_fixed_maxseg(ccv->tp);
+ cwin = CCV(ccv, snd_cwnd);
cubic_data = ccv->cc_data;
cubic_record_rtt(ccv);
@@ -250,7 +251,7 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
(ccv->flags & CCF_CWND_LIMITED)) {
/* Use the logic in NewReno ack_received() for slow start. */
- if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) ||
+ if (cwin <= CCV(ccv, snd_ssthresh) ||
cubic_data->min_rtt_usecs == TCPTV_SRTTBASE) {
cubic_does_slow_start(ccv, cubic_data);
} else {
@@ -265,20 +266,32 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED;
cubic_log_hystart_event(ccv, cubic_data, 11, CCV(ccv, snd_ssthresh));
}
- if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) &&
- (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) {
- /* RFC8312 Section 4.7 */
- cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT |
- CUBICFLAG_IN_SLOWSTART);
- cubic_data->W_max = CCV(ccv, snd_cwnd);
- cubic_data->t_epoch = ticks;
- cubic_data->K = 0;
- } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART |
+ if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART |
+ CUBICFLAG_CONG_EVENT |
CUBICFLAG_IN_APPLIMIT)) {
+ /*
+ * At the beginning of the current congestion
+ * avoidance stage, The epoch variables
+ * (t_epoch, cwnd_epoch, K) are updated in the
+ * following three cases:
+ * 1) just exited the slow start
+ * 2) after a congestion event
+ * 3) application-limited
+ */
+ cubic_data->t_epoch = ticks;
+ cubic_data->cwnd_epoch = cwin;
+ cubic_data->K = cubic_k(cubic_data->W_max / mss,
+ cubic_data->cwnd_epoch / mss);
cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART |
+ CUBICFLAG_CONG_EVENT |
CUBICFLAG_IN_APPLIMIT);
- cubic_data->t_epoch = ticks;
- cubic_data->K = cubic_k(cubic_data->W_max / mss);
+
+ if (cubic_data->flags & CUBICFLAG_RTO_EVENT) {
+ /* RFC9438 Section 4.8: Timeout */
+ cubic_data->flags &= ~CUBICFLAG_RTO_EVENT;
+ cubic_data->W_max = cwin;
+ cubic_data->K = 0;
+ }
}
usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick;
if (usecs_since_epoch < 0) {
@@ -288,12 +301,9 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
usecs_since_epoch = INT_MAX;
cubic_data->t_epoch = ticks - INT_MAX;
}
-
W_est = tf_cwnd(ccv);
-
/*
- * The mean RTT is used to best reflect the equations in
- * the I-D.
+ * The mean RTT is used to best reflect the equations.
*/
W_cubic = cubic_cwnd(usecs_since_epoch +
cubic_data->mean_rtt_usecs,
@@ -302,33 +312,24 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
cubic_data->K);
if (W_cubic < W_est) {
- /*
- * TCP-friendly region, follow tf
- * cwnd growth.
- */
- CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX);
+ /* RFC9438 Section 4.3: Reno-friendly region */
+ CCV(ccv, snd_cwnd) = W_est;
cubic_data->flags |= CUBICFLAG_IN_TF;
- } else if (CCV(ccv, snd_cwnd) < W_cubic) {
+ } else {
/*
- * Concave or convex region, follow CUBIC
- * cwnd growth.
- * Only update snd_cwnd, if it doesn't shrink.
+ * RFC9438 Section 4.4 or 4.5:
+ * Concave or Convex Region
*/
- CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX);
- cubic_data->flags &= ~CUBICFLAG_IN_TF;
- }
-
- /*
- * If we're not in slow start and we're probing for a
- * new cwnd limit at the start of a connection
- * (happens when hostcache has a relevant entry),
- * keep updating our current estimate of the
- * W_max.
- */
- if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) &&
- cubic_data->W_max < CCV(ccv, snd_cwnd)) {
- cubic_data->W_max = CCV(ccv, snd_cwnd);
- cubic_data->K = cubic_k(cubic_data->W_max / mss);
+ if (W_cubic < cwin) {
+ target = cwin;
+ } else if (W_cubic > ((cwin * 3) >> 1)) {
+ target = (cwin * 3) >> 1;
+ } else {
+ target = W_cubic;
+ }
+ incr = (((target - cwin) << CUBIC_SHIFT) /
+ cwin * mss) >> CUBIC_SHIFT;
+ CCV(ccv, snd_cwnd) = cwin + incr;
}
}
} else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
@@ -345,12 +346,11 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
static void
cubic_after_idle(struct cc_var *ccv)
{
- struct cubic *cubic_data;
-
- cubic_data = ccv->cc_data;
+ struct cubic *cubic_data = ccv->cc_data;
+ uint32_t mss = tcp_fixed_maxseg(ccv->tp);
cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd));
- cubic_data->K = cubic_k(cubic_data->W_max / tcp_fixed_maxseg(ccv->tp));
+ cubic_data->K = cubic_k(cubic_data->W_max / mss, cubic_data->cwnd_epoch / mss);
if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) {
/*
* Re-enable hystart if we have been idle.
@@ -389,7 +389,9 @@ cubic_cb_init(struct cc_var *ccv, void *ptr)
cubic_data = ptr;
/* Init some key variables with sensible defaults. */
- cubic_data->t_epoch = ticks;
+ cubic_data->t_epoch = 0;
+ cubic_data->cwnd_epoch = 0;
+ cubic_data->K = 0;
cubic_data->min_rtt_usecs = TCPTV_SRTTBASE;
cubic_data->mean_rtt_usecs = 1;
@@ -416,7 +418,7 @@ static void
cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
{
struct cubic *cubic_data;
- uint32_t mss, pipe;
+ uint32_t mss, pipe, ssthresh;
cubic_data = ccv->cc_data;
mss = tcp_fixed_maxseg(ccv->tp);
@@ -431,10 +433,13 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
}
if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
- cubic_ssthresh_update(ccv, mss);
+ ssthresh = cubic_get_ssthresh(ccv, mss);
+ CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss);
+ /*
+ * The congestion flag will recalculate K at the
+ * beginning of the congestion avoidance stage.
+ */
cubic_data->flags |= CUBICFLAG_CONG_EVENT;
- cubic_data->t_epoch = ticks;
- cubic_data->K = cubic_k(cubic_data->W_max / mss);
}
ENTER_RECOVERY(CCV(ccv, t_flags));
}
@@ -448,17 +453,20 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
cubic_log_hystart_event(ccv, cubic_data, 9, CCV(ccv, snd_ssthresh));
}
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
- cubic_ssthresh_update(ccv, mss);
+ ssthresh = cubic_get_ssthresh(ccv, mss);
+ CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss);
+ CCV(ccv, snd_cwnd) = max(ssthresh, mss);
+ /*
+ * The congestion flag will recalculate K at the
+ * beginning of the congestion avoidance stage.
+ */
cubic_data->flags |= CUBICFLAG_CONG_EVENT;
- cubic_data->t_epoch = ticks;
- cubic_data->K = cubic_k(cubic_data->W_max / mss);
- CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
ENTER_CONGRECOVERY(CCV(ccv, t_flags));
}
break;
case CC_RTO:
- /* RFC8312 Section 4.7 */
+ /* RFC9438 Section 4.8: Timeout */
if (CCV(ccv, t_rxtshift) == 1) {
/*
* Remember the state only for the first RTO event. This
@@ -475,12 +483,16 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
(((uint64_t)min(CCV(ccv, snd_wnd), pipe) *
CUBIC_BETA) >> CUBIC_SHIFT) / mss) * mss;
}
- cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT;
+ /*
+ * The RTO flag will recalculate K at the
+ * beginning of the congestion avoidance stage.
+ */
+ cubic_data->flags |= CUBICFLAG_RTO_EVENT;
CCV(ccv, snd_cwnd) = mss;
break;
case CC_RTO_ERR:
- cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT);
+ cubic_data->flags &= ~CUBICFLAG_RTO_EVENT;
cubic_data->K = cubic_data->undo_K;
cubic_data->W_max = cubic_data->undo_W_max;
cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch;
@@ -503,7 +515,7 @@ cubic_conn_init(struct cc_var *ccv)
* this here bad things happen when entries from the TCP hostcache
* get used.
*/
- cubic_data->W_max = CCV(ccv, snd_cwnd);
+ cubic_data->W_max = UINT_MAX;
}
static int
@@ -603,44 +615,36 @@ cubic_record_rtt(struct cc_var *ccv)
}
/*
- * Update the ssthresh in the event of congestion.
+ * Return the new value for ssthresh in the event of a congestion.
*/
-static void
-cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg)
+static uint32_t
+cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg)
{
struct cubic *cubic_data;
- uint32_t ssthresh;
- uint32_t cwnd;
+ uint32_t cwnd, pipe;
cubic_data = ccv->cc_data;
cwnd = CCV(ccv, snd_cwnd);
- /* Fast convergence heuristic. */
+ /* RFC9438 Section 4.7: Fast convergence */
if (cwnd < cubic_data->W_max) {
cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT;
}
- cubic_data->undo_W_max = cubic_data->W_max;
cubic_data->W_max = cwnd;
if (cubic_data->flags & CUBICFLAG_IN_TF) {
- /* If in the TCP friendly region, follow what newreno does */
- ssthresh = newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg);
+ /* If in the TCP friendly region, follow what newreno does. */
+ return (newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg));
- } else if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) {
- /*
- * On the first congestion event, set ssthresh to cwnd * 0.5
- * and reduce W_max to cwnd * beta. This aligns the cubic
- * concave region appropriately.
- */
- ssthresh = cwnd >> 1;
- cubic_data->W_max = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT;
} else {
/*
- * On subsequent congestion events, set ssthresh to cwnd * beta.
+ * RFC9438 Section 4.6: Multiplicative Decrease
+ * Outside the TCP friendly region, set ssthresh to the size of
+ * inflight_size * beta.
*/
- ssthresh = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT;
+ pipe = tcp_compute_pipe(ccv->tp);
+ return ((pipe * CUBIC_BETA) >> CUBIC_SHIFT);
}
- CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg);
}
static void
diff --git a/sys/netinet/cc/cc_cubic.h b/sys/netinet/cc/cc_cubic.h
index c30128570ab0..c31506d26b00 100644
--- a/sys/netinet/cc/cc_cubic.h
+++ b/sys/netinet/cc/cc_cubic.h
@@ -88,14 +88,23 @@
/* Kernel only bits */
#ifdef _KERNEL
struct cubic {
- /* CUBIC K in fixed point form with CUBIC_SHIFT worth of precision. */
+ /*
+ * CUBIC K in fixed point form with CUBIC_SHIFT worth of precision.
+ * Also means the time period in seconds it takes to increase the
+ * congestion window size at the beginning of the current congestion
+ * avoidance stage to W_max.
+ */
int64_t K;
/* Sum of RTT samples across an epoch in usecs. */
int64_t sum_rtt_usecs;
- /* Size of cwnd just before cwnd was reduced in the last congestion event */
- uint64_t W_max;
- /* The cwnd at the beginning of the current congestion avoidance stage */
- uint64_t cwnd_epoch;
+ /* Size of cwnd (in bytes) just before cwnd was reduced in the last congestion event. */
+ uint32_t W_max;
+ /* An estimate (in bytes) for the congestion window in the Reno-friendly region */
+ uint32_t W_est;
+ /* An estimate (in bytes) for the congestion window in the CUBIC region */
+ uint32_t W_cubic;
+ /* The cwnd (in bytes) at the beginning of the current congestion avoidance stage. */
+ uint32_t cwnd_epoch;
/* various flags */
uint32_t flags;
/* Minimum observed rtt in usecs. */
@@ -110,8 +119,8 @@ struct cubic {
int undo_t_epoch;
/* Few variables to restore the state after RTO_ERR */
int64_t undo_K;
- uint64_t undo_W_max;
- uint64_t undo_cwnd_epoch;
+ uint32_t undo_W_max;
+ uint32_t undo_cwnd_epoch;
uint32_t css_baseline_minrtt;
uint32_t css_current_round_minrtt;
uint32_t css_lastround_minrtt;
@@ -130,60 +139,103 @@ struct cubic {
extern int hz;
/*
- * Implementation based on the formulae found in the CUBIC Internet Draft
- * "draft-ietf-tcpm-cubic-04".
+ * Implementation based on the formulas in RFC9438.
*
*/
-static __inline float
-theoretical_cubic_k(double wmax_pkts)
+
+/*
+ * Returns K, the time period in seconds it takes to increase the congestion
+ * window size at the beginning of the current congestion avoidance stage to
+ * W_max.
+ */
+static inline float
+theoretical_cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs)
{
double C;
C = 0.4;
+ if (wmax_segs <= cwnd_epoch_segs)
+ return 0.0;
- return (pow((wmax_pkts * 0.3) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT));
+ /*
+ * Figure 2: K = ((W_max - cwnd_epoch) / C)^(1/3)
+ */
+ return (pow((wmax_segs - cwnd_epoch_segs) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT));
}
-static __inline unsigned long
-theoretical_cubic_cwnd(int ticks_since_epoch, unsigned long wmax, uint32_t smss)
+/*
+ * Returns the congestion window in segments at time t in seconds based on the
+ * cubic increase function, where t is the elapsed time in seconds from the
+ * beginning of the current congestion avoidance stage, as described in RFC9438
+ * Section 4.2.
+ */
+static inline unsigned long
+theoretical_cubic_cwnd(int ticks_elapsed, uint32_t wmax_segs, uint32_t cwnd_epoch_segs)
{
- double C, wmax_pkts;
+ double C, t;
+ float K;
C = 0.4;
- wmax_pkts = wmax / (double)smss;
+ t = ticks_elapsed / (double)hz;
+ K = theoretical_cubic_k(wmax_segs, cwnd_epoch_segs);
- return (smss * (wmax_pkts +
- (C * pow(ticks_since_epoch / (double)hz -
- theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0))));
+ /*
+ * Figure 1: W_cubic(t) = C * (t - K)^3 + W_max
+ */
+ return (C * pow(t - K / pow(2, CUBIC_SHIFT), 3.0) + wmax_segs);
}
-static __inline unsigned long
-theoretical_reno_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax,
- uint32_t smss)
+/*
+ * Returns estimated Reno congestion window in segments.
+ */
+static inline unsigned long
+theoretical_reno_cwnd(int ticks_elapsed, int rtt_ticks, uint32_t wmax_segs)
{
- return ((wmax * 0.5) + ((ticks_since_epoch / (float)rtt_ticks) * smss));
+ return (wmax_segs * 0.5 + ticks_elapsed / (float)rtt_ticks);
}
-static __inline unsigned long
-theoretical_tf_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax,
- uint32_t smss)
+/*
+ * Returns an estimate for the congestion window in segments in the
+ * Reno-friendly region -- that is, an estimate for the congestion window of
+ * Reno, as described in RFC9438 Section 4.3, where:
+ * cwnd: Current congestion window in segments.
+ * cwnd_prior: Size of cwnd in segments at the time of setting ssthresh most
+ * recently, either upon exiting the first slow start or just before
+ * cwnd was reduced in the last congestion event.
+ * W_est: An estimate for the congestion window in segments in the Reno-friendly
+ * region -- that is, an estimate for the congestion window of Reno.
+ */
+static inline unsigned long
+theoretical_tf_cwnd(unsigned long W_est, unsigned long segs_acked, unsigned long cwnd,
+ unsigned long cwnd_prior)
{
+ float cubic_alpha, cubic_beta;
+
+ /* RFC9438 Section 4.6: The parameter β_cubic SHOULD be set to 0.7. */
+ cubic_beta = 0.7;
- return ((wmax * 0.7) + ((3 * 0.3) / (2 - 0.3) *
- (ticks_since_epoch / (float)rtt_ticks) * smss));
+ if (W_est >= cwnd_prior)
+ cubic_alpha = 1.0;
+ else
+ cubic_alpha = (3.0 * (1.0 - cubic_beta)) / (1.0 + cubic_beta);
+
+ /*
+ * Figure 4: W_est = W_est + α_cubic * segments_acked / cwnd
+ */
+ return (W_est + cubic_alpha * segs_acked / cwnd);
}
#endif /* !_KERNEL */
/*
* Compute the CUBIC K value used in the cwnd calculation, using an
- * implementation of eqn 2 in the I-D. The method used
- * here is adapted from Apple Computer Technical Report #KT-32.
+ * implementation mentioned in Figure. 2 of RFC9438.
+ * The method used here is adapted from Apple Computer Technical Report #KT-32.
*/
-static __inline int64_t
-cubic_k(unsigned long wmax_pkts)
+static inline int64_t
+cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs)
{
int64_t s, K;
uint16_t p;
@@ -191,8 +243,13 @@ cubic_k(unsigned long wmax_pkts)
K = s = 0;
p = 0;
- /* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */
- s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR;
+ /* Handle the corner case where W_max <= cwnd_epoch */
+ if (wmax_segs <= cwnd_epoch_segs) {
+ return 0;
+ }
+
+ /* (wmax - cwnd_epoch) / C with CUBIC_SHIFT worth of precision. */
+ s = ((wmax_segs - cwnd_epoch_segs) << (2 * CUBIC_SHIFT)) / CUBIC_C_FACTOR;
/* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */
while (s >= 256) {
@@ -213,13 +270,14 @@ cubic_k(unsigned long wmax_pkts)
}
/*
- * Compute the new cwnd value using an implementation of eqn 1 from the I-D.
+ * Compute and return the new cwnd value in bytes using an implementation
+ * mentioned in Figure. 1 of RFC9438.
* Thanks to Kip Macy for help debugging this function.
*
* XXXLAS: Characterise bounds for overflow.
*/
-static __inline unsigned long
-cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K)
+static inline uint32_t
+cubic_cwnd(int usecs_since_epoch, uint32_t wmax, uint32_t smss, int64_t K)
{
int64_t cwnd;
@@ -238,7 +296,7 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K)
cwnd *= (cwnd * cwnd);
/*
- * C(t - K)^3 + wmax
+ * Figure 1: C * (t - K)^3 + wmax
* The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of
* CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above,
* and an extra from multiplying through by CUBIC_C_FACTOR.
@@ -253,33 +311,9 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K)
}
/*
- * Compute an approximation of the NewReno cwnd some number of usecs after a
- * congestion event. RTT should be the average RTT estimate for the path
- * measured over the previous congestion epoch and wmax is the value of cwnd at
- * the last congestion event. The "TCP friendly" concept in the CUBIC I-D is
- * rather tricky to understand and it turns out this function is not required.
- * It is left here for reference.
- *
- * XXX: Not used
- */
-static __inline unsigned long
-reno_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax,
- uint32_t smss)
-{
-
- /*
- * For NewReno, beta = 0.5, therefore: W_tcp(t) = wmax*0.5 + t/RTT
- * W_tcp(t) deals with cwnd/wmax in pkts, so because our cwnd is in
- * bytes, we have to multiply by smss.
- */
- return (((wmax * RENO_BETA) + (((usecs_since_epoch * smss)
- << CUBIC_SHIFT) / rtt_usecs)) >> CUBIC_SHIFT);
-}
-
-/*
* Compute the "TCP friendly" cwnd by newreno in congestion avoidance state.
*/
-static __inline unsigned long
+static inline uint32_t
tf_cwnd(struct cc_var *ccv)
{
/* newreno is "TCP friendly" */
diff --git a/sys/netinet/dccp.h b/sys/netinet/dccp.h
index 4fb6a0d2ab3e..da83a1b06861 100644
--- a/sys/netinet/dccp.h
+++ b/sys/netinet/dccp.h
@@ -64,7 +64,7 @@ struct dccphdr {
uint8_t seq[6];
} longseq;
} d_seqno;
-};
+} __packed;
#define d_seqno_short d_seqno.shortseq;
#define d_seqno_long d_seqno.longseq.seq;
diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h
index 7845b682f3e4..2ca5b3433e47 100644
--- a/sys/netinet/icmp6.h
+++ b/sys/netinet/icmp6.h
@@ -713,9 +713,6 @@ void icmp6_redirect_input(struct mbuf *, int);
void icmp6_redirect_output(struct mbuf *, struct nhop_object *);
int icmp6_ratelimit(const struct in6_addr *, const int, const int);
-struct ip6ctlparam;
-void icmp6_mtudisc_update(struct ip6ctlparam *, int);
-
/* XXX: is this the right place for these macros? */
#define icmp6_ifstat_inc(ifp, tag) \
do { \
diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h
index b1f2b0ebf911..b39479565bd6 100644
--- a/sys/netinet/icmp_var.h
+++ b/sys/netinet/icmp_var.h
@@ -100,15 +100,13 @@ void kmod_icmpstat_inc(int statnum);
SYSCTL_DECL(_net_inet_icmp);
extern int badport_bandlim(int);
-#define BANDLIM_UNLIMITED -1
#define BANDLIM_ICMP_UNREACH 0
#define BANDLIM_ICMP_ECHO 1
#define BANDLIM_ICMP_TSTAMP 2
-#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
-#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */
-#define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_SCTP_OOTB 6
-#define BANDLIM_MAX 7
+#define BANDLIM_TCP_RST 3
+#define BANDLIM_ICMP6_UNREACH 4
+#define BANDLIM_SCTP_OOTB 5
+#define BANDLIM_MAX 6
#endif
#endif
diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c
index 88da1b139b1f..dc6ef343662d 100644
--- a/sys/netinet/if_ether.c
+++ b/sys/netinet/if_ether.c
@@ -56,6 +56,7 @@
#include <net/if_dl.h>
#include <net/if_private.h>
#include <net/if_types.h>
+#include <net/if_bridgevar.h>
#include <net/netisr.h>
#include <net/ethernet.h>
#include <net/route.h>
@@ -832,7 +833,7 @@ in_arpinput(struct mbuf *m)
* when we have clusters of interfaces).
*/
CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
- if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
+ if (((bridged && bridge_same_p(ia->ia_ifp->if_bridge, ifp->if_bridge)) ||
ia->ia_ifp == ifp) &&
itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
(ia->ia_ifa.ifa_carp == NULL ||
@@ -842,7 +843,7 @@ in_arpinput(struct mbuf *m)
}
}
CK_LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
- if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
+ if (((bridged && bridge_same_p(ia->ia_ifp->if_bridge, ifp->if_bridge)) ||
ia->ia_ifp == ifp) &&
isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
ifa_ref(&ia->ia_ifa);
@@ -850,7 +851,7 @@ in_arpinput(struct mbuf *m)
}
#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \
- (ia->ia_ifp->if_bridge == ifp->if_softc && \
+ (bridge_get_softc_p(ia->ia_ifp) == ifp->if_softc && \
!bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \
addr == ia->ia_addr.sin_addr.s_addr)
/*
diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c
index 5082b6294ebb..299f3c2e02bb 100644
--- a/sys/netinet/igmp.c
+++ b/sys/netinet/igmp.c
@@ -402,32 +402,43 @@ out:
static int
sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
{
+ struct epoch_tracker et;
int error;
int new;
+ struct igmp_ifsoftc *igi;
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error)
return (error);
- IGMP_LOCK();
-
new = V_igmp_default_version;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error || !req->newptr)
- goto out_locked;
+ return (error);
- if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
- error = EINVAL;
- goto out_locked;
- }
+ if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3)
+ return (EINVAL);
+
+ IN_MULTI_LIST_LOCK();
+ IGMP_LOCK();
+ NET_EPOCH_ENTER(et);
- CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
- V_igmp_default_version, new);
+ if (V_igmp_default_version != new) {
+ CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
+ V_igmp_default_version, new);
- V_igmp_default_version = new;
+ V_igmp_default_version = new;
-out_locked:
+ LIST_FOREACH(igi, &V_igi_head, igi_link) {
+ if (igi->igi_version > V_igmp_default_version){
+ igmp_set_version(igi, V_igmp_default_version);
+ }
+ }
+ }
+
+ NET_EPOCH_EXIT(et);
+ IN_MULTI_LIST_UNLOCK();
IGMP_UNLOCK();
return (error);
}
diff --git a/sys/netinet/in.c b/sys/netinet/in.c
index 2fcbff8611ff..963449d4b4b1 100644
--- a/sys/netinet/in.c
+++ b/sys/netinet/in.c
@@ -57,6 +57,7 @@
#include <net/if_llatbl.h>
#include <net/if_private.h>
#include <net/if_types.h>
+#include <net/if_bridgevar.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/route/route_ctl.h>
@@ -519,6 +520,13 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred
#endif
/*
+ * Check if bridge wants to allow adding addrs to member interfaces.
+ */
+ if (ifp->if_bridge && bridge_member_ifaddrs_p &&
+ !bridge_member_ifaddrs_p())
+ return (EINVAL);
+
+ /*
* See whether address already exist.
*/
iaIsFirst = true;
diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c
index b889131b544b..538cd43a88a3 100644
--- a/sys/netinet/in_fib_dxr.c
+++ b/sys/netinet/in_fib_dxr.c
@@ -345,7 +345,7 @@ initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
struct heap_entry *fhp = &da->heap[0];
struct rtentry *rt;
struct route_nhop_data rnd;
-
+
da->heap_index = 0;
da->dst.sin_addr.s_addr = htonl(dst_u32);
rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
@@ -1143,7 +1143,7 @@ dxr_destroy(void *data)
free(da, M_DXRAUX);
}
-static void
+static void
epoch_dxr_destroy(epoch_context_t ctx)
{
struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
@@ -1202,7 +1202,7 @@ dxr_dump_end(void *data, struct fib_dp *dp)
static enum flm_op_result
dxr_dump_rib_item(struct rtentry *rt, void *data)
{
-
+
return (FLM_SUCCESS);
}
diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c
index 7e0b9a6a9373..de2a98ce541c 100644
--- a/sys/netinet/in_kdtrace.c
+++ b/sys/netinet/in_kdtrace.c
@@ -286,6 +286,8 @@ MIB_PROBE_TCP(tcps_sc_unreach);
MIB_PROBE_TCP(tcps_sc_zonefail);
MIB_PROBE_TCP(tcps_sc_sendcookie);
MIB_PROBE_TCP(tcps_sc_recvcookie);
+MIB_PROBE_TCP(tcps_sc_spurcookie);
+MIB_PROBE_TCP(tcps_sc_failcookie);
MIB_PROBE_TCP(tcps_hc_added);
MIB_PROBE_TCP(tcps_hc_bucketoverflow);
diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h
index 7b0d433c60d8..a203b660d777 100644
--- a/sys/netinet/in_kdtrace.h
+++ b/sys/netinet/in_kdtrace.h
@@ -278,6 +278,8 @@ SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_unreach);
SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_zonefail);
SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_sendcookie);
SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_recvcookie);
+SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_spurcookie);
+SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_failcookie);
SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_added);
SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_bucketoverflow);
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 3774f73a7a8f..dbe48242381d 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -235,7 +235,7 @@ VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
in_pcbhashseed_init, NULL);
#ifdef INET
-VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1;
+VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
#define V_connect_inaddr_wild VNET(connect_inaddr_wild)
SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
@@ -1745,6 +1745,23 @@ in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
}
/*
+ * Dereference and rlock inp, for which the caller must own the
+ * reference. Returns true if inp no longer usable, false otherwise.
+ */
+bool
+in_pcbrele_rlock(struct inpcb *inp)
+{
+ INP_RLOCK(inp);
+ if (in_pcbrele_rlocked(inp))
+ return (true);
+ if ((inp->inp_flags & INP_FREED) != 0) {
+ INP_RUNLOCK(inp);
+ return (true);
+ }
+ return (false);
+}
+
+/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 5fe12c4f1e76..9e0618e87601 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -303,6 +303,30 @@ struct sockopt_parameters {
char sop_optval[];
};
+#ifdef _SYS_KTLS_H_
+struct xktls_session {
+ uint32_t tsz; /* total sz of elm, next elm is at this+tsz */
+ uint32_t fsz; /* size of the struct up to keys */
+ uint64_t inp_gencnt;
+ kvaddr_t so_pcb;
+ struct in_conninfo coninf;
+ u_short rx_vlan_id;
+ struct xktls_session_onedir rcv;
+ struct xktls_session_onedir snd;
+/*
+ * Next are
+ * - keydata for rcv, first cipher of length rcv.cipher_key_len, then
+ * authentication of length rcv.auth_key_len;
+ * - driver data (string) of length rcv.drv_st_len, if the rcv session is
+ * offloaded to ifnet rcv.ifnet;
+ * - keydata for snd, first cipher of length snd.cipher_key_len, then
+ * authentication of length snd.auth_key_len;
+ * - driver data (string) of length snd.drv_st_len, if the snd session is
+ * offloaded to ifnet snd.ifnet;
+ */
+};
+#endif /* _SYS_KTLS_H_ */
+
#ifdef _KERNEL
int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
int (*ctloutput_set)(struct inpcb *, struct sockopt *));
@@ -657,6 +681,7 @@ void in_pcbref(struct inpcb *);
bool in_pcbrele(struct inpcb *, inp_lookup_t);
bool in_pcbrele_rlocked(struct inpcb *);
bool in_pcbrele_wlocked(struct inpcb *);
+bool in_pcbrele_rlock(struct inpcb *inp);
typedef bool inp_match_t(const struct inpcb *, void *);
struct inpcb_iterator {
diff --git a/sys/netinet/in_prot.c b/sys/netinet/in_prot.c
index 204f4f60456e..69f0f3694096 100644
--- a/sys/netinet/in_prot.c
+++ b/sys/netinet/in_prot.c
@@ -26,21 +26,17 @@
*/
/*
- * System calls related to processes and protection
+ * Helpers related to visibility and protection of sockets and inpcb.
*/
-#include <sys/cdefs.h>
-#include "opt_inet.h"
-#include "opt_inet6.h"
-
-#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/socket.h>
-#include <sys/jail.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
@@ -72,3 +68,16 @@ cr_canseeinpcb(struct ucred *cred, struct inpcb *inp)
return (0);
}
+
+bool
+cr_canexport_ktlskeys(struct thread *td, struct inpcb *inp)
+{
+ int error;
+
+ if (cr_canseeinpcb(td->td_ucred, inp) == 0 &&
+ cr_xids_subset(td->td_ucred, inp->inp_cred))
+ return (true);
+ error = priv_check(td, PRIV_NETINET_KTLSKEYS);
+ return (error == 0);
+
+}
diff --git a/sys/netinet/in_rss.c b/sys/netinet/in_rss.c
index 698fd86dc7a5..f93a1d2bfd7b 100644
--- a/sys/netinet/in_rss.c
+++ b/sys/netinet/in_rss.c
@@ -285,7 +285,7 @@ rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval,
}
/*
* Only allow 2-tuple for TCP frames if we don't also
- * support 2-tuple for TCP.
+ * support 4-tuple for TCP.
*/
if ((rss_gethashconfig() & RSS_HASHTYPE_RSS_IPV4) &&
((rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) &&
diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h
index 2750733335bb..e2f553ec461c 100644
--- a/sys/netinet/in_systm.h
+++ b/sys/netinet/in_systm.h
@@ -32,6 +32,8 @@
#ifndef _NETINET_IN_SYSTM_H_
#define _NETINET_IN_SYSTM_H_
+#include <sys/types.h>
+
/*
* Miscellaneous internetwork
* definitions for kernel.
@@ -56,8 +58,10 @@ typedef u_int32_t n_time; /* ms since 00:00 UTC, byte rev */
#ifdef _KERNEL
struct inpcb;
struct ucred;
+struct thread;
int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp);
+bool cr_canexport_ktlskeys(struct thread *td, struct inpcb *inp);
uint32_t iptime(void);
#endif
diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h
index 8d205ba07cf5..6de41a7e79fa 100644
--- a/sys/netinet/ip.h
+++ b/sys/netinet/ip.h
@@ -33,7 +33,8 @@
#ifndef _NETINET_IP_H_
#define _NETINET_IP_H_
-#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <netinet/in.h>
/*
* Definitions for internet protocol version 4.
@@ -66,7 +67,7 @@ struct ip {
u_char ip_p; /* protocol */
u_short ip_sum; /* checksum */
struct in_addr ip_src,ip_dst; /* source and dest address */
-} __packed __aligned(2);
+} __packed;
#define IP_MAXPACKET 65535 /* maximum packet size */
@@ -186,7 +187,7 @@ struct ip_timestamp {
uint32_t ipt_time; /* network format */
} ipt_ta[1];
} ipt_timestamp;
-};
+} __packed;
/* Flag bits for ipt_flg. */
#define IPOPT_TS_TSONLY 0 /* timestamps only */
diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c
index 0ead7149c1e2..d3d7957cf087 100644
--- a/sys/netinet/ip_carp.c
+++ b/sys/netinet/ip_carp.c
@@ -206,8 +206,6 @@ struct carpkreq {
*
* Known issues with locking:
*
- * - Sending ad, we put the pointer to the softc in an mtag, and no reference
- * counting is done on the softc.
* - On module unload we may race (?) with packet processing thread
* dereferencing our function pointers.
*/
@@ -1688,6 +1686,7 @@ char *
carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
{
struct ifaddr *ifa;
+ char *mac = NULL;
NET_EPOCH_ASSERT();
@@ -1698,18 +1697,26 @@ carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
struct m_tag *mtag;
mtag = m_tag_get(PACKET_TAG_CARP,
- sizeof(struct carp_softc *), M_NOWAIT);
- if (mtag == NULL)
- /* Better a bit than nothing. */
- return (sc->sc_addr);
+ sizeof(sc->sc_vhid) + sizeof(sc->sc_addr),
+ M_NOWAIT);
+ if (mtag == NULL) {
+ CARPSTATS_INC(carps_onomem);
+ break;
+ }
+ /* carp_output expects sc_vhid first. */
+ bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid));
+ /*
+ * Save sc_addr into mtag data after sc_vhid to avoid
+ * possible access to destroyed softc.
+ */
+ mac = (char *)(mtag + 1) + sizeof(sc->sc_vhid);
+ bcopy(sc->sc_addr, mac, sizeof(sc->sc_addr));
- bcopy(&sc, mtag + 1, sizeof(sc));
m_tag_prepend(m, mtag);
-
- return (sc->sc_addr);
+ break;
}
- return (NULL);
+ return (mac);
}
#endif /* INET6 */
diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c
index 9b81760e58f3..51e7c2fbc4b0 100644
--- a/sys/netinet/ip_fastfwd.c
+++ b/sys/netinet/ip_fastfwd.c
@@ -69,6 +69,7 @@
#include <sys/cdefs.h>
#include "opt_ipstealth.h"
+#include "opt_sctp.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -102,6 +103,10 @@
#include <machine/in_cksum.h>
+#if defined(SCTP) || defined(SCTP_SUPPORT)
+#include <netinet/sctp_crc32.h>
+#endif
+
#define V_ipsendredirects VNET(ipsendredirects)
static struct mbuf *
@@ -460,6 +465,23 @@ passout:
} else
gw = (const struct sockaddr *)dst;
+ /*
+ * If TCP/UDP header still needs a valid checksum and interface will not
+ * calculate it for us, do it here.
+ */
+ if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &
+ ~nh->nh_ifp->if_hwassist)) {
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+#if defined(SCTP) || defined(SCTP_SUPPORT)
+ if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP_SCTP &
+ ~nh->nh_ifp->if_hwassist)) {
+ sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
+ m->m_pkthdr.csum_flags &= ~CSUM_IP_SCTP;
+ }
+#endif
+
/* Handle redirect case. */
redest.s_addr = 0;
if (V_ipsendredirects && osrc.s_addr == ip->ip_src.s_addr &&
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index c440223b81f8..51e68c310915 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -167,149 +167,149 @@ typedef struct _ip_fw3_opheader {
*/
enum ipfw_opcodes { /* arguments (4 byte each) */
- O_NOP,
-
- O_IP_SRC, /* u32 = IP */
- O_IP_SRC_MASK, /* ip = IP/mask */
- O_IP_SRC_ME, /* none */
- O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */
-
- O_IP_DST, /* u32 = IP */
- O_IP_DST_MASK, /* ip = IP/mask */
- O_IP_DST_ME, /* none */
- O_IP_DST_SET, /* u32=base, arg1=len, bitmap */
-
- O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */
- O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */
- O_PROTO, /* arg1=protocol */
-
- O_MACADDR2, /* 2 mac addr:mask */
- O_MAC_TYPE, /* same as srcport */
-
- O_LAYER2, /* none */
- O_IN, /* none */
- O_FRAG, /* none */
-
- O_RECV, /* none */
- O_XMIT, /* none */
- O_VIA, /* none */
-
- O_IPOPT, /* arg1 = 2*u8 bitmap */
- O_IPLEN, /* arg1 = len */
- O_IPID, /* arg1 = id */
-
- O_IPTOS, /* arg1 = id */
- O_IPPRECEDENCE, /* arg1 = precedence << 5 */
- O_IPTTL, /* arg1 = TTL */
-
- O_IPVER, /* arg1 = version */
- O_UID, /* u32 = id */
- O_GID, /* u32 = id */
- O_ESTAB, /* none (tcp established) */
- O_TCPFLAGS, /* arg1 = 2*u8 bitmap */
- O_TCPWIN, /* arg1 = desired win */
- O_TCPSEQ, /* u32 = desired seq. */
- O_TCPACK, /* u32 = desired seq. */
- O_ICMPTYPE, /* u32 = icmp bitmap */
- O_TCPOPTS, /* arg1 = 2*u8 bitmap */
-
- O_VERREVPATH, /* none */
- O_VERSRCREACH, /* none */
-
- O_PROBE_STATE, /* v0:arg1=kidx, v1:kidx=kidx */
- O_KEEP_STATE, /* v0:arg1=kidx, v1:kidx=kidx */
- O_LIMIT, /* ipfw_insn_limit */
- O_LIMIT_PARENT, /* dyn_type, not an opcode. */
+ O_NOP = 0,
+
+ O_IP_SRC = 1, /* u32 = IP */
+ O_IP_SRC_MASK = 2, /* ip = IP/mask */
+ O_IP_SRC_ME = 3, /* none */
+ O_IP_SRC_SET = 4, /* u32=base, arg1=len, bitmap */
+
+ O_IP_DST = 5, /* u32 = IP */
+ O_IP_DST_MASK = 6, /* ip = IP/mask */
+ O_IP_DST_ME = 7, /* none */
+ O_IP_DST_SET = 8, /* u32=base, arg1=len, bitmap */
+
+ O_IP_SRCPORT = 9, /* (n)port list:mask 4 byte ea */
+ O_IP_DSTPORT = 10, /* (n)port list:mask 4 byte ea */
+ O_PROTO = 11, /* arg1=protocol */
+
+ O_MACADDR2 = 12, /* 2 mac addr:mask */
+ O_MAC_TYPE = 13, /* same as srcport */
+
+ O_LAYER2 = 14, /* none */
+ O_IN = 15, /* none */
+ O_FRAG = 16, /* none */
+
+ O_RECV = 17, /* none */
+ O_XMIT = 18, /* none */
+ O_VIA = 19, /* none */
+
+ O_IPOPT = 20, /* arg1 = 2*u8 bitmap */
+ O_IPLEN = 21, /* arg1 = len */
+ O_IPID = 22, /* arg1 = id */
+
+ O_IPTOS = 23, /* arg1 = id */
+ O_IPPRECEDENCE = 24, /* arg1 = precedence << 5 */
+ O_IPTTL = 25, /* arg1 = TTL */
+
+ O_IPVER = 26, /* arg1 = version */
+ O_UID = 27, /* u32 = id */
+ O_GID = 28, /* u32 = id */
+ O_ESTAB = 29, /* none (tcp established) */
+ O_TCPFLAGS = 30, /* arg1 = 2*u8 bitmap */
+ O_TCPWIN = 31, /* arg1 = desired win */
+ O_TCPSEQ = 32, /* u32 = desired seq. */
+ O_TCPACK = 33, /* u32 = desired seq. */
+ O_ICMPTYPE = 34, /* u32 = icmp bitmap */
+ O_TCPOPTS = 35, /* arg1 = 2*u8 bitmap */
+
+ O_VERREVPATH = 36, /* none */
+ O_VERSRCREACH = 37, /* none */
+
+ O_PROBE_STATE = 38, /* v0:arg1=kidx, v1:kidx=kidx */
+ O_KEEP_STATE = 39, /* v0:arg1=kidx, v1:kidx=kidx */
+ O_LIMIT = 40, /* ipfw_insn_limit */
+ O_LIMIT_PARENT = 41, /* dyn_type, not an opcode. */
/*
* These are really 'actions'.
*/
- O_LOG, /* ipfw_insn_log */
- O_PROB, /* u32 = match probability */
+ O_LOG = 42, /* ipfw_insn_log */
+ O_PROB = 43, /* u32 = match probability */
- O_CHECK_STATE, /* v0:arg1=kidx, v1:kidx=kidx */
- O_ACCEPT, /* none */
- O_DENY, /* none */
- O_REJECT, /* arg1=icmp arg (same as deny) */
- O_COUNT, /* none */
- O_SKIPTO, /* v0:arg1=next rule number */
+ O_CHECK_STATE = 44, /* v0:arg1=kidx, v1:kidx=kidx */
+ O_ACCEPT = 45, /* none */
+ O_DENY = 46, /* none */
+ O_REJECT = 47, /* arg1=icmp arg (same as deny) */
+ O_COUNT = 48, /* none */
+ O_SKIPTO = 49, /* v0:arg1=next rule number */
/* v1:kidx= next rule number */
- O_PIPE, /* arg1=pipe number */
- O_QUEUE, /* arg1=queue number */
- O_DIVERT, /* arg1=port number */
- O_TEE, /* arg1=port number */
- O_FORWARD_IP, /* fwd sockaddr */
- O_FORWARD_MAC, /* fwd mac */
- O_NAT, /* nope */
- O_REASS, /* none */
+ O_PIPE = 50, /* arg1=pipe number */
+ O_QUEUE = 51, /* arg1=queue number */
+ O_DIVERT = 52, /* arg1=port number */
+ O_TEE = 53, /* arg1=port number */
+ O_FORWARD_IP = 54, /* fwd sockaddr */
+ O_FORWARD_MAC = 55, /* fwd mac */
+ O_NAT = 56, /* nope */
+ O_REASS = 57, /* none */
/*
* More opcodes.
*/
- O_IPSEC, /* has ipsec history */
- O_IP_SRC_LOOKUP, /* v0:arg1=table number, u32=value */
+ O_IPSEC = 58, /* has ipsec history */
+ O_IP_SRC_LOOKUP = 59, /* v0:arg1=table number, u32=value */
/* v1:kidx=name, u32=value, arg1=key */
- O_IP_DST_LOOKUP, /* arg1=table number, u32=value */
+ O_IP_DST_LOOKUP = 60, /* arg1=table number, u32=value */
/* v1:kidx=name, u32=value, arg1=key */
- O_ANTISPOOF, /* none */
- O_JAIL, /* u32 = id */
- O_ALTQ, /* u32 = altq classif. qid */
- O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */
- O_TCPDATALEN, /* arg1 = tcp data len */
- O_IP6_SRC, /* address without mask */
- O_IP6_SRC_ME, /* my addresses */
- O_IP6_SRC_MASK, /* address with the mask */
- O_IP6_DST,
- O_IP6_DST_ME,
- O_IP6_DST_MASK,
- O_FLOW6ID, /* for flow id tag in the ipv6 pkt */
- O_ICMP6TYPE, /* icmp6 packet type filtering */
- O_EXT_HDR, /* filtering for ipv6 extension header */
- O_IP6,
+ O_ANTISPOOF = 61, /* none */
+ O_JAIL = 62, /* u32 = id */
+ O_ALTQ = 63, /* u32 = altq classif. qid */
+ O_DIVERTED = 64, /* arg1=bitmap (1:loop, 2:out) */
+ O_TCPDATALEN = 65, /* arg1 = tcp data len */
+ O_IP6_SRC = 66, /* address without mask */
+ O_IP6_SRC_ME = 67, /* my addresses */
+ O_IP6_SRC_MASK = 68, /* address with the mask */
+ O_IP6_DST = 69,
+ O_IP6_DST_ME = 70,
+ O_IP6_DST_MASK = 71,
+ O_FLOW6ID = 72, /* for flow id tag in the ipv6 pkt */
+ O_ICMP6TYPE = 73, /* icmp6 packet type filtering */
+ O_EXT_HDR = 74, /* filtering for ipv6 extension header */
+ O_IP6 = 75,
/*
* actions for ng_ipfw
*/
- O_NETGRAPH, /* send to ng_ipfw */
- O_NGTEE, /* copy to ng_ipfw */
+ O_NETGRAPH = 76, /* send to ng_ipfw */
+ O_NGTEE = 77, /* copy to ng_ipfw */
- O_IP4,
+ O_IP4 = 78,
- O_UNREACH6, /* arg1=icmpv6 code arg (deny) */
+ O_UNREACH6 = 79, /* arg1=icmpv6 code arg (deny) */
- O_TAG, /* arg1=tag number */
- O_TAGGED, /* arg1=tag number */
+ O_TAG = 80, /* arg1=tag number */
+ O_TAGGED = 81, /* arg1=tag number */
- O_SETFIB, /* arg1=FIB number */
- O_FIB, /* arg1=FIB desired fib number */
+ O_SETFIB = 82, /* arg1=FIB number */
+ O_FIB = 83, /* arg1=FIB desired fib number */
- O_SOCKARG, /* socket argument */
+ O_SOCKARG = 84, /* socket argument */
- O_CALLRETURN, /* v0:arg1=called rule number */
+ O_CALLRETURN = 85, /* v0:arg1=called rule number */
/* v1:kidx=called rule number */
- O_FORWARD_IP6, /* fwd sockaddr_in6 */
+ O_FORWARD_IP6 = 86, /* fwd sockaddr_in6 */
- O_DSCP, /* 2 u32 = DSCP mask */
- O_SETDSCP, /* arg1=DSCP value */
- O_IP_FLOW_LOOKUP, /* v0:arg1=table number, u32=value */
+ O_DSCP = 87, /* 2 u32 = DSCP mask */
+ O_SETDSCP = 88, /* arg1=DSCP value */
+ O_IP_FLOW_LOOKUP = 89, /* v0:arg1=table number, u32=value */
/* v1:kidx=name, u32=value */
- O_EXTERNAL_ACTION, /* v0:arg1=id of external action handler */
+ O_EXTERNAL_ACTION = 90, /* v0:arg1=id of external action handler */
/* v1:kidx=id of external action handler */
- O_EXTERNAL_INSTANCE, /* v0:arg1=id of eaction handler instance */
+ O_EXTERNAL_INSTANCE = 91, /* v0:arg1=id of eaction handler instance */
/* v1:kidx=id of eaction handler instance */
- O_EXTERNAL_DATA, /* variable length data */
+ O_EXTERNAL_DATA = 92, /* variable length data */
- O_SKIP_ACTION, /* none */
- O_TCPMSS, /* arg1=MSS value */
+ O_SKIP_ACTION = 93, /* none */
+ O_TCPMSS = 94, /* arg1=MSS value */
- O_MAC_SRC_LOOKUP, /* kidx=name, u32=value, arg1=key */
- O_MAC_DST_LOOKUP, /* kidx=name, u32=value, arg1=key */
+ O_MAC_SRC_LOOKUP = 95, /* kidx=name, u32=value, arg1=key */
+ O_MAC_DST_LOOKUP = 96, /* kidx=name, u32=value, arg1=key */
- O_SETMARK, /* u32 = value */
- O_MARK, /* 2 u32 = value, bitmask */
+ O_SETMARK = 97, /* u32 = value */
+ O_MARK = 98, /* 2 u32 = value, bitmask */
O_LAST_OPCODE /* not an opcode! */
};
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index 17d15d7d9629..fc0848b2c944 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -391,7 +391,6 @@ stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) -
nip->ip_hl = 5;
nip->ip_p = IPPROTO_ICMP;
nip->ip_tos = 0;
- nip->ip_off = 0;
if (V_error_keeptags)
m_tag_copy_chain(m, n, M_NOWAIT);
@@ -872,6 +871,8 @@ match:
mac_netinet_icmp_replyinplace(m);
#endif
ip->ip_src = t;
+ /* ip->ip_tos will be reflected. */
+ ip->ip_off = htons(0);
ip->ip_ttl = V_ip_defttl;
if (optlen > 0) {
@@ -1090,15 +1091,14 @@ ip_next_mtu(int mtu, int dir)
* the 'final' error, but it doesn't make sense to solve the printing
* delay with more complex code.
*/
-VNET_DEFINE_STATIC(struct counter_rate, icmp_rates[BANDLIM_MAX]);
+VNET_DEFINE_STATIC(struct counter_rate *, icmp_rates[BANDLIM_MAX]);
#define V_icmp_rates VNET(icmp_rates)
static const char *icmp_rate_descrs[BANDLIM_MAX] = {
[BANDLIM_ICMP_UNREACH] = "icmp unreach",
[BANDLIM_ICMP_ECHO] = "icmp ping",
[BANDLIM_ICMP_TSTAMP] = "icmp tstamp",
- [BANDLIM_RST_CLOSEDPORT] = "closed port RST",
- [BANDLIM_RST_OPENPORT] = "open port RST",
+ [BANDLIM_TCP_RST] = "tcp reset",
[BANDLIM_ICMP6_UNREACH] = "icmp6 unreach",
[BANDLIM_SCTP_OOTB] = "sctp ootb",
};
@@ -1158,8 +1158,7 @@ icmp_bandlimit_init(void)
{
for (int i = 0; i < BANDLIM_MAX; i++) {
- V_icmp_rates[i].cr_rate = counter_u64_alloc(M_WAITOK);
- V_icmp_rates[i].cr_ticks = ticks;
+ V_icmp_rates[i] = counter_rate_alloc(M_WAITOK, 1);
icmplim_new_jitter(i);
}
}
@@ -1172,7 +1171,7 @@ icmp_bandlimit_uninit(void)
{
for (int i = 0; i < BANDLIM_MAX; i++)
- counter_u64_free(V_icmp_rates[i].cr_rate);
+ counter_rate_free(V_icmp_rates[i]);
}
VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
icmp_bandlimit_uninit, NULL);
@@ -1183,13 +1182,13 @@ badport_bandlim(int which)
{
int64_t pps;
- if (V_icmplim == 0 || which == BANDLIM_UNLIMITED)
+ if (V_icmplim == 0)
return (0);
KASSERT(which >= 0 && which < BANDLIM_MAX,
("%s: which %d", __func__, which));
- pps = counter_ratecheck(&V_icmp_rates[which], V_icmplim +
+ pps = counter_ratecheck(V_icmp_rates[which], V_icmplim +
V_icmplim_curr_jitter[which]);
if (pps > 0) {
if (V_icmplim_output)
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
index 18ca5861a40e..f782ebc53eb0 100644
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -47,7 +47,7 @@ struct ipovly {
u_short ih_len; /* protocol length */
struct in_addr ih_src; /* source internet address */
struct in_addr ih_dst; /* destination internet address */
-};
+} __packed;
#ifdef _KERNEL
/*
diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c
index 7858e4d2b9f3..6758813f6a21 100644
--- a/sys/netinet/libalias/alias.c
+++ b/sys/netinet/libalias/alias.c
@@ -290,13 +290,14 @@ IcmpAliasIn1(struct libalias *la, struct ip *pip)
{
struct alias_link *lnk;
struct icmp *ic;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
ic = (struct icmp *)ip_next(pip);
/* Get source address from ICMP data field and restore original data */
- lnk = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1);
- if (lnk != NULL) {
+ ret = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1, &lnk);
+ if (ret == PKT_ALIAS_OK) {
u_short original_id;
int accumulate;
@@ -319,10 +320,8 @@ IcmpAliasIn1(struct libalias *la, struct ip *pip)
&original_address, &pip->ip_dst, 2);
pip->ip_dst = original_address;
}
-
- return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
+ return (ret);
}
/*
@@ -337,6 +336,7 @@ IcmpAliasIn2(struct libalias *la, struct ip *pip)
struct udphdr *ud;
struct tcphdr *tc;
struct alias_link *lnk;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
ic = (struct icmp *)ip_next(pip);
@@ -346,18 +346,26 @@ IcmpAliasIn2(struct libalias *la, struct ip *pip)
tc = (struct tcphdr *)ip_next(ip);
ic2 = (struct icmp *)ip_next(ip);
- if (ip->ip_p == IPPROTO_UDP)
- lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src,
+ if (ip->ip_p == IPPROTO_UDP) {
+ ret = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src,
ud->uh_dport, ud->uh_sport,
- IPPROTO_UDP, 0);
- else if (ip->ip_p == IPPROTO_TCP)
- lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src,
+ IPPROTO_UDP, 0, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ } else if (ip->ip_p == IPPROTO_TCP) {
+ ret = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src,
tc->th_dport, tc->th_sport,
- IPPROTO_TCP, 0);
- else if (ip->ip_p == IPPROTO_ICMP) {
- if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP)
- lnk = FindIcmpIn(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0);
- else
+ IPPROTO_TCP, 0, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ } else if (ip->ip_p == IPPROTO_ICMP) {
+ if (ic2->icmp_type == ICMP_ECHO ||
+ ic2->icmp_type == ICMP_TSTAMP) {
+ ret = FindIcmpIn(la, ip->ip_dst, ip->ip_src,
+ ic2->icmp_id, 0, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ } else
lnk = NULL;
} else
lnk = NULL;
@@ -479,13 +487,15 @@ IcmpAliasOut1(struct libalias *la, struct ip *pip, int create)
{
struct alias_link *lnk;
struct icmp *ic;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
ic = (struct icmp *)ip_next(pip);
/* Save overwritten data for when echo packet returns */
- lnk = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create);
- if (lnk != NULL) {
+ ret = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create,
+ &lnk);
+ if (ret == PKT_ALIAS_OK) {
u_short alias_id;
int accumulate;
@@ -508,10 +518,8 @@ IcmpAliasOut1(struct libalias *la, struct ip *pip, int create)
&alias_address, &pip->ip_src, 2);
pip->ip_src = alias_address;
}
-
- return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
+ return (ret);
}
/*
@@ -526,6 +534,7 @@ IcmpAliasOut2(struct libalias *la, struct ip *pip)
struct udphdr *ud;
struct tcphdr *tc;
struct alias_link *lnk;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
ic = (struct icmp *)ip_next(pip);
@@ -535,18 +544,26 @@ IcmpAliasOut2(struct libalias *la, struct ip *pip)
tc = (struct tcphdr *)ip_next(ip);
ic2 = (struct icmp *)ip_next(ip);
- if (ip->ip_p == IPPROTO_UDP)
- lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src,
+ if (ip->ip_p == IPPROTO_UDP) {
+ ret = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src,
ud->uh_dport, ud->uh_sport,
- IPPROTO_UDP, 0);
- else if (ip->ip_p == IPPROTO_TCP)
- lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src,
+ IPPROTO_UDP, 0, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ } else if (ip->ip_p == IPPROTO_TCP) {
+ ret = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src,
tc->th_dport, tc->th_sport,
- IPPROTO_TCP, 0);
- else if (ip->ip_p == IPPROTO_ICMP) {
- if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP)
- lnk = FindIcmpOut(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0);
- else
+ IPPROTO_TCP, 0, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ } else if (ip->ip_p == IPPROTO_ICMP) {
+ if (ic2->icmp_type == ICMP_ECHO ||
+ ic2->icmp_type == ICMP_TSTAMP) {
+ ret = FindIcmpOut(la, ip->ip_dst, ip->ip_src,
+ ic2->icmp_id, 0, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ } else
lnk = NULL;
} else
lnk = NULL;
@@ -661,14 +678,15 @@ ProtoAliasIn(struct libalias *la, struct in_addr ip_src,
struct ip *pip, u_char ip_p, u_short *ip_sum)
{
struct alias_link *lnk;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
/* Return if proxy-only mode is enabled */
if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)
return (PKT_ALIAS_OK);
- lnk = FindProtoIn(la, ip_src, pip->ip_dst, ip_p);
- if (lnk != NULL) {
+ ret = FindProtoIn(la, ip_src, pip->ip_dst, ip_p, &lnk);
+ if (ret == PKT_ALIAS_OK) {
struct in_addr original_address;
original_address = GetOriginalAddress(lnk);
@@ -677,10 +695,8 @@ ProtoAliasIn(struct libalias *la, struct in_addr ip_src,
DifferentialChecksum(ip_sum,
&original_address, &pip->ip_dst, 2);
pip->ip_dst = original_address;
-
- return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
+ return (ret);
}
/*
@@ -693,6 +709,7 @@ ProtoAliasOut(struct libalias *la, struct ip *pip,
struct in_addr ip_dst, u_char ip_p, u_short *ip_sum, int create)
{
struct alias_link *lnk;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
@@ -703,8 +720,8 @@ ProtoAliasOut(struct libalias *la, struct ip *pip,
if (!create)
return (PKT_ALIAS_IGNORED);
- lnk = FindProtoOut(la, pip->ip_src, ip_dst, ip_p);
- if (lnk != NULL) {
+ ret = FindProtoOut(la, pip->ip_src, ip_dst, ip_p, &lnk);
+ if (ret == PKT_ALIAS_OK) {
struct in_addr alias_address;
alias_address = GetAliasAddress(lnk);
@@ -713,10 +730,8 @@ ProtoAliasOut(struct libalias *la, struct ip *pip,
DifferentialChecksum(ip_sum,
&alias_address, &pip->ip_src, 2);
pip->ip_src = alias_address;
-
- return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
+ return (ret);
}
#define MF_ISSET(_pip) (ntohs((_pip)->ip_off) & IP_MF)
@@ -745,6 +760,7 @@ UdpAliasIn(struct libalias *la, struct ip *pip)
{
struct udphdr *ud;
struct alias_link *lnk;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
@@ -752,10 +768,12 @@ UdpAliasIn(struct libalias *la, struct ip *pip)
if (ud == NULL)
return (PKT_ALIAS_IGNORED);
- lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst,
+ ret = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst,
ud->uh_sport, ud->uh_dport,
- IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY));
- if (lnk != NULL) {
+ IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY), &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ {
struct in_addr alias_address;
struct in_addr original_address;
struct in_addr proxy_address;
@@ -828,7 +846,6 @@ UdpAliasIn(struct libalias *la, struct ip *pip)
return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
}
static int
@@ -840,7 +857,7 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
struct in_addr proxy_server_address;
u_short dest_port;
u_short proxy_server_port;
- int proxy_type;
+ int proxy_type, ret;
LIBALIAS_LOCK_ASSERT(la);
@@ -877,10 +894,12 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
pip->ip_dst = proxy_server_address;
ud->uh_dport = proxy_server_port;
}
- lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst,
+ ret = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst,
ud->uh_sport, ud->uh_dport,
- IPPROTO_UDP, create);
- if (lnk != NULL) {
+ IPPROTO_UDP, create, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ {
u_short alias_port;
struct in_addr alias_address;
struct alias_data ad = {
@@ -930,7 +949,6 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
}
static int
@@ -939,6 +957,7 @@ TcpAliasIn(struct libalias *la, struct ip *pip)
struct tcphdr *tc;
struct alias_link *lnk;
size_t dlen;
+ int ret;
LIBALIAS_LOCK_ASSERT(la);
@@ -947,11 +966,12 @@ TcpAliasIn(struct libalias *la, struct ip *pip)
return (PKT_ALIAS_IGNORED);
tc = (struct tcphdr *)ip_next(pip);
- lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst,
+ ret = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst,
tc->th_sport, tc->th_dport,
IPPROTO_TCP,
- !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY));
- if (lnk != NULL) {
+ !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY),
+ &lnk);
+ if (ret == PKT_ALIAS_OK) {
struct in_addr alias_address;
struct in_addr original_address;
struct in_addr proxy_address;
@@ -1057,13 +1077,13 @@ TcpAliasIn(struct libalias *la, struct ip *pip)
return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
+ return (ret);
}
static int
TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
{
- int proxy_type;
+ int proxy_type, ret;
u_short dest_port;
u_short proxy_server_port;
size_t dlen;
@@ -1108,12 +1128,12 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
accumulate -= twowords(&pip->ip_dst);
ADJUST_CHECKSUM(accumulate, pip->ip_sum);
}
- lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst,
+ ret = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst,
tc->th_sport, tc->th_dport,
- IPPROTO_TCP, create);
- if (lnk == NULL)
- return (PKT_ALIAS_IGNORED);
- if (lnk != NULL) {
+ IPPROTO_TCP, create, &lnk);
+ if (ret != PKT_ALIAS_OK)
+ return (ret);
+ {
u_short alias_port;
struct in_addr alias_address;
int accumulate;
@@ -1177,7 +1197,6 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create)
return (PKT_ALIAS_OK);
}
- return (PKT_ALIAS_IGNORED);
}
/* Fragment Handling
@@ -1581,17 +1600,24 @@ LibAliasUnaliasOut(struct libalias *la,
ic = (struct icmp *)ip_next(pip);
/* Find a link */
- if (pip->ip_p == IPPROTO_UDP)
- lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src,
+ if (pip->ip_p == IPPROTO_UDP) {
+ iresult = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src,
ud->uh_dport, ud->uh_sport,
- IPPROTO_UDP, 0);
- else if (pip->ip_p == IPPROTO_TCP)
- lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src,
+ IPPROTO_UDP, 0, &lnk);
+ if (iresult != PKT_ALIAS_OK)
+ goto getout;
+ } else if (pip->ip_p == IPPROTO_TCP) {
+ iresult = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src,
tc->th_dport, tc->th_sport,
- IPPROTO_TCP, 0);
- else if (pip->ip_p == IPPROTO_ICMP)
- lnk = FindIcmpIn(la, pip->ip_dst, pip->ip_src, ic->icmp_id, 0);
- else
+ IPPROTO_TCP, 0, &lnk);
+ if (iresult != PKT_ALIAS_OK)
+ goto getout;
+ } else if (pip->ip_p == IPPROTO_ICMP) {
+ iresult = FindIcmpIn(la, pip->ip_dst, pip->ip_src,
+ ic->icmp_id, 0, &lnk);
+ if (iresult != PKT_ALIAS_OK)
+ goto getout;
+ } else
lnk = NULL;
/* Change it from an aliased packet to an unaliased packet */
diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c
index b09e41935d93..c143d74a2f45 100644
--- a/sys/netinet/libalias/alias_db.c
+++ b/sys/netinet/libalias/alias_db.c
@@ -28,13 +28,13 @@
#include <sys/cdefs.h>
#ifdef _KERNEL
-#include <machine/stdarg.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/rwlock.h>
+#include <sys/stdarg.h>
#include <sys/syslog.h>
#else
#include <stdarg.h>
@@ -1049,15 +1049,19 @@ FindLinkByInternalEndpoint(struct libalias *la, struct in_addr src_addr,
(prototypes in alias_local.h)
*/
-struct alias_link *
+int
FindIcmpIn(struct libalias *la, struct in_addr dst_addr,
struct in_addr alias_addr,
u_short id_alias,
- int create)
+ int create,
+ struct alias_link **lnkp)
{
struct alias_link *lnk;
LIBALIAS_LOCK_ASSERT(la);
+
+ *lnkp = NULL;
+
lnk = FindLinkIn(la, dst_addr, alias_addr,
NO_DEST_PORT, id_alias,
LINK_ICMP, 0);
@@ -1068,19 +1072,26 @@ FindIcmpIn(struct libalias *la, struct in_addr dst_addr,
lnk = AddLink(la, target_addr, dst_addr, alias_addr,
id_alias, NO_DEST_PORT, id_alias,
LINK_ICMP);
+ if (lnk == NULL)
+ return (PKT_ALIAS_ERROR);
}
- return (lnk);
+ *lnkp = lnk;
+ return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED);
}
-struct alias_link *
+int
FindIcmpOut(struct libalias *la, struct in_addr src_addr,
struct in_addr dst_addr,
u_short id,
- int create)
+ int create,
+ struct alias_link **lnkp)
{
struct alias_link *lnk;
LIBALIAS_LOCK_ASSERT(la);
+
+ *lnkp = NULL;
+
lnk = FindLinkOut(la, src_addr, dst_addr,
id, NO_DEST_PORT,
LINK_ICMP, 0);
@@ -1091,8 +1102,11 @@ FindIcmpOut(struct libalias *la, struct in_addr src_addr,
lnk = AddLink(la, src_addr, dst_addr, alias_addr,
id, NO_DEST_PORT, GET_ALIAS_ID,
LINK_ICMP);
+ if (lnk == NULL)
+ return (PKT_ALIAS_ERROR);
}
- return (lnk);
+ *lnkp = lnk;
+ return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED);
}
struct alias_link *
@@ -1146,18 +1160,21 @@ FindFragmentPtr(struct libalias *la, struct in_addr dst_addr,
LINK_FRAGMENT_PTR, 0);
}
-struct alias_link *
+int
FindProtoIn(struct libalias *la, struct in_addr dst_addr,
struct in_addr alias_addr,
- u_char proto)
+ u_char proto,
+ struct alias_link **lnkp)
{
struct alias_link *lnk;
LIBALIAS_LOCK_ASSERT(la);
+
+ *lnkp = NULL;
+
lnk = FindLinkIn(la, dst_addr, alias_addr,
NO_DEST_PORT, 0,
proto, 1);
-
if (lnk == NULL && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) {
struct in_addr target_addr;
@@ -1165,22 +1182,28 @@ FindProtoIn(struct libalias *la, struct in_addr dst_addr,
lnk = AddLink(la, target_addr, dst_addr, alias_addr,
NO_SRC_PORT, NO_DEST_PORT, 0,
proto);
+ if (lnk == NULL)
+ return (PKT_ALIAS_ERROR);
}
- return (lnk);
+ *lnkp = lnk;
+ return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED);
}
-struct alias_link *
+int
FindProtoOut(struct libalias *la, struct in_addr src_addr,
struct in_addr dst_addr,
- u_char proto)
+ u_char proto,
+ struct alias_link **lnkp)
{
struct alias_link *lnk;
LIBALIAS_LOCK_ASSERT(la);
+
+ *lnkp = NULL;
+
lnk = FindLinkOut(la, src_addr, dst_addr,
NO_SRC_PORT, NO_DEST_PORT,
proto, 1);
-
if (lnk == NULL) {
struct in_addr alias_addr;
@@ -1188,22 +1211,29 @@ FindProtoOut(struct libalias *la, struct in_addr src_addr,
lnk = AddLink(la, src_addr, dst_addr, alias_addr,
NO_SRC_PORT, NO_DEST_PORT, 0,
proto);
+ if (lnk == NULL)
+ return (PKT_ALIAS_ERROR);
}
- return (lnk);
+ *lnkp = lnk;
+ return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED);
}
-struct alias_link *
+int
FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr,
struct in_addr alias_addr,
u_short dst_port,
u_short alias_port,
u_char proto,
- int create)
+ int create,
+ struct alias_link **lnkp)
{
int link_type;
struct alias_link *lnk;
LIBALIAS_LOCK_ASSERT(la);
+
+ *lnkp = NULL;
+
switch (proto) {
case IPPROTO_UDP:
link_type = LINK_UDP;
@@ -1212,8 +1242,7 @@ FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr,
link_type = LINK_TCP;
break;
default:
- return (NULL);
- break;
+ return (PKT_ALIAS_IGNORED);
}
lnk = FindLinkIn(la, dst_addr, alias_addr,
@@ -1227,22 +1256,30 @@ FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr,
lnk = AddLink(la, target_addr, dst_addr, alias_addr,
alias_port, dst_port, alias_port,
link_type);
+ if (lnk == NULL)
+ return (PKT_ALIAS_ERROR);
+
}
- return (lnk);
+ *lnkp = lnk;
+ return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED);
}
-struct alias_link *
+int
FindUdpTcpOut(struct libalias *la, struct in_addr src_addr,
struct in_addr dst_addr,
u_short src_port,
u_short dst_port,
u_char proto,
- int create)
+ int create,
+ struct alias_link **lnkp)
{
int link_type;
struct alias_link *lnk;
LIBALIAS_LOCK_ASSERT(la);
+
+ *lnkp = NULL;
+
switch (proto) {
case IPPROTO_UDP:
link_type = LINK_UDP;
@@ -1251,12 +1288,10 @@ FindUdpTcpOut(struct libalias *la, struct in_addr src_addr,
link_type = LINK_TCP;
break;
default:
- return (NULL);
- break;
+ return (PKT_ALIAS_IGNORED);
}
lnk = FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, create);
-
if (lnk == NULL && create) {
struct in_addr alias_addr;
@@ -1264,8 +1299,11 @@ FindUdpTcpOut(struct libalias *la, struct in_addr src_addr,
lnk = AddLink(la, src_addr, dst_addr, alias_addr,
src_port, dst_port, GET_ALIAS_PORT,
link_type);
+ if (lnk == NULL)
+ return (PKT_ALIAS_ERROR);
}
- return (lnk);
+ *lnkp = lnk;
+ return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED);
}
struct alias_link *
diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c
index e063a67c2902..30cee74fff21 100644
--- a/sys/netinet/libalias/alias_irc.c
+++ b/sys/netinet/libalias/alias_irc.c
@@ -360,9 +360,9 @@ AliasHandleIrcOut(struct libalias *la,
* matter, and this would probably allow it through
* at least _some_ firewalls.
*/
- dcc_lnk = FindUdpTcpOut(la, true_addr, destaddr,
+ (void)FindUdpTcpOut(la, true_addr, destaddr,
true_port, 0,
- IPPROTO_TCP, 1);
+ IPPROTO_TCP, 1, &dcc_lnk);
DBprintf(("Got a DCC link\n"));
if (dcc_lnk) {
struct in_addr alias_address; /* Address from aliasing */
diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h
index ef6c89e675d6..7c1dcb0c8eb0 100644
--- a/sys/netinet/libalias/alias_local.h
+++ b/sys/netinet/libalias/alias_local.h
@@ -239,12 +239,12 @@ struct alias_link *
AddLink(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr,
struct in_addr alias_addr, u_short src_port, u_short dst_port,
int alias_param, int link_type);
-struct alias_link *
+int
FindIcmpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
- u_short _id_alias, int _create);
-struct alias_link *
+ u_short _id_alias, int _create, struct alias_link **_lnkp);
+int
FindIcmpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
- u_short _id, int _create);
+ u_short _id, int _create, struct alias_link **_lnkp);
struct alias_link *
FindFragmentIn1(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
u_short _ip_id);
@@ -255,18 +255,20 @@ struct alias_link *
AddFragmentPtrLink(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id);
struct alias_link *
FindFragmentPtr(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id);
-struct alias_link *
+int
FindProtoIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
- u_char _proto);
-struct alias_link *
+ u_char _proto, struct alias_link **_lnkp);
+int
FindProtoOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
- u_char _proto);
-struct alias_link *
+ u_char _proto, struct alias_link **_lnkp);
+int
FindUdpTcpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr,
- u_short _dst_port, u_short _alias_port, u_char _proto, int _create);
-struct alias_link *
+ u_short _dst_port, u_short _alias_port, u_char _proto, int _create,
+ struct alias_link **_lnkp);
+int
FindUdpTcpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
- u_short _src_port, u_short _dst_port, u_char _proto, int _create);
+ u_short _src_port, u_short _dst_port, u_char _proto, int _create,
+ struct alias_link **_lnkp);
struct alias_link *
AddPptp(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr,
struct in_addr _alias_addr, u_int16_t _src_call_id);
diff --git a/sys/netinet/libalias/alias_sctp.c b/sys/netinet/libalias/alias_sctp.c
index 6781c33f5edb..5ccf31697b42 100644
--- a/sys/netinet/libalias/alias_sctp.c
+++ b/sys/netinet/libalias/alias_sctp.c
@@ -72,12 +72,12 @@
#ifdef _KERNEL
-#include <machine/stdarg.h>
#include <sys/param.h>
#include <sys/gsb_crc32.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
+#include <sys/stdarg.h>
#include <sys/syslog.h>
#include <netinet/libalias/alias_sctp.h>
#include <netinet/libalias/alias.h>
diff --git a/sys/netinet/libalias/alias_skinny.c b/sys/netinet/libalias/alias_skinny.c
index d12046d7953f..fd9e15d3ad40 100644
--- a/sys/netinet/libalias/alias_skinny.c
+++ b/sys/netinet/libalias/alias_skinny.c
@@ -279,9 +279,9 @@ alias_skinny_opnrcvch_ack(struct libalias *la, struct OpenReceiveChannelAck *opn
*localIpAddr = (u_int32_t)opnrcvch_ack->ipAddr;
null_addr.s_addr = INADDR_ANY;
- opnrcv_lnk = FindUdpTcpOut(la, pip->ip_src, null_addr,
+ (void)FindUdpTcpOut(la, pip->ip_src, null_addr,
htons((u_short) opnrcvch_ack->port), 0,
- IPPROTO_UDP, 1);
+ IPPROTO_UDP, 1, &opnrcv_lnk);
opnrcvch_ack->ipAddr = (u_int32_t)GetAliasAddress(opnrcv_lnk).s_addr;
opnrcvch_ack->port = (u_int32_t)ntohs(GetAliasPort(opnrcv_lnk));
diff --git a/sys/netinet/libalias/alias_smedia.c b/sys/netinet/libalias/alias_smedia.c
index 1c4ee0970a53..badd75a45c61 100644
--- a/sys/netinet/libalias/alias_smedia.c
+++ b/sys/netinet/libalias/alias_smedia.c
@@ -435,8 +435,8 @@ alias_pna_out(struct libalias *la, struct ip *pip,
if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) {
memcpy(&port, work, 2);
- pna_links = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk),
- port, 0, IPPROTO_UDP, 1);
+ (void)FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk),
+ port, 0, IPPROTO_UDP, 1, &pna_links);
if (pna_links != NULL) {
#ifndef NO_FW_PUNCH
/* Punch hole in firewall */
diff --git a/sys/netinet/pim.h b/sys/netinet/pim.h
index 98230fc6ae2d..4744ffc7e9d8 100644
--- a/sys/netinet/pim.h
+++ b/sys/netinet/pim.h
@@ -71,7 +71,7 @@ struct pim {
#endif /* ! _PIM_VT */
uint8_t pim_reserved; /* Reserved */
uint16_t pim_cksum; /* IP-style checksum */
-};
+} __packed;
/* KAME-related name backward compatibility */
#define pim_ver pim_vers
#define pim_rsv pim_reserved
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 7b6104da5402..66070faf97e9 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -49,6 +49,7 @@
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/stdarg.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
@@ -74,7 +75,6 @@
#include <netipsec/ipsec_support.h>
-#include <machine/stdarg.h>
#include <security/mac/mac_framework.h>
extern ipproto_input_t *ip_protox[];
diff --git a/sys/netinet/sctp_bsd_addr.c b/sys/netinet/sctp_bsd_addr.c
index a10fbcc5ca40..ac715d8298ec 100644
--- a/sys/netinet/sctp_bsd_addr.c
+++ b/sys/netinet/sctp_bsd_addr.c
@@ -117,25 +117,26 @@ sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa)
{
struct in6_ifaddr *ifa6;
+ KASSERT(ifa->address.sa.sa_family == AF_INET6,
+ ("sctp_gather_internal_ifa_flags() called with address family %u",
+ ifa->address.sa.sa_family));
ifa6 = (struct in6_ifaddr *)ifa->ifa;
ifa->flags = ifa6->ia6_flags;
- if (!MODULE_GLOBAL(ip6_use_deprecated)) {
- if (ifa->flags &
- IN6_IFF_DEPRECATED) {
+ if (MODULE_GLOBAL(ip6_use_deprecated)) {
+ ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
+ } else {
+ if (ifa->flags & IN6_IFF_DEPRECATED) {
ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
} else {
ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
}
- } else {
- ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
}
- if (ifa->flags &
- (IN6_IFF_DETACHED |
- IN6_IFF_ANYCAST |
- IN6_IFF_NOTREADY)) {
+ if (ifa->flags & (IN6_IFF_DETACHED | IN6_IFF_DUPLICATED)) {
+ ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
+ }
+ /* Right now, do not support IPv6 anycast addresses */
+ if (ifa->flags & IN6_IFF_ANYCAST) {
ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
- } else {
- ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE;
}
}
#endif /* INET6 */
diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c
index dc31ffbc2161..5f637cc63df5 100644
--- a/sys/netinet/sctp_input.c
+++ b/sys/netinet/sctp_input.c
@@ -5780,7 +5780,11 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
goto out;
}
ecn_bits = ip->ip_tos;
- if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) {
+ if (m->m_pkthdr.csum_flags & (CSUM_SCTP_VALID | CSUM_IP_SCTP)) {
+ /*
+ * Packet with CSUM_IP_SCTP were sent from local host using
+ * checksum offloading. Checksum not required.
+ */
SCTP_STAT_INCR(sctps_recvhwcrc);
compute_crc = 0;
} else {
diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c
index 1a8a514fbf5f..2092f20e3c22 100644
--- a/sys/netinet/sctp_pcb.c
+++ b/sys/netinet/sctp_pcb.c
@@ -453,6 +453,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
SCTPDBG(SCTP_DEBUG_PCB4,
"Clearing deleted ifa flag\n");
sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
+#ifdef INET6
+ if (sctp_ifap->address.sa.sa_family == AF_INET6) {
+ sctp_gather_internal_ifa_flags(sctp_ifap);
+ }
+#endif
sctp_ifap->ifn_p = sctp_ifnp;
atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
}
@@ -475,6 +480,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
} else {
/* Repair ifn_p, which was NULL... */
sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
+#ifdef INET6
+ if (sctp_ifap->address.sa.sa_family == AF_INET6) {
+ sctp_gather_internal_ifa_flags(sctp_ifap);
+ }
+#endif
SCTPDBG(SCTP_DEBUG_PCB4,
"Repairing ifn %p for ifa %p\n",
(void *)sctp_ifnp, (void *)sctp_ifap);
@@ -500,6 +510,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
memcpy(&sctp_ifap->address, addr, addr->sa_len);
sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE;
sctp_ifap->flags = ifa_flags;
+#ifdef INET6
+ if (addr->sa_family == AF_INET6) {
+ sctp_gather_internal_ifa_flags(sctp_ifap);
+ }
+#endif
/* Set scope */
switch (sctp_ifap->address.sa.sa_family) {
#ifdef INET
@@ -635,7 +650,7 @@ sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr,
}
}
SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap);
- sctp_ifap->localifa_flags &= SCTP_ADDR_VALID;
+ sctp_ifap->localifa_flags &= ~SCTP_ADDR_VALID;
/*
* We don't set the flag. This means that the structure will
* hang around in EP's that have bound specific to it until
@@ -3050,7 +3065,7 @@ continue_anyway:
/* GAK, more FIXME IFA lock? */
if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
/* Can't bind a non-existent addr. */
- error = EINVAL;
+ error = EADDRNOTAVAIL;
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
goto out;
}
diff --git a/sys/netinet/sctp_sysctl.c b/sys/netinet/sctp_sysctl.c
index a39429ec046e..bd2f23f40727 100644
--- a/sys/netinet/sctp_sysctl.c
+++ b/sys/netinet/sctp_sysctl.c
@@ -265,6 +265,10 @@ sctp_sysctl_copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *st
if (sctp_is_addr_restricted(stcb, sctp_ifa)) {
continue;
}
+ } else {
+ if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
+ continue;
+ }
}
switch (sctp_ifa->address.sa.sa_family) {
#ifdef INET
diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c
index 66af716eea52..7d8cb965ab09 100644
--- a/sys/netinet/sctp_timer.c
+++ b/sys/netinet/sctp_timer.c
@@ -35,7 +35,6 @@
#define _IP_VHL
#include <netinet/sctp_os.h>
#include <netinet/sctp_pcb.h>
-
#include <netinet/sctp_var.h>
#include <netinet/sctp_sysctl.h>
#include <netinet/sctp_timer.h>
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 94d41ff67836..41a49b318cd5 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -77,7 +77,7 @@ struct tcphdr {
u_short th_win; /* window */
u_short th_sum; /* checksum */
u_short th_urp; /* urgent pointer */
-};
+} __packed;
static __inline uint16_t
__tcp_get_flags(const struct tcphdr *th)
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 5b39c94e0e58..b77ebc928809 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -170,6 +170,50 @@
#define NUM_OF_HPTSI_SLOTS 102400
+/* The number of connections after which the dynamic sleep logic kicks in. */
+#define DEFAULT_CONNECTION_THRESHOLD 100
+
+/*
+ * When using the hpts, a TCP stack must make sure
+ * that once a INP_DROPPED flag is applied to a INP
+ * that it does not expect tcp_output() to ever be
+ * called by the hpts. The hpts will *not* call
+ * any output (or input) functions on a TCB that
+ * is in the DROPPED state.
+ *
+ * This implies final ACK's and RST's that might
+ * be sent when a TCB is still around must be
+ * sent from a routine like tcp_respond().
+ */
+#define LOWEST_SLEEP_ALLOWED 50
+#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
+ * this determines min granularity of the
+ * hpts. If 1, granularity is 10useconds at
+ * the cost of more CPU (context switching).
+ * Note do not set this to 0.
+ */
+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
+#define DYNAMIC_MAX_SLEEP 5000 /* 5ms */
+
+/* Thresholds for raising/lowering sleep */
+#define SLOTS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
+#define SLOTS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
+/**
+ *
+ * Dynamic adjustment of sleeping times is done in "new" mode
+ * where we are depending on syscall returns and lro returns
+ * to push hpts forward mainly and the timer is only a backstop.
+ *
+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
+ * then we do a dynamic adjustment on the time we sleep.
+ * Our threshold is if the lateness of the first client served (in ticks) is
+ * greater than or equal too slots_indicate_more_sleep (10ms
+ * or 10000 ticks). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the ticks_ran is less than
+ * slots_indicate_more_sleep (100 ticks or 1000usecs).
+ *
+ */
+
/* Each hpts has its own p_mtx which is used for locking */
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
@@ -244,11 +288,10 @@ static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
static void tcp_hpts_thread(void *ctx);
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
-static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
+static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
-
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"TCP Hpts controls");
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -366,7 +409,7 @@ sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
new = hpts_sleep_max;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error == 0 && req->newptr) {
- if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) ||
+ if ((new < (dynamic_min_sleep/HPTS_USECS_PER_SLOT)) ||
(new > HPTS_MAX_SLEEP_ALLOWED))
error = EINVAL;
else
@@ -404,15 +447,15 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
&sysctl_net_inet_tcp_hpts_min_sleep, "IU",
"The minimum time the hpts must sleep before processing more slots");
-static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
-static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
+static int slots_indicate_more_sleep = SLOTS_INDICATE_MORE_SLEEP;
+static int slots_indicate_less_sleep = SLOTS_INDICATE_LESS_SLEEP;
static int tcp_hpts_no_wake_over_thresh = 1;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
- &ticks_indicate_more_sleep, 0,
+ &slots_indicate_more_sleep, 0,
"If we only process this many or less on a timeout, we need longer sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
- &ticks_indicate_less_sleep, 0,
+ &slots_indicate_less_sleep, 0,
"If we process this many or more on a timeout, we need less sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
&tcp_hpts_no_wake_over_thresh, 0,
@@ -433,38 +476,40 @@ static void
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
int slots_to_run, int idx, bool from_callout)
{
- union tcp_log_stackspecific log;
- /*
- * Unused logs are
- * 64 bit - delRate, rttProp, bw_inuse
- * 16 bit - cwnd_gain
- * 8 bit - bbr_state, bbr_substate, inhpts;
- */
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.flex1 = hpts->p_nxt_slot;
- log.u_bbr.flex2 = hpts->p_cur_slot;
- log.u_bbr.flex3 = hpts->p_prev_slot;
- log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
- log.u_bbr.flex6 = hpts->p_on_queue_cnt;
- log.u_bbr.flex7 = hpts->p_cpu;
- log.u_bbr.flex8 = (uint8_t)from_callout;
- log.u_bbr.inflight = slots_to_run;
- log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
- log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
- log.u_bbr.epoch = hpts->saved_curslot;
- log.u_bbr.lt_epoch = hpts->saved_prev_slot;
- log.u_bbr.pkts_out = hpts->p_delayed_by;
- log.u_bbr.lost = hpts->p_hpts_sleep_time;
- log.u_bbr.pacing_gain = hpts->p_cpu;
- log.u_bbr.pkt_epoch = hpts->p_runningslot;
- log.u_bbr.use_lt_bw = 1;
- TCP_LOG_EVENTP(tp, NULL,
- &tptosocket(tp)->so_rcv,
- &tptosocket(tp)->so_snd,
- BBR_LOG_HPTSDIAG, 0,
- 0, &log, false, tv);
+ if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ /*
+ * Unused logs are
+ * 64 bit - delRate, rttProp, bw_inuse
+ * 16 bit - cwnd_gain
+ * 8 bit - bbr_state, bbr_substate, inhpts;
+ */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.flex7 = hpts->p_cpu;
+ log.u_bbr.flex8 = (uint8_t)from_callout;
+ log.u_bbr.inflight = slots_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usec(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.pacing_gain = hpts->p_cpu;
+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
+ log.u_bbr.use_lt_bw = 1;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
+ }
}
static void
@@ -875,7 +920,7 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
return (slot_on);
}
/* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hptstick(&tv);
+ wheel_cts = tcp_tv_to_hpts_slot(&tv);
/* Map it onto the wheel */
wheel_slot = tick_to_wheel(wheel_cts);
/* Now what's the max we can place it at? */
@@ -947,7 +992,7 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
* We need to reschedule the hpts's time-out.
*/
hpts->p_hpts_sleep_time = slot;
- need_new_to = slot * HPTS_TICKS_PER_SLOT;
+ need_new_to = slot * HPTS_USECS_PER_SLOT;
}
}
/*
@@ -1102,7 +1147,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
hpts->p_lasttick = hpts->p_curtick;
hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+ tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
if ((hpts->p_on_queue_cnt == 0) ||
(hpts->p_lasttick == hpts->p_curtick)) {
@@ -1118,8 +1163,7 @@ again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) >
- ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
+ if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) &&
(hpts->p_on_queue_cnt != 0)) {
/*
* Wheel wrap is occuring, basically we
@@ -1200,7 +1244,7 @@ again:
* was not any (i.e. if slots_to_run == 1, no delay).
*/
hpts->p_delayed_by = (slots_to_run - (i + 1)) *
- HPTS_TICKS_PER_SLOT;
+ HPTS_USECS_PER_SLOT;
runningslot = hpts->p_runningslot;
hptsh = &hpts->p_hptss[runningslot];
@@ -1353,10 +1397,7 @@ again:
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
- if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
- tcp_hpts_log(hpts, tp, &tv, slots_to_run, i,
- from_callout);
- }
+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
@@ -1447,7 +1488,7 @@ no_one:
goto again;
}
no_run:
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+ tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1487,7 +1528,7 @@ no_run:
}
void
-__tcp_set_hpts(struct tcpcb *tp, int32_t line)
+tcp_set_hpts(struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
@@ -1570,7 +1611,7 @@ __tcp_run_hpts(void)
ticks_ran = tcp_hptsi(hpts, false);
/* We may want to adjust the sleep values here */
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
- if (ticks_ran > ticks_indicate_less_sleep) {
+ if (ticks_ran > slots_indicate_less_sleep) {
struct timeval tv;
sbintime_t sb;
@@ -1580,7 +1621,7 @@ __tcp_run_hpts(void)
/* Reschedule with new to value */
tcp_hpts_set_max_sleep(hpts, 0);
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
/* Validate its in the right ranges */
if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
hpts->overidden_sleep = tv.tv_usec;
@@ -1602,7 +1643,7 @@ __tcp_run_hpts(void)
callout_reset_sbt_on(&hpts->co, sb, 0,
hpts_timeout_swi, hpts, hpts->p_cpu,
(C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else if (ticks_ran < ticks_indicate_more_sleep) {
+ } else if (ticks_ran < slots_indicate_more_sleep) {
/* For the further sleep, don't reschedule hpts */
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
@@ -1684,7 +1725,7 @@ tcp_hpts_thread(void *ctx)
hpts->p_hpts_active = 1;
ticks_ran = tcp_hptsi(hpts, true);
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) {
hpts->hit_callout_thresh = 1;
atomic_add_int(&hpts_that_need_softclock, 1);
@@ -1698,11 +1739,11 @@ tcp_hpts_thread(void *ctx)
* Only adjust sleep time if we were
* called from the callout i.e. direct_wake == 0.
*/
- if (ticks_ran < ticks_indicate_more_sleep) {
+ if (ticks_ran < slots_indicate_more_sleep) {
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
- } else if (ticks_ran > ticks_indicate_less_sleep) {
+ } else if (ticks_ran > slots_indicate_less_sleep) {
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
@@ -1949,7 +1990,7 @@ tcp_hpts_mod_load(void)
hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[i] = tcp_tv_to_usectick(&tv);
+ tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv);
hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
@@ -1996,7 +2037,7 @@ tcp_hpts_mod_load(void)
}
}
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
hpts->sleeping = tv.tv_usec;
sb = tvtosbt(tv);
callout_reset_sbt_on(&hpts->co, sb, 0,
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index b097a2b98db9..6172baf2a062 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -26,14 +26,38 @@
#ifndef __tcp_hpts_h__
#define __tcp_hpts_h__
-/* Number of useconds in a hpts tick */
-#define HPTS_TICKS_PER_SLOT 10
+/* Number of useconds represented by an hpts slot */
+#define HPTS_USECS_PER_SLOT 10
#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
+static inline uint32_t
+tcp_tv_to_hpts_slot(const struct timeval *sv)
+{
+ return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT));
+}
+
+static inline uint32_t
+tcp_tv_to_usec(const struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+static inline uint32_t
+tcp_tv_to_msec(const struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
+}
+
+static inline uint64_t
+tcp_tv_to_lusec(const struct timeval *sv)
+{
+ return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
struct hpts_diag {
uint32_t p_hpts_active; /* bbr->flex7 x */
uint32_t p_nxt_slot; /* bbr->flex1 x */
@@ -66,52 +90,16 @@ struct hpts_diag {
#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
-#define DEFAULT_CONNECTION_THESHOLD 100
+#ifdef _KERNEL
/*
- * When using the hpts, a TCP stack must make sure
- * that once a INP_DROPPED flag is applied to a INP
- * that it does not expect tcp_output() to ever be
- * called by the hpts. The hpts will *not* call
- * any output (or input) functions on a TCB that
- * is in the DROPPED state.
- *
- * This implies final ACK's and RST's that might
- * be sent when a TCB is still around must be
- * sent from a routine like tcp_respond().
- */
-#define LOWEST_SLEEP_ALLOWED 50
-#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
- * this determines min granularity of the
- * hpts. If 1, granularity is 10useconds at
- * the cost of more CPU (context switching).
- * Note do not set this to 0.
- */
-#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
-#define DYNAMIC_MAX_SLEEP 5000 /* 5ms */
-
-/* Thresholds for raising/lowering sleep */
-#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
-#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
-/**
- *
- * Dynamic adjustment of sleeping times is done in "new" mode
- * where we are depending on syscall returns and lro returns
- * to push hpts forward mainly and the timer is only a backstop.
- *
- * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
- * then we do a dynamic adjustment on the time we sleep.
- * Our threshold is if the lateness of the first client served (in ticks) is
- * greater than or equal too ticks_indicate_more_sleep (10ms
- * or 10000 ticks). If we were that late, the actual sleep time
- * is adjusted down by 50%. If the ticks_ran is less than
- * ticks_indicate_more_sleep (100 ticks or 1000usecs).
- *
- */
+ * The following are the definitions for the kernel HPTS interface for managing
+ * the HPTS ring and the TCBs on it.
+*/
-#ifdef _KERNEL
void tcp_hpts_init(struct tcpcb *);
void tcp_hpts_remove(struct tcpcb *);
+
static inline bool
tcp_in_hpts(struct tcpcb *tp)
{
@@ -149,57 +137,17 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
#define tcp_hpts_insert(inp, slot) \
tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
-void __tcp_set_hpts(struct tcpcb *tp, int32_t line);
-#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
-
-void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);
-
-void tcp_lro_hpts_init(void);
-void tcp_lro_hpts_uninit(void);
-
-extern int32_t tcp_min_hptsi_time;
-
-#endif /* _KERNEL */
-
-/*
- * The following functions should also be available
- * to userspace as well.
- */
-static __inline uint32_t
-tcp_tv_to_hptstick(const struct timeval *sv)
-{
- return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
-}
-
-static __inline uint32_t
-tcp_tv_to_usectick(const struct timeval *sv)
-{
- return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
-}
-
-static __inline uint32_t
-tcp_tv_to_mssectick(const struct timeval *sv)
-{
- return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
-}
-
-static __inline uint64_t
-tcp_tv_to_lusectick(const struct timeval *sv)
-{
- return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
-}
-
-#ifdef _KERNEL
+void tcp_set_hpts(struct tcpcb *tp);
extern int32_t tcp_min_hptsi_time;
static inline int32_t
get_hpts_min_sleep_time(void)
{
- return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
+ return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT);
}
-static __inline uint32_t
+static inline uint32_t
tcp_gethptstick(struct timeval *sv)
{
struct timeval tv;
@@ -207,10 +155,10 @@ tcp_gethptstick(struct timeval *sv)
if (sv == NULL)
sv = &tv;
microuptime(sv);
- return (tcp_tv_to_hptstick(sv));
+ return (tcp_tv_to_hpts_slot(sv));
}
-static __inline uint64_t
+static inline uint64_t
tcp_get_u64_usecs(struct timeval *tv)
{
struct timeval tvd;
@@ -218,10 +166,10 @@ tcp_get_u64_usecs(struct timeval *tv)
if (tv == NULL)
tv = &tvd;
microuptime(tv);
- return (tcp_tv_to_lusectick(tv));
+ return (tcp_tv_to_lusec(tv));
}
-static __inline uint32_t
+static inline uint32_t
tcp_get_usecs(struct timeval *tv)
{
struct timeval tvd;
@@ -229,8 +177,15 @@ tcp_get_usecs(struct timeval *tv)
if (tv == NULL)
tv = &tvd;
microuptime(tv);
- return (tcp_tv_to_usectick(tv));
+ return (tcp_tv_to_usec(tv));
}
+/*
+ * LRO HPTS initialization and uninitialization, only for internal use by the
+ * HPTS code.
+ */
+void tcp_lro_hpts_init(void);
+void tcp_lro_hpts_uninit(void);
+
#endif /* _KERNEL */
#endif /* __tcp_hpts_h__ */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 29a6b431f311..6492495dc583 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -383,7 +383,7 @@ cc_conn_init(struct tcpcb *tp)
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ tp->t_rttmin, tcp_rexmit_max);
}
if (metrics.hc_ssthresh) {
/*
@@ -567,8 +567,6 @@ int
tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
{
struct mbuf *m;
- struct in6_ifaddr *ia6;
- struct ip6_hdr *ip6;
m = *mp;
if (m->m_len < *offp + sizeof(struct tcphdr)) {
@@ -580,19 +578,6 @@ tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
}
}
- /*
- * draft-itojun-ipv6-tcp-to-anycast
- * better place to put this in?
- */
- ip6 = mtod(m, struct ip6_hdr *);
- ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
- if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
- icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
- (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
- *mp = NULL;
- return (IPPROTO_DONE);
- }
-
*mp = m;
return (tcp_input_with_port(mp, offp, proto, port));
}
@@ -624,7 +609,6 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
int tlen = 0, off;
int drop_hdrlen;
int thflags;
- int rstreason = 0; /* For badport_bandlim accounting purposes */
int lookupflag;
uint8_t iptos;
struct m_tag *fwd_tag = NULL;
@@ -636,6 +620,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
#endif /* INET6 */
struct tcpopt to; /* options in this segment */
char *s = NULL; /* address and port logging */
+ bool closed_port = false; /* segment is hitting a closed port */
NET_EPOCH_ASSERT();
@@ -664,6 +649,12 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
th->th_sum = in6_cksum_pseudo(ip6, tlen,
IPPROTO_TCP, m->m_pkthdr.csum_data);
th->th_sum ^= 0xffff;
+ } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) {
+ /*
+ * Packet from local host (maybe from a VM).
+ * Checksum not required.
+ */
+ th->th_sum = 0;
} else
th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
if (th->th_sum) {
@@ -724,6 +715,12 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
htonl(m->m_pkthdr.csum_data + tlen +
IPPROTO_TCP));
th->th_sum ^= 0xffff;
+ } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) {
+ /*
+ * Packet from local host (maybe from a VM).
+ * Checksum not required.
+ */
+ th->th_sum = 0;
} else {
struct ipovly *ipov = (struct ipovly *)ip;
@@ -907,22 +904,22 @@ findpcb:
* XXX MRT Send RST using which routing table?
*/
if (inp == NULL) {
- if (rstreason != 0) {
+ if ((lookupflag & INPLOOKUP_WILDCARD) == 0) {
/* We came here after second (safety) lookup. */
- MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0);
- goto dropwithreset;
- }
- /*
- * Log communication attempts to ports that are not
- * in use.
- */
- if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
- V_tcp_log_in_vain == 2) {
- if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
+ MPASS(!closed_port);
+ } else {
+ /*
+ * Log communication attempts to ports that are not
+ * in use.
+ */
+ if (((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
+ V_tcp_log_in_vain == 2) &&
+ (s = tcp_log_vain(NULL, th, (void *)ip, ip6))) {
log(LOG_INFO, "%s; %s: Connection attempt "
"to closed port\n", s, __func__);
+ }
+ closed_port = true;
}
- rstreason = BANDLIM_RST_CLOSEDPORT;
goto dropwithreset;
}
INP_LOCK_ASSERT(inp);
@@ -1013,12 +1010,12 @@ findpcb:
* down or it is in the CLOSED state. Either way we drop the
* segment and send an appropriate response.
*/
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ closed_port = true;
goto dropwithreset;
}
if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ closed_port = true;
goto dropwithreset;
}
@@ -1070,6 +1067,8 @@ findpcb:
* socket appended to the listen queue in SYN_RECEIVED state.
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+ int result;
+
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1079,8 +1078,8 @@ findpcb:
/*
* NB: syncache_expand() doesn't unlock inp.
*/
- rstreason = syncache_expand(&inc, &to, th, &so, m, port);
- if (rstreason < 0) {
+ result = syncache_expand(&inc, &to, th, &so, m, port);
+ if (result < 0) {
/*
* A failing TCP MD5 signature comparison
* must result in the segment being dropped
@@ -1088,7 +1087,7 @@ findpcb:
* to the sender.
*/
goto dropunlock;
- } else if (rstreason == 0) {
+ } else if (result == 0) {
/*
* No syncache entry, or ACK was not for our
* SYN/ACK. Do our protection against double
@@ -1099,7 +1098,8 @@ findpcb:
* don't want to sent RST for the second ACK,
* so we perform second lookup without wildcard
* match, hoping to find the new socket. If
- * the ACK is stray indeed, rstreason would
+ * the ACK is stray indeed, the missing
+ * INPLOOKUP_WILDCARD flag in lookupflag would
* hint the above code that the lookup was a
* second attempt.
*
@@ -1107,7 +1107,6 @@ findpcb:
* of the failure cause.
*/
INP_WUNLOCK(inp);
- rstreason = BANDLIM_RST_OPENPORT;
lookupflag &= ~INPLOOKUP_WILDCARD;
goto findpcb;
}
@@ -1131,7 +1130,6 @@ tfo_socket_result:
V_tcp_sc_rst_sock_fail ?
"sending RST" : "try again");
if (V_tcp_sc_rst_sock_fail) {
- rstreason = BANDLIM_UNLIMITED;
goto dropwithreset;
} else
goto dropunlock;
@@ -1198,7 +1196,6 @@ tfo_socket_result:
s, __func__);
syncache_badack(&inc, port); /* XXX: Not needed! */
TCPSTAT_INC(tcps_badsyn);
- rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
}
/*
@@ -1274,7 +1271,6 @@ tfo_socket_result:
"Connection attempt to deprecated "
"IPv6 address rejected\n",
s, __func__);
- rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
}
}
@@ -1395,12 +1391,12 @@ dropwithreset:
* When blackholing do not respond with a RST but
* completely ignore the segment and drop it.
*/
- if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) ||
- (rstreason == BANDLIM_RST_CLOSEDPORT &&
- ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
+ if (((!closed_port && V_blackhole == 3) ||
+ (closed_port &&
+ ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
(V_blackhole_local || (
#ifdef INET6
- isipv6 ? !in6_localaddr(&ip6->ip6_src) :
+ isipv6 ? !in6_localip(&ip6->ip6_src) :
#endif
#ifdef INET
!in_localip(ip->ip_src)
@@ -1410,7 +1406,7 @@ dropwithreset:
)))
goto dropunlock;
TCP_PROBE5(receive, NULL, tp, m, tp, th);
- tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ tcp_dropwithreset(m, th, tp, tlen);
m = NULL; /* mbuf chain got consumed. */
dropunlock:
@@ -1453,7 +1449,7 @@ drop:
* is at least 3/8 of the current socket buffer size.
* 3. receive buffer size has not hit maximal automatic size;
*
- * If all of the criteria are met we increaset the socket buffer
+ * If all of the criteria are met, we increase the socket buffer
* by a 1/2 (bounded by the max). This allows us to keep ahead
* of slow-start but also makes it so our peer never gets limited
* by our rwnd which we then open up causing a burst.
@@ -1519,7 +1515,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
uint16_t thflags;
int acked, ourfinisacked, needoutput = 0;
sackstatus_t sack_changed;
- int rstreason, todrop, win, incforsyn = 0;
+ int todrop, win, incforsyn = 0;
uint32_t tiwin;
uint16_t nsegs;
char *s;
@@ -1530,7 +1526,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
struct tcpopt to;
int tfo_syn;
u_int maxseg = 0;
+ bool no_data;
+ no_data = (tlen == 0);
thflags = tcp_get_flags(th);
tp->sackhint.last_sack_ack = 0;
sack_changed = SACK_NOCHANGE;
@@ -1562,7 +1560,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
*/
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
- rstreason = BANDLIM_UNLIMITED;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -1769,7 +1766,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tp->ts_recent = to.to_tsval;
}
- if (tlen == 0) {
+ if (no_data) {
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
!IN_RECOVERY(tp->t_flags) &&
@@ -1978,7 +1975,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
- rstreason = BANDLIM_RST_OPENPORT;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -1991,7 +1987,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* FIN, or a RST.
*/
if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
- rstreason = BANDLIM_RST_OPENPORT;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
} else if (thflags & TH_SYN) {
@@ -2212,7 +2207,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
tp = tcp_drop(tp, ECONNRESET);
- rstreason = BANDLIM_UNLIMITED;
} else {
tcp_ecn_input_syn_sent(tp, thflags, iptos);
tcp_send_challenge_ack(tp, th, m);
@@ -2259,7 +2253,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* for the "LAND" DoS attack.
*/
if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
- rstreason = BANDLIM_RST_OPENPORT;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -2341,7 +2334,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
tp = tcp_close(tp);
TCPSTAT_INC(tcps_rcvafterclose);
- rstreason = BANDLIM_UNLIMITED;
goto dropwithreset;
}
@@ -2572,7 +2564,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
maxseg = tcp_maxseg(tp);
- if (tlen == 0 &&
+ if (no_data &&
(tiwin == tp->snd_wnd ||
(tp->t_flags & TF_SACK_PERMIT))) {
/*
@@ -2815,9 +2807,11 @@ enter_recovery:
KASSERT((tp->t_dupacks == 2 &&
tp->snd_limited == 0) ||
(sent == maxseg + 1 &&
- tp->t_flags & TF_SENTFIN),
- ("%s: sent too much",
- __func__));
+ tp->t_flags & TF_SENTFIN) ||
+ (sent < 2 * maxseg &&
+ tp->t_flags & TF_NODELAY),
+ ("%s: sent too much: %u>%u",
+ __func__, sent, maxseg));
tp->snd_limited = 2;
} else if (sent > 0) {
++tp->snd_limited;
@@ -3126,8 +3120,7 @@ step6:
(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
/* keep track of pure window updates */
- if (tlen == 0 &&
- tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
TCPSTAT_INC(tcps_rcvwinupd);
tp->snd_wnd = tiwin;
tp->snd_wl1 = th->th_seq;
@@ -3437,7 +3430,6 @@ dropafterack:
if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
(SEQ_GT(tp->snd_una, th->th_ack) ||
SEQ_GT(th->th_ack, tp->snd_max)) ) {
- rstreason = BANDLIM_RST_OPENPORT;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -3449,11 +3441,10 @@ dropafterack:
return;
dropwithreset:
+ tcp_dropwithreset(m, th, NULL, tlen);
if (tp != NULL) {
- tcp_dropwithreset(m, th, tp, tlen, rstreason);
INP_WUNLOCK(inp);
- } else
- tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+ }
return;
drop:
@@ -3473,8 +3464,7 @@ drop:
* tp may be NULL.
*/
void
-tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
- int tlen, int rstreason)
+tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen)
{
#ifdef INET
struct ip *ip;
@@ -3514,7 +3504,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
#endif
/* Perform bandwidth limiting. */
- if (badport_bandlim(rstreason) < 0)
+ if (badport_bandlim(BANDLIM_TCP_RST) < 0)
goto drop;
/* tcp_respond consumes the mbuf chain. */
@@ -3745,7 +3735,7 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
* the minimum feasible timer (which is 2 ticks).
*/
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
- max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+ max(tp->t_rttmin, rtt + 2), tcp_rexmit_max);
/*
* We received an ack for a packet that wasn't retransmitted;
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index e9ad05382b81..e24790ece43d 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -29,6 +29,7 @@
#include <sys/cdefs.h>
#include "opt_inet.h"
+#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/arb.h>
#include <sys/hash.h>
@@ -43,11 +44,18 @@
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
+#ifdef DDB
+#include <sys/time.h>
+#endif
#include <sys/tree.h>
#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#include <sys/counter.h>
#include <dev/tcp_log/tcp_log_dev.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
#include <net/if.h>
#include <net/if_var.h>
#include <net/vnet.h>
@@ -1840,40 +1848,36 @@ retry:
log_buf->tlb_txbuf.tls_sb_ccc = 0;
}
/* Copy values from tp to the log entry. */
-#define COPY_STAT(f) log_buf->tlb_ ## f = tp->f
-#define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f
- COPY_STAT_T(state);
- COPY_STAT_T(starttime);
- COPY_STAT(iss);
- COPY_STAT_T(flags);
- COPY_STAT(snd_una);
- COPY_STAT(snd_max);
- COPY_STAT(snd_cwnd);
- COPY_STAT(snd_nxt);
- COPY_STAT(snd_recover);
- COPY_STAT(snd_wnd);
- COPY_STAT(snd_ssthresh);
- COPY_STAT_T(srtt);
- COPY_STAT_T(rttvar);
- COPY_STAT(rcv_up);
- COPY_STAT(rcv_adv);
- COPY_STAT_T(flags2);
- COPY_STAT(rcv_nxt);
- COPY_STAT(rcv_wnd);
- COPY_STAT_T(dupacks);
- COPY_STAT_T(segqlen);
- COPY_STAT(snd_numholes);
+ log_buf->tlb_state = tp->t_state;
+ log_buf->tlb_starttime = tp->t_starttime;
+ log_buf->tlb_iss = tp->iss;
+ log_buf->tlb_flags = tp->t_flags;
+ log_buf->tlb_snd_una = tp->snd_una;
+ log_buf->tlb_snd_max = tp->snd_max;
+ log_buf->tlb_snd_cwnd = tp->snd_cwnd;
+ log_buf->tlb_snd_nxt = tp->snd_nxt;
+ log_buf->tlb_snd_recover = tp->snd_recover;
+ log_buf->tlb_snd_wnd = tp->snd_wnd;
+ log_buf->tlb_snd_ssthresh = tp->snd_ssthresh;
+ log_buf->tlb_srtt = tp->t_srtt;
+ log_buf->tlb_rttvar = tp->t_rttvar;
+ log_buf->tlb_rcv_up = tp->rcv_up;
+ log_buf->tlb_rcv_adv = tp->rcv_adv;
+ log_buf->tlb_flags2 = tp->t_flags2;
+ log_buf->tlb_rcv_nxt = tp->rcv_nxt;
+ log_buf->tlb_rcv_wnd = tp->rcv_wnd;
+ log_buf->tlb_dupacks = tp->t_dupacks;
+ log_buf->tlb_segqlen = tp->t_segqlen;
+ log_buf->tlb_snd_numholes = tp->snd_numholes;
log_buf->tlb_flex1 = 0;
log_buf->tlb_flex2 = 0;
- COPY_STAT_T(fbyte_in);
- COPY_STAT_T(fbyte_out);
- COPY_STAT(snd_scale);
- COPY_STAT(rcv_scale);
+ log_buf->tlb_fbyte_in = tp->t_fbyte_in;
+ log_buf->tlb_fbyte_out = tp->t_fbyte_out;
+ log_buf->tlb_snd_scale = tp->snd_scale;
+ log_buf->tlb_rcv_scale = tp->rcv_scale;
log_buf->_pad[0] = 0;
log_buf->_pad[1] = 0;
log_buf->_pad[2] = 0;
-#undef COPY_STAT
-#undef COPY_STAT_T
/* Copy stack-specific info. */
if (stackinfo != NULL) {
memcpy(&log_buf->tlb_stackinfo, stackinfo,
@@ -2874,10 +2878,11 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
/* double check log state now that we have the lock */
if (inp->inp_flags & INP_DROPPED)
goto done;
- if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
+ if (tcp_bblogging_on(tp)) {
struct timeval tv;
tcp_log_eventspecific_t log;
+ memset(&log, 0, sizeof(log));
microuptime(&tv);
log.u_sf.offset = offset;
log.u_sf.length = nbytes;
@@ -2975,3 +2980,370 @@ skip_closed_req:
done:
INP_WUNLOCK(inp);
}
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_tcphdr(struct tcp_log_buffer *tlm_buf)
+{
+ struct sackblk sack;
+ struct tcphdr *th;
+ int cnt, i, j, opt, optlen, num_sacks;
+ uint32_t val, ecr;
+ uint16_t mss;
+ uint16_t flags;
+
+ if ((tlm_buf->tlb_eventflags & TLB_FLAG_HDR) == 0) {
+ return;
+ }
+ th = &tlm_buf->tlb_th;
+ flags = tcp_get_flags(th);
+ if (flags & TH_FIN) {
+ db_printf("F");
+ }
+ if (flags & TH_SYN) {
+ db_printf("S");
+ }
+ if (flags & TH_RST) {
+ db_printf("R");
+ }
+ if (flags & TH_PUSH) {
+ db_printf("P");
+ }
+ if (flags & TH_ACK) {
+ db_printf(".");
+ }
+ if (flags & TH_URG) {
+ db_printf("U");
+ }
+ if (flags & TH_ECE) {
+ db_printf("E");
+ }
+ if (flags & TH_CWR) {
+ db_printf("W");
+ }
+ if (flags & TH_AE) {
+ db_printf("A");
+ }
+ db_printf(" %u:%u(%u)", ntohl(th->th_seq),
+ ntohl(th->th_seq) + tlm_buf->tlb_len, tlm_buf->tlb_len);
+ if (flags & TH_ACK) {
+ db_printf(" ack %u", ntohl(th->th_ack));
+ }
+ db_printf(" win %u", ntohs(th->th_win));
+ if (flags & TH_URG) {
+ db_printf(" urg %u", ntohs(th->th_urp));
+ }
+ cnt = (th->th_off << 2) - sizeof(struct tcphdr);
+ if (cnt > 0) {
+ db_printf(" <");
+ for (i = 0; i < cnt; i += optlen) {
+ opt = tlm_buf->tlb_opts[i];
+ if (opt == TCPOPT_EOL || opt == TCPOPT_NOP) {
+ optlen = 1;
+ } else {
+ if (cnt - i < 2) {
+ break;
+ }
+ optlen = tlm_buf->tlb_opts[i + 1];
+ if (optlen < 2 || optlen > cnt - i) {
+ break;
+ }
+ }
+ if (i > 0) {
+ db_printf(",");
+ }
+ switch (opt) {
+ case TCPOPT_EOL:
+ db_printf("eol");
+ break;
+ case TCPOPT_NOP:
+ db_printf("nop");
+ break;
+ case TCPOPT_MAXSEG:
+ if (optlen != TCPOLEN_MAXSEG) {
+ break;
+ }
+ bcopy(tlm_buf->tlb_opts + i + 2, &mss,
+ sizeof(uint16_t));
+ db_printf("mss %u", ntohs(mss));
+ break;
+ case TCPOPT_WINDOW:
+ if (optlen != TCPOLEN_WINDOW) {
+ break;
+ }
+ db_printf("wscale %u",
+ tlm_buf->tlb_opts[i + 2]);
+ break;
+ case TCPOPT_SACK_PERMITTED:
+ if (optlen != TCPOLEN_SACK_PERMITTED) {
+ break;
+ }
+ db_printf("sackOK");
+ break;
+ case TCPOPT_SACK:
+ if (optlen == TCPOLEN_SACKHDR ||
+ (optlen - 2) % TCPOLEN_SACK != 0) {
+ break;
+ }
+ num_sacks = (optlen - 2) / TCPOLEN_SACK;
+ db_printf("sack");
+ for (j = 0; j < num_sacks; j++) {
+ bcopy(tlm_buf->tlb_opts + i + 2 +
+ j * TCPOLEN_SACK, &sack,
+ TCPOLEN_SACK);
+ db_printf(" %u:%u", ntohl(sack.start),
+ ntohl(sack.end));
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (optlen != TCPOLEN_TIMESTAMP) {
+ break;
+ }
+ bcopy(tlm_buf->tlb_opts + i + 2, &val,
+ sizeof(uint32_t));
+ bcopy(tlm_buf->tlb_opts + i + 6, &ecr,
+ sizeof(uint32_t));
+ db_printf("TS val %u ecr %u", ntohl(val),
+ ntohl(ecr));
+ break;
+ case TCPOPT_SIGNATURE:
+ db_printf("md5");
+ if (optlen > 2) {
+ db_printf(" ");
+ }
+ for (j = 0; j < optlen - 2; j++) {
+ db_printf("%02x",
+ tlm_buf->tlb_opts[i + 2 + j]);
+ }
+ break;
+ case TCPOPT_FAST_OPEN:
+ db_printf("FO");
+ if (optlen > 2) {
+ db_printf(" ");
+ }
+ for (j = 0; j < optlen - 2; j++) {
+ db_printf("%02x",
+ tlm_buf->tlb_opts[i + 2 + j]);
+ }
+ break;
+ default:
+ db_printf("opt=%u len=%u", opt, optlen);
+ break;
+ }
+ }
+ db_printf(">");
+ }
+}
+static void
+db_print_pru(struct tcp_log_buffer *tlm_buf)
+{
+ switch (tlm_buf->tlb_flex1) {
+ case PRU_ATTACH:
+ db_printf("ATTACH");
+ break;
+ case PRU_DETACH:
+ db_printf("DETACH");
+ break;
+ case PRU_BIND:
+ db_printf("BIND");
+ break;
+ case PRU_LISTEN:
+ db_printf("LISTEN");
+ break;
+ case PRU_CONNECT:
+ db_printf("CONNECT");
+ break;
+ case PRU_ACCEPT:
+ db_printf("ACCEPT");
+ break;
+ case PRU_DISCONNECT:
+ db_printf("DISCONNECT");
+ break;
+ case PRU_SHUTDOWN:
+ db_printf("SHUTDOWN");
+ break;
+ case PRU_RCVD:
+ db_printf("RCVD");
+ break;
+ case PRU_SEND:
+ db_printf("SEND");
+ break;
+ case PRU_ABORT:
+ db_printf("ABORT");
+ break;
+ case PRU_CONTROL:
+ db_printf("CONTROL");
+ break;
+ case PRU_SENSE:
+ db_printf("SENSE");
+ break;
+ case PRU_RCVOOB:
+ db_printf("RCVOOB");
+ break;
+ case PRU_SENDOOB:
+ db_printf("SENDOOB");
+ break;
+ case PRU_SOCKADDR:
+ db_printf("SOCKADDR");
+ break;
+ case PRU_PEERADDR:
+ db_printf("PEERADDR");
+ break;
+ case PRU_CONNECT2:
+ db_printf("CONNECT2");
+ break;
+ case PRU_FASTTIMO:
+ db_printf("FASTTIMO");
+ break;
+ case PRU_SLOWTIMO:
+ db_printf("SLOWTIMO");
+ break;
+ case PRU_PROTORCV:
+ db_printf("PROTORCV");
+ break;
+ case PRU_PROTOSEND:
+ db_printf("PROTOSEND");
+ break;
+ case PRU_SEND_EOF:
+ db_printf("SEND_EOF");
+ break;
+ case PRU_SOSETLABEL:
+ db_printf("SOSETLABEL");
+ break;
+ case PRU_CLOSE:
+ db_printf("CLOSE");
+ break;
+ case PRU_FLUSH:
+ db_printf("FLUSH");
+ break;
+ default:
+ db_printf("Unknown PRU (%u)", tlm_buf->tlb_flex1);
+ break;
+ }
+ if (tlm_buf->tlb_errno >= 0) {
+ db_printf(", error: %d", tlm_buf->tlb_errno);
+ }
+}
+
+static void
+db_print_rto(struct tcp_log_buffer *tlm_buf)
+{
+ tt_what what;
+ tt_which which;
+
+ what = (tlm_buf->tlb_flex1 & 0xffffff00) >> 8;
+ which = tlm_buf->tlb_flex1 & 0x000000ff;
+ switch (what) {
+ case TT_PROCESSING:
+ db_printf("Processing ");
+ break;
+ case TT_PROCESSED:
+ db_printf("Processed ");
+ break;
+ case TT_STARTING:
+ db_printf("Starting ");
+ break;
+ case TT_STOPPING:
+ db_printf("Stopping ");
+ break;
+ default:
+ db_printf("Unknown operation (%u) for ", what);
+ break;
+ }
+ switch (which) {
+ case TT_REXMT:
+ db_printf("Retransmission ");
+ break;
+ case TT_PERSIST:
+ db_printf("Persist ");
+ break;
+ case TT_KEEP:
+ db_printf("Keepalive ");
+ break;
+ case TT_2MSL:
+ db_printf("2 MSL ");
+ break;
+ case TT_DELACK:
+ db_printf("Delayed ACK ");
+ break;
+ default:
+ db_printf("Unknown (%u) ", which);
+ break;
+ }
+ db_printf("timer");
+ if (what == TT_STARTING) {
+ db_printf(": %u ms", tlm_buf->tlb_flex2);
+ }
+}
+
+static void
+db_print_usersend(struct tcp_log_buffer *tlm_buf)
+{
+ if ((tlm_buf->tlb_eventflags & TLB_FLAG_RXBUF) == 0) {
+ return;
+ }
+ if ((tlm_buf->tlb_eventflags & TLB_FLAG_TXBUF) == 0) {
+ return;
+ }
+ db_printf("usersend: rcv.acc: %u rcv.ccc: %u snd.acc: %u snd.ccc: %u",
+ tlm_buf->tlb_rxbuf.tls_sb_acc, tlm_buf->tlb_rxbuf.tls_sb_ccc,
+ tlm_buf->tlb_txbuf.tls_sb_acc, tlm_buf->tlb_txbuf.tls_sb_ccc);
+}
+
+void
+db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent)
+{
+ struct tcp_log_mem *log_entry;
+ struct tcp_log_buffer *tlm_buf, *prev_tlm_buf;
+ int64_t delta_t;
+
+ indent += 2;
+ prev_tlm_buf = NULL;
+ STAILQ_FOREACH(log_entry, log_entries, tlm_queue) {
+ db_print_indent(indent);
+ tlm_buf = &log_entry->tlm_buf;
+ if (prev_tlm_buf == NULL) {
+ db_printf(" 0.000 ");
+ } else {
+ delta_t = sbttoms(tvtosbt(tlm_buf->tlb_tv) -
+ tvtosbt(prev_tlm_buf->tlb_tv));
+ db_printf("+%u.%03u ", (uint32_t)(delta_t / 1000),
+ (uint32_t)(delta_t % 1000));
+ }
+ switch (tlm_buf->tlb_eventid) {
+ case TCP_LOG_IN:
+ db_printf("< ");
+ db_print_tcphdr(tlm_buf);
+ break;
+ case TCP_LOG_OUT:
+ db_printf("> ");
+ db_print_tcphdr(tlm_buf);
+ break;
+ case TCP_LOG_RTO:
+ db_print_rto(tlm_buf);
+ break;
+ case TCP_LOG_PRU:
+ db_print_pru(tlm_buf);
+ break;
+ case TCP_LOG_USERSEND:
+ db_print_usersend(tlm_buf);
+ break;
+ default:
+ break;
+ }
+ db_printf("\n");
+ prev_tlm_buf = tlm_buf;
+ if (db_pager_quit)
+ break;
+ }
+}
+#endif
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index 9ee2d97d47c2..f8c064b6a104 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -60,14 +60,6 @@ struct tcp_log_verbose
uint8_t _pad[4];
} ALIGN_TCP_LOG;
-/* Internal RACK state variables. */
-struct tcp_log_rack
-{
- uint32_t tlr_rack_rtt; /* rc_rack_rtt */
- uint8_t tlr_state; /* Internal RACK state */
- uint8_t _pad[3]; /* Padding */
-};
-
struct tcp_log_bbr {
uint64_t cur_del_rate;
uint64_t delRate;
@@ -126,7 +118,6 @@ struct tcp_log_sendfile {
*/
union tcp_log_stackspecific
{
- struct tcp_log_rack u_rack;
struct tcp_log_bbr u_bbr;
struct tcp_log_sendfile u_sf;
struct tcp_log_raw u_raw; /* "raw" log access */
@@ -185,7 +176,6 @@ struct tcp_log_buffer
uint8_t _pad[3]; /* Padding */
/* Per-stack info */
union tcp_log_stackspecific tlb_stackinfo;
-#define tlb_rack tlb_stackinfo.u_rack
/* The packet */
uint32_t tlb_len; /* The packet's data length */
@@ -387,12 +377,12 @@ extern int32_t tcp_trace_point_count;
/*
* Returns true if any sort of BB logging is enabled,
- * commonly used throughout the codebase.
+ * commonly used throughout the codebase.
*/
static inline int
tcp_bblogging_on(struct tcpcb *tp)
{
- if (tp->_t_logstate <= TCP_LOG_STATE_OFF)
+ if (tp->_t_logstate <= TCP_LOG_STATE_OFF)
return (0);
if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS)
return (0);
@@ -437,7 +427,7 @@ tcp_set_bblog_state(struct tcpcb *tp, uint8_t ls, uint8_t bbpoint)
}
}
-static inline uint32_t
+static inline uint32_t
tcp_get_bblog_state(struct tcpcb *tp)
{
return (tp->_t_logstate);
@@ -549,12 +539,12 @@ struct tcpcb;
NULL, NULL, 0, NULL); \
} while (0)
#endif /* TCP_LOG_FORCEVERBOSE */
+/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */
#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
do { \
- if (tcp_bblogging_on(tp)) \
- tcp_log_event(tp, th, rxbuf, txbuf, eventid, \
- errornum, len, stackinfo, th_hostorder, \
- NULL, NULL, 0, tv); \
+ KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \
+ tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \
+ stackinfo, th_hostorder, NULL, NULL, 0, tv); \
} while (0)
#ifdef TCP_BLACKBOX
@@ -580,6 +570,9 @@ void tcp_log_flowend(struct tcpcb *tp);
void tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes,
int flags);
int tcp_log_apply_ratio(struct tcpcb *tp, int ratio);
+#ifdef DDB
+void db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent);
+#endif
#else /* !TCP_BLACKBOX */
#define tcp_log_verbose (false)
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 10afed17bf3b..7512679bd4e9 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -1301,9 +1301,9 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h
return (TCP_LRO_CANNOT);
#endif
if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
- ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
+ ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
(m->m_pkthdr.csum_data != 0xffff)) {
- /*
+ /*
* The checksum either did not have hardware offload
* or it was a bad checksum. We can't LRO such
* a packet.
@@ -1334,7 +1334,7 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h
#endif
/* If no hardware or arrival stamp on the packet add timestamp */
if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) {
- m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
+ m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
m->m_flags |= M_TSTMP_LRO;
}
@@ -1429,9 +1429,9 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
int error;
if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
- ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
+ ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
(m->m_pkthdr.csum_data != 0xffff)) {
- /*
+ /*
* The checksum either did not have hardware offload
* or it was a bad checksum. We can't LRO such
* a packet.
@@ -1481,7 +1481,7 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
((mb->m_flags & M_TSTMP) == 0)) {
/* Add in an LRO time since no hardware */
binuptime(&lc->lro_last_queue_time);
- mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
+ mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
mb->m_flags |= M_TSTMP_LRO;
}
diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c
index 7e756285da45..43587285fe26 100644
--- a/sys/netinet/tcp_lro_hpts.c
+++ b/sys/netinet/tcp_lro_hpts.c
@@ -188,7 +188,7 @@ tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
log.u_bbr.cur_del_rate = (uintptr_t)m;
log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
bintime2timeval(&lc->lro_last_queue_time, &btv);
- log.u_bbr.flex6 = tcp_tv_to_usectick(&btv);
+ log.u_bbr.flex6 = tcp_tv_to_usec(&btv);
log.u_bbr.flex7 = le->compressed;
log.u_bbr.pacing_gain = le->uncompressed;
if (in_epoch(net_epoch_preempt))
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index bc5b42ee6f2c..2dfb7faf56e3 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1250,7 +1250,7 @@ send:
* fack acks recoverypoint.
*/
if ((tp->t_flags & TF_LRD) && SEQ_GEQ(p->rxmit, p->end))
- p->rxmit = tp->snd_recover;
+ p->rxmit = SEQ_MAX(p->rxmit, tp->snd_recover);
tp->sackhint.sack_bytes_rexmit += len;
}
if (IN_RECOVERY(tp->t_flags)) {
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 90d789f0e224..b6c55fac50b3 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -283,7 +283,7 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
INP_WLOCK_ASSERT(tptoinpcb(tp));
/* Check arguments. */
- KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end"));
+ KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("SEG_GT(rcv_start, rcv_end)"));
if ((rcv_start == rcv_end) &&
(tp->rcv_numsacks >= 1) &&
@@ -498,8 +498,8 @@ tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
tp->snd_numholes--;
atomic_subtract_int(&V_tcp_sack_globalholes, 1);
- KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
- KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
+ KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes < 0"));
+ KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes < 0"));
}
/*
@@ -583,6 +583,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
*/
if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
left_edge_delta = th_ack - tp->snd_una;
+ delivered_data += left_edge_delta;
sack_blocks[num_sack_blks].start = tp->snd_una;
sack_blocks[num_sack_blks++].end = th_ack;
/*
@@ -590,7 +591,6 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
* due to DSACK blocks
*/
if (SEQ_LT(tp->snd_fack, th_ack)) {
- delivered_data += th_ack - tp->snd_una;
tp->snd_fack = th_ack;
sack_changed = SACK_CHANGE;
}
@@ -684,7 +684,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
delivered_data += sblkp->end - sblkp->start;
tp->sackhint.hole_bytes += temp->end - temp->start;
KASSERT(tp->sackhint.hole_bytes >= 0,
- ("sackhint hole bytes >= 0"));
+ ("sackhint hole bytes < 0"));
tp->snd_fack = sblkp->end;
sblkp--;
sack_changed = SACK_NEWLOSS;
@@ -744,7 +744,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
while (cur != NULL) {
if (!(sblkp >= sack_blocks)) {
if (((loss_sblks >= tcprexmtthresh) ||
- (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
+ (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
break;
loss_thresh += loss_hiack - cur->end;
loss_hiack = cur->start;
@@ -783,7 +783,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
tp->sackhint.sack_bytes_rexmit -=
(SEQ_MIN(cur->rxmit, cur->end) - cur->start);
KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
- ("sackhint bytes rtx >= 0"));
+ ("sackhint bytes rtx < 0"));
sack_changed = SACK_CHANGE;
if (SEQ_LEQ(sblkp->start, cur->start)) {
/* Data acks at least the beginning of hole. */
@@ -816,7 +816,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
cur->end = sblkp->start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
- cur->rxmit = tp->snd_recover;
+ cur->rxmit = SEQ_MAX(cur->rxmit, tp->snd_recover);
} else {
/*
* ACKs some data in middle of a hole; need
@@ -843,7 +843,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
cur->rxmit = SEQ_MIN(cur->rxmit,
cur->end);
if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
- cur->rxmit = tp->snd_recover;
+ cur->rxmit = SEQ_MAX(cur->rxmit, tp->snd_recover);
delivered_data += (sblkp->end - sblkp->start);
}
}
@@ -874,13 +874,13 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
if (TAILQ_EMPTY(&tp->snd_holes)) {
KASSERT(tp->sackhint.hole_bytes == 0,
- ("SACK scoreboard empty, but accounting non-zero\n"));
+ ("SACK scoreboard empty, but sackhint hole bytes != 0"));
tp->sackhint.sack_bytes_rexmit = 0;
tp->sackhint.sacked_bytes = 0;
tp->sackhint.lost_bytes = 0;
} else {
KASSERT(tp->sackhint.hole_bytes > 0,
- ("SACK scoreboard not empty, but has no bytes\n"));
+ ("SACK scoreboard not empty, but sackhint hole bytes <= 0"));
tp->sackhint.delivered_data = delivered_data;
tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0"));
@@ -918,9 +918,9 @@ tcp_free_sackholes(struct tcpcb *tp)
tp->sackhint.hole_bytes = 0;
tp->sackhint.lost_bytes = 0;
- KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
+ KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes != 0"));
KASSERT(tp->sackhint.nexthole == NULL,
- ("tp->sackhint.nexthole == NULL"));
+ ("tp->sackhint.nexthole != NULL"));
}
/*
@@ -1061,11 +1061,15 @@ tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
}
}
}
- KASSERT(SEQ_LT(hole->start, hole->end), ("%s: hole.start >= hole.end", __func__));
+ KASSERT(SEQ_LT(hole->start, hole->end),
+ ("%s: SEQ_GEQ(hole.start, hole.end)", __func__));
if (!(V_tcp_do_newsack)) {
- KASSERT(SEQ_LT(hole->start, tp->snd_fack), ("%s: hole.start >= snd.fack", __func__));
- KASSERT(SEQ_LT(hole->end, tp->snd_fack), ("%s: hole.end >= snd.fack", __func__));
- KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), ("%s: hole.rxmit >= snd.fack", __func__));
+ KASSERT(SEQ_LT(hole->start, tp->snd_fack),
+ ("%s: SEG_GEQ(hole.start, snd.fack)", __func__));
+ KASSERT(SEQ_LT(hole->end, tp->snd_fack),
+ ("%s: SEG_GEQ(hole.end, snd.fack)", __func__));
+ KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack),
+ ("%s: SEQ_GEQ(hole.rxmit, snd.fack)", __func__));
if (SEQ_GEQ(hole->start, hole->end) ||
SEQ_GEQ(hole->start, tp->snd_fack) ||
SEQ_GEQ(hole->end, tp->snd_fack) ||
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 17a0744961ce..fed259f4d8e1 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -2173,7 +2173,7 @@ bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin)
log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay;
log.u_bbr.flex4 = bbr->rc_tp->ts_offset;
log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
- log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv);
+ log.u_bbr.pkts_out = tcp_tv_to_msec(&bbr->rc_tv);
log.u_bbr.flex6 = tsin;
log.u_bbr.flex7 = 0;
log.u_bbr.flex8 = bbr->rc_ack_was_delayed;
@@ -2241,13 +2241,13 @@ bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uin
mbuf_tstmp2timespec(m, &ts);
tv.tv_sec = ts.tv_sec;
tv.tv_usec = ts.tv_nsec / 1000;
- log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv);
+ log.u_bbr.lt_epoch = tcp_tv_to_usec(&tv);
} else {
log.u_bbr.lt_epoch = 0;
}
if (m->m_flags & M_TSTMP_LRO) {
mbuf_tstmp2timeval(m, &tv);
- log.u_bbr.flex5 = tcp_tv_to_usectick(&tv);
+ log.u_bbr.flex5 = tcp_tv_to_usec(&tv);
} else {
/* No arrival timestamp */
log.u_bbr.flex5 = 0;
@@ -5126,8 +5126,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
tp->t_maxseg = tp->t_pmtud_saved_maxseg;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
@@ -6792,7 +6792,7 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
(ack_type == BBR_CUM_ACKED) &&
(to->to_flags & TOF_TS) &&
(to->to_tsecr != 0)) {
- t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr;
+ t = tcp_tv_to_msec(&bbr->rc_tv) - to->to_tsecr;
if (t < 1)
t = 1;
t *= MS_IN_USEC;
@@ -7330,7 +7330,7 @@ bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th,
uint32_t ts, now, rtt;
ts = bbr_ts_convert(to->to_tsecr);
- now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv));
+ now = bbr_ts_convert(tcp_tv_to_msec(&bbr->rc_tv));
rtt = now - ts;
if (rtt < 1)
rtt = 1;
@@ -7863,7 +7863,7 @@ nothing_left:
/* tcp_close will kill the inp pre-log the Reset */
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
tp = tcp_close(tp);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
BBR_STAT_INC(bbr_dropped_af_data);
return (1);
}
@@ -8461,7 +8461,7 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
if ((to->to_flags & TOF_TS) != 0 &&
SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -8893,7 +8893,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((to->to_flags & TOF_TS) != 0) {
uint32_t t, rtt;
- t = tcp_tv_to_mssectick(&bbr->rc_tv);
+ t = tcp_tv_to_msec(&bbr->rc_tv);
if (TSTMP_GEQ(t, to->to_tsecr)) {
rtt = t - to->to_tsecr;
if (rtt == 0) {
@@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
if (tp->t_flags & TF_FASTOPEN) {
@@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
@@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -9034,7 +9034,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
tp->snd_wnd = tiwin;
@@ -9067,7 +9067,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((to->to_flags & TOF_TS) != 0) {
uint32_t t, rtt;
- t = tcp_tv_to_mssectick(&bbr->rc_tv);
+ t = tcp_tv_to_msec(&bbr->rc_tv);
if (TSTMP_GEQ(t, to->to_tsecr)) {
rtt = t - to->to_tsecr;
if (rtt == 0) {
@@ -9258,7 +9258,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -9355,7 +9355,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -9405,7 +9405,7 @@ close_now:
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
tp = tcp_close(tp);
KMOD_TCPSTAT_INC(tcps_rcvafterclose);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
+ ctf_do_dropwithreset(m, tp, th, *tlen);
return (1);
}
if (sbavail(&so->so_snd) == 0)
@@ -9486,7 +9486,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -9602,7 +9602,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -9704,7 +9704,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -9818,7 +9818,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN | TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
tp->ts_recent = to->to_tsval;
}
/*
@@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -10141,7 +10141,7 @@ bbr_init(struct tcpcb *tp, void **ptr)
* flags.
*/
bbr_stop_all_timers(tp, bbr);
- /*
+ /*
* Validate the timers are not in usec, if they are convert.
* BBR should in theory move to USEC and get rid of a
* lot of the TICKS_2 calls.. but for now we stay
@@ -10150,7 +10150,7 @@ bbr_init(struct tcpcb *tp, void **ptr)
tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ tp->t_rttmin, tcp_rexmit_max);
bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
return (0);
}
@@ -11327,7 +11327,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
mbuf_tstmp2timespec(m, &ts);
bbr->rc_tv.tv_sec = ts.tv_sec;
bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
- bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
+ bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usec(&bbr->rc_tv);
} else if (m->m_flags & M_TSTMP_LRO) {
/* Next the arrival timestamp */
struct timespec ts;
@@ -11335,7 +11335,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
mbuf_tstmp2timespec(m, &ts);
bbr->rc_tv.tv_sec = ts.tv_sec;
bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
- bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
+ bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usec(&bbr->rc_tv);
} else {
/*
* Ok just get the current time.
@@ -11376,7 +11376,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
*/
if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
to.to_tsecr -= tp->ts_offset;
- if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv)))
+ if (TSTMP_GT(to.to_tsecr, tcp_tv_to_msec(&bbr->rc_tv)))
to.to_tsecr = 0;
}
/*
@@ -11414,7 +11414,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
(tp->t_flags & TF_REQ_TSTMP)) {
tp->t_flags |= TF_RCVD_TSTMP;
tp->ts_recent = to.to_tsval;
- tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->ts_recent_age = tcp_tv_to_msec(&bbr->rc_tv);
} else
tp->t_flags &= ~TF_REQ_TSTMP;
if (to.to_flags & TOF_MSS)
@@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
if (tiwin > bbr->r_ctl.rc_high_rwnd)
@@ -11544,7 +11544,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
if (nxt_pkt == 0) {
if ((bbr->r_wanted_output != 0) ||
- (tp->t_flags & TF_ACKNOW)) {
+ (tp->t_flags & TF_ACKNOW)) {
bbr->rc_output_starts_timer = 0;
did_out = 1;
@@ -11870,7 +11870,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
bbr = (struct tcp_bbr *)tp->t_fb_ptr;
/* We take a cache hit here */
memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
- cts = tcp_tv_to_usectick(&bbr->rc_tv);
+ cts = tcp_tv_to_usec(&bbr->rc_tv);
inp = bbr->rc_inp;
hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS);
tp->t_flags2 &= ~TF2_HPTS_CALLS;
@@ -12885,7 +12885,7 @@ send:
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
- to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset;
+ to.to_tsval = tcp_tv_to_msec(&bbr->rc_tv) + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
local_options += TCPOLEN_TIMESTAMP + 2;
@@ -12893,7 +12893,7 @@ send:
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
- tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv);
+ tp->rfbuf_ts = tcp_tv_to_msec(&bbr->rc_tv);
/* Selective ACK's. */
if (flags & TH_SYN)
to.to_flags |= TOF_SACKPERM;
@@ -13172,11 +13172,7 @@ send:
mb, moff, &len,
if_hw_tsomaxsegcount,
if_hw_tsomaxsegsize, msb,
- ((rsm == NULL) ? hw_tls : 0)
-#ifdef NETFLIX_COPY_ARGS
- , NULL, NULL
-#endif
- );
+ ((rsm == NULL) ? hw_tls : 0));
if (len <= maxseg) {
/*
* Must have ran out of mbufs for the copy
@@ -13806,8 +13802,8 @@ nomore:
tp->t_maxseg = old_maxseg - 40;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
@@ -14127,17 +14123,17 @@ bbr_switch_failed(struct tcpcb *tp)
toval = bbr->rc_pacer_started - cts;
} else {
/* one slot please */
- toval = HPTS_TICKS_PER_SLOT;
+ toval = HPTS_USECS_PER_SLOT;
}
} else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
toval = bbr->r_ctl.rc_timer_exp - cts;
} else {
/* one slot please */
- toval = HPTS_TICKS_PER_SLOT;
+ toval = HPTS_USECS_PER_SLOT;
}
} else
- toval = HPTS_TICKS_PER_SLOT;
+ toval = HPTS_USECS_PER_SLOT;
(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),
__LINE__, &diag);
bbr_log_hpts_diag(bbr, cts, &diag);
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index f5bc435890e7..71dd4de6baf9 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -40,7 +40,6 @@
#endif
#include <sys/lock.h>
#include <sys/malloc.h>
-#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
@@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0;
static uint32_t rack_pcm_is_enabled = 1;
static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
-static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */
+static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */
static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
@@ -605,7 +604,7 @@ rack_get_lt_bw(struct tcp_rack *rack)
/* Include all the current bytes too */
microuptime(&tv);
bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq);
- tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
+ tim += (tcp_tv_to_lusec(&tv) - rack->r_ctl.lt_timemark);
}
if ((bytes != 0) && (tim != 0))
return ((bytes * (uint64_t)1000000) / tim);
@@ -621,7 +620,7 @@ rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8)
struct tcpcb *tp;
uint32_t old_beta;
uint32_t old_beta_ecn;
- int error, failed = 0;
+ int error = 0, failed = 0;
tp = rack->rc_tp;
if (tp->t_cc == NULL) {
@@ -684,7 +683,7 @@ out:
struct newreno *ptr;
ptr = ((struct newreno *)tp->t_ccv.cc_data);
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = ptr->beta;
log.u_bbr.flex2 = ptr->beta_ecn;
@@ -938,7 +937,7 @@ rack_init_sysctls(void)
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "time_between", CTLFLAG_RW,
- & rack_time_between_probertt, 96000000,
+ &rack_time_between_probertt, 96000000,
"How many useconds between the lowest rtt falling must past before we enter probertt");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
@@ -2246,7 +2245,7 @@ rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped)
ent = rack->r_ctl.rc_last_sft;
microuptime(&tv);
- timenow = tcp_tv_to_lusectick(&tv);
+ timenow = tcp_tv_to_lusec(&tv);
if (timenow >= ent->deadline) {
/* No time left we do DGP only */
rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
@@ -2678,7 +2677,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
*/
return;
}
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = tsused;
log.u_bbr.flex2 = thresh;
log.u_bbr.flex3 = rsm->r_flags;
@@ -2709,7 +2708,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = rack->rc_tp->t_srtt;
log.u_bbr.flex2 = to;
log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
@@ -2752,7 +2751,7 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex8 = to_num;
log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
@@ -2792,7 +2791,7 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex8 = flag;
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.cur_del_rate = (uintptr_t)prev;
@@ -2840,7 +2839,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l
if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = t;
log.u_bbr.flex2 = len;
@@ -2889,7 +2888,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l
log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
- log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ log.u_bbr.bw_inuse = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
log.u_bbr.bw_inuse <<= 32;
if (rsm)
log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
@@ -3013,7 +3012,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = tick;
@@ -3042,7 +3041,7 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = slot;
if (rack->rack_no_prr)
@@ -3149,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = slot;
log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
@@ -3185,7 +3184,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
@@ -3230,7 +3229,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack,
/* No you can't use 1, its for the real to cancel */
return;
}
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = flex1;
log.u_bbr.flex2 = flex2;
@@ -3255,7 +3254,7 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = timers;
log.u_bbr.flex2 = ret;
log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
@@ -3285,7 +3284,7 @@ rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line)
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
if (rack->rack_no_prr)
@@ -3480,16 +3479,16 @@ static void
rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
{
if (rsm->r_flags & RACK_APP_LIMITED) {
- if (rack->r_ctl.rc_app_limited_cnt > 0) {
- rack->r_ctl.rc_app_limited_cnt--;
- }
+ KASSERT((rack->r_ctl.rc_app_limited_cnt > 0),
+ ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm));
+ rack->r_ctl.rc_app_limited_cnt--;
}
if (rsm->r_limit_type) {
/* currently there is only one limit type */
rack->r_ctl.rc_num_split_allocs--;
}
if (rsm == rack->r_ctl.rc_first_appl) {
- rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start);
+ rack->r_ctl.cleared_app_ack_seq = rsm->r_end;
rack->r_ctl.cleared_app_ack = 1;
if (rack->r_ctl.rc_app_limited_cnt == 0)
rack->r_ctl.rc_first_appl = NULL;
@@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
* earlier.
*
* So lets calculate the BDP with the "known" b/w using
- * the SRTT has our rtt and then multiply it by the
- * goal.
+ * the SRTT as our rtt and then multiply it by the goal.
*/
bw = rack_get_bw(rack);
srtt = (uint64_t)tp->t_srtt;
@@ -3646,7 +3644,7 @@ rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_
}
/* Now what about time? */
srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
- tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
+ tim = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
/*
* We do not allow a measurement if we are in recovery
@@ -4118,7 +4116,7 @@ rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
@@ -4864,7 +4862,7 @@ rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, ui
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = add_part;
log.u_bbr.flex2 = sub_part;
@@ -4893,7 +4891,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
uint64_t resid_bw, subpart = 0, addpart = 0, srtt;
int did_add = 0;
- us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
if (TSTMP_GEQ(us_cts, tp->gput_ts))
tim = us_cts - tp->gput_ts;
@@ -5214,7 +5212,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = rack->r_ctl.current_round;
log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
@@ -5250,7 +5248,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = rack->r_ctl.current_round;
log.u_bbr.flex2 = (uint32_t)gp_est;
@@ -5357,7 +5355,7 @@ skip_measurement:
rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
- tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
rack->app_limited_needs_set = 0;
tp->gput_seq = th_ack;
if (rack->in_probe_rtt)
@@ -5492,7 +5490,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint
rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
rack->r_ctl.lt_seq = tp->snd_max;
- tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
if (tmark >= rack->r_ctl.lt_timemark) {
rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
}
@@ -5533,7 +5531,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = th_ack;
log.u_bbr.flex2 = tp->t_ccv.flags;
@@ -5648,7 +5646,7 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = th_ack;
log.u_bbr.flex2 = tp->t_ccv.flags;
@@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
tp->t_badrxtwin = 0;
break;
}
- if ((CC_ALGO(tp)->cong_signal != NULL) &&
+ if ((CC_ALGO(tp)->cong_signal != NULL) &&
(type != CC_RTO)){
tp->t_ccv.curack = ack;
CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
@@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li
*
* If reorder-fade is configured, then we track the last time we saw
* re-ordering occur. If we reach the point where enough time as
- * passed we no longer consider reordering has occuring.
+ * passed we no longer consider reordering as occurring.
*
* Or if reorder-face is 0, then once we see reordering we consider
* the connection to alway be subject to reordering and just set lro
@@ -6347,7 +6345,7 @@ activate_tlp:
if (to < rack_tlp_min) {
to = rack_tlp_min;
}
- if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
+ if (to > TICKS_2_USEC(tcp_rexmit_max)) {
/*
* If the TLP time works out to larger than the max
* RTO lets not do TLP.. just RTO.
@@ -6392,7 +6390,7 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_se
rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
rack->r_ctl.lt_seq = snd_una;
- tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
if (tmark >= rack->r_ctl.lt_timemark) {
rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
}
@@ -6481,7 +6479,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = diag->p_nxt_slot;
log.u_bbr.flex2 = diag->p_cur_slot;
log.u_bbr.flex3 = diag->slot_req;
@@ -6520,7 +6518,7 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = sb->sb_flags;
log.u_bbr.flex2 = len;
log.u_bbr.flex3 = sb->sb_state;
@@ -6594,22 +6592,22 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* on the clock. We always have a min
* 10 slots (10 x 10 i.e. 100 usecs).
*/
- if (slot <= HPTS_TICKS_PER_SLOT) {
+ if (slot <= HPTS_USECS_PER_SLOT) {
/* We gain delay */
- rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
- slot = HPTS_TICKS_PER_SLOT;
+ rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot);
+ slot = HPTS_USECS_PER_SLOT;
} else {
/* We take off some */
- rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
- slot = HPTS_TICKS_PER_SLOT;
+ rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT);
+ slot = HPTS_USECS_PER_SLOT;
}
} else {
slot -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
/* Make sure we have 100 useconds at minimum */
- if (slot < HPTS_TICKS_PER_SLOT) {
- rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
- slot = HPTS_TICKS_PER_SLOT;
+ if (slot < HPTS_USECS_PER_SLOT) {
+ rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot;
+ slot = HPTS_USECS_PER_SLOT;
}
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;
@@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
/* Push bit must go to the right edge as well */
if (rsm->r_flags & RACK_HAD_PUSH)
rsm->r_flags &= ~RACK_HAD_PUSH;
+ /* Update the count if app limited */
+ if (nrsm->r_flags & RACK_APP_LIMITED)
+ rack->r_ctl.rc_app_limited_cnt++;
/* Clone over the state of the hw_tls flag */
nrsm->r_hw_tls = rsm->r_hw_tls;
/*
@@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack,
l_rsm->r_flags |= RACK_TLP;
if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
l_rsm->r_flags |= RACK_RWND_COLLAPSED;
- if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
+ if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
/*
* If both are app-limited then let the
@@ -7887,8 +7888,8 @@ drop_it:
tp->t_maxseg = tp->t_pmtud_saved_maxseg;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
@@ -8032,6 +8033,7 @@ skip_time_check:
ret = rack_timeout_rack(tp, rack, cts);
} else if (timers & PACE_TMR_TLP) {
rack->r_ctl.rc_tlp_rxt_last_time = cts;
+ rack->r_fast_output = 0;
ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
} else if (timers & PACE_TMR_RXT) {
rack->r_ctl.rc_tlp_rxt_last_time = cts;
@@ -8136,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
* remove the lost desgination and reduce the
* bytes considered lost.
*/
- rsm->r_flags &= ~RACK_WAS_LOST;
+ rsm->r_flags &= ~RACK_WAS_LOST;
KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -8778,7 +8780,7 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
}
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
#endif
- rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
+ rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
/*
* the retransmit should happen at rtt + 4 * rttvar. Because of the
* way we do the smoothing, srtt and rttvar will each average +1/2
@@ -8831,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts
val = rack_probertt_lower_within * rack_time_between_probertt;
val /= 100;
- if ((rack->in_probe_rtt == 0) &&
+ if ((rack->in_probe_rtt == 0) &&
(rack->rc_skip_timely == 0) &&
((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
rack_enter_probertt(rack, us_cts);
@@ -8884,8 +8886,8 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
rack->r_ctl.rc_rack_min_rtt = 1;
}
}
- if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
- us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
+ us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
else
us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
if (us_rtt == 0)
@@ -8894,7 +8896,7 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
/* Kick the RTT to the CC */
CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
}
- rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+ rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usec(&rack->r_ctl.act_rcv_time));
if (ack_type == SACKED) {
rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
@@ -8989,8 +8991,8 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
* we retransmitted. This is because
* we match the timestamps.
*/
- if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
- us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
+ if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
+ us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
else
us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
@@ -9183,7 +9185,7 @@ rack_need_set_test(struct tcpcb *tp,
seq = tp->gput_seq;
ts = tp->gput_ts;
rack->app_limited_needs_set = 0;
- tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
/* Do we start at a new end? */
if ((use_which == RACK_USE_BEG) &&
SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
@@ -10368,7 +10370,7 @@ more:
* and yet before retransmitting we get an ack
* which can happen due to reordering.
*/
- rsm->r_flags &= ~RACK_WAS_LOST;
+ rsm->r_flags &= ~RACK_WAS_LOST;
KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -10818,7 +10820,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
changed = th_ack - rsm->r_start;
if (changed) {
rack_process_to_cumack(tp, rack, th_ack, cts, to,
- tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time));
+ tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
}
if ((to->to_flags & TOF_SACK) == 0) {
/* We are done nothing left and no sack. */
@@ -11064,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
* We need to skip anything already set
* to be retransmitted.
*/
- if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
(rsm->r_flags & RACK_MUST_RXT)) {
rsm = TAILQ_NEXT(rsm, r_tnext);
continue;
@@ -11696,7 +11698,7 @@ rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack)
rack_log_hybrid_sends(rack, ent, __LINE__);
/* calculate the time based on the ack arrival */
data = ent->end - ent->start;
- laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ laa = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
if (ent->flags & TCP_TRK_TRACK_FLG_FSND) {
if (ent->first_send > ent->localtime)
ftim = ent->first_send;
@@ -11842,7 +11844,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
* less than and we have not closed our window.
*/
if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
- rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
if (rack->r_ctl.rc_reorder_ts == 0)
rack->r_ctl.rc_reorder_ts = 1;
}
@@ -12036,7 +12038,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
/* tcp_close will kill the inp pre-log the Reset */
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
tp = tcp_close(tp);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
}
@@ -12874,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -13088,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
if (tp->t_flags & TF_FASTOPEN) {
@@ -13101,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
@@ -13135,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return (1);
}
if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -13398,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -13494,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -13516,7 +13518,7 @@ rack_check_data_after_close(struct mbuf *m,
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
tp = tcp_close(tp);
KMOD_TCPSTAT_INC(tcps_rcvafterclose);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
+ ctf_do_dropwithreset(m, tp, th, *tlen);
return (1);
}
if (sbavail(&so->so_snd) == 0)
@@ -13644,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -13745,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -13847,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -13951,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, tlen);
return (1);
}
}
@@ -14227,7 +14229,7 @@ rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex8 = mod;
log.u_bbr.flex1 = flex1;
@@ -14366,17 +14368,17 @@ rack_switch_failed(struct tcpcb *tp)
toval = rack->r_ctl.rc_last_output_to - cts;
} else {
/* one slot please */
- toval = HPTS_TICKS_PER_SLOT;
+ toval = HPTS_USECS_PER_SLOT;
}
} else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
toval = rack->r_ctl.rc_timer_exp - cts;
} else {
/* one slot please */
- toval = HPTS_TICKS_PER_SLOT;
+ toval = HPTS_USECS_PER_SLOT;
}
} else
- toval = HPTS_TICKS_PER_SLOT;
+ toval = HPTS_USECS_PER_SLOT;
(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),
__LINE__, &diag);
rack_log_hpts_diag(rack, cts, &diag, &tv);
@@ -14636,9 +14638,6 @@ rack_init(struct tcpcb *tp, void **ptr)
if (rack->r_ctl.pcm_s == NULL) {
rack->r_ctl.pcm_i.cnt_alloc = 0;
}
-#ifdef NETFLIX_STATS
- rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask;
-#endif
rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
if (rack_enable_shared_cwnd)
@@ -14744,12 +14743,12 @@ rack_init(struct tcpcb *tp, void **ptr)
rack->r_ctl.rack_per_of_gp_ss = 250;
}
rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
- rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
- rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
+ rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
+ rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
rack_probertt_filter_life);
- us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
rack->r_ctl.rc_time_of_last_probertt = us_cts;
rack->r_ctl.rc_went_idle_time = us_cts;
@@ -14958,7 +14957,7 @@ rack_init(struct tcpcb *tp, void **ptr)
if (TSTMP_GT(qr.timer_pacing_to, us_cts))
tov = qr.timer_pacing_to - us_cts;
else
- tov = HPTS_TICKS_PER_SLOT;
+ tov = HPTS_USECS_PER_SLOT;
}
if (qr.timer_hpts_flags & PACE_TMR_MASK) {
rack->r_ctl.rc_timer_exp = qr.timer_timer_exp;
@@ -14966,7 +14965,7 @@ rack_init(struct tcpcb *tp, void **ptr)
if (TSTMP_GT(qr.timer_timer_exp, us_cts))
tov = qr.timer_timer_exp - us_cts;
else
- tov = HPTS_TICKS_PER_SLOT;
+ tov = HPTS_USECS_PER_SLOT;
}
}
rack_log_chg_info(tp, rack, 4,
@@ -15117,7 +15116,7 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex8 = 10;
log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced;
log.u_bbr.flex2 = rack->rc_free_cnt;
@@ -15361,7 +15360,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
tcp_req = tcp_req_find_req_for_seq(tp, ae->ack);
}
#endif
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
if (rack->rack_no_prr == 0)
log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
@@ -15386,7 +15385,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
ts.tv_nsec = ae->timestamp % 1000000000;
ltv.tv_sec = ts.tv_sec;
ltv.tv_usec = ts.tv_nsec / 1000;
- log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
+ log.u_bbr.lt_epoch = tcp_tv_to_usec(&ltv);
} else if (ae->flags & TSTMP_LRO) {
/* Record the LRO the arrival timestamp */
log.u_bbr.flex3 = M_TSTMP_LRO;
@@ -15394,7 +15393,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
ts.tv_nsec = ae->timestamp % 1000000000;
ltv.tv_sec = ts.tv_sec;
ltv.tv_usec = ts.tv_nsec / 1000;
- log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
+ log.u_bbr.flex5 = tcp_tv_to_usec(&ltv);
}
log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
/* Log the rcv time */
@@ -15562,10 +15561,10 @@ rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2,
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
-
+
(void)tcp_get_usecs(&tv);
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.timeStamp = tcp_tv_to_usec(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.flex8 = mod;
log.u_bbr.flex1 = flex1;
@@ -15647,7 +15646,7 @@ rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = rack->r_ctl.current_round;
log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
@@ -15748,8 +15747,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
the_win = tp->snd_wnd;
win_seq = tp->snd_wl1;
win_upd_ack = tp->snd_wl2;
- cts = tcp_tv_to_usectick(tv);
- ms_cts = tcp_tv_to_mssectick(tv);
+ cts = tcp_tv_to_usec(tv);
+ ms_cts = tcp_tv_to_msec(tv);
rack->r_ctl.rc_rcvtime = cts;
segsiz = ctf_fixed_maxseg(tp);
if ((rack->rc_gp_dyn_mul) &&
@@ -15865,7 +15864,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
* or it could be a keep-alive or persists
*/
if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
- rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
if (rack->r_ctl.rc_reorder_ts == 0)
rack->r_ctl.rc_reorder_ts = 1;
}
@@ -15884,7 +15883,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
}
if (rack->forced_ack) {
rack_handle_probe_response(rack, tiwin,
- tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+ tcp_tv_to_usec(&rack->r_ctl.act_rcv_time));
}
#ifdef TCP_ACCOUNTING
win_up_req = 1;
@@ -15931,7 +15930,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
rack->r_ctl.act_rcv_time = *tv;
}
rack_process_to_cumack(tp, rack, ae->ack, cts, to,
- tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time));
+ tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
#ifdef TCP_REQUEST_TRK
rack_req_check_for_comp(rack, high_seq);
#endif
@@ -16399,7 +16398,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* must process the ack coming in but need to defer sending
* anything becase a pacing timer is running.
*/
- us_cts = tcp_tv_to_usectick(tv);
+ us_cts = tcp_tv_to_usec(tv);
if (m->m_flags & M_ACKCMP) {
/*
* All compressed ack's are ack's by definition so
@@ -16467,8 +16466,8 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if (m->m_flags & M_ACKCMP) {
panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
}
- cts = tcp_tv_to_usectick(tv);
- ms_cts = tcp_tv_to_mssectick(tv);
+ cts = tcp_tv_to_usec(tv);
+ ms_cts = tcp_tv_to_msec(tv);
nsegs = m->m_pkthdr.lro_nsegs;
counter_u64_add(rack_proc_non_comp_ack, 1);
#ifdef TCP_ACCOUNTING
@@ -16570,7 +16569,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack);
}
#endif
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
if (rack->rack_no_prr == 0)
log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
@@ -16596,13 +16595,13 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
mbuf_tstmp2timespec(m, &ts);
ltv.tv_sec = ts.tv_sec;
ltv.tv_usec = ts.tv_nsec / 1000;
- log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
+ log.u_bbr.lt_epoch = tcp_tv_to_usec(&ltv);
} else if (m->m_flags & M_TSTMP_LRO) {
/* Record the LRO the arrival timestamp */
mbuf_tstmp2timespec(m, &ts);
ltv.tv_sec = ts.tv_sec;
ltv.tv_usec = ts.tv_nsec / 1000;
- log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
+ log.u_bbr.flex5 = tcp_tv_to_usec(&ltv);
}
log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
/* Log the rcv time */
@@ -16654,7 +16653,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
#ifdef TCP_ACCOUNTING
sched_unpin();
#endif
@@ -16820,7 +16819,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
}
if (thflags & TH_FIN)
tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
- us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
if ((rack->rc_gp_dyn_mul) &&
(rack->use_fixed_rate == 0) &&
(rack->rc_always_pace)) {
@@ -16918,7 +16917,7 @@ do_output_now:
} else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
goto do_output_now;
} else if ((no_output == 1) &&
- (nxt_pkt == 0) &&
+ (nxt_pkt == 0) &&
(tcp_in_hpts(rack->rc_tp) == 0)) {
/*
* We are not in hpts and we had a pacing timer up. Use
@@ -17178,6 +17177,12 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
log.u_bbr.cwnd_gain <<= 1;
log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
+ log.u_bbr.cwnd_gain <<= 1;
+ log.u_bbr.cwnd_gain |= rack->use_fixed_rate;
+ log.u_bbr.cwnd_gain <<= 1;
+ log.u_bbr.cwnd_gain |= rack->rc_always_pace;
+ log.u_bbr.cwnd_gain <<= 1;
+ log.u_bbr.cwnd_gain |= rack->gp_ready;
log.u_bbr.bbr_substate = quality;
log.u_bbr.bbr_state = rack->dgp_on;
log.u_bbr.bbr_state <<= 1;
@@ -17344,7 +17349,7 @@ at_lt_bw:
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.flex1 = rack_bw_multipler;
log.u_bbr.flex2 = len;
@@ -17539,8 +17544,8 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
rack->r_ctl.rc_last_us_rtt,
88, __LINE__, NULL, gain);
}
- if ((bw_est == 0) || (rate_wanted == 0) ||
- ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
+ if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
+ (rack->use_fixed_rate == 0)) {
/*
* No way yet to make a b/w estimate or
* our raise is set incorrectly.
@@ -17979,7 +17984,7 @@ start_set:
tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
rack->r_ctl.rc_gp_cumack_ts = 0;
if ((rack->r_ctl.cleared_app_ack == 1) &&
- (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) {
+ (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) {
/*
* We just cleared an application limited period
* so the next seq out needs to skip the first
@@ -18102,7 +18107,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = error;
log.u_bbr.flex2 = flags;
@@ -18367,7 +18372,7 @@ rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack,
err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
#endif
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
log.u_bbr.flex1 = p_rate;
log.u_bbr.flex2 = p_queue;
@@ -18820,7 +18825,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
counter_u64_add(rack_collapsed_win_rxt, 1);
counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
}
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
if (rack->rack_no_prr)
log.u_bbr.flex1 = 0;
@@ -19039,7 +19044,7 @@ rack_sndbuf_autoscale(struct tcp_rack *rack)
static int
rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
- uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
+ uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line)
{
/*
* Enter to do fast output. We are given that the sched_pin is
@@ -19212,7 +19217,7 @@ again:
}
if (rack->r_ctl.fsb.rfo_apply_push &&
(len == rack->r_ctl.fsb.left_to_send)) {
- tcp_set_flags(th, flags | TH_PUSH);
+ flags |= TH_PUSH;
add_flag |= RACK_HAD_PUSH;
}
if ((m->m_next == NULL) || (len <= 0)){
@@ -19369,7 +19374,7 @@ again:
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
if (rack->rack_no_prr)
log.u_bbr.flex1 = 0;
@@ -19391,11 +19396,11 @@ again:
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.flex5 = log.u_bbr.inflight;
log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
- log.u_bbr.delivered = 0;
+ log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send;
log.u_bbr.rttProp = 0;
log.u_bbr.delRate = rack->r_must_retran;
log.u_bbr.delRate <<= 1;
- log.u_bbr.pkt_epoch = __LINE__;
+ log.u_bbr.pkt_epoch = line;
/* For fast output no retrans so just inflight and how many mss we send */
log.u_bbr.flex5 = log.u_bbr.inflight;
log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
@@ -19437,7 +19442,7 @@ again:
}
if ((error == 0) && (rack->lt_bw_up == 0)) {
/* Unlikely */
- rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv);
+ rack->r_ctl.lt_timemark = tcp_tv_to_lusec(tv);
rack->r_ctl.lt_seq = tp->snd_una;
rack->lt_bw_up = 1;
} else if ((error == 0) &&
@@ -19468,7 +19473,7 @@ again:
tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
rack->forced_ack = 0; /* If we send something zap the FA flag */
- tot_len += len;
+ *tot_len += len;
if ((tp->t_flags & TF_GPUTINPROG) == 0)
rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
tp->snd_max += len;
@@ -19504,6 +19509,7 @@ again:
}
if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
(max_val > len) &&
+ (*tot_len < rack->r_ctl.rc_pace_max_segs) &&
(tso == 0)) {
max_val -= len;
len = segsiz;
@@ -19515,14 +19521,14 @@ again:
}
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
counter_u64_add(rack_fto_send, 1);
- slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__);
- rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
+ slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
+ rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
- tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
+ tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz);
}
sched_unpin();
#endif
@@ -19779,7 +19785,7 @@ rack_output(struct tcpcb *tp)
#endif
early = 0;
cts = tcp_get_usecs(&tv);
- ms_cts = tcp_tv_to_mssectick(&tv);
+ ms_cts = tcp_tv_to_msec(&tv);
if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
tcp_in_hpts(rack->rc_tp)) {
/*
@@ -19884,20 +19890,36 @@ rack_output(struct tcpcb *tp)
TCPS_HAVEESTABLISHED(tp->t_state)) {
rack_set_state(tp, rack);
}
+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ minseg = segsiz;
+ if (rack->r_ctl.rc_pace_max_segs == 0)
+ pace_max_seg = rack->rc_user_set_max_segs * segsiz;
+ else
+ pace_max_seg = rack->r_ctl.rc_pace_max_segs;
if ((rack->r_fast_output) &&
(doing_tlp == 0) &&
(tp->rcv_numsacks == 0)) {
int ret;
error = 0;
- ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
- if (ret >= 0)
+ ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
+ if (ret > 0)
return(ret);
else if (error) {
inp = rack->rc_inp;
so = inp->inp_socket;
sb = &so->so_snd;
goto nomore;
+ } else {
+ /* Return == 0, if there is more we can send tot_len wise fall through and send */
+ if (tot_len_this_send >= pace_max_seg)
+ return (ret);
+#ifdef TCP_ACCOUNTING
+ /* We need to re-pin since fast_output un-pined */
+ sched_pin();
+ ts_val = get_cyclecount();
+#endif
+ /* Fall back out so we can send any more that may bring us to pace_max_seg */
}
}
inp = rack->rc_inp;
@@ -20001,15 +20023,9 @@ rack_output(struct tcpcb *tp)
again:
sendalot = 0;
cts = tcp_get_usecs(&tv);
- ms_cts = tcp_tv_to_mssectick(&tv);
+ ms_cts = tcp_tv_to_msec(&tv);
tso = 0;
mtu = 0;
- segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
- minseg = segsiz;
- if (rack->r_ctl.rc_pace_max_segs == 0)
- pace_max_seg = rack->rc_user_set_max_segs * segsiz;
- else
- pace_max_seg = rack->r_ctl.rc_pace_max_segs;
if (TCPS_HAVEESTABLISHED(tp->t_state) &&
(rack->r_ctl.pcm_max_seg == 0)) {
/*
@@ -20025,7 +20041,7 @@ again:
rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
}
}
- if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
+ if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
uint32_t rw_avail, cwa;
if (tp->snd_wnd > ctf_outstanding(tp))
@@ -20871,6 +20887,7 @@ just_return_nolock:
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
((IN_RECOVERY(tp->t_flags)) == 0) &&
+ (doing_tlp == 0) &&
(rack->r_must_retran == 0) &&
((tp->t_flags & TF_NEEDFIN) == 0) &&
(len > 0) && (orig_len > 0) &&
@@ -21012,7 +21029,7 @@ just_return_nolock:
} else
log = 1;
}
- /* Mark the last packet has app limited */
+ /* Mark the last packet as app limited */
rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
if (rack->r_ctl.rc_app_limited_cnt == 0)
@@ -21364,7 +21381,8 @@ send:
if (max_len <= 0) {
len = 0;
} else if (len > max_len) {
- sendalot = 1;
+ if (doing_tlp == 0)
+ sendalot = 1;
len = max_len;
mark = 2;
}
@@ -21535,11 +21553,7 @@ send:
m->m_next = tcp_m_copym(
mb, moff, &len,
if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
- ((rsm == NULL) ? hw_tls : 0)
-#ifdef NETFLIX_COPY_ARGS
- , &s_mb, &s_moff
-#endif
- );
+ ((rsm == NULL) ? hw_tls : 0));
if (len <= (tp->t_maxseg - optlen)) {
/*
* Must have ran out of mbufs for the copy
@@ -21593,7 +21607,6 @@ send:
flags |= TH_PUSH;
add_flag |= RACK_HAD_PUSH;
}
-
SOCK_SENDBUF_UNLOCK(so);
} else {
SOCK_SENDBUF_UNLOCK(so);
@@ -21886,7 +21899,7 @@ send:
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
if (rack->rack_no_prr)
log.u_bbr.flex1 = 0;
@@ -22062,6 +22075,8 @@ out:
* In transmit state, time the transmission and arrange for the
* retransmit. In persist state, just set snd_max.
*/
+ if ((rsm == NULL) && doing_tlp)
+ add_flag |= RACK_TLP;
rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
rack_to_usec_ts(&tv),
rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
@@ -22075,7 +22090,7 @@ out:
}
if (rsm == NULL) {
if (rack->lt_bw_up == 0) {
- rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
+ rack->r_ctl.lt_timemark = tcp_tv_to_lusec(&tv);
rack->r_ctl.lt_seq = tp->snd_una;
rack->lt_bw_up = 1;
} else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) {
@@ -22148,15 +22163,14 @@ out:
rack->r_ctl.rc_prr_sndcnt = 0;
}
sub_from_prr = 0;
- if (doing_tlp) {
- /* Make sure the TLP is added */
- add_flag |= RACK_TLP;
- } else if (rsm) {
- /* If its a resend without TLP then it must not have the flag */
- rsm->r_flags &= ~RACK_TLP;
- }
-
-
+ if (rsm != NULL) {
+ if (doing_tlp)
+ /* Make sure the TLP is added */
+ rsm->r_flags |= RACK_TLP;
+ else
+ /* If its a resend without TLP then it must not have the flag */
+ rsm->r_flags &= ~RACK_TLP;
+ }
if ((error == 0) &&
(len > 0) &&
(tp->snd_una == tp->snd_max))
@@ -22494,6 +22508,7 @@ enobufs:
((flags & (TH_SYN|TH_FIN)) == 0) &&
(rsm == NULL) &&
(ipoptlen == 0) &&
+ (doing_tlp == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
((IN_RECOVERY(tp->t_flags)) == 0) &&
@@ -22520,6 +22535,7 @@ enobufs:
rack_use_rfo &&
((flags & (TH_SYN|TH_FIN)) == 0) &&
(rsm == NULL) &&
+ (doing_tlp == 0) &&
(ipoptlen == 0) &&
(rack->r_must_retran == 0) &&
rack->r_fsb_inited &&
@@ -22536,7 +22552,7 @@ enobufs:
segsiz, pace_max_seg, hw_tls, flags);
if (rack->r_fast_output) {
error = 0;
- ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
+ ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
if (ret >= 0)
return (ret);
else if (error)
@@ -22822,7 +22838,7 @@ process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid)
rack->r_ctl.rc_fixed_pacing_rate_ca = 0;
rack->r_ctl.rc_fixed_pacing_rate_ss = 0;
/* Now allocate or find our entry that will have these settings */
- sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0);
+ sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusec(&tv), 0);
if (sft == NULL) {
rack->rc_tp->tcp_hybrid_error++;
/* no space, where would it have gone? */
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index da26b8cb1f9b..fc12672a45f7 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -507,13 +507,11 @@ ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
void
ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
- int32_t rstreason, int32_t tlen)
+ int32_t tlen)
{
- if (tp != NULL) {
- tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ tcp_dropwithreset(m, th, tp, tlen);
+ if (tp != NULL)
INP_WUNLOCK(tptoinpcb(tp));
- } else
- tcp_dropwithreset(m, th, NULL, tlen, rstreason);
}
void
@@ -672,7 +670,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t
(SEQ_GT(tp->snd_una, th->th_ack) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
*ret_val = 1;
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, tlen);
return;
} else
*ret_val = 0;
@@ -866,10 +864,10 @@ ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
void
ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
- int32_t rstreason, int32_t tlen)
+ int32_t tlen)
{
- tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ tcp_dropwithreset(m, th, tp, tlen);
tp = tcp_drop(tp, ETIMEDOUT);
if (tp)
INP_WUNLOCK(tptoinpcb(tp));
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h
index 6a8a056d89b0..cd33cb8ce50b 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@@ -101,7 +101,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
void
ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
- struct tcphdr *th, int32_t rstreason, int32_t tlen);
+ struct tcphdr *th, int32_t tlen);
void
ctf_do_drop(struct mbuf *m, struct tcpcb *tp);
@@ -125,7 +125,7 @@ ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
void
ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
- int32_t rstreason, int32_t tlen);
+ int32_t tlen);
uint32_t
ctf_fixed_maxseg(struct tcpcb *tp);
diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c
index 09e90da88895..759bfda98357 100644
--- a/sys/netinet/tcp_stacks/rack_pcm.c
+++ b/sys/netinet/tcp_stacks/rack_pcm.c
@@ -172,9 +172,9 @@ rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, uint32_t start, uint3
goto skip_ack_accounting;
}
/*
- * Record ACK data.
+ * Record ACK data.
*/
- ack_arrival = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ ack_arrival = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
if (SEQ_GT(end, rack->r_ctl.pcm_i.eseq)) {
/* Trim the end to the end of our range if it is beyond */
end = rack->r_ctl.pcm_i.eseq;
@@ -241,8 +241,8 @@ skip_ack_accounting:
for (i=0; i<rack->r_ctl.pcm_i.cnt; i++) {
e = &rack->r_ctl.pcm_s[i];
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.timeStamp = tcp_tv_to_usec(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.flex8 = 1;
log.u_bbr.flex1 = e->sseq;
@@ -286,7 +286,7 @@ skip_ack_accounting:
* Prev time holds the last ack arrival time.
*/
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
+ log.u_bbr.timeStamp = tcp_tv_to_usec(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.flex8 = 2;
log.u_bbr.flex1 = rack->r_ctl.pcm_i.sseq;
@@ -305,7 +305,7 @@ skip_ack_accounting:
0, &log, false, NULL, NULL, 0, &tv);
}
}
- /*
+ /*
* Here we need a lot to be added including:
* 1) Some form of measurement, where if we think the measurement
* is valid we iterate over the PCM data and come up with a path
diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c
index fc9ee8454a1e..2b70548f3cc6 100644
--- a/sys/netinet/tcp_stacks/sack_filter.c
+++ b/sys/netinet/tcp_stacks/sack_filter.c
@@ -400,7 +400,7 @@ sack_filter_run(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq
break;
}
/* Copy it out to the outbound */
- memcpy(&in[at], &blkboard[i], sizeof(struct sackblk));
+ memcpy(&in[at], &blkboard[i], sizeof(struct sackblk));
at++;
room--;
/* now lets add it to our sack-board */
@@ -588,7 +588,7 @@ sack_filter_blks(struct tcpcb *tp, struct sack_filter *sf, struct sackblk *in, i
sf->sf_ack = th_ack;
for(i=0, sf->sf_cur=0; i<numblks; i++) {
- if ((in[i].end != tp->snd_max) &&
+ if ((in[i].end != tp->snd_max) &&
((in[i].end - in[i].start) < segmax)) {
/*
* We do not accept blocks less than a MSS minus all
@@ -707,7 +707,7 @@ main(int argc, char **argv)
out = stdout;
memset(&tp, 0, sizeof(tp));
tp.t_maxseg = 1460;
-
+
while ((i = getopt(argc, argv, "dIi:o:?hS:")) != -1) {
switch (i) {
case 'S':
@@ -883,7 +883,7 @@ main(int argc, char **argv)
} else {
printf("can't open sack_setup.bin -- sorry no load\n");
}
-
+
} else if (strncmp(buffer, "help", 4) == 0) {
help:
fprintf(out, "You can input:\n");
diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h
index b12fcf84567c..a1c0684a4359 100644
--- a/sys/netinet/tcp_stacks/sack_filter.h
+++ b/sys/netinet/tcp_stacks/sack_filter.h
@@ -42,7 +42,7 @@
* previously processed sack information.
*
* The second thing that the sack filter does is help protect against malicious
- * attackers that are trying to attack any linked lists (or other data structures)
+ * attackers that are trying to attack any linked lists (or other data structures)
* that are used in sack processing. Consider an attacker sending in sacks for
* every other byte of data outstanding. This could in theory drastically split
* up any scoreboard you are maintaining and make you search through a very large
diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h
index f88efe3c9ef9..10ddd12bda75 100644
--- a/sys/netinet/tcp_stacks/tcp_bbr.h
+++ b/sys/netinet/tcp_stacks/tcp_bbr.h
@@ -347,8 +347,6 @@ struct bbr_log_sysctl_out {
/*
* Locking for the rack control block.
* a) Locked by INP_WLOCK
- * b) Locked by the hpts-mutex
- *
*/
#define BBR_STATE_STARTUP 0x01
#define BBR_STATE_DRAIN 0x02
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index 4374594a1d82..144b4fabf7eb 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -327,8 +327,6 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
/*
* Locking for the rack control block.
* a) Locked by INP_WLOCK
- * b) Locked by the hpts-mutex
- *
*/
#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
#define RETRAN_CNT_SIZE 16
@@ -614,7 +612,6 @@ struct rack_control {
struct tcp_rack {
/* First cache line 0x00 */
- TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */
int32_t(*r_substate) (struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, struct tcpopt *,
int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 34964ed8283c..2e039ebbfdd2 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -82,6 +82,7 @@
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
+#include <netinet/icmp_var.h>
#ifdef INET6
#include <netinet/icmp6.h>
#include <netinet/ip6.h>
@@ -1032,10 +1033,6 @@ tcp_default_fb_init(struct tcpcb *tp, void **ptr)
/* We don't use the pointer */
*ptr = NULL;
- KASSERT(tp->t_state < TCPS_TIME_WAIT,
- ("%s: connection %p in unexpected state %d", __func__, tp,
- tp->t_state));
-
/* Make sure we get no interesting mbuf queuing behavior */
/* All mbuf queue/ack compress flags should be off */
tcp_lro_features_off(tp);
@@ -1052,7 +1049,8 @@ tcp_default_fb_init(struct tcpcb *tp, void **ptr)
if (tp->t_rxtshift == 0)
tp->t_rxtcur = rexmt;
else
- TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX);
+ TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin,
+ tcp_rexmit_max);
/*
* Nothing to do for ESTABLISHED or LISTEN states. And, we don't
@@ -1454,6 +1452,7 @@ tcp_vnet_init(void *arg __unused)
VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
V_tcp_msl = TCPTV_MSL;
+ V_tcp_msl_local = TCPTV_MSL_LOCAL;
arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0);
}
VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
@@ -1473,11 +1472,8 @@ tcp_init(void *arg __unused)
tcp_keepintvl = TCPTV_KEEPINTVL;
tcp_maxpersistidle = TCPTV_KEEP_IDLE;
tcp_rexmit_initial = TCPTV_RTOBASE;
- if (tcp_rexmit_initial < 1)
- tcp_rexmit_initial = 1;
tcp_rexmit_min = TCPTV_MIN;
- if (tcp_rexmit_min < 1)
- tcp_rexmit_min = 1;
+ tcp_rexmit_max = TCPTV_REXMTMAX;
tcp_persmin = TCPTV_PERSMIN;
tcp_persmax = TCPTV_PERSMAX;
tcp_rexmit_slop = TCPTV_CPU_VAR;
@@ -2086,7 +2082,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(tp);
log.u_bbr.flex8 = 4;
log.u_bbr.pkts_out = tp->t_maxseg;
@@ -2161,6 +2157,13 @@ tcp_send_challenge_ack(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m)
sbintime_t now;
bool send_challenge_ack;
+ /*
+ * The sending of a challenge ACK could be triggered by a blind attacker
+ * to detect an existing TCP connection. To mitigate that, increment
+ * also the global counter which would be incremented if the attacker
+ * would have guessed wrongly.
+ */
+ (void)badport_bandlim(BANDLIM_TCP_RST);
if (V_tcp_ack_war_time_window == 0 || V_tcp_ack_war_cnt == 0) {
/* ACK war protection is disabled. */
send_challenge_ack = true;
@@ -2664,6 +2667,272 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
NULL, 0, tcp_pcblist, "S,xtcpcb",
"List of active TCP connections");
+#define SND_TAG_STATUS_MAXLEN 128
+
+#ifdef KERN_TLS
+
+static struct sx ktlslist_lock;
+SX_SYSINIT(ktlslistlock, &ktlslist_lock, "ktlslist");
+static uint64_t ktls_glob_gen = 1;
+
+static int
+tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys)
+{
+ struct xinpgen xig;
+ struct inpcb *inp;
+ struct socket *so;
+ struct ktls_session *ksr, *kss;
+ char *buf;
+ struct xktls_session *xktls;
+ uint64_t ipi_gencnt;
+ size_t buflen, len, sz;
+ u_int cnt;
+ int error;
+ bool ek, p;
+
+ sx_assert(&ktlslist_lock, SA_XLOCKED);
+ if (req->newptr != NULL)
+ return (EPERM);
+
+ len = 0;
+ cnt = 0;
+ ipi_gencnt = V_tcbinfo.ipi_gencnt;
+ bzero(&xig, sizeof(xig));
+ xig.xig_len = sizeof(xig);
+ xig.xig_gen = ktls_glob_gen++;
+ xig.xig_sogen = so_gencnt;
+
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_RLOCKPCB);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ if (inp->inp_gencnt > ipi_gencnt ||
+ cr_canseeinpcb(req->td->td_ucred, inp) != 0)
+ continue;
+
+ so = inp->inp_socket;
+ if (so != NULL && so->so_gencnt <= xig.xig_sogen) {
+ p = false;
+ ek = export_keys && cr_canexport_ktlskeys(
+ req->td, inp);
+ ksr = so->so_rcv.sb_tls_info;
+ if (ksr != NULL) {
+ ksr->gen = xig.xig_gen;
+ p = true;
+ if (ek) {
+ sz = SIZE_T_MAX;
+ ktls_session_copy_keys(ksr,
+ NULL, &sz);
+ len += sz;
+ }
+ if (ksr->snd_tag != NULL &&
+ ksr->snd_tag->sw->snd_tag_status_str !=
+ NULL) {
+ sz = SND_TAG_STATUS_MAXLEN;
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = ksr->snd_tag->sw->
+ snd_tag_status_str(
+ ksr->snd_tag, NULL, &sz);
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0)
+ len += sz;
+ }
+ }
+ kss = so->so_snd.sb_tls_info;
+ if (kss != NULL) {
+ kss->gen = xig.xig_gen;
+ p = true;
+ if (ek) {
+ sz = SIZE_T_MAX;
+ ktls_session_copy_keys(kss,
+ NULL, &sz);
+ len += sz;
+ }
+ if (kss->snd_tag != NULL &&
+ kss->snd_tag->sw->snd_tag_status_str !=
+ NULL) {
+ sz = SND_TAG_STATUS_MAXLEN;
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = kss->snd_tag->sw->
+ snd_tag_status_str(
+ kss->snd_tag, NULL, &sz);
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0)
+ len += sz;
+ }
+ }
+ if (p) {
+ len += sizeof(*xktls);
+ len = roundup2(len, __alignof(struct
+ xktls_session));
+ }
+ }
+ }
+ if (req->oldptr == NULL) {
+ len += 2 * sizeof(xig);
+ len += 3 * len / 4;
+ req->oldidx = len;
+ return (0);
+ }
+
+ if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
+ return (error);
+
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error != 0)
+ return (error);
+
+ buflen = roundup2(sizeof(*xktls) + 2 * TLS_MAX_PARAM_SIZE +
+ 2 * SND_TAG_STATUS_MAXLEN, __alignof(struct xktls_session));
+ buf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
+ struct inpcb_iterator inpi1 = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_RLOCKPCB);
+ while ((inp = inp_next(&inpi1)) != NULL) {
+ if (inp->inp_gencnt > ipi_gencnt ||
+ cr_canseeinpcb(req->td->td_ucred, inp) != 0)
+ continue;
+
+ so = inp->inp_socket;
+ if (so == NULL)
+ continue;
+
+ p = false;
+ ek = export_keys && cr_canexport_ktlskeys(req->td, inp);
+ ksr = so->so_rcv.sb_tls_info;
+ kss = so->so_snd.sb_tls_info;
+ xktls = (struct xktls_session *)buf;
+ if (ksr != NULL && ksr->gen == xig.xig_gen) {
+ p = true;
+ ktls_session_to_xktls_onedir(ksr, ek, &xktls->rcv);
+ }
+ if (kss != NULL && kss->gen == xig.xig_gen) {
+ p = true;
+ ktls_session_to_xktls_onedir(kss, ek, &xktls->snd);
+ }
+ if (!p)
+ continue;
+
+ xktls->inp_gencnt = inp->inp_gencnt;
+ xktls->so_pcb = (kvaddr_t)inp;
+ memcpy(&xktls->coninf, &inp->inp_inc, sizeof(xktls->coninf));
+ len = sizeof(*xktls);
+ if (ksr != NULL && ksr->gen == xig.xig_gen) {
+ if (ek) {
+ sz = buflen - len;
+ ktls_session_copy_keys(ksr, buf + len, &sz);
+ len += sz;
+ } else {
+ xktls->rcv.cipher_key_len = 0;
+ xktls->rcv.auth_key_len = 0;
+ }
+ if (ksr->snd_tag != NULL &&
+ ksr->snd_tag->sw->snd_tag_status_str != NULL) {
+ sz = SND_TAG_STATUS_MAXLEN;
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = ksr->snd_tag->sw->snd_tag_status_str(
+ ksr->snd_tag, buf + len, &sz);
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0) {
+ xktls->rcv.drv_st_len = sz;
+ len += sz;
+ }
+ }
+ }
+ if (kss != NULL && kss->gen == xig.xig_gen) {
+ if (ek) {
+ sz = buflen - len;
+ ktls_session_copy_keys(kss, buf + len, &sz);
+ len += sz;
+ } else {
+ xktls->snd.cipher_key_len = 0;
+ xktls->snd.auth_key_len = 0;
+ }
+ if (kss->snd_tag != NULL &&
+ kss->snd_tag->sw->snd_tag_status_str != NULL) {
+ sz = SND_TAG_STATUS_MAXLEN;
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = kss->snd_tag->sw->snd_tag_status_str(
+ kss->snd_tag, buf + len, &sz);
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0) {
+ xktls->snd.drv_st_len = sz;
+ len += sz;
+ }
+ }
+ }
+ len = roundup2(len, __alignof(*xktls));
+ xktls->tsz = len;
+ xktls->fsz = sizeof(*xktls);
+
+ error = SYSCTL_OUT(req, xktls, len);
+ if (error != 0) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ cnt++;
+ }
+
+ if (error == 0) {
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = cnt;
+ error = SYSCTL_OUT(req, &xig, sizeof(xig));
+ }
+
+ zfree(buf, M_TEMP);
+ return (error);
+}
+
+static int
+tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys)
+{
+ int repeats, error;
+
+ for (repeats = 0; repeats < 100; repeats++) {
+ if (sx_xlock_sig(&ktlslist_lock))
+ return (EINTR);
+ error = tcp_ktlslist_locked(oidp, arg1, arg2, req,
+ export_keys);
+ sx_xunlock(&ktlslist_lock);
+ if (error != EDEADLK)
+ break;
+ if (sig_intr() != 0) {
+ error = EINTR;
+ break;
+ }
+ req->oldidx = 0;
+ }
+ return (error);
+}
+
+static int
+tcp_ktlslist_nokeys(SYSCTL_HANDLER_ARGS)
+{
+ return (tcp_ktlslist1(oidp, arg1, arg2, req, false));
+}
+
+static int
+tcp_ktlslist_wkeys(SYSCTL_HANDLER_ARGS)
+{
+ return (tcp_ktlslist1(oidp, arg1, arg2, req, true));
+}
+
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST, ktlslist,
+ CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ NULL, 0, tcp_ktlslist_nokeys, "S,xktls_session",
+ "List of active kTLS sessions for TCP connections");
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST_WKEYS, ktlslist_wkeys,
+ CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ NULL, 0, tcp_ktlslist_wkeys, "S,xktls_session",
+ "List of active kTLS sessions for TCP connections with keys");
+#endif /* KERN_TLS */
+
#ifdef INET
static int
tcp_getcred(SYSCTL_HANDLER_ARGS)
@@ -2936,7 +3205,7 @@ tcp6_next_pmtu(const struct icmp6_hdr *icmp6)
* small, set to the min.
*/
if (mtu < IPV6_MMTU)
- mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */
+ mtu = IPV6_MMTU;
return (mtu);
}
@@ -4276,7 +4545,7 @@ tcp_change_time_units(struct tcpcb *tp, int granularity)
panic("Unknown granularity:%d tp:%p",
granularity, tp);
}
-#endif
+#endif
}
void
@@ -4364,7 +4633,7 @@ tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req,
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(tp);
log.u_bbr.flex8 = val;
log.u_bbr.rttProp = req->timestamp;
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 606d808676e1..80e6b53d10df 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -131,17 +131,18 @@ static void syncache_timer(void *);
static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t,
uint8_t *, uintptr_t);
static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *);
-static struct syncache
- *syncookie_lookup(struct in_conninfo *, struct syncache_head *,
- struct syncache *, struct tcphdr *, struct tcpopt *,
- struct socket *, uint16_t);
+static bool syncookie_expand(struct in_conninfo *,
+ const struct syncache_head *, struct syncache *,
+ struct tcphdr *, struct tcpopt *, struct socket *,
+ uint16_t);
static void syncache_pause(struct in_conninfo *);
static void syncache_unpause(void *);
static void syncookie_reseed(void *);
#ifdef INVARIANTS
-static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
- struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
- struct socket *lso, uint16_t port);
+static void syncookie_cmp(struct in_conninfo *,
+ const struct syncache_head *, struct syncache *,
+ struct tcphdr *, struct tcpopt *, struct socket *,
+ uint16_t);
#endif
/*
@@ -442,7 +443,7 @@ syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
else
TCPT_RANGESET(rexmt,
tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits],
- tcp_rexmit_min, TCPTV_REXMTMAX);
+ tcp_rexmit_min, tcp_rexmit_max);
sc->sc_rxttime = ticks + rexmt;
sc->sc_rxmits++;
if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
@@ -1096,6 +1097,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
*/
if (locked && !V_tcp_syncookies) {
SCH_UNLOCK(sch);
+ TCPSTAT_INC(tcps_sc_spurcookie);
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: Spurious ACK, "
"segment rejected (syncookies disabled)\n",
@@ -1105,17 +1107,21 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
if (locked && !V_tcp_syncookiesonly &&
sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) {
SCH_UNLOCK(sch);
+ TCPSTAT_INC(tcps_sc_spurcookie);
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: Spurious ACK, "
"segment rejected (no syncache entry)\n",
s, __func__);
goto failed;
}
- bzero(&scs, sizeof(scs));
- sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port);
if (locked)
SCH_UNLOCK(sch);
- if (sc == NULL) {
+ bzero(&scs, sizeof(scs));
+ if (syncookie_expand(inc, sch, &scs, th, to, *lsop, port)) {
+ sc = &scs;
+ TCPSTAT_INC(tcps_sc_recvcookie);
+ } else {
+ TCPSTAT_INC(tcps_sc_failcookie);
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: Segment failed "
"SYNCOOKIE authentication, segment rejected "
@@ -2251,8 +2257,8 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc)
return (iss);
}
-static struct syncache *
-syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
+static bool
+syncookie_expand(struct in_conninfo *inc, const struct syncache_head *sch,
struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
struct socket *lso, uint16_t port)
{
@@ -2282,7 +2288,7 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
/* The recomputed hash matches the ACK if this was a genuine cookie. */
if ((ack & ~0xff) != (hash & ~0xff))
- return (NULL);
+ return (false);
/* Fill in the syncache values. */
sc->sc_flags = 0;
@@ -2342,47 +2348,47 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
sc->sc_port = port;
- TCPSTAT_INC(tcps_sc_recvcookie);
- return (sc);
+ return (true);
}
#ifdef INVARIANTS
-static int
-syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
+static void
+syncookie_cmp(struct in_conninfo *inc, const struct syncache_head *sch,
struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
struct socket *lso, uint16_t port)
{
- struct syncache scs, *scx;
+ struct syncache scs;
char *s;
bzero(&scs, sizeof(scs));
- scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port);
+ if (syncookie_expand(inc, sch, &scs, th, to, lso, port) &&
+ (sc->sc_peer_mss != scs.sc_peer_mss ||
+ sc->sc_requested_r_scale != scs.sc_requested_r_scale ||
+ sc->sc_requested_s_scale != scs.sc_requested_s_scale ||
+ (sc->sc_flags & SCF_SACK) != (scs.sc_flags & SCF_SACK))) {
- if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL)
- return (0);
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL)
+ return;
- if (scx != NULL) {
- if (sc->sc_peer_mss != scx->sc_peer_mss)
+ if (sc->sc_peer_mss != scs.sc_peer_mss)
log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n",
- s, __func__, sc->sc_peer_mss, scx->sc_peer_mss);
+ s, __func__, sc->sc_peer_mss, scs.sc_peer_mss);
- if (sc->sc_requested_r_scale != scx->sc_requested_r_scale)
+ if (sc->sc_requested_r_scale != scs.sc_requested_r_scale)
log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n",
s, __func__, sc->sc_requested_r_scale,
- scx->sc_requested_r_scale);
+ scs.sc_requested_r_scale);
- if (sc->sc_requested_s_scale != scx->sc_requested_s_scale)
+ if (sc->sc_requested_s_scale != scs.sc_requested_s_scale)
log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n",
s, __func__, sc->sc_requested_s_scale,
- scx->sc_requested_s_scale);
+ scs.sc_requested_s_scale);
- if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK))
+ if ((sc->sc_flags & SCF_SACK) != (scs.sc_flags & SCF_SACK))
log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__);
- }
- if (s != NULL)
free(s, M_TCPLOG);
- return (0);
+ }
}
#endif /* INVARIANTS */
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 4d8dafaec31d..3b9fe7a317b0 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -74,39 +74,33 @@
#include <netinet/tcpip.h>
int tcp_persmin;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT | CTLFLAG_RW,
&tcp_persmin, 0, sysctl_msec_to_ticks, "I",
"minimum persistence interval");
int tcp_persmax;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT | CTLFLAG_RW,
&tcp_persmax, 0, sysctl_msec_to_ticks, "I",
"maximum persistence interval");
int tcp_keepinit;
-SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW,
&tcp_keepinit, 0, sysctl_msec_to_ticks, "I",
"time to establish connection");
int tcp_keepidle;
-SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW,
&tcp_keepidle, 0, sysctl_msec_to_ticks, "I",
"time before keepalive probes begin");
int tcp_keepintvl;
SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
- &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I",
+ CTLTYPE_INT | CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I",
"time between keepalive probes");
int tcp_delacktime;
SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
- &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
+ CTLTYPE_INT | CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
"Time before a delayed ACK is sent");
VNET_DEFINE(int, tcp_msl);
@@ -115,21 +109,29 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
&VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I",
"Maximum segment lifetime");
+VNET_DEFINE(int, tcp_msl_local);
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl_local,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
+ &VNET_NAME(tcp_msl_local), 0, sysctl_msec_to_ticks, "I",
+ "Maximum segment lifetime for local communication");
+
int tcp_rexmit_initial;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT | CTLFLAG_RW,
&tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I",
"Initial Retransmission Timeout");
int tcp_rexmit_min;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT | CTLFLAG_RW,
&tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
"Minimum Retransmission Timeout");
+int tcp_rexmit_max;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_max, CTLTYPE_INT | CTLFLAG_RW,
+ &tcp_rexmit_max, 0, sysctl_msec_to_ticks, "I",
+ "Maximum Retransmission Timeout");
+
int tcp_rexmit_slop;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT | CTLFLAG_RW,
&tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
"Retransmission Timer Slop");
@@ -144,8 +146,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
"Recycle closed FIN_WAIT_2 connections faster");
int tcp_finwait2_timeout;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT | CTLFLAG_RW,
&tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I",
"FIN-WAIT2 timeout");
@@ -162,8 +163,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
"Drop TCP options from 3rd and later retransmitted SYN");
int tcp_maxunacktime = TCPTV_MAXUNACKTIME;
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime,
- CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, CTLTYPE_INT | CTLFLAG_RW,
&tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I",
"Maximum time (in ms) that a session can linger without making progress");
@@ -629,8 +629,7 @@ tcp_timer_rexmt(struct tcpcb *tp)
rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
- TCPT_RANGESET(tp->t_rxtcur, rexmt,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, tcp_rexmit_max);
/*
* We enter the path for PLMTUD if connection is established or, if
@@ -758,8 +757,8 @@ tcp_timer_rexmt(struct tcpcb *tp)
tp->t_maxseg = tp->t_pmtud_saved_maxseg;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index a3ca268417ba..34a0f1375463 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -32,6 +32,8 @@
#ifndef _NETINET_TCP_TIMER_H_
#define _NETINET_TCP_TIMER_H_
+#ifdef _KERNEL
+
/*
* The TCPT_REXMT timer is used to force retransmissions.
* The TCP has the TCPT_REXMT timer set whenever segments
@@ -71,21 +73,22 @@
/*
* Time constants.
*/
-#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */
+#define TCPTV_MSL MSEC_2_TICKS(30000) /* max seg lifetime (hah!) */
+#define TCPTV_MSL_LOCAL MSEC_2_TICKS(10) /* max seg lifetime for local comm */
#define TCPTV_SRTTBASE 0 /* base roundtrip time;
if 0, no idea yet */
-#define TCPTV_RTOBASE ( 1*hz) /* assumed RTO if no info */
+#define TCPTV_RTOBASE MSEC_2_TICKS(1000) /* assumed RTO if no info */
-#define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */
-#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */
+#define TCPTV_PERSMIN MSEC_2_TICKS(5000) /* minimum persist interval */
+#define TCPTV_PERSMAX MSEC_2_TICKS(60000) /* maximum persist interval */
-#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */
-#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */
-#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */
+#define TCPTV_KEEP_INIT MSEC_2_TICKS(75000) /* initial connect keepalive */
+#define TCPTV_KEEP_IDLE MSEC_2_TICKS(120*60*1000) /* dflt time before probing */
+#define TCPTV_KEEPINTVL MSEC_2_TICKS(75000) /* default probe interval */
#define TCPTV_KEEPCNT 8 /* max probes before drop */
#define TCPTV_MAXUNACKTIME 0 /* max time without making progress */
-#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */
+#define TCPTV_FINWAIT2_TIMEOUT MSEC_2_TICKS(60000) /* FIN_WAIT_2 timeout if no receiver */
/*
* Minimum retransmit timer is 3 ticks, for algorithmic stability.
@@ -107,15 +110,13 @@
* The prior minimum of 1*hz (1 second) badly breaks throughput on any
* networks faster then a modem that has minor (e.g. 1%) packet loss.
*/
-#define TCPTV_MIN ( hz/33 ) /* minimum allowable value */
-#define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */
-#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */
-
-#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */
+#define TCPTV_MIN MSEC_2_TICKS(30) /* minimum allowable value */
+#define TCPTV_CPU_VAR MSEC_2_TICKS(200) /* cpu variance allowed (200ms) */
+#define TCPTV_REXMTMAX MSEC_2_TICKS(64000) /* max allowable REXMT value */
#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */
-#define TCPTV_DELACK ( hz/25 ) /* 40ms timeout */
+#define TCPTV_DELACK MSEC_2_TICKS(40) /* 40ms timeout */
/*
* If we exceed this number of retransmits for a single segment, we'll consider
@@ -135,8 +136,6 @@
(tv) = (tvmax); \
} while(0)
-#ifdef _KERNEL
-
#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
#define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl)
@@ -165,6 +164,7 @@ extern int tcp_maxunacktime; /* max time without making progress */
extern int tcp_maxpersistidle;
extern int tcp_rexmit_initial;
extern int tcp_rexmit_min;
+extern int tcp_rexmit_max;
extern int tcp_rexmit_slop;
extern int tcp_ttl; /* time to live for TCP segs */
extern int tcp_backoff[];
@@ -184,6 +184,8 @@ VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss);
#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss)
VNET_DECLARE(int, tcp_msl);
#define V_tcp_msl VNET(tcp_msl)
+VNET_DECLARE(int, tcp_msl_local);
+#define V_tcp_msl_local VNET(tcp_msl_local)
#endif /* _KERNEL */
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 8d77db275310..ce63fcf9ffc0 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -87,12 +87,52 @@
#include <security/mac/mac_framework.h>
-VNET_DEFINE_STATIC(bool, nolocaltimewait) = true;
+VNET_DEFINE_STATIC(bool, nolocaltimewait) = false;
#define V_nolocaltimewait VNET(nolocaltimewait)
-SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait,
- CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), true,
+
+static int
+sysctl_net_inet_tcp_nolocaltimewait(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ bool new;
+
+ new = V_nolocaltimewait;
+ error = sysctl_handle_bool(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ V_nolocaltimewait = new;
+ gone_in(16, "net.inet.tcp.nolocaltimewait is obsolete."
+ " Use net.inet.tcp.msl_local instead.\n");
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, nolocaltimewait,
+ CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_U8,
+ &VNET_NAME(nolocaltimewait), 0, sysctl_net_inet_tcp_nolocaltimewait, "CU",
"Do not create TCP TIME_WAIT state for local connections");
+static u_int
+tcp_eff_msl(struct tcpcb *tp)
+{
+ struct inpcb *inp = tptoinpcb(tp);
+#ifdef INET6
+ bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
+#endif
+
+ if (
+#ifdef INET6
+ isipv6 ? in6_localip(&inp->in6p_faddr) :
+#endif
+#ifdef INET
+ in_localip(inp->inp_faddr))
+#else
+ false)
+#endif
+ return (V_tcp_msl_local);
+ else
+ return (V_tcp_msl);
+}
+
/*
* Move a TCP connection into TIME_WAIT state.
* inp is locked, and is unlocked before returning.
@@ -127,7 +167,7 @@ tcp_twstart(struct tcpcb *tp)
if (V_nolocaltimewait && (
#ifdef INET6
- isipv6 ? in6_localaddr(&inp->in6p_faddr) :
+ isipv6 ? in6_localip(&inp->in6p_faddr) :
#endif
#ifdef INET
in_localip(inp->inp_faddr)
@@ -140,7 +180,7 @@ tcp_twstart(struct tcpcb *tp)
return;
}
- tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl);
+ tcp_timer_activate(tp, TT_2MSL, 2 * tcp_eff_msl(tp));
INP_WUNLOCK(inp);
}
@@ -283,7 +323,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
if (thflags & TH_FIN) {
seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
if (seq + 1 == tp->rcv_nxt)
- tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl);
+ tcp_timer_activate(tp, TT_2MSL, 2 * tcp_eff_msl(tp));
}
/*
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index fbc204097b25..98c934955121 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -146,7 +146,7 @@ tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error)
}
/*
- * TCP attaches to socket via pru_attach(), reserving space,
+ * TCP attaches to socket via pr_attach(), reserving space,
* and an internet control block.
*/
static int
@@ -164,7 +164,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td)
goto out;
so->so_rcv.sb_flags |= SB_AUTOSIZE;
- so->so_snd.sb_flags |= SB_AUTOSIZE;
+ so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE);
error = in_pcballoc(so, &V_tcbinfo);
if (error)
goto out;
@@ -523,7 +523,7 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
}
if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
goto out;
- if (SOLISTENING(so) || so->so_options & SO_REUSEPORT_LB) {
+ if (SOLISTENING(so)) {
error = EOPNOTSUPP;
goto out;
}
@@ -590,7 +590,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
error = EAFNOSUPPORT;
goto out;
}
- if (SOLISTENING(so) || so->so_options & SO_REUSEPORT_LB) {
+ if (SOLISTENING(so)) {
error = EOPNOTSUPP;
goto out;
}
@@ -907,8 +907,8 @@ out:
/*
* Do a send by putting data in output queue and updating urgent
* marker if URG set. Possibly send more data. Unlike the other
- * pru_*() routines, the mbuf chains are our responsibility. We
- * must either enqueue them or free them. The other pru_* routines
+ * pr_*() routines, the mbuf chains are our responsibility. We
+ * must either enqueue them or free them. The other pr_*() routines
* generally are caller-frees.
*/
static int
@@ -1419,6 +1419,7 @@ struct protosw tcp_protosw = {
.pr_rcvd = tcp_usr_rcvd,
.pr_rcvoob = tcp_usr_rcvoob,
.pr_send = tcp_usr_send,
+ .pr_sendfile_wait = sendfile_wait_generic,
.pr_ready = tcp_usr_ready,
.pr_shutdown = tcp_usr_shutdown,
.pr_sockaddr = in_getsockaddr,
@@ -1447,6 +1448,7 @@ struct protosw tcp6_protosw = {
.pr_rcvd = tcp_usr_rcvd,
.pr_rcvoob = tcp_usr_rcvoob,
.pr_send = tcp_usr_send,
+ .pr_sendfile_wait = sendfile_wait_generic,
.pr_ready = tcp_usr_ready,
.pr_shutdown = tcp_usr_shutdown,
.pr_sockaddr = in6_mapped_sockaddr,
@@ -1476,6 +1478,8 @@ tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td)
(SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
SS_ISDISCONNECTED)) != 0))
return (EISCONN);
+ if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
+ return (EOPNOTSUPP);
INP_HASH_WLOCK(&V_tcbinfo);
error = in_pcbconnect(inp, sin, td->td_ucred);
@@ -1516,8 +1520,11 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td)
INP_WLOCK_ASSERT(inp);
if (__predict_false((so->so_state &
- (SS_ISCONNECTING | SS_ISCONNECTED)) != 0))
+ (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
+ SS_ISDISCONNECTED)) != 0))
return (EISCONN);
+ if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
+ return (EOPNOTSUPP);
INP_HASH_WLOCK(&V_tcbinfo);
error = in6_pcbconnect(inp, sin6, td->td_ucred, true);
@@ -1761,9 +1768,9 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
/*
* Release the ref count the lookup
* acquired.
- */
+ */
refcount_release(&blk->tfb_refcnt);
- /*
+ /*
* Now there is a chance that the
* init() function mucked with some
* things before it failed, such as
@@ -1793,7 +1800,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
* new one already.
*/
refcount_release(&tp->t_fb->tfb_refcnt);
- /*
+ /*
* Set in the new stack.
*/
tp->t_fb = blk;
@@ -1927,7 +1934,7 @@ tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
CC_LIST_RUNLOCK();
return(ESRCH);
}
- /*
+ /*
* With a reference the algorithm cannot be removed
* so we hold a reference through the change process.
*/
@@ -3044,7 +3051,44 @@ db_print_toobflags(char t_oobflags)
}
static void
-db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
+db_print_bblog_state(int state)
+{
+ switch (state) {
+ case TCP_LOG_STATE_RATIO_OFF:
+ db_printf("TCP_LOG_STATE_RATIO_OFF");
+ break;
+ case TCP_LOG_STATE_CLEAR:
+ db_printf("TCP_LOG_STATE_CLEAR");
+ break;
+ case TCP_LOG_STATE_OFF:
+ db_printf("TCP_LOG_STATE_OFF");
+ break;
+ case TCP_LOG_STATE_TAIL:
+ db_printf("TCP_LOG_STATE_TAIL");
+ break;
+ case TCP_LOG_STATE_HEAD:
+ db_printf("TCP_LOG_STATE_HEAD");
+ break;
+ case TCP_LOG_STATE_HEAD_AUTO:
+ db_printf("TCP_LOG_STATE_HEAD_AUTO");
+ break;
+ case TCP_LOG_STATE_CONTINUAL:
+ db_printf("TCP_LOG_STATE_CONTINUAL");
+ break;
+ case TCP_LOG_STATE_TAIL_AUTO:
+ db_printf("TCP_LOG_STATE_TAIL_AUTO");
+ break;
+ case TCP_LOG_VIA_BBPOINTS:
+ db_printf("TCP_LOG_STATE_BBPOINTS");
+ break;
+ default:
+ db_printf("UNKNOWN(%d)", state);
+ break;
+ }
+}
+
+static void
+db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog)
{
db_print_indent(indent);
@@ -3154,18 +3198,68 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
db_print_indent(indent);
db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n",
tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
+
+ db_print_indent(indent);
+ db_printf("t_fb.tfb_tcp_block_name: %s\n", tp->t_fb->tfb_tcp_block_name);
+
+ db_print_indent(indent);
+ db_printf("t_cc.name: %s\n", tp->t_cc->name);
+
+ db_print_indent(indent);
+ db_printf("_t_logstate: %d (", tp->_t_logstate);
+ db_print_bblog_state(tp->_t_logstate);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("t_lognum: %d t_loglimit: %d t_logsn: %u\n",
+ tp->t_lognum, tp->t_loglimit, tp->t_logsn);
+
+ if (show_bblog) {
+#ifdef TCP_BLACKBOX
+ db_print_bblog_entries(&tp->t_logs, indent);
+#else
+ db_print_indent(indent);
+ db_printf("BBLog not supported\n");
+#endif
+ }
}
DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
{
struct tcpcb *tp;
+ bool show_bblog;
if (!have_addr) {
db_printf("usage: show tcpcb <addr>\n");
return;
}
+ show_bblog = strchr(modif, 'b') != NULL;
tp = (struct tcpcb *)addr;
- db_print_tcpcb(tp, "tcpcb", 0);
+ db_print_tcpcb(tp, "tcpcb", 0, show_bblog);
+}
+
+DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct inpcb *inp;
+ bool only_locked, show_bblog;
+
+ only_locked = strchr(modif, 'l') != NULL;
+ show_bblog = strchr(modif, 'b') != NULL;
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
+ if (only_locked &&
+ inp->inp_lock.rw_lock == RW_UNLOCKED)
+ continue;
+ db_print_tcpcb(intotcpcb(inp), "tcpcb", 0, show_bblog);
+ if (db_pager_quit)
+ break;
+ }
+ CURVNET_RESTORE();
+ if (db_pager_quit)
+ break;
+ }
}
#endif
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 5be024ededc7..53856bae9a66 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -182,7 +182,7 @@ struct tcp_sendfile_track {
* snd_una). When the response comes back indicating
* that there was data (return value 1), then the caller
* can build a sendmap entry based on the range and the
- * times. The next query would then be done at the
+ * times. The next query would then be done at the
* newly created sendmap_end. Repeated until sendmap_end == snd_max.
*
* Flags in sendmap_flags are defined below as well.
@@ -197,7 +197,7 @@ struct tcp_sendfile_track {
* The rack_times are a misc collection of information that
* the old stack might possibly fill in. Of course its possible
* that an old stack may not have a piece of information. If so
- * then setting that value to zero is advised. Setting any
+ * then setting that value to zero is advised. Setting any
* timestamp passed should only place a zero in it when it
* is unfilled. This may mean that a time is off by a micro-second
* but this is ok in the grand scheme of things.
@@ -205,13 +205,13 @@ struct tcp_sendfile_track {
* When switching stacks it is desireable to get as much information
* from the old stack to the new stack as possible. Though not always
* will the stack be compatible in the types of information. The
- * init() function needs to take care when it begins changing
+ * init() function needs to take care when it begins changing
* things such as inp_flags2 and the timer units to position these
* changes at a point where it is unlikely they will fail after
* making such changes. A stack optionally can have an "undo"
- * function
+ * function
*
- * To transfer information to the old stack from the new in
+ * To transfer information to the old stack from the new in
* respect to LRO and the inp_flags2, the new stack should set
* the inp_flags2 to what it supports. The old stack in its
* fini() function should call the tcp_handle_orphaned_packets()
@@ -528,15 +528,6 @@ typedef enum {
/* Minimum map entries limit value, if set */
#define TCP_MIN_MAP_ENTRIES_LIMIT 128
-/*
- * TODO: We yet need to brave plowing in
- * to tcp_input() and the pru_usrreq() block.
- * Right now these go to the old standards which
- * are somewhat ok, but in the long term may
- * need to be changed. If we do tackle tcp_input()
- * then we need to get rid of the tcp_do_segment()
- * function below.
- */
/* Flags for tcp functions */
#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
#define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */
@@ -553,13 +544,13 @@ typedef enum {
* do is:
* a) Make sure that the inp_flags2 is setup correctly
* for LRO. There are two flags that the previous
- * stack may have set INP_MBUF_ACKCMP and
+ * stack may have set INP_MBUF_ACKCMP and
* INP_SUPPORTS_MBUFQ. If the new stack does not
* support these it *should* clear the flags.
* b) Make sure that the timers are in the proper
* granularity that the stack wants. The stack
* should check the t_tmr_granularity field. Currently
- * there are two values that it may hold
+ * there are two values that it may hold
* TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC.
* Use the functions tcp_timer_convert(tp, granularity);
* to move the timers to the correct format for your stack.
@@ -567,14 +558,14 @@ typedef enum {
* The new stack may also optionally query the tfb_chg_query
* function if the old stack has one. The new stack may ask
* for one of three entries and can also state to the old
- * stack its support for the INP_MBUF_ACKCMP and
+ * stack its support for the INP_MBUF_ACKCMP and
* INP_SUPPORTS_MBUFQ. This is important since if there are
* queued ack's without that statement the old stack will
* be forced to discard the queued acks. The requests that
* can be made for information by the new stacks are:
*
* Note also that the tfb_tcp_fb_init() when called can
- * determine if a query is needed by looking at the
+ * determine if a query is needed by looking at the
* value passed in the ptr. The ptr is designed to be
* set in with any allocated memory, but the address
* of the condtion (ptr == &tp->t_fb_ptr) will be
@@ -582,17 +573,17 @@ typedef enum {
* setup of a tcb (which means no query would be needed).
* If, however, the value is not t_fb_ptr, then the caller
* is in the middle of a stack switch and is the new stack.
- * A query would be appropriate (if the new stack support
+ * A query would be appropriate (if the new stack support
* the query mechanism).
*
* TCP_QUERY_SENDMAP - Query of outstanding data.
* TCP_QUERY_TIMERS_UP - Query about running timers.
- * TCP_SUPPORTED_LRO - Declaration in req_param of
- * the inp_flags2 supported by
+ * TCP_SUPPORTED_LRO - Declaration in req_param of
+ * the inp_flags2 supported by
* the new stack.
* TCP_QUERY_RACK_TIMES - Enquire about various timestamps
* and states the old stack may be in.
- *
+ *
* tfb_tcp_fb_fini is changed to add a flag to tell
* the old stack if the tcb is being destroyed or
* not. A one in the flag means the TCB is being
@@ -936,9 +927,12 @@ struct in_conninfo;
+ (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
/*
- * TCP statistics.
- * Many of these should be kept per connection,
- * but that's inconvenient at the moment.
+ * Global (per-VNET) TCP statistics. The below structure represents what we
+ * export to the userland, but in the kernel we have an array of counter_u64_t
+ * with as many elements as there are members in the structure. The counters
+ * shall be increased by TCPSTAT_INC() or KMOD_TCPSTAT_INC(). Adding a new
+ * counter also requires adding corresponding SDT probes into in_kdtrace.h and
+ * into in_kdtrace.c.
*/
struct tcpstat {
uint64_t tcps_connattempt; /* connections initiated */
@@ -1024,6 +1018,8 @@ struct tcpstat {
uint64_t tcps_sc_zonefail; /* zalloc() failed */
uint64_t tcps_sc_sendcookie; /* SYN cookie sent */
uint64_t tcps_sc_recvcookie; /* SYN cookie received */
+ uint64_t tcps_sc_spurcookie; /* SYN cookie spurious, rejected */
+ uint64_t tcps_sc_failcookie; /* SYN cookie failed, rejected */
uint64_t tcps_hc_added; /* entry added to hostcache */
uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */
@@ -1243,6 +1239,9 @@ struct tcp_function_info {
#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
#define TCPCTL_DROP 15 /* drop tcp connection */
#define TCPCTL_STATES 16 /* connection counts by TCP state */
+#define TCPCTL_KTLSLIST 17 /* connections with active ktls
+ session */
+#define TCPCTL_KTLSLIST_WKEYS 18 /* KTLSLIST with key data exported */
#ifdef _KERNEL
#ifdef SYSCTL_DECL
@@ -1380,8 +1379,7 @@ int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *,
void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
void tcp_dooptions(struct tcpopt *, u_char *, int, int);
-void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
- struct tcpcb *, int, int);
+void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int);
void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
void tcp_xmit_timer(struct tcpcb *, int);
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 76aadad9a3b9..4203029ff7c3 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -525,7 +525,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
/*
* Temporary failure during offload, take this PCB back.
* Detach from the TOE driver and do the rest of what
- * TCP's pru_connect would have done if the connection
+ * TCP's pr_connect() would have done if the connection
* wasn't offloaded.
*/
diff --git a/sys/netinet/toecore.h b/sys/netinet/toecore.h
index 612c2fe1caf5..843b261ec162 100644
--- a/sys/netinet/toecore.h
+++ b/sys/netinet/toecore.h
@@ -66,7 +66,7 @@ struct toedev {
void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *);
/*
- * This is called by the kernel during pru_rcvd for an offloaded TCP
+ * This is called by the kernel during pr_rcvd() for an offloaded TCP
* connection and provides an opportunity for the TOE driver to manage
* its rx window and credits.
*/
diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h
index edff456ba70e..010f2210b516 100644
--- a/sys/netinet/udp.h
+++ b/sys/netinet/udp.h
@@ -44,7 +44,7 @@ struct udphdr {
u_short uh_dport; /* destination port */
u_short uh_ulen; /* udp length */
u_short uh_sum; /* udp checksum */
-};
+} __packed;
/*
* User-settable options (used with setsockopt).
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index dafbaf6dc672..3e6519118a40 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -243,7 +243,6 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
struct sockaddr_in6 udp_in6;
#endif
struct udpcb *up;
- bool filtered;
INP_LOCK_ASSERT(inp);
@@ -252,13 +251,19 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
*/
up = intoudpcb(inp);
if (up->u_tun_func != NULL) {
+ bool filtered;
+
in_pcbref(inp);
INP_RUNLOCK(inp);
filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
up->u_tun_ctx);
INP_RLOCK(inp);
- if (filtered)
- return (in_pcbrele_rlocked(inp));
+ if (in_pcbrele_rlocked(inp))
+ return (1);
+ if (filtered) {
+ INP_RUNLOCK(inp);
+ return (1);
+ }
}
off += sizeof(struct udphdr);
@@ -443,7 +448,7 @@ udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
/*
* No matching pcb found; discard datagram. (No need
* to send an ICMP Port Unreachable for a broadcast
- * or multicast datgram.)
+ * or multicast datagram.)
*/
UDPSTAT_INC(udps_noport);
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
@@ -555,6 +560,12 @@ udp_input(struct mbuf **mp, int *offp, int proto)
ip->ip_dst.s_addr, htonl((u_short)len +
m->m_pkthdr.csum_data + proto));
uh_sum ^= 0xffff;
+ } else if (m->m_pkthdr.csum_flags & CSUM_IP_UDP) {
+ /*
+ * Packet from local host (maybe from a VM).
+ * Checksum not required.
+ */
+ uh_sum = 0;
} else {
char b[offsetof(struct ipovly, ih_src)];
struct ipovly *ipov = (struct ipovly *)ip;
@@ -643,7 +654,11 @@ udp_input(struct mbuf **mp, int *offp, int proto)
else
UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
UDPSTAT_INC(udps_noport);
- if (m->m_flags & (M_BCAST | M_MCAST)) {
+ if (m->m_flags & M_MCAST) {
+ UDPSTAT_INC(udps_noportmcast);
+ goto badunlocked;
+ }
+ if (m->m_flags & M_BCAST) {
UDPSTAT_INC(udps_noportbcast);
goto badunlocked;
}