aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet/tcp_hpts.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet/tcp_hpts.c')
-rw-r--r--sys/netinet/tcp_hpts.c159
1 files changed, 100 insertions, 59 deletions
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 5b39c94e0e58..b77ebc928809 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -170,6 +170,50 @@
#define NUM_OF_HPTSI_SLOTS 102400
+/* The number of connections after which the dynamic sleep logic kicks in. */
+#define DEFAULT_CONNECTION_THRESHOLD 100
+
+/*
+ * When using the hpts, a TCP stack must make sure
+ * that once a INP_DROPPED flag is applied to a INP
+ * that it does not expect tcp_output() to ever be
+ * called by the hpts. The hpts will *not* call
+ * any output (or input) functions on a TCB that
+ * is in the DROPPED state.
+ *
+ * This implies final ACK's and RST's that might
+ * be sent when a TCB is still around must be
+ * sent from a routine like tcp_respond().
+ */
+#define LOWEST_SLEEP_ALLOWED 50
+#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
+ * this determines min granularity of the
+ * hpts. If 1, granularity is 10useconds at
+ * the cost of more CPU (context switching).
+ * Note do not set this to 0.
+ */
+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
+#define DYNAMIC_MAX_SLEEP 5000 /* 5ms */
+
+/* Thresholds for raising/lowering sleep */
+#define SLOTS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
+#define SLOTS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
+/**
+ *
+ * Dynamic adjustment of sleeping times is done in "new" mode
+ * where we are depending on syscall returns and lro returns
+ * to push hpts forward mainly and the timer is only a backstop.
+ *
+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
+ * then we do a dynamic adjustment on the time we sleep.
+ * Our threshold is if the lateness of the first client served (in ticks) is
+ * greater than or equal too slots_indicate_more_sleep (10ms
+ * or 10000 ticks). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the ticks_ran is less than
+ * slots_indicate_more_sleep (100 ticks or 1000usecs).
+ *
+ */
+
/* Each hpts has its own p_mtx which is used for locking */
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
@@ -244,11 +288,10 @@ static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
static void tcp_hpts_thread(void *ctx);
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
-static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
+static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
-
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"TCP Hpts controls");
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -366,7 +409,7 @@ sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
new = hpts_sleep_max;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error == 0 && req->newptr) {
- if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) ||
+ if ((new < (dynamic_min_sleep/HPTS_USECS_PER_SLOT)) ||
(new > HPTS_MAX_SLEEP_ALLOWED))
error = EINVAL;
else
@@ -404,15 +447,15 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
&sysctl_net_inet_tcp_hpts_min_sleep, "IU",
"The minimum time the hpts must sleep before processing more slots");
-static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
-static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
+static int slots_indicate_more_sleep = SLOTS_INDICATE_MORE_SLEEP;
+static int slots_indicate_less_sleep = SLOTS_INDICATE_LESS_SLEEP;
static int tcp_hpts_no_wake_over_thresh = 1;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
- &ticks_indicate_more_sleep, 0,
+ &slots_indicate_more_sleep, 0,
"If we only process this many or less on a timeout, we need longer sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
- &ticks_indicate_less_sleep, 0,
+ &slots_indicate_less_sleep, 0,
"If we process this many or more on a timeout, we need less sleep on the next callout");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
&tcp_hpts_no_wake_over_thresh, 0,
@@ -433,38 +476,40 @@ static void
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
int slots_to_run, int idx, bool from_callout)
{
- union tcp_log_stackspecific log;
- /*
- * Unused logs are
- * 64 bit - delRate, rttProp, bw_inuse
- * 16 bit - cwnd_gain
- * 8 bit - bbr_state, bbr_substate, inhpts;
- */
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.flex1 = hpts->p_nxt_slot;
- log.u_bbr.flex2 = hpts->p_cur_slot;
- log.u_bbr.flex3 = hpts->p_prev_slot;
- log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
- log.u_bbr.flex6 = hpts->p_on_queue_cnt;
- log.u_bbr.flex7 = hpts->p_cpu;
- log.u_bbr.flex8 = (uint8_t)from_callout;
- log.u_bbr.inflight = slots_to_run;
- log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
- log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
- log.u_bbr.epoch = hpts->saved_curslot;
- log.u_bbr.lt_epoch = hpts->saved_prev_slot;
- log.u_bbr.pkts_out = hpts->p_delayed_by;
- log.u_bbr.lost = hpts->p_hpts_sleep_time;
- log.u_bbr.pacing_gain = hpts->p_cpu;
- log.u_bbr.pkt_epoch = hpts->p_runningslot;
- log.u_bbr.use_lt_bw = 1;
- TCP_LOG_EVENTP(tp, NULL,
- &tptosocket(tp)->so_rcv,
- &tptosocket(tp)->so_snd,
- BBR_LOG_HPTSDIAG, 0,
- 0, &log, false, tv);
+ if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ /*
+ * Unused logs are
+ * 64 bit - delRate, rttProp, bw_inuse
+ * 16 bit - cwnd_gain
+ * 8 bit - bbr_state, bbr_substate, inhpts;
+ */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.flex7 = hpts->p_cpu;
+ log.u_bbr.flex8 = (uint8_t)from_callout;
+ log.u_bbr.inflight = slots_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usec(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.pacing_gain = hpts->p_cpu;
+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
+ log.u_bbr.use_lt_bw = 1;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
+ }
}
static void
@@ -875,7 +920,7 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
return (slot_on);
}
/* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hptstick(&tv);
+ wheel_cts = tcp_tv_to_hpts_slot(&tv);
/* Map it onto the wheel */
wheel_slot = tick_to_wheel(wheel_cts);
/* Now what's the max we can place it at? */
@@ -947,7 +992,7 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
* We need to reschedule the hpts's time-out.
*/
hpts->p_hpts_sleep_time = slot;
- need_new_to = slot * HPTS_TICKS_PER_SLOT;
+ need_new_to = slot * HPTS_USECS_PER_SLOT;
}
}
/*
@@ -1102,7 +1147,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
hpts->p_lasttick = hpts->p_curtick;
hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+ tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
if ((hpts->p_on_queue_cnt == 0) ||
(hpts->p_lasttick == hpts->p_curtick)) {
@@ -1118,8 +1163,7 @@ again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) >
- ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
+ if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) &&
(hpts->p_on_queue_cnt != 0)) {
/*
* Wheel wrap is occuring, basically we
@@ -1200,7 +1244,7 @@ again:
* was not any (i.e. if slots_to_run == 1, no delay).
*/
hpts->p_delayed_by = (slots_to_run - (i + 1)) *
- HPTS_TICKS_PER_SLOT;
+ HPTS_USECS_PER_SLOT;
runningslot = hpts->p_runningslot;
hptsh = &hpts->p_hptss[runningslot];
@@ -1353,10 +1397,7 @@ again:
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
- if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
- tcp_hpts_log(hpts, tp, &tv, slots_to_run, i,
- from_callout);
- }
+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
@@ -1447,7 +1488,7 @@ no_one:
goto again;
}
no_run:
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+ tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1487,7 +1528,7 @@ no_run:
}
void
-__tcp_set_hpts(struct tcpcb *tp, int32_t line)
+tcp_set_hpts(struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
@@ -1570,7 +1611,7 @@ __tcp_run_hpts(void)
ticks_ran = tcp_hptsi(hpts, false);
/* We may want to adjust the sleep values here */
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
- if (ticks_ran > ticks_indicate_less_sleep) {
+ if (ticks_ran > slots_indicate_less_sleep) {
struct timeval tv;
sbintime_t sb;
@@ -1580,7 +1621,7 @@ __tcp_run_hpts(void)
/* Reschedule with new to value */
tcp_hpts_set_max_sleep(hpts, 0);
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
/* Validate its in the right ranges */
if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
hpts->overidden_sleep = tv.tv_usec;
@@ -1602,7 +1643,7 @@ __tcp_run_hpts(void)
callout_reset_sbt_on(&hpts->co, sb, 0,
hpts_timeout_swi, hpts, hpts->p_cpu,
(C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else if (ticks_ran < ticks_indicate_more_sleep) {
+ } else if (ticks_ran < slots_indicate_more_sleep) {
/* For the further sleep, don't reschedule hpts */
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
@@ -1684,7 +1725,7 @@ tcp_hpts_thread(void *ctx)
hpts->p_hpts_active = 1;
ticks_ran = tcp_hptsi(hpts, true);
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) {
hpts->hit_callout_thresh = 1;
atomic_add_int(&hpts_that_need_softclock, 1);
@@ -1698,11 +1739,11 @@ tcp_hpts_thread(void *ctx)
* Only adjust sleep time if we were
* called from the callout i.e. direct_wake == 0.
*/
- if (ticks_ran < ticks_indicate_more_sleep) {
+ if (ticks_ran < slots_indicate_more_sleep) {
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
- } else if (ticks_ran > ticks_indicate_less_sleep) {
+ } else if (ticks_ran > slots_indicate_less_sleep) {
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
@@ -1949,7 +1990,7 @@ tcp_hpts_mod_load(void)
hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[i] = tcp_tv_to_usectick(&tv);
+ tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv);
hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
@@ -1996,7 +2037,7 @@ tcp_hpts_mod_load(void)
}
}
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
hpts->sleeping = tv.tv_usec;
sb = tvtosbt(tv);
callout_reset_sbt_on(&hpts->co, sb, 0,