diff options
Diffstat (limited to 'sys/netinet/tcp_hpts.c')
| -rw-r--r-- | sys/netinet/tcp_hpts.c | 933 | 
1 files changed, 508 insertions, 425 deletions
| diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 63bbe4bba11b..c54459bb5f01 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -39,15 +39,14 @@   * First, and probably the main thing its used by Rack and BBR, it can   * be used to call tcp_output() of a transport stack at some time in the future.   * The normal way this is done is that tcp_output() of the stack schedules - * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The - * slot is the time from now that the stack wants to be called but it - * must be converted to tcp_hpts's notion of slot. This is done with - * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical + * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The + * usecs is the time from now that the stack wants to be called and is + * passing time directly in microseconds. So a typical   * call from the tcp_output() routine might look like:   * - * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); + * tcp_hpts_insert(tp, 550, NULL);   * - * The above would schedule tcp_output() to be called in 550 useconds. + * The above would schedule tcp_output() to be called in 550 microseconds.   * Note that if using this mechanism the stack will want to add near   * its top a check to prevent unwanted calls (from user land or the   * arrival of incoming ack's). So it would add something like: @@ -149,27 +148,44 @@  #include <netinet/tcpip.h>  #include <netinet/cc/cc.h>  #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h>  #include <netinet/tcp_log_buf.h>  #ifdef tcp_offload  #include <netinet/tcp_offload.h>  #endif -/* - * The hpts uses a 102400 wheel. The wheel - * defines the time in 10 usec increments (102400 x 10). - * This gives a range of 10usec - 1024ms to place - * an entry within. If the user requests more than - * 1.024 second, a remaineder is attached and the hpts - * when seeing the remainder will re-insert the - * inpcb forward in time from where it is until - * the remainder is zero. - */ +/* Global instance for TCP HPTS */ +struct tcp_hptsi *tcp_hptsi_pace; + +/* Default function table for production use. */ +const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = { +	.microuptime = microuptime, +	.swi_add = swi_add, +	.swi_remove = swi_remove, +	.swi_sched = swi_sched, +	.intr_event_bind = intr_event_bind, +	.intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset, +	.callout_init = callout_init, +	.callout_reset_sbt_on = callout_reset_sbt_on, +	._callout_stop_safe = _callout_stop_safe, +}; -#define NUM_OF_HPTSI_SLOTS 102400 +#ifdef TCP_HPTS_KTEST +#define microuptime pace->funcs->microuptime +#define swi_add pace->funcs->swi_add +#define swi_remove pace->funcs->swi_remove +#define swi_sched pace->funcs->swi_sched +#define intr_event_bind pace->funcs->intr_event_bind +#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset +#define callout_init pace->funcs->callout_init +#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on +#define _callout_stop_safe pace->funcs->_callout_stop_safe +#endif -/* The number of connections after which the dynamic sleep logic kicks in. */ -#define DEFAULT_CONNECTION_THRESHOLD 100 +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); + +static void tcp_hpts_thread(void *ctx);  /*   * When using the hpts, a TCP stack must make sure @@ -204,87 +220,22 @@   *   * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh   * then we do a dynamic adjustment on the time we sleep. - * Our threshold is if the lateness of the first client served (in ticks) is + * Our threshold is if the lateness of the first client served (in slots) is   * greater than or equal too slots_indicate_more_sleep (10ms - * or 10000 ticks). If we were that late, the actual sleep time - * is adjusted down by 50%. If the ticks_ran is less than - * slots_indicate_more_sleep (100 ticks or 1000usecs). + * or 10000 slots). If we were that late, the actual sleep time + * is adjusted down by 50%. If the slots_ran is less than + * slots_indicate_more_sleep (100 slots or 1000usecs).   *   */ -/* Each hpts has its own p_mtx which is used for locking */ -#define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED) -#define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx) -#define	HPTS_TRYLOCK(hpts)	mtx_trylock(&(hpts)->p_mtx) -#define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx) -struct tcp_hpts_entry { -	/* Cache line 0x00 */ -	struct mtx p_mtx;	/* Mutex for hpts */ -	struct timeval p_mysleep;	/* Our min sleep time */ -	uint64_t syscall_cnt; -	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */ -	uint16_t p_hpts_active; /* Flag that says hpts is awake  */ -	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ -	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */ -	uint32_t p_runningslot; /* Current tick we are at if we are running */ -	uint32_t p_prev_slot;	/* Previous slot we were on */ -	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */ -	uint32_t p_nxt_slot;	/* The next slot outside the current range of -				 * slots that the hpts is running on. */ -	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */ -	uint32_t p_lasttick;	/* Last tick before the current one */ -	uint8_t p_direct_wake :1, /* boolean */ -		p_on_min_sleep:1, /* boolean */ -		p_hpts_wake_scheduled:1, /* boolean */ -		hit_callout_thresh:1, -		p_avail:4; -	uint8_t p_fill[3];	  /* Fill to 32 bits */ -	/* Cache line 0x40 */ -	struct hptsh { -		TAILQ_HEAD(, tcpcb)	head; -		uint32_t		count; -		uint32_t		gencnt; -	} *p_hptss;			/* Hptsi wheel */ -	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max -					 * of 255ms */ -	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */ -	uint32_t saved_lasttick;	/* for logging */ -	uint32_t saved_curtick;		/* for logging */ -	uint32_t saved_curslot;		/* for logging */ -	uint32_t saved_prev_slot;       /* for logging */ -	uint32_t p_delayed_by;	/* How much were we delayed by */ -	/* Cache line 0x80 */ -	struct sysctl_ctx_list hpts_ctx; -	struct sysctl_oid *hpts_root; -	struct intr_event *ie; -	void *ie_cookie; -	uint16_t p_num;		/* The hpts number one per cpu */ -	uint16_t p_cpu;		/* The hpts CPU */ -	/* There is extra space in here */ -	/* Cache line 0x100 */ -	struct callout co __aligned(CACHE_LINE_SIZE); -}               __aligned(CACHE_LINE_SIZE); - -static struct tcp_hptsi { -	struct cpu_group **grps; -	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */ -	uint32_t *cts_last_ran; -	uint32_t grp_cnt; -	uint32_t rp_num_hptss;	/* Number of hpts threads */ -} tcp_pace; - -static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");  #ifdef RSS -static int tcp_bind_threads = 1; +int tcp_bind_threads = 1;  #else -static int tcp_bind_threads = 2; +int tcp_bind_threads = 2;  #endif  static int tcp_use_irq_cpu = 0;  static int hpts_does_tp_logging = 0; - -static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); -static void tcp_hpts_thread(void *ctx); - +static int32_t tcp_hpts_precision = 120;  int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;  static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;  static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; @@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,  SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,      "TCP Hpts statistics"); -#define	timersub(tvp, uvp, vvp)						\ -	do {								\ -		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\ -		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\ -		if ((vvp)->tv_usec < 0) {				\ -			(vvp)->tv_sec--;				\ -			(vvp)->tv_usec += 1000000;			\ -		}							\ -	} while (0) - -static int32_t tcp_hpts_precision = 120; - -static struct hpts_domain_info { -	int count; -	int cpu[MAXCPU]; -} hpts_domains[MAXMEMDOM]; -  counter_u64_t hpts_hopelessly_behind;  SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,      &tcp_hpts_no_wake_over_thresh, 0,      "When we are over the threshold on the pacer do we prohibit wakeups?"); -static uint16_t -hpts_random_cpu(void) +uint16_t +tcp_hptsi_random_cpu(struct tcp_hptsi *pace)  {  	uint16_t cpuid;  	uint32_t ran;  	ran = arc4random(); -	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); +	cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);  	return (cpuid);  } @@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,  		log.u_bbr.flex2 = hpts->p_cur_slot;  		log.u_bbr.flex3 = hpts->p_prev_slot;  		log.u_bbr.flex4 = idx; -		log.u_bbr.flex5 = hpts->p_curtick;  		log.u_bbr.flex6 = hpts->p_on_queue_cnt;  		log.u_bbr.flex7 = hpts->p_cpu;  		log.u_bbr.flex8 = (uint8_t)from_callout;  		log.u_bbr.inflight = slots_to_run;  		log.u_bbr.applimited = hpts->overidden_sleep; -		log.u_bbr.delivered = hpts->saved_curtick;  		log.u_bbr.timeStamp = tcp_tv_to_usec(tv);  		log.u_bbr.epoch = hpts->saved_curslot;  		log.u_bbr.lt_epoch = hpts->saved_prev_slot; @@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,  	}  } +/* + * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI + * for the HPTS entry to run. + */  static void -tcp_wakehpts(struct tcp_hpts_entry *hpts) +tcp_hpts_sleep_timeout(void *arg)  { +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif +	struct tcp_hpts_entry *hpts; + +	hpts = (struct tcp_hpts_entry *)arg; +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif +	swi_sched(hpts->ie_cookie, 0); +} + +/* + * Reset the HPTS callout timer with the provided timeval. Returns the results + * of the callout_reset_sbt_on() function. + */ +static int +tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv) +{ +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif +	sbintime_t sb; + +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif + +	/* Store off to make visible the actual sleep time */ +	hpts->sleeping = tv->tv_usec; + +	sb = tvtosbt(*tv); +	return (callout_reset_sbt_on( +		    &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu, +		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)))); +} + +/* + * Schedules the SWI for the HTPS entry to run, if not already scheduled or + * running. + */ +void +tcp_hpts_wake(struct tcp_hpts_entry *hpts) +{ +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif +  	HPTS_MTX_ASSERT(hpts); +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif +  	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {  		hpts->p_direct_wake = 0;  		return; @@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)  }  static void -hpts_timeout_swi(void *arg) -{ -	struct tcp_hpts_entry *hpts; - -	hpts = (struct tcp_hpts_entry *)arg; -	swi_sched(hpts->ie_cookie, 0); -} - -static void  tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)  {  	struct inpcb *inp = tptoinpcb(tp); @@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)  }  static struct tcp_hpts_entry * -tcp_hpts_lock(struct tcpcb *tp) +tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)  {  	struct tcp_hpts_entry *hpts;  	INP_LOCK_ASSERT(tptoinpcb(tp)); -	hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; +	hpts = pace->rp_ent[tp->t_hpts_cpu];  	HPTS_LOCK(hpts);  	return (hpts); @@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp)   * and has never received a first packet.   */  void -tcp_hpts_init(struct tcpcb *tp) +__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)  { -  	if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { -		tp->t_hpts_cpu = hpts_random_cpu(); +		tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);  		MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));  	}  } @@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp)   * INP lock and then get the hpts lock.   */  void -tcp_hpts_remove(struct tcpcb *tp) +__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)  {  	struct tcp_hpts_entry *hpts;  	struct hptsh *hptsh;  	INP_WLOCK_ASSERT(tptoinpcb(tp)); -	hpts = tcp_hpts_lock(tp); +	hpts = tcp_hpts_lock(pace, tp);  	if (tp->t_in_hpts == IHPTS_ONQUEUE) {  		hptsh = &hpts->p_hptss[tp->t_hpts_slot];  		tp->t_hpts_request = 0; @@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus)  {  	/*  	 * Given a slot on the wheel, what slot -	 * is that plus ticks out? +	 * is that plus slots out?  	 */ -	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); +	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot));  	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);  }  static inline int -tick_to_wheel(uint32_t cts_in_wticks) +cts_to_wheel(uint32_t cts)  {  	/* -	 * Given a timestamp in ticks (so by -	 * default to get it to a real time one -	 * would multiply by 10.. i.e the number -	 * of ticks in a slot) map it to our limited -	 * space wheel. +	 * Given a timestamp in useconds map it to our limited space wheel.  	 */ -	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); +	return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS);  }  static inline int @@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  	if ((hpts->p_hpts_active == 1) &&  	    (hpts->p_wheel_complete == 0)) {  		end_slot = hpts->p_runningslot; -		/* Back up one tick */ +		/* Back up one slot */  		if (end_slot == 0)  			end_slot = NUM_OF_HPTSI_SLOTS - 1;  		else @@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  		 * not active, or we have  		 * completed the pass over  		 * the wheel, we can use the -		 * prev tick and subtract one from it. This puts us +		 * prev slot and subtract one from it. This puts us  		 * as far out as possible on the wheel.  		 */  		end_slot = hpts->p_prev_slot; @@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  		/*  		 * Now we have close to the full wheel left minus the  		 * time it has been since the pacer went to sleep. Note -		 * that wheel_tick, passed in, should be the current time +		 * that wheel_slot, passed in, should be the current time  		 * from the perspective of the caller, mapped to the wheel.  		 */  		if (hpts->p_prev_slot != wheel_slot) @@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  #ifdef INVARIANTS  static void  check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, -    uint32_t hptsslot, int line) +    uint32_t hptsslot)  {  	/*  	 * Sanity checks for the pacer with invariants @@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,  }  #endif -uint32_t -tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) +void +__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, +	struct hpts_diag *diag)  {  	struct tcp_hpts_entry *hpts;  	struct timeval tv; -	uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; +	uint32_t slot, wheel_cts, last_slot, need_new_to = 0;  	int32_t wheel_slot, maxslots;  	bool need_wakeup = false; @@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  	MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE));  	/* +	 * Convert microseconds to slots for internal use.  	 * We now return the next-slot the hpts will be on, beyond its  	 * current run (if up) or where it was when it stopped if it is  	 * sleeping.  	 */ -	hpts = tcp_hpts_lock(tp); +	slot = HPTS_USEC_TO_SLOTS(usecs); +	hpts = tcp_hpts_lock(pace, tp);  	microuptime(&tv);  	if (diag) {  		memset(diag, 0, sizeof(struct hpts_diag)); @@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  		diag->p_runningslot = hpts->p_runningslot;  		diag->p_nxt_slot = hpts->p_nxt_slot;  		diag->p_cur_slot = hpts->p_cur_slot; -		diag->p_curtick = hpts->p_curtick; -		diag->p_lasttick = hpts->p_lasttick;  		diag->slot_req = slot;  		diag->p_on_min_sleep = hpts->p_on_min_sleep;  		diag->hpts_sleep_time = hpts->p_hpts_sleep_time; @@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  			 * timeout is not 1.  			 */  			hpts->p_direct_wake = 1; -			tcp_wakehpts(hpts); +			tcp_hpts_wake(hpts);  		} -		slot_on = hpts->p_nxt_slot;  		HPTS_UNLOCK(hpts); -		return (slot_on); +		return;  	} -	/* Get the current time relative to the wheel */ -	wheel_cts = tcp_tv_to_hpts_slot(&tv); -	/* Map it onto the wheel */ -	wheel_slot = tick_to_wheel(wheel_cts); +	/* Get the current time stamp and map it onto the wheel */ +	wheel_cts = tcp_tv_to_usec(&tv); +	wheel_slot = cts_to_wheel(wheel_cts);  	/* Now what's the max we can place it at? */  	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);  	if (diag) { @@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  		tp->t_hpts_slot = last_slot;  	}  	if (diag) { -		diag->slot_remaining = tp->t_hpts_request; +		diag->time_remaining = tp->t_hpts_request;  		diag->inp_hptsslot = tp->t_hpts_slot;  	}  #ifdef INVARIANTS -	check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); +	check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot);  #endif  	if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))  		tcp_hpts_insert_internal(tp, hpts); @@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  	}  	/*  	 * Now how far is the hpts sleeping to? if active is 1, its -	 * up and ticking we do nothing, otherwise we may need to +	 * up and running we do nothing, otherwise we may need to  	 * reschedule its callout if need_new_to is set from above.  	 */  	if (need_wakeup) {  		hpts->p_direct_wake = 1; -		tcp_wakehpts(hpts); +		tcp_hpts_wake(hpts);  		if (diag) {  			diag->need_new_to = 0;  			diag->co_ret = 0xffff0000; @@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  	} else if (need_new_to) {  		int32_t co_ret;  		struct timeval tv; -		sbintime_t sb;  		tv.tv_sec = 0;  		tv.tv_usec = 0; @@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  			tv.tv_sec++;  			need_new_to -= HPTS_USEC_IN_SEC;  		} -		tv.tv_usec = need_new_to; -		sb = tvtosbt(tv); -		co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, -					      hpts_timeout_swi, hpts, hpts->p_cpu, -					      (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); +		tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */ +		co_ret = tcp_hpts_sleep(hpts, &tv);  		if (diag) {  			diag->need_new_to = need_new_to;  			diag->co_ret = co_ret;  		}  	} -	slot_on = hpts->p_nxt_slot;  	HPTS_UNLOCK(hpts); - -	return (slot_on);  }  static uint16_t -hpts_cpuid(struct tcpcb *tp, int *failed) +hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)  {  	struct inpcb *inp = tptoinpcb(tp);  	u_int cpuid; @@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)  #ifdef RSS  	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);  	if (cpuid == NETISR_CPUID_NONE) -		return (hpts_random_cpu()); +		return (tcp_hptsi_random_cpu(pace));  	else  		return (cpuid);  #endif @@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)  	 */  	if (inp->inp_flowtype == M_HASHTYPE_NONE) {  		counter_u64_add(cpu_uses_random, 1); -		return (hpts_random_cpu()); +		return (tcp_hptsi_random_cpu(pace));  	}  	/*  	 * Hash to a thread based on the flowid.  If we are using numa, @@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)  #ifdef NUMA  	} else {  		/* Hash into the cpu's that use that domain */ -		di = &hpts_domains[inp->inp_numa_domain]; +		di = &pace->domains[inp->inp_numa_domain];  		cpuid = di->cpu[inp->inp_flowid % di->count];  	}  #endif @@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)  	}  } -static int32_t +static bool +tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run) +{ +	return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT)); +} + +int32_t  tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)  { +	struct tcp_hptsi *pace;  	struct tcpcb *tp;  	struct timeval tv;  	int32_t slots_to_run, i, error; @@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)  	int32_t wrap_loop_cnt = 0;  	int32_t slot_pos_of_endpoint = 0;  	int32_t orig_exit_slot; +	uint32_t cts, cts_last_run;  	bool completed_measure, seen_endpoint;  	completed_measure = false; @@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)  	HPTS_MTX_ASSERT(hpts);  	NET_EPOCH_ASSERT(); + +	pace = hpts->p_hptsi; +	MPASS(pace != NULL); +  	/* record previous info for any logging */ -	hpts->saved_lasttick = hpts->p_lasttick; -	hpts->saved_curtick = hpts->p_curtick;  	hpts->saved_curslot = hpts->p_cur_slot;  	hpts->saved_prev_slot = hpts->p_prev_slot; -	hpts->p_lasttick = hpts->p_curtick; -	hpts->p_curtick = tcp_gethptstick(&tv); -	tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); -	orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); +	microuptime(&tv); +	cts_last_run = pace->cts_last_ran[hpts->p_cpu]; +	pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv); + +	orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);  	if ((hpts->p_on_queue_cnt == 0) || -	    (hpts->p_lasttick == hpts->p_curtick)) { +	    !tcp_hpts_different_slots(cts, cts_last_run)) {  		/* -		 * No time has yet passed, -		 * or nothing to do. +		 * Not enough time has yet passed or nothing to do.  		 */  		hpts->p_prev_slot = hpts->p_cur_slot; -		hpts->p_lasttick = hpts->p_curtick;  		goto no_run;  	}  again:  	hpts->p_wheel_complete = 0;  	HPTS_MTX_ASSERT(hpts);  	slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); -	if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) && -	    (hpts->p_on_queue_cnt != 0)) { +	if ((hpts->p_on_queue_cnt != 0) && +	    ((cts - cts_last_run) > +	     ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) {  		/*  		 * Wheel wrap is occuring, basically we  		 * are behind and the distance between @@ -1238,7 +1214,7 @@ again:  		uint32_t runningslot;  		/* -		 * Calculate our delay, if there are no extra ticks there +		 * Calculate our delay, if there are no extra slots there  		 * was not any (i.e. if slots_to_run == 1, no delay).  		 */  		hpts->p_delayed_by = (slots_to_run - (i + 1)) * @@ -1391,7 +1367,7 @@ again:  				 * gets added to the hpts (not this one)  				 * :-)  				 */ -				tcp_set_hpts(tp); +				__tcp_set_hpts(pace, tp);  			}  			CURVNET_SET(inp->inp_vnet);  			/* Lets do any logging that we might want to */ @@ -1450,10 +1426,12 @@ no_one:  	hpts->p_delayed_by = 0;  	/*  	 * Check to see if we took an excess amount of time and need to run -	 * more ticks (if we did not hit eno-bufs). +	 * more slots (if we did not hit eno-bufs).  	 */  	hpts->p_prev_slot = hpts->p_cur_slot; -	hpts->p_lasttick = hpts->p_curtick; +	microuptime(&tv); +	cts_last_run = cts; +	cts = tcp_tv_to_usec(&tv);  	if (!from_callout || (loop_cnt > max_pacer_loops)) {  		/*  		 * Something is serious slow we have @@ -1465,7 +1443,7 @@ no_one:  		 * can never catch up :(  		 *  		 * We will just lie to this thread -		 * and let it thing p_curtick is +		 * and let it think p_curslot is  		 * correct. When it next awakens  		 * it will find itself further behind.  		 */ @@ -1473,20 +1451,19 @@ no_one:  			counter_u64_add(hpts_hopelessly_behind, 1);  		goto no_run;  	} -	hpts->p_curtick = tcp_gethptstick(&tv); -	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + +	hpts->p_cur_slot = cts_to_wheel(cts);  	if (!seen_endpoint) {  		/* We saw no endpoint but we may be looping */  		orig_exit_slot = hpts->p_cur_slot;  	} -	if ((wrap_loop_cnt < 2) && -	    (hpts->p_lasttick != hpts->p_curtick)) { +	if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) {  		counter_u64_add(hpts_loops, 1);  		loop_cnt++;  		goto again;  	}  no_run: -	tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); +	pace->cts_last_ran[hpts->p_cpu] = cts;  	/*  	 * Set flag to tell that we are done for  	 * any slot input that happens during @@ -1494,25 +1471,36 @@ no_run:  	 */  	hpts->p_wheel_complete = 1;  	/* -	 * Now did we spend too long running input and need to run more ticks? -	 * Note that if wrap_loop_cnt < 2 then we should have the conditions -	 * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt -	 * is greater than 2, then the condtion most likely are *not* true. -	 * Also if we are called not from the callout, we don't run the wheel -	 * multiple times so the slots may not align either. -	 */ -	KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || -		 (wrap_loop_cnt >= 2) || !from_callout), -		("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, -		 hpts->p_prev_slot, hpts->p_cur_slot)); -	KASSERT(((hpts->p_lasttick == hpts->p_curtick) -		 || (wrap_loop_cnt >= 2) || !from_callout), -		("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, -		 hpts->p_lasttick, hpts->p_curtick)); -	if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { -		hpts->p_curtick = tcp_gethptstick(&tv); +	* If enough time has elapsed that we should be processing the next +	* slot(s), then we should have kept running and not marked the wheel as +	* complete. +	* +	* But there are several other conditions where we would have stopped +	* processing, so the prev/cur slots and cts variables won't match. +	* These conditions are: +	* +	* - Calls not from callouts don't run multiple times +	* - The wheel is empty +	* - We've processed more than max_pacer_loops times +	* - We've wrapped more than 2 times +	* +	* This assert catches when the logic above has violated this design. +	* +	*/ +	KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) || +		 (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) || +		 ((hpts->p_prev_slot == hpts->p_cur_slot) && +		  !tcp_hpts_different_slots(cts, cts_last_run))), +		("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, " +		 "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d", +		 hpts, hpts->p_prev_slot, hpts->p_cur_slot, +		 cts_last_run, cts, loop_cnt, wrap_loop_cnt)); + +	if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) { +		microuptime(&tv); +		cts = tcp_tv_to_usec(&tv); +		hpts->p_cur_slot = cts_to_wheel(cts);  		counter_u64_add(hpts_loops, 1); -		hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);  		goto again;  	} @@ -1526,16 +1514,16 @@ no_run:  }  void -tcp_set_hpts(struct tcpcb *tp) +__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)  {  	struct tcp_hpts_entry *hpts;  	int failed;  	INP_WLOCK_ASSERT(tptoinpcb(tp)); -	hpts = tcp_hpts_lock(tp); +	hpts = tcp_hpts_lock(pace, tp);  	if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { -		tp->t_hpts_cpu = hpts_cpuid(tp, &failed); +		tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);  		if (failed == 0)  			tp->t_flags2 |= TF2_HPTS_CPU_SET;  	} @@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp)  }  static struct tcp_hpts_entry * -tcp_choose_hpts_to_run(void) +tcp_choose_hpts_to_run(struct tcp_hptsi *pace)  { +	struct timeval tv;  	int i, oldest_idx, start, end;  	uint32_t cts, time_since_ran, calc; -	cts = tcp_get_usecs(NULL); +	microuptime(&tv); +	cts = tcp_tv_to_usec(&tv);  	time_since_ran = 0;  	/* Default is all one group */  	start = 0; -	end = tcp_pace.rp_num_hptss; +	end = pace->rp_num_hptss;  	/*  	 * If we have more than one L3 group figure out which one  	 * this CPU is in.  	 */ -	if (tcp_pace.grp_cnt > 1) { -		for (i = 0; i < tcp_pace.grp_cnt; i++) { -			if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { -				start = tcp_pace.grps[i]->cg_first; -				end = (tcp_pace.grps[i]->cg_last + 1); +	if (pace->grp_cnt > 1) { +		for (i = 0; i < pace->grp_cnt; i++) { +			if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) { +				start = pace->grps[i]->cg_first; +				end = (pace->grps[i]->cg_last + 1);  				break;  			}  		}  	}  	oldest_idx = -1;  	for (i = start; i < end; i++) { -		if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) -			calc = cts - tcp_pace.cts_last_ran[i]; +		if (TSTMP_GT(cts, pace->cts_last_ran[i])) +			calc = cts - pace->cts_last_ran[i];  		else  			calc = 0;  		if (calc > time_since_ran) { @@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void)  		}  	}  	if (oldest_idx >= 0) -		return(tcp_pace.rp_ent[oldest_idx]); +		return(pace->rp_ent[oldest_idx]);  	else -		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); +		return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);  }  static void @@ -1588,9 +1578,9 @@ __tcp_run_hpts(void)  {  	struct epoch_tracker et;  	struct tcp_hpts_entry *hpts; -	int ticks_ran; +	int slots_ran; -	hpts = tcp_choose_hpts_to_run(); +	hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);  	if (hpts->p_hpts_active) {  		/* Already active */ @@ -1606,12 +1596,11 @@ __tcp_run_hpts(void)  	hpts->syscall_cnt++;  	counter_u64_add(hpts_direct_call, 1);  	hpts->p_hpts_active = 1; -	ticks_ran = tcp_hptsi(hpts, false); +	slots_ran = tcp_hptsi(hpts, false);  	/* We may want to adjust the sleep values here */  	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { -		if (ticks_ran > slots_indicate_less_sleep) { +		if (slots_ran > slots_indicate_less_sleep) {  			struct timeval tv; -			sbintime_t sb;  			hpts->p_mysleep.tv_usec /= 2;  			if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) @@ -1635,13 +1624,8 @@ __tcp_run_hpts(void)  			 * the dynamic value and set the on_min_sleep  			 * flag so we will not be awoken.  			 */ -			sb = tvtosbt(tv); -			/* Store off to make visible the actual sleep time */ -			hpts->sleeping = tv.tv_usec; -			callout_reset_sbt_on(&hpts->co, sb, 0, -					     hpts_timeout_swi, hpts, hpts->p_cpu, -					     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); -		} else if (ticks_ran < slots_indicate_more_sleep) { +			(void)tcp_hpts_sleep(hpts, &tv); +		} else if (slots_ran < slots_indicate_more_sleep) {  			/* For the further sleep, don't reschedule  hpts */  			hpts->p_mysleep.tv_usec *= 2;  			if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) @@ -1658,17 +1642,22 @@ out_with_mtx:  static void  tcp_hpts_thread(void *ctx)  { +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif  	struct tcp_hpts_entry *hpts;  	struct epoch_tracker et;  	struct timeval tv; -	sbintime_t sb; -	int ticks_ran; +	int slots_ran;  	hpts = (struct tcp_hpts_entry *)ctx; +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif  	HPTS_LOCK(hpts);  	if (hpts->p_direct_wake) {  		/* Signaled by input or output with low occupancy count. */ -		callout_stop(&hpts->co); +		_callout_stop_safe(&hpts->co, 0);  		counter_u64_add(hpts_direct_awakening, 1);  	} else {  		/* Timed out, the normal case. */ @@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx)  	}  	hpts->sleeping = 0;  	hpts->p_hpts_active = 1; -	ticks_ran = tcp_hptsi(hpts, true); +	slots_ran = tcp_hptsi(hpts, true);  	tv.tv_sec = 0;  	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;  	if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { @@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx)  			 * Only adjust sleep time if we were  			 * called from the callout i.e. direct_wake == 0.  			 */ -			if (ticks_ran < slots_indicate_more_sleep) { +			if (slots_ran < slots_indicate_more_sleep) {  				hpts->p_mysleep.tv_usec *= 2;  				if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)  					hpts->p_mysleep.tv_usec = dynamic_max_sleep; -			} else if (ticks_ran > slots_indicate_less_sleep) { +			} else if (slots_ran > slots_indicate_less_sleep) {  				hpts->p_mysleep.tv_usec /= 2;  				if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)  					hpts->p_mysleep.tv_usec = dynamic_min_sleep; @@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx)  	hpts->p_hpts_active = 0;  back_to_sleep:  	hpts->p_direct_wake = 0; -	sb = tvtosbt(tv); -	/* Store off to make visible the actual sleep time */ -	hpts->sleeping = tv.tv_usec; -	callout_reset_sbt_on(&hpts->co, sb, 0, -			     hpts_timeout_swi, hpts, hpts->p_cpu, -			     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); +	(void)tcp_hpts_sleep(hpts, &tv);  	NET_EPOCH_EXIT(et);  	HPTS_UNLOCK(hpts);  } -#undef	timersub -  static int32_t  hpts_count_level(struct cpu_group *cg)  { @@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g  	}  } -static void -tcp_hpts_mod_load(void) +/* + * Initialize a tcp_hptsi structure. This performs the core initialization + * without starting threads. + */ +struct tcp_hptsi* +tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)  { +	struct tcp_hptsi *pace;  	struct cpu_group *cpu_top; -	int32_t error __diagused; -	int32_t i, j, bound = 0, created = 0; +	uint32_t i, j, cts; +	int32_t count;  	size_t sz, asz;  	struct timeval tv; -	sbintime_t sb;  	struct tcp_hpts_entry *hpts; -	struct pcpu *pc;  	char unit[16];  	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; -	int count, domain; +	KASSERT(funcs != NULL, ("funcs is NULL")); + +	/* Allocate the main structure */ +	pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO); +	if (pace == NULL) +		return (NULL); + +	memset(pace, 0, sizeof(*pace)); +	pace->funcs = funcs; + +	/* Setup CPU topology information */  #ifdef SMP  	cpu_top = smp_topo();  #else  	cpu_top = NULL;  #endif -	tcp_pace.rp_num_hptss = ncpus; -	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); -	hpts_loops = counter_u64_alloc(M_WAITOK); -	back_tosleep = counter_u64_alloc(M_WAITOK); -	combined_wheel_wrap = counter_u64_alloc(M_WAITOK); -	wheel_wrap = counter_u64_alloc(M_WAITOK); -	hpts_wake_timeout = counter_u64_alloc(M_WAITOK); -	hpts_direct_awakening = counter_u64_alloc(M_WAITOK); -	hpts_back_tosleep = counter_u64_alloc(M_WAITOK); -	hpts_direct_call = counter_u64_alloc(M_WAITOK); -	cpu_uses_flowid = counter_u64_alloc(M_WAITOK); -	cpu_uses_random = counter_u64_alloc(M_WAITOK); +	pace->rp_num_hptss = ncpus; -	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); -	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); -	sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); -	tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); -	tcp_pace.grp_cnt = 0; +	/* Allocate hpts entry array */ +	sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *)); +	pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); + +	/* Allocate timestamp tracking array */ +	sz = (sizeof(uint32_t) * pace->rp_num_hptss); +	pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); + +	/* Setup CPU groups */  	if (cpu_top == NULL) { -		tcp_pace.grp_cnt = 1; +		pace->grp_cnt = 1;  	} else {  		/* Find out how many cache level 3 domains we have */  		count = 0; -		tcp_pace.grp_cnt = hpts_count_level(cpu_top); -		if (tcp_pace.grp_cnt == 0) { -			tcp_pace.grp_cnt = 1; +		pace->grp_cnt = hpts_count_level(cpu_top); +		if (pace->grp_cnt == 0) { +			pace->grp_cnt = 1;  		} -		sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); -		tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); +		sz = (pace->grp_cnt * sizeof(struct cpu_group *)); +		pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);  		/* Now populate the groups */ -		if (tcp_pace.grp_cnt == 1) { +		if (pace->grp_cnt == 1) {  			/*  			 * All we need is the top level all cpu's are in  			 * the same cache so when we use grp[0]->cg_mask @@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void)  			 * all cpu's in it. The level here is probably  			 * zero which is ok.  			 */ -			tcp_pace.grps[0] = cpu_top; +			pace->grps[0] = cpu_top;  		} else {  			/*  			 * Here we must find all the level three cache domains  			 * and setup our pointers to them.  			 */  			count = 0; -			hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); +			hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);  		}  	} + +	/* Cache the current time for initializing the hpts entries */ +	microuptime(&tv); +	cts = tcp_tv_to_usec(&tv); + +	/* Initialize each hpts entry */  	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; -	for (i = 0; i < tcp_pace.rp_num_hptss; i++) { -		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), +	for (i = 0; i < pace->rp_num_hptss; i++) { +		pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),  		    M_TCPHPTS, M_WAITOK | M_ZERO); -		tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); -		hpts = tcp_pace.rp_ent[i]; -		/* -		 * Init all the hpts structures that are not specifically -		 * zero'd by the allocations. Also lets attach them to the -		 * appropriate sysctl block as well. -		 */ -		mtx_init(&hpts->p_mtx, "tcp_hpts_lck", -		    "hpts", MTX_DEF | MTX_DUPOK); -		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { -			TAILQ_INIT(&hpts->p_hptss[j].head); -			hpts->p_hptss[j].count = 0; -			hpts->p_hptss[j].gencnt = 0; -		} -		sysctl_ctx_init(&hpts->hpts_ctx); -		sprintf(unit, "%d", i); -		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, -		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), -		    OID_AUTO, -		    unit, -		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0, -		    ""); -		SYSCTL_ADD_INT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "out_qcnt", CTLFLAG_RD, -		    &hpts->p_on_queue_cnt, 0, -		    "Count TCB's awaiting output processing"); -		SYSCTL_ADD_U16(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "active", CTLFLAG_RD, -		    &hpts->p_hpts_active, 0, -		    "Is the hpts active"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "curslot", CTLFLAG_RD, -		    &hpts->p_cur_slot, 0, -		    "What the current running pacers goal"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "runtick", CTLFLAG_RD, -		    &hpts->p_runningslot, 0, -		    "What the running pacers current slot is"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "curtick", CTLFLAG_RD, -		    &hpts->p_curtick, 0, -		    "What the running pacers last tick mapped to the wheel was"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "lastran", CTLFLAG_RD, -		    &tcp_pace.cts_last_ran[i], 0, -		    "The last usec tick that this hpts ran"); -		SYSCTL_ADD_LONG(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "cur_min_sleep", CTLFLAG_RD, -		    &hpts->p_mysleep.tv_usec, -		    "What the running pacers is using for p_mysleep.tv_usec"); -		SYSCTL_ADD_U64(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "now_sleeping", CTLFLAG_RD, -		    &hpts->sleeping, 0, -		    "What the running pacers is actually sleeping for"); -		SYSCTL_ADD_U64(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "syscall_cnt", CTLFLAG_RD, -		    &hpts->syscall_cnt, 0, -		    "How many times we had syscalls on this hpts"); +		pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, +		    M_WAITOK | M_ZERO); +		hpts = pace->rp_ent[i]; +		/* Basic initialization */  		hpts->p_hpts_sleep_time = hpts_sleep_max; -		hpts->p_num = i; -		hpts->p_curtick = tcp_gethptstick(&tv); -		tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv); -		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); -		hpts->p_cpu = 0xffff; +		hpts->p_cpu = i; +		pace->cts_last_ran[i] = cts; +		hpts->p_cur_slot = cts_to_wheel(cts); +		hpts->p_prev_slot = hpts->p_cur_slot;  		hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);  		callout_init(&hpts->co, 1); +		hpts->p_hptsi = pace; +		mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", +		    MTX_DEF | MTX_DUPOK); +		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { +			TAILQ_INIT(&hpts->p_hptss[j].head); +		} + +		/* Setup SYSCTL if requested */ +		if (enable_sysctl) { +			sysctl_ctx_init(&hpts->hpts_ctx); +			sprintf(unit, "%d", i); +			hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, +			    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), +			    OID_AUTO, +			    unit, +			    CTLFLAG_RW | CTLFLAG_MPSAFE, 0, +			    ""); +			SYSCTL_ADD_INT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "out_qcnt", CTLFLAG_RD, +			    &hpts->p_on_queue_cnt, 0, +			    "Count TCB's awaiting output processing"); +			SYSCTL_ADD_U16(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "active", CTLFLAG_RD, +			    &hpts->p_hpts_active, 0, +			    "Is the hpts active"); +			SYSCTL_ADD_UINT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "curslot", CTLFLAG_RD, +			    &hpts->p_cur_slot, 0, +			    "What the current running pacers goal"); +			SYSCTL_ADD_UINT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "runslot", CTLFLAG_RD, +			    &hpts->p_runningslot, 0, +			    "What the running pacers current slot is"); +			SYSCTL_ADD_UINT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "lastran", CTLFLAG_RD, +			    &pace->cts_last_ran[i], 0, +			    "The last usec timestamp that this hpts ran"); +			SYSCTL_ADD_LONG(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "cur_min_sleep", CTLFLAG_RD, +			    &hpts->p_mysleep.tv_usec, +			    "What the running pacers is using for p_mysleep.tv_usec"); +			SYSCTL_ADD_U64(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "now_sleeping", CTLFLAG_RD, +			    &hpts->sleeping, 0, +			    "What the running pacers is actually sleeping for"); +			SYSCTL_ADD_U64(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "syscall_cnt", CTLFLAG_RD, +			    &hpts->syscall_cnt, 0, +			    "How many times we had syscalls on this hpts"); +		}  	} -	/* Don't try to bind to NUMA domains if we don't have any */ -	if (vm_ndomains == 1 && tcp_bind_threads == 2) -		tcp_bind_threads = 0; -	/* -	 * Now lets start ithreads to handle the hptss. -	 */ -	for (i = 0; i < tcp_pace.rp_num_hptss; i++) { -		hpts = tcp_pace.rp_ent[i]; -		hpts->p_cpu = i; +	return (pace); +} + +/* + * Create threads for a tcp_hptsi structure and starts timers for the current + * (minimum) sleep interval. + */ +void +tcp_hptsi_start(struct tcp_hptsi *pace) +{ +	struct tcp_hpts_entry *hpts; +	struct pcpu *pc; +	struct timeval tv; +	uint32_t i, j; +	int count, domain; +	int error __diagused; + +	KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL")); + +	/* Start threads for each hpts entry */ +	for (i = 0; i < pace->rp_num_hptss; i++) { +		hpts = pace->rp_ent[i]; + +		KASSERT(hpts->ie_cookie == NULL, +		    ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));  		error = swi_add(&hpts->ie, "hpts",  		    tcp_hpts_thread, (void *)hpts,  		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);  		KASSERT(error == 0, -			("Can't add hpts:%p i:%d err:%d", -			 hpts, i, error)); -		created++; -		hpts->p_mysleep.tv_sec = 0; -		hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; +		    ("Can't add hpts:%p i:%d err:%d", hpts, i, error)); +  		if (tcp_bind_threads == 1) { -			if (intr_event_bind(hpts->ie, i) == 0) -				bound++; +			(void)intr_event_bind(hpts->ie, i);  		} else if (tcp_bind_threads == 2) {  			/* Find the group for this CPU (i) and bind into it */ -			for (j = 0; j < tcp_pace.grp_cnt; j++) { -				if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { +			for (j = 0; j < pace->grp_cnt; j++) { +				if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) {  					if (intr_event_bind_ithread_cpuset(hpts->ie, -						&tcp_pace.grps[j]->cg_mask) == 0) { -						bound++; +					    &pace->grps[j]->cg_mask) == 0) {  						pc = pcpu_find(i);  						domain = pc->pc_domain; -						count = hpts_domains[domain].count; -						hpts_domains[domain].cpu[count] = i; -						hpts_domains[domain].count++; +						count = pace->domains[domain].count; +						pace->domains[domain].cpu[count] = i; +						pace->domains[domain].count++;  						break;  					}  				}  			}  		} + +		hpts->p_mysleep.tv_sec = 0; +		hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;  		tv.tv_sec = 0;  		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; -		hpts->sleeping = tv.tv_usec; -		sb = tvtosbt(tv); -		callout_reset_sbt_on(&hpts->co, sb, 0, -				     hpts_timeout_swi, hpts, hpts->p_cpu, -				     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); -	} -	/* -	 * If we somehow have an empty domain, fall back to choosing -	 * among all htps threads. -	 */ -	for (i = 0; i < vm_ndomains; i++) { -		if (hpts_domains[i].count == 0) { -			tcp_bind_threads = 0; -			break; -		} +		(void)tcp_hpts_sleep(hpts, &tv);  	} -	tcp_hpts_softclock = __tcp_run_hpts; -	tcp_lro_hpts_init(); -	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", -	    created, bound, -	    tcp_bind_threads == 2 ? "NUMA domains" : "cpus");  } -static void -tcp_hpts_mod_unload(void) +/* + * Stop all callouts/threads for a tcp_hptsi structure. + */ +void +tcp_hptsi_stop(struct tcp_hptsi *pace)  { +	struct tcp_hpts_entry *hpts;  	int rv __diagused; +	uint32_t i; -	tcp_lro_hpts_uninit(); -	atomic_store_ptr(&tcp_hpts_softclock, NULL); +	KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL")); -	for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { -		struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; +	for (i = 0; i < pace->rp_num_hptss; i++) { +		hpts = pace->rp_ent[i]; +		KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i)); +		KASSERT(hpts->ie_cookie != NULL, +		    ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i)); -		rv = callout_drain(&hpts->co); +		rv = _callout_stop_safe(&hpts->co, CS_DRAIN);  		MPASS(rv != 0);  		rv = swi_remove(hpts->ie_cookie);  		MPASS(rv == 0); +		hpts->ie_cookie = NULL; +	} +} -		rv = sysctl_ctx_free(&hpts->hpts_ctx); -		MPASS(rv == 0); +/* + * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create. + */ +void +tcp_hptsi_destroy(struct tcp_hptsi *pace) +{ +	struct tcp_hpts_entry *hpts; +	uint32_t i; + +	KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL")); +	KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL")); + +	/* Cleanup each hpts entry */ +	for (i = 0; i < pace->rp_num_hptss; i++) { +		hpts = pace->rp_ent[i]; +		if (hpts != NULL) { +			/* Cleanup SYSCTL if it was initialized */ +			if (hpts->hpts_root != NULL) { +				sysctl_ctx_free(&hpts->hpts_ctx); +			} -		mtx_destroy(&hpts->p_mtx); -		free(hpts->p_hptss, M_TCPHPTS); -		free(hpts, M_TCPHPTS); +			mtx_destroy(&hpts->p_mtx); +			free(hpts->p_hptss, M_TCPHPTS); +			free(hpts, M_TCPHPTS); +		}  	} -	free(tcp_pace.rp_ent, M_TCPHPTS); -	free(tcp_pace.cts_last_ran, M_TCPHPTS); +	/* Cleanup main arrays */ +	free(pace->rp_ent, M_TCPHPTS); +	free(pace->cts_last_ran, M_TCPHPTS);  #ifdef SMP -	free(tcp_pace.grps, M_TCPHPTS); +	free(pace->grps, M_TCPHPTS);  #endif +	/* Free the main structure */ +	free(pace, M_TCPHPTS); +} + +static int +tcp_hpts_mod_load(void) +{ +	int i; + +	/* Don't try to bind to NUMA domains if we don't have any */ +	if (vm_ndomains == 1 && tcp_bind_threads == 2) +		tcp_bind_threads = 0; + +	/* Create the tcp_hptsi structure */ +	tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true); +	if (tcp_hptsi_pace == NULL) +		return (ENOMEM); + +	/* Initialize global counters */ +	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); +	hpts_loops = counter_u64_alloc(M_WAITOK); +	back_tosleep = counter_u64_alloc(M_WAITOK); +	combined_wheel_wrap = counter_u64_alloc(M_WAITOK); +	wheel_wrap = counter_u64_alloc(M_WAITOK); +	hpts_wake_timeout = counter_u64_alloc(M_WAITOK); +	hpts_direct_awakening = counter_u64_alloc(M_WAITOK); +	hpts_back_tosleep = counter_u64_alloc(M_WAITOK); +	hpts_direct_call = counter_u64_alloc(M_WAITOK); +	cpu_uses_flowid = counter_u64_alloc(M_WAITOK); +	cpu_uses_random = counter_u64_alloc(M_WAITOK); + +	/* Start the threads */ +	tcp_hptsi_start(tcp_hptsi_pace); + +	/* Enable the global HPTS softclock function */ +	tcp_hpts_softclock = __tcp_run_hpts; + +	/* Initialize LRO HPTS */ +	tcp_lro_hpts_init(); + +	/* +	 * If we somehow have an empty domain, fall back to choosing among all +	 * HPTS threads. +	 */ +	for (i = 0; i < vm_ndomains; i++) { +		if (tcp_hptsi_pace->domains[i].count == 0) { +			tcp_bind_threads = 0; +			break; +		} +	} + +	printf("TCP HPTS started %u (%s) swi interrupt threads\n", +		tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ? +		 "(unbounded)" : +		 (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain")); + +	return (0); +} + +static void +tcp_hpts_mod_unload(void) +{ +	tcp_lro_hpts_uninit(); + +	/* Disable the global HPTS softclock function */ +	atomic_store_ptr(&tcp_hpts_softclock, NULL); + +	tcp_hptsi_stop(tcp_hptsi_pace); +	tcp_hptsi_destroy(tcp_hptsi_pace); +	tcp_hptsi_pace = NULL; + +	/* Cleanup global counters */  	counter_u64_free(hpts_hopelessly_behind);  	counter_u64_free(hpts_loops);  	counter_u64_free(back_tosleep); @@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void)  }  static int -tcp_hpts_modevent(module_t mod, int what, void *arg) +tcp_hpts_mod_event(module_t mod, int what, void *arg)  { -  	switch (what) {  	case MOD_LOAD: -		tcp_hpts_mod_load(); -		return (0); +		return (tcp_hpts_mod_load());  	case MOD_QUIESCE:  		/*  		 * Since we are a dependency of TCP stack modules, they should @@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg)  static moduledata_t tcp_hpts_module = {  	.name = "tcphpts", -	.evhand = tcp_hpts_modevent, +	.evhand = tcp_hpts_mod_event,  };  DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); | 
