13 files changed, 2735 insertions, 35 deletions
diff --git a/sys/conf/files b/sys/conf/files
index b2da16980cee..60a172d2f18d 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4355,6 +4355,7 @@ netinet/tcp_log_buf.c		optional tcp_blackbox inet | tcp_blackbox inet6
 netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
 netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
+netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
 netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap
 netinet/tcp_reass.c		optional inet | inet6
 netinet/tcp_sack.c		optional inet | inet6
diff --git a/sys/conf/options b/sys/conf/options
index 2c50d3b47f3f..980cf73df885 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -218,6 +218,7 @@ SYSVMSG		opt_sysvipc.h
 SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
+TCPHPTS         opt_inet.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
 UMTX_CHAINS	opt_global.h
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 8545bd833a9c..19eb5af9596d 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
+#include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
@@ -87,6 +88,9 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
+#ifdef TCPHPTS
+#include <netinet/tcp_hpts.h>
+#endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #endif
@@ -1224,9 +1228,28 @@ in_pcbrele_rlocked(struct inpcb *inp)
 		}
 		return (0);
 	}
-
+	
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+	if (inp->inp_in_hpts || inp->inp_in_input) {
+		struct tcp_hpts_entry *hpts;
+		/*
+		 * We should not be on the hpts at 
+		 * this point in any form. we must
+		 * get the lock to be sure.
+		 */
+		hpts = tcp_hpts_lock(inp);
+		if (inp->inp_in_hpts)
+			panic("Hpts:%p inp:%p at free still on hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+		hpts = tcp_input_lock(inp);
+		if (inp->inp_in_input) 
+			panic("Hpts:%p inp:%p at free still on input hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+	}
+#endif
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
@@ -1255,7 +1278,26 @@ in_pcbrele_wlocked(struct inpcb *inp)
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+	if (inp->inp_in_hpts || inp->inp_in_input) {
+		struct tcp_hpts_entry *hpts;
+		/*
+		 * We should not be on the hpts at 
+		 * this point in any form. we must
+		 * get the lock to be sure.
+		 */
+		hpts = tcp_hpts_lock(inp);
+		if (inp->inp_in_hpts)
+			panic("Hpts:%p inp:%p at free still on hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+		hpts = tcp_input_lock(inp);
+		if (inp->inp_in_input) 
+			panic("Hpts:%p inp:%p at free still on input hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+	}
+#endif
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 574ab4077aea..f4b6da20a2db 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -156,6 +156,7 @@ struct in_conninfo {
  * from the global list.
  *
  * Key:
+ * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
@@ -164,6 +165,51 @@ struct in_conninfo {
  * (h) - Protected by the pcbhash lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
+ * 
+ * Notes on the tcp_hpts:
+ * 
+ * First Hpts lock order is
+ * 1) INP_WLOCK()
+ * 2) HPTS_LOCK() i.e. hpts->pmtx 
+ *
+ * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). 
+ * You may check the inp->inp_in_hpts flag without the hpts lock. 
+ * The hpts is the only one that will clear this flag holding 
+ * only the hpts lock. This means that in your tcp_output()
+ * routine when you test for the inp_in_hpts flag to be 1 
+ * it may be transitioning to 0 (by the hpts). 
+ * That's ok since that will just mean an extra call to tcp_output 
+ * that most likely will find the call you executed
+ * (when the mis-match occured) will have put the TCB back 
+ * on the hpts and it will return. If your
+ * call did not add the inp back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Though usually
+ * you are either doing this from a timer, where you need and have
+ * the INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ *
+ * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and 
+ * inp_input_cpu_set fields are controlled completely by
+ * the hpts. Do not ever set these. The inp_hpts_cpu_set
+ * and inp_input_cpu_set fields indicate if the hpts has
+ * setup the respective cpu field. It is advised if this
+ * field is 0, to enqueue the packet with the appropriate
+ * hpts_immediate() call. If the _set field is 1, then
+ * you may compare the inp_*_cpu field to the curcpu and
+ * may want to again insert onto the hpts if these fields
+ * are not equal (i.e. you are not on the expected CPU).
+ *
+ * A note on inp_hpts_calls and inp_input_calls, these
+ * flags are set when the hpts calls either the output
+ * or do_segment routines respectively. If the routine
+ * being called wants to use this, then it needs to
+ * clear the flag before returning. The hpts will not
+ * clear the flag. The flags can be used to tell if
+ * the hpts is the function calling the respective
+ * routine.
  *
  * A few other notes:
  *
@@ -190,14 +236,45 @@ struct inpcb {
 	LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
-#define	inp_start_zero	inp_refcount
+#define	inp_start_zero	inp_hpts
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
+	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
+
+	uint32_t inp_hpts_request;	/* Current hpts request, zero if
+					 * fits in the pacing window (i&b). */
+	/*
+	 * Note the next fields are protected by a
+	 * different lock (hpts-lock). This means that 
+	 * they must correspond in size to the smallest
+	 * protectable bit field (uint8_t on x86, and
+	 * other platfomrs potentially uint32_t?). Also
+	 * since CPU switches can occur at different times the two
+	 * fields can *not* be collapsed into a signal bit field.
+	 */
+#if defined(__amd64__) || defined(__i386__)	
+	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
+	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
+#else
+	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
+	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
+#endif
+	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
+	volatile uint16_t  inp_input_cpu; /* Lock (i) */
+	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
+			 inp_input_cpu_set : 1,	/* on input hpts (i) */
+			 inp_hpts_calls :1,	/* (i) from output hpts */
+			 inp_input_calls :1,	/* (i) from input hpts */
+			 inp_spare_bits2 : 4;
+	uint8_t inp_spare_byte;		/* Compiler hole */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
+	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
+	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
+	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
 	LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
@@ -638,6 +715,7 @@ short	inp_so_options(const struct inpcb *inp);
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
 #define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 #define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
+#define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
 
 /*
  * Flags passed to in_pcblookup*() functions.
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
new file mode 100644
index 000000000000..b3b8c9c0bb84
--- /dev/null
+++ b/sys/netinet/tcp_hpts.c
@@ -0,0 +1,1964 @@
+/*-
+ * Copyright (c) 2016-8
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+/**
+ * Some notes about usage.
+ *
+ * The tcp_hpts system is designed to provide a high precision timer
+ * system for tcp. Its main purpose is to provide a mechanism for 
+ * pacing packets out onto the wire. It can be used in two ways
+ * by a given TCP stack (and those two methods can be used simultaneously).
+ *
+ * First, and probably the main thing its used by Rack and BBR for, it can
+ * be used to call tcp_output() of a transport stack at some time in the future.
+ * The normal way this is done is that tcp_output() of the stack schedules
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
+ * slot is the time from now that the stack wants to be called but it
+ * must be converted to tcp_hpts's notion of slot. This is done with
+ * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * call from the tcp_output() routine might look like:
+ *
+ * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ *
+ * The above would schedule tcp_ouput() to be called in 550 useconds.
+ * Note that if using this mechanism the stack will want to add near
+ * its top a check to prevent unwanted calls (from user land or the
+ * arrival of incoming ack's). So it would add something like:
+ *
+ * if (inp->inp_in_hpts)
+ *    return;
+ *
+ * to prevent output processing until the time alotted has gone by.
+ * Of course this is a bare bones example and the stack will probably
+ * have more consideration then just the above.
+ *
+ * Now the tcp_hpts system will call tcp_output in one of two forms, 
+ * it will first check to see if the stack as defined a 
+ * tfb_tcp_output_wtime() function, if so that is the routine it
+ * will call, if that function is not defined then it will call the
+ * tfb_tcp_output() function. The only difference between these
+ * two calls is that the former passes the time in to the function
+ * so the function does not have to access the time (which tcp_hpts
+ * already has). What these functions do is of course totally up
+ * to the individual tcp stack.
+ *
+ * Now the second function (actually two functions I guess :D)
+ * the tcp_hpts system provides is the  ability to either abort 
+ * a connection (later) or process  input on a connection. 
+ * Why would you want to do this? To keep processor locality.
+ *
+ * So in order to use the input redirection function the
+ * stack changes its tcp_do_segment() routine to instead
+ * of process the data call the function:
+ *
+ * tcp_queue_pkt_to_input()
+ *
+ * You will note that the arguments to this function look
+ * a lot like tcp_do_segments's arguments. This function
+ * will assure that the tcp_hpts system will
+ * call the functions tfb_tcp_hpts_do_segment() from the
+ * correct CPU. Note that multiple calls can get pushed
+ * into the tcp_hpts system this will be indicated by
+ * the next to last argument to tfb_tcp_hpts_do_segment()
+ * (nxt_pkt). If nxt_pkt is a 1 then another packet is
+ * coming. If nxt_pkt is a 0 then this is the last call
+ * that the tcp_hpts system has available for the tcp stack.
+ * 
+ * The other point of the input system is to be able to safely
+ * drop a tcp connection without worrying about the recursive 
+ * locking that may be occuring on the INP_WLOCK. So if
+ * a stack wants to drop a connection it calls:
+ *
+ *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
+ * 
+ * To schedule the tcp_hpts system to call 
+ * 
+ *    tcp_drop(tp, drop_reason)
+ *
+ * at a future point. This is quite handy to prevent locking
+ * issues when dropping connections.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>		/* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/counter.h>
+#include <sys/time.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_hpts.h>
+
+#ifdef tcpdebug
+#include <netinet/tcp_debug.h>
+#endif				/* tcpdebug */
+#ifdef tcp_offload
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef ipsec
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif				/* ipsec */
+#include "opt_rss.h"
+
+MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+#ifdef RSS
+static int tcp_bind_threads = 1;
+#else
+static int tcp_bind_threads = 0;
+#endif
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+
+static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
+
+TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
+
+static struct tcp_hptsi tcp_pace;
+
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp,
+    struct tcpcb **tp);
+static void tcp_wakehpts(struct tcp_hpts_entry *p);
+static void tcp_wakeinput(struct tcp_hpts_entry *p);
+static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hpts_thread(void *ctx);
+static void tcp_init_hptsi(void *st);
+
+int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
+static int32_t tcp_hpts_callout_skip_swi = 0;
+
+SYSCTL_DECL(_net_inet_tcp);
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
+
+#define	timersub(tvp, uvp, vvp)						\
+	do {								\
+		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
+		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
+		if ((vvp)->tv_usec < 0) {				\
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_usec += 1000000;			\
+		}							\
+	} while (0)
+
+static int32_t logging_on = 0;
+static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
+static int32_t tcp_hpts_precision = 120;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+    &tcp_hpts_precision, 120,
+    "Value for PRE() precision of callout");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+    &logging_on, 0,
+    "Turn on logging if compiled in");
+
+counter_u64_t hpts_loops;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+    &hpts_loops, "Number of times hpts had to loop to catch up");
+
+counter_u64_t back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+    &back_tosleep, "Number of times hpts found no tcbs");
+
+static int32_t in_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
+    &in_newts_every_tcb, 0,
+    "Do we have a new cts every tcb we process for input");
+static int32_t in_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
+    &in_ts_percision, 0,
+    "Do we use percise timestamp for clients on input");
+static int32_t out_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
+    &out_newts_every_tcb, 0,
+    "Do we have a new cts every tcb we process for output");
+static int32_t out_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
+    &out_ts_percision, 0,
+    "Do we use a percise timestamp for every output cts");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+    &hpts_sleep_max, 0,
+    "The maximum time the hpts will sleep <1 - 254>");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+    &tcp_min_hptsi_time, 0,
+    "The minimum time the hpts must sleep before processing more slots");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
+    &tcp_hpts_callout_skip_swi, 0,
+    "Do we have the callout call directly to the hpts?");
+
+static void
+__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
+    uint32_t ticknow, int32_t line)
+{
+	struct hpts_log *pl;
+
+	HPTS_MTX_ASSERT(hpts);
+	if (hpts->p_log == NULL)
+		return;
+	pl = &hpts->p_log[hpts->p_log_at];
+	hpts->p_log_at++;
+	if (hpts->p_log_at >= hpts->p_logsize) {
+		hpts->p_log_at = 0;
+		hpts->p_log_wrapped = 1;
+	}
+	pl->inp = inp;
+	if (inp) {
+		pl->t_paceslot = inp->inp_hptsslot;
+		pl->t_hptsreq = inp->inp_hpts_request;
+		pl->p_onhpts = inp->inp_in_hpts;
+		pl->p_oninput = inp->inp_in_input;
+	} else {
+		pl->t_paceslot = 0;
+		pl->t_hptsreq = 0;
+		pl->p_onhpts = 0;
+		pl->p_oninput = 0;
+	}
+	pl->is_notempty = 1;
+	pl->event = event;
+	pl->line = line;
+	pl->cts = tcp_get_usecs(NULL);
+	pl->p_curtick = hpts->p_curtick;
+	pl->p_prevtick = hpts->p_prevtick;
+	pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
+	pl->ticknow = ticknow;
+	pl->slot_req = slot;
+	pl->p_nxt_slot = hpts->p_nxt_slot;
+	pl->p_cur_slot = hpts->p_cur_slot;
+	pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
+	pl->p_flags = (hpts->p_cpu & 0x7f);
+	pl->p_flags <<= 7;
+	pl->p_flags |= (hpts->p_num & 0x7f);
+	pl->p_flags <<= 2;
+	if (hpts->p_hpts_active) {
+		pl->p_flags |= HPTS_HPTS_ACTIVE;
+	}
+}
+
+#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
+
+static void
+hpts_timeout_swi(void *arg)
+{
+	struct tcp_hpts_entry *hpts;
+
+	hpts = (struct tcp_hpts_entry *)arg;
+	swi_sched(hpts->ie_cookie, 0);
+}
+
+static void
+hpts_timeout_dir(void *arg)
+{
+	tcp_hpts_thread(arg);
+}
+
+static inline void
+hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_hpts_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if (inp->inp_in_hpts == 0) {
+		/* We are not on the hpts? */
+		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
+	}
+	if (TAILQ_EMPTY(head) &&
+	    (hpts->p_on_queue_cnt != 0)) {
+		/* We should not be empty with a queue count */
+		panic("%s hpts:%p hpts bucket empty but cnt:%d",
+		    __FUNCTION__, hpts, hpts->p_on_queue_cnt);
+	}
+#endif
+	TAILQ_REMOVE(head, inp, inp_hpts);
+	hpts->p_on_queue_cnt--;
+	if (hpts->p_on_queue_cnt < 0) {
+		/* Count should not go negative .. */
+#ifdef INVARIANTS
+		panic("Hpts goes negative inp:%p hpts:%p",
+		    inp, hpts);
+#endif
+		hpts->p_on_queue_cnt = 0;
+	}
+	if (clear) {
+		inp->inp_hpts_request = 0;
+		inp->inp_in_hpts = 0;
+	}
+}
+
+static inline void
+hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_hpts_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
+		/* We are already on the hpts? */
+		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
+	}
+#endif
+	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
+	inp->inp_in_hpts = 1;
+	hpts->p_on_queue_cnt++;
+	if (noref == 0) {
+		in_pcbref(inp);
+	}
+}
+
+static inline void
+hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_input_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if (inp->inp_in_input == 0) {
+		/* We are not on the input hpts? */
+		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
+	}
+#endif
+	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
+	hpts->p_on_inqueue_cnt--;
+	if (hpts->p_on_inqueue_cnt < 0) {
+#ifdef INVARIANTS
+		panic("Hpts in goes negative inp:%p hpts:%p",
+		    inp, hpts);
+#endif
+		hpts->p_on_inqueue_cnt = 0;
+	}
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&hpts->p_input) &&
+	    (hpts->p_on_inqueue_cnt != 0)) {
+		/* We should not be empty with a queue count */
+		panic("%s hpts:%p in_hpts input empty but cnt:%d",
+		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
+	}
+#endif
+	if (clear)
+		inp->inp_in_input = 0;
+}
+
+static inline void
+hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_input_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if (inp->inp_in_input == 1) {
+		/* We are already on the input hpts? */
+		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
+	}
+#endif
+	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
+	inp->inp_in_input = 1;
+	hpts->p_on_inqueue_cnt++;
+	in_pcbref(inp);
+}
+
+static int
+sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
+{
+	struct tcp_hpts_entry *hpts;
+	size_t sz;
+	int32_t logging_was, i;
+	int32_t error = 0;
+
+	/*
+	 * HACK: Turn off logging so no locks are required this really needs
+	 * a memory barrier :)
+	 */
+	logging_was = logging_on;
+	logging_on = 0;
+	if (!req->oldptr) {
+		/* How much? */
+		sz = 0;
+		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+			hpts = tcp_pace.rp_ent[i];
+			if (hpts->p_log == NULL)
+				continue;
+			sz += (sizeof(struct hpts_log) * hpts->p_logsize);
+		}
+		error = SYSCTL_OUT(req, 0, sz);
+	} else {
+		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+			hpts = tcp_pace.rp_ent[i];
+			if (hpts->p_log == NULL)
+				continue;
+			if (hpts->p_log_wrapped)
+				sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+			else
+				sz = (sizeof(struct hpts_log) * hpts->p_log_at);
+			error = SYSCTL_OUT(req, hpts->p_log, sz);
+		}
+	}
+	logging_on = logging_was;
+	return error;
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+    0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
+
+
+/*
+ * Try to get the INP_INFO lock.
+ *
+ * This function always succeeds in getting the lock. It will clear
+ * *tpp and return (1) if something critical changed while the inpcb
+ * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
+ *
+ * This function relies on the fact that the hpts always holds a
+ * reference on the inpcb while the segment is on the hptsi wheel and
+ * in the input queue.
+ *
+ */
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
+{
+	struct tcp_function_block *tfb;
+	struct tcpcb *tp;
+	void *ptr;
+
+	/* Try the easy way. */
+	if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
+		return (0);
+
+	/*
+	 * OK, let's try the hard way. We'll save the function pointer block
+	 * to make sure that doesn't change while we aren't holding the
+	 * lock.
+	 */
+	tp = *tpp;
+	tfb = tp->t_fb;
+	ptr = tp->t_fb_ptr;
+	INP_WUNLOCK(inp);
+	INP_INFO_RLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	/* If the session went away, return an error. */
+	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+	    (inp->inp_flags2 & INP_FREED)) {
+		*tpp = NULL;
+		return (1);
+	}
+	/*
+	 * If the function block or stack-specific data block changed,
+	 * report an error.
+	 */
+	tp = intotcpcb(inp);
+	if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
+		*tpp = NULL;
+		return (1);
+	}
+	return (0);
+}
+
+
+static void
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
+{
+	HPTS_MTX_ASSERT(hpts);
+	swi_sched(hpts->ie_cookie, 0);
+	if (hpts->p_hpts_active == 2) {
+		/* Rare sleeping on a ENOBUF */
+		wakeup_one(hpts);
+	}
+}
+
+static void
+tcp_wakeinput(struct tcp_hpts_entry *hpts)
+{
+	HPTS_MTX_ASSERT(hpts);
+	swi_sched(hpts->ie_cookie, 0);
+	if (hpts->p_hpts_active == 2) {
+		/* Rare sleeping on a ENOBUF */
+		wakeup_one(hpts);
+	}
+}
+
+struct tcp_hpts_entry *
+tcp_cur_hpts(struct inpcb *inp)
+{
+	int32_t hpts_num;
+	struct tcp_hpts_entry *hpts;
+
+	hpts_num = inp->inp_hpts_cpu;
+	hpts = tcp_pace.rp_ent[hpts_num];
+	return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_hpts_lock(struct inpcb *inp)
+{
+	struct tcp_hpts_entry *hpts;
+	int32_t hpts_num;
+
+again:
+	hpts_num = inp->inp_hpts_cpu;
+	hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx)) {
+		panic("Hpts:%p owns mtx prior-to lock line:%d",
+		    hpts, __LINE__);
+	}
+#endif
+	mtx_lock(&hpts->p_mtx);
+	if (hpts_num != inp->inp_hpts_cpu) {
+		mtx_unlock(&hpts->p_mtx);
+		goto again;
+	}
+	return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_input_lock(struct inpcb *inp)
+{
+	struct tcp_hpts_entry *hpts;
+	int32_t hpts_num;
+
+again:
+	hpts_num = inp->inp_input_cpu;
+	hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx)) {
+		panic("Hpts:%p owns mtx prior-to lock line:%d",
+		    hpts, __LINE__);
+	}
+#endif
+	mtx_lock(&hpts->p_mtx);
+	if (hpts_num != inp->inp_input_cpu) {
+		mtx_unlock(&hpts->p_mtx);
+		goto again;
+	}
+	return (hpts);
+}
+
+static void
+tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
+{
+	int32_t add_freed;
+
+	if (inp->inp_flags2 & INP_FREED) {
+		/*
+		 * Need to play a special trick so that in_pcbrele_wlocked
+		 * does not return 1 when it really should have returned 0.
+		 */
+		add_freed = 1;
+		inp->inp_flags2 &= ~INP_FREED;
+	} else {
+		add_freed = 0;
+	}
+#ifndef INP_REF_DEBUG
+	if (in_pcbrele_wlocked(inp)) {
+		/*
+		 * This should not happen. We have the inpcb referred to by
+		 * the main socket (why we are called) and the hpts. It
+		 * should always return 0.
+		 */
+		panic("inpcb:%p release ret 1",
+		    inp);
+	}
+#else
+	if (__in_pcbrele_wlocked(inp, line)) {
+		/*
+		 * This should not happen. We have the inpcb referred to by
+		 * the main socket (why we are called) and the hpts. It
+		 * should always return 0.
+		 */
+		panic("inpcb:%p release ret 1",
+		    inp);
+	}
+#endif
+	if (add_freed) {
+		inp->inp_flags2 |= INP_FREED;
+	}
+}
+
+static void
+tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+	if (inp->inp_in_hpts) {
+		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
+		tcp_remove_hpts_ref(inp, hpts, line);
+	}
+}
+
+static void
+tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+	HPTS_MTX_ASSERT(hpts);
+	if (inp->inp_in_input) {
+		hpts_sane_input_remove(hpts, inp, 1);
+		tcp_remove_hpts_ref(inp, hpts, line);
+	}
+}
+
+/*
+ * Called normally with the INP_LOCKED but it
+ * does not matter, the hpts lock is the key
+ * but the lock order allows us to hold the
+ * INP lock and then get the hpts lock.
+ *
+ * Valid values in the flags are
+ * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
+ * HPTS_REMOVE_INPUT - remove from the input of the hpts.
+ * Note that you can or both values together and get two
+ * actions.
+ */
+void
+__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
+{
+	struct tcp_hpts_entry *hpts;
+
+	INP_WLOCK_ASSERT(inp);
+	if (flags & HPTS_REMOVE_OUTPUT) {
+		hpts = tcp_hpts_lock(inp);
+		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
+		mtx_unlock(&hpts->p_mtx);
+	}
+	if (flags & HPTS_REMOVE_INPUT) {
+		hpts = tcp_input_lock(inp);
+		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
+		mtx_unlock(&hpts->p_mtx);
+	}
+}
+
+static inline int
+hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+{
+	return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+}
+
+static int
+tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
+{
+	int32_t need_wake = 0;
+	uint32_t ticknow = 0;
+
+	HPTS_MTX_ASSERT(hpts);
+	if (inp->inp_in_hpts == 0) {
+		/* Ok we need to set it on the hpts in the current slot */
+		if (hpts->p_hpts_active == 0) {
+			/* A sleeping hpts we want in next slot to run */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
+				    hpts_tick(hpts, 1));
+			}
+			inp->inp_hptsslot = hpts_tick(hpts, 1);
+			inp->inp_hpts_request = 0;
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
+			}
+			need_wake = 1;
+		} else if ((void *)inp == hpts->p_inp) {
+			/*
+			 * We can't allow you to go into the same slot we
+			 * are in. We must put you out.
+			 */
+			inp->inp_hptsslot = hpts->p_nxt_slot;
+		} else
+			inp->inp_hptsslot = hpts->p_cur_slot;
+		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+		inp->inp_hpts_request = 0;
+		if (logging_on) {
+			tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
+		}
+		if (need_wake) {
+			/*
+			 * Activate the hpts if it is sleeping and its
+			 * timeout is not 1.
+			 */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
+			}
+			hpts->p_direct_wake = 1;
+			tcp_wakehpts(hpts);
+		}
+	}
+	return (need_wake);
+}
+
+int
+__tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
+{
+	int32_t ret;
+	struct tcp_hpts_entry *hpts;
+
+	INP_WLOCK_ASSERT(inp);
+	hpts = tcp_hpts_lock(inp);
+	ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
+	mtx_unlock(&hpts->p_mtx);
+	return (ret);
+}
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
+    struct hpts_diag *diag, int32_t noref)
+{
+	int32_t need_new_to = 0;
+	int32_t need_wakeup = 0;
+	uint32_t largest_slot;
+	uint32_t ticknow = 0;
+	uint32_t slot_calc;
+
+	HPTS_MTX_ASSERT(hpts);
+	if (diag) {
+		memset(diag, 0, sizeof(struct hpts_diag));
+		diag->p_hpts_active = hpts->p_hpts_active;
+		diag->p_nxt_slot = hpts->p_nxt_slot;
+		diag->p_cur_slot = hpts->p_cur_slot;
+		diag->slot_req = slot;
+	}
+	if ((inp->inp_in_hpts == 0) || noref) {
+		inp->inp_hpts_request = slot;
+		if (slot == 0) {
+			/* Immediate */
+			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+			return;
+		}
+		if (hpts->p_hpts_active) {
+			/*
+			 * Its slot - 1 since nxt_slot is the next tick that
+			 * will go off since the hpts is awake
+			 */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
+			}
+			/*
+			 * We want to make sure that we don't place a inp in
+			 * the range of p_cur_slot <-> p_nxt_slot. If we
+			 * take from p_nxt_slot to the end, plus p_cur_slot
+			 * and then take away 2, we will know how many is
+			 * the max slots we can use.
+			 */
+			if (hpts->p_nxt_slot > hpts->p_cur_slot) {
+				/*
+				 * Non-wrap case nxt_slot <-> cur_slot we
+				 * don't want to land in. So the diff gives
+				 * us what is taken away from the number of
+				 * slots.
+				 */
+				largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
+			} else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
+				largest_slot = NUM_OF_HPTSI_SLOTS - 2;
+			} else {
+				/*
+				 * Wrap case so the diff gives us the number
+				 * of slots that we can land in.
+				 */
+				largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+			}
+			/*
+			 * We take away two so we never have a problem (20
+			 * usec's) out of 1024000 usecs
+			 */
+			largest_slot -= 2;
+			if (inp->inp_hpts_request > largest_slot) {
+				/*
+				 * Restrict max jump of slots and remember
+				 * leftover
+				 */
+				slot = largest_slot;
+				inp->inp_hpts_request -= largest_slot;
+			} else {
+				/* This one will run when we hit it */
+				inp->inp_hpts_request = 0;
+			}
+			if (hpts->p_nxt_slot == hpts->p_cur_slot)
+				slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
+			else
+				slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
+			if (slot_calc == hpts->p_cur_slot) {
+#ifdef INVARIANTS
+				/* TSNH */
+				panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
+				    hpts, slot_calc, slot, largest_slot);
+#endif
+				if (slot_calc)
+					slot_calc--;
+				else
+					slot_calc = NUM_OF_HPTSI_SLOTS - 1;
+			}
+			inp->inp_hptsslot = slot_calc;
+			if (diag) {
+				diag->inp_hptsslot = inp->inp_hptsslot;
+			}
+		} else {
+			/*
+			 * The hpts is sleeping, we need to figure out where
+			 * it will wake up at and if we need to reschedule
+			 * its time-out.
+			 */
+			uint32_t have_slept, yet_to_sleep;
+			uint32_t slot_now;
+			struct timeval tv;
+
+			ticknow = tcp_gethptstick(&tv);
+			slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
+			/*
+			 * The user wants to be inserted at (slot_now +
+			 * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
+			 */
+			largest_slot = NUM_OF_HPTSI_SLOTS - 2;
+			if (inp->inp_hpts_request > largest_slot) {
+				/* Adjust the residual in inp_hpts_request */
+				slot = largest_slot;
+				inp->inp_hpts_request -= largest_slot;
+			} else {
+				/* No residual it all fits */
+				inp->inp_hpts_request = 0;
+			}
+			inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
+			if (diag) {
+				diag->slot_now = slot_now;
+				diag->inp_hptsslot = inp->inp_hptsslot;
+				diag->p_on_min_sleep = hpts->p_on_min_sleep;
+			}
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
+			}
+			/* Now do we need to restart the hpts's timer? */
+			if (TSTMP_GT(ticknow, hpts->p_curtick))
+				have_slept = ticknow - hpts->p_curtick;
+			else
+				have_slept = 0;
+			if (have_slept < hpts->p_hpts_sleep_time) {
+				/* This should be what happens */
+				yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
+			} else {
+				/* We are over-due */
+				yet_to_sleep = 0;
+				need_wakeup = 1;
+			}
+			if (diag) {
+				diag->have_slept = have_slept;
+				diag->yet_to_sleep = yet_to_sleep;
+				diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
+			}
+			if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+				/*
+				 * We need to reschedule the hptss time-out.
+				 */
+				hpts->p_hpts_sleep_time = slot;
+				need_new_to = slot * HPTS_TICKS_PER_USEC;
+			}
+		}
+		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+		if (logging_on) {
+			tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
+		}
+		/*
+		 * Now how far is the hpts sleeping to? if active is 1, its
+		 * up and ticking we do nothing, otherwise we may need to
+		 * reschedule its callout if need_new_to is set from above.
+		 */
+		if (need_wakeup) {
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
+			}
+			hpts->p_direct_wake = 1;
+			tcp_wakehpts(hpts);
+			if (diag) {
+				diag->need_new_to = 0;
+				diag->co_ret = 0xffff0000;
+			}
+		} else if (need_new_to) {
+			int32_t co_ret;
+			struct timeval tv;
+			sbintime_t sb;
+
+			tv.tv_sec = 0;
+			tv.tv_usec = 0;
+			while (need_new_to > HPTS_USEC_IN_SEC) {
+				tv.tv_sec++;
+				need_new_to -= HPTS_USEC_IN_SEC;
+			}
+			tv.tv_usec = need_new_to;
+			sb = tvtosbt(tv);
+			if (tcp_hpts_callout_skip_swi == 0) {
+				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+				    hpts_timeout_swi, hpts, hpts->p_cpu,
+				    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+			} else {
+				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+				    hpts_timeout_dir, hpts,
+				    hpts->p_cpu,
+				    C_PREL(tcp_hpts_precision));
+			}
+			if (diag) {
+				diag->need_new_to = need_new_to;
+				diag->co_ret = co_ret;
+			}
+		}
+	} else {
+#ifdef INVARIANTS
+		panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
+#endif
+	}
+}
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+	struct tcp_hpts_entry *hpts;
+	uint32_t slot_on, cts;
+	struct timeval tv;
+
+	/*
+	 * We now return the next-slot the hpts will be on, beyond its
+	 * current run (if up) or where it was when it stopped if it is
+	 * sleeping.
+	 */
+	INP_WLOCK_ASSERT(inp);
+	hpts = tcp_hpts_lock(inp);
+	if (in_ts_percision)
+		microuptime(&tv);
+	else
+		getmicrouptime(&tv);
+	cts = tcp_tv_to_usectick(&tv);
+	tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+	slot_on = hpts->p_nxt_slot;
+	mtx_unlock(&hpts->p_mtx);
+	return (slot_on);
+}
+
+uint32_t
+__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
+	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
+}
+
+int
+__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
+{
+	int32_t retval = 0;
+
+	HPTS_MTX_ASSERT(hpts);
+	if (inp->inp_in_input == 0) {
+		/* Ok we need to set it on the hpts in the current slot */
+		hpts_sane_input_insert(hpts, inp, line);
+		retval = 1;
+		if (hpts->p_hpts_active == 0) {
+			/*
+			 * Activate the hpts if it is sleeping.
+			 */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
+			}
+			retval = 2;
+			hpts->p_direct_wake = 1;
+			tcp_wakeinput(hpts);
+		}
+	} else if (hpts->p_hpts_active == 0) {
+		retval = 4;
+		hpts->p_direct_wake = 1;
+		tcp_wakeinput(hpts);
+	}
+	return (retval);
+}
+
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked)
+{
+	/* Setup packet for input first */
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
+	m->m_pkthdr.pace_tlen = (uint16_t) tlen;
+	m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
+	m->m_pkthdr.pace_tos = iptos;
+	m->m_pkthdr.pace_lock = (uint8_t) ti_locked;
+	if (tp->t_in_pkt == NULL) {
+		tp->t_in_pkt = m;
+		tp->t_tail_pkt = m;
+	} else {
+		tp->t_tail_pkt->m_nextpkt = m;
+		tp->t_tail_pkt = m;
+	}
+}
+
+
+int32_t
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line){
+	struct tcp_hpts_entry *hpts;
+	int32_t ret;
+
+	tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos, ti_locked);
+	hpts = tcp_input_lock(tp->t_inpcb);
+	ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+	mtx_unlock(&hpts->p_mtx);
+	return (ret);
+}
+
+void
+__tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
+{
+	struct tcp_hpts_entry *hpts;
+	struct tcpcb *tp;
+
+	tp = intotcpcb(inp);
+	hpts = tcp_input_lock(tp->t_inpcb);
+	if (inp->inp_in_input == 0) {
+		/* Ok we need to set it on the hpts in the current slot */
+		hpts_sane_input_insert(hpts, inp, line);
+		if (hpts->p_hpts_active == 0) {
+			/*
+			 * Activate the hpts if it is sleeping.
+			 */
+			hpts->p_direct_wake = 1;
+			tcp_wakeinput(hpts);
+		}
+	} else if (hpts->p_hpts_active == 0) {
+		hpts->p_direct_wake = 1;
+		tcp_wakeinput(hpts);
+	}
+	inp->inp_hpts_drop_reas = reason;
+	mtx_unlock(&hpts->p_mtx);
+}
+
+static uint16_t
+hpts_random_cpu(struct inpcb *inp){
+	/*
+	 * No flow type set distribute the load randomly.
+	 */
+	uint16_t cpuid;
+	uint32_t ran;
+
+	/*
+	 * If one has been set use it i.e. we want both in and out on the
+	 * same hpts.
+	 */
+	if (inp->inp_input_cpu_set) {
+		return (inp->inp_input_cpu);
+	} else if (inp->inp_hpts_cpu_set) {
+		return (inp->inp_hpts_cpu);
+	}
+	/* Nothing set use a random number */
+	ran = arc4random();
+	cpuid = (ran & 0xffff) % mp_ncpus;
+	return (cpuid);
+}
+
+static uint16_t
+hpts_cpuid(struct inpcb *inp){
+	uint16_t cpuid;
+
+
+	/*
+	 * If one has been set use it i.e. we want both in and out on the
+	 * same hpts.
+	 */
+	if (inp->inp_input_cpu_set) {
+		return (inp->inp_input_cpu);
+	} else if (inp->inp_hpts_cpu_set) {
+		return (inp->inp_hpts_cpu);
+	}
+	/* If one is set the other must be the same */
+#ifdef	RSS
+	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
+	if (cpuid == NETISR_CPUID_NONE)
+		return (hpts_random_cpu(inp));
+	else
+		return (cpuid);
+#else
+	/*
+	 * We don't have a flowid -> cpuid mapping, so cheat and just map
+	 * unknown cpuids to curcpu.  Not the best, but apparently better
+	 * than defaulting to swi 0.
+	 */
+	if (inp->inp_flowtype != M_HASHTYPE_NONE) {
+		cpuid = inp->inp_flowid % mp_ncpus;
+		return (cpuid);
+	}
+	cpuid = hpts_random_cpu(inp);
+	return (cpuid);
+#endif
+}
+
+/*
+ * Do NOT try to optimize the processing of inp's
+ * by first pulling off all the inp's into a temporary
+ * list (e.g. TAILQ_CONCAT). If you do that the subtle
+ * interactions of switching CPU's will kill because of
+ * problems in the linked list manipulation. Basically
+ * you would switch cpu's with the hpts mutex locked
+ * but then while you were processing one of the inp's
+ * some other one that you switch will get a new
+ * packet on the different CPU. It will insert it
+ * on the new hptss input list. Creating a temporary
+ * link in the inp will not fix it either, since
+ * the other hpts will be doing the same thing and
+ * you will both end up using the temporary link.
+ *
+ * You will die in an ASSERT for tailq corruption if you
+ * run INVARIANTS or you will die horribly without
+ * INVARIANTS in some unknown way with a corrupt linked
+ * list.
+ */
+static void
+tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+	struct mbuf *m, *n;
+	struct tcpcb *tp;
+	struct inpcb *inp;
+	uint16_t drop_reason;
+	int16_t set_cpu;
+	uint32_t did_prefetch = 0;
+	int32_t ti_locked = TI_UNLOCKED;
+
+	HPTS_MTX_ASSERT(hpts);
+	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
+		HPTS_MTX_ASSERT(hpts);
+		hpts_sane_input_remove(hpts, inp, 0);
+		if (inp->inp_input_cpu_set == 0) {
+			set_cpu = 1;
+		} else {
+			set_cpu = 0;
+		}
+		hpts->p_inp = inp;
+		drop_reason = inp->inp_hpts_drop_reas;
+		inp->inp_in_input = 0;
+		mtx_unlock(&hpts->p_mtx);
+		if (drop_reason) {
+			INP_INFO_RLOCK(&V_tcbinfo);
+			ti_locked = TI_RLOCKED;
+		} else {
+			ti_locked = TI_UNLOCKED;
+		}
+		INP_WLOCK(inp);
+		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+		    (inp->inp_flags2 & INP_FREED)) {
+out:
+			hpts->p_inp = NULL;
+			if (ti_locked == TI_RLOCKED) {
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+			}
+			if (in_pcbrele_wlocked(inp) == 0) {
+				INP_WUNLOCK(inp);
+			}
+			ti_locked = TI_UNLOCKED;
+			mtx_lock(&hpts->p_mtx);
+			continue;
+		}
+		tp = intotcpcb(inp);
+		if ((tp == NULL) || (tp->t_inpcb == NULL)) {
+			goto out;
+		}
+		if (drop_reason) {
+			/* This tcb is being destroyed for drop_reason */
+			m = tp->t_in_pkt;
+			if (m)
+				n = m->m_nextpkt;
+			else
+				n = NULL;
+			tp->t_in_pkt = NULL;
+			while (m) {
+				m_freem(m);
+				m = n;
+				if (m)
+					n = m->m_nextpkt;
+			}
+			tp = tcp_drop(tp, drop_reason);
+			INP_INFO_RUNLOCK(&V_tcbinfo);
+			if (tp == NULL) {
+				INP_WLOCK(inp);
+			}
+			if (in_pcbrele_wlocked(inp) == 0)
+				INP_WUNLOCK(inp);
+			mtx_lock(&hpts->p_mtx);
+			continue;
+		}
+		if (set_cpu) {
+			/*
+			 * Setup so the next time we will move to the right
+			 * CPU. This should be a rare event. It will
+			 * sometimes happens when we are the client side
+			 * (usually not the server). Somehow tcp_output()
+			 * gets called before the tcp_do_segment() sets the
+			 * intial state. This means the r_cpu and r_hpts_cpu
+			 * is 0. We get on the hpts, and then tcp_input()
+			 * gets called setting up the r_cpu to the correct
+			 * value. The hpts goes off and sees the mis-match.
+			 * We simply correct it here and the CPU will switch
+			 * to the new hpts nextime the tcb gets added to the
+			 * the hpts (not this time) :-)
+			 */
+			tcp_set_hpts(inp);
+		}
+		CURVNET_SET(tp->t_vnet);
+		m = tp->t_in_pkt;
+		n = NULL;
+		if (m != NULL &&
+		    (m->m_pkthdr.pace_lock == TI_RLOCKED ||
+		    tp->t_state != TCPS_ESTABLISHED)) {
+			ti_locked = TI_RLOCKED;
+			if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
+				CURVNET_RESTORE();
+				goto out;
+			}
+			m = tp->t_in_pkt;
+		}
+		if (in_newts_every_tcb) {
+			if (in_ts_percision)
+				microuptime(tv);
+			else
+				getmicrouptime(tv);
+		}
+		if (tp->t_fb_ptr != NULL) {
+			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
+			did_prefetch = 1;
+		}
+		/* Any input work to do, if so do it first */
+		if ((m != NULL) && (m == tp->t_in_pkt)) {
+			struct tcphdr *th;
+			int32_t tlen, drop_hdrlen, nxt_pkt;
+			uint8_t iptos;
+
+			n = m->m_nextpkt;
+			tp->t_in_pkt = tp->t_tail_pkt = NULL;
+			while (m) {
+				th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
+				tlen = m->m_pkthdr.pace_tlen;
+				drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
+				iptos = m->m_pkthdr.pace_tos;
+				m->m_nextpkt = NULL;
+				if (n)
+					nxt_pkt = 1;
+				else
+					nxt_pkt = 0;
+				inp->inp_input_calls = 1;
+				if (tp->t_fb->tfb_tcp_hpts_do_segment) {
+					/* Use the hpts specific do_segment */
+					(*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
+					    tp, drop_hdrlen,
+					    tlen, iptos, ti_locked, nxt_pkt, tv);
+				} else {
+					/* Use the default do_segment */
+					(*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
+					    tp, drop_hdrlen,
+					    tlen, iptos, ti_locked);
+				}
+				/*
+				 * Do segment returns unlocked we need the
+				 * lock again but we also need some kasserts
+				 * here.
+				 */
+				INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+				INP_UNLOCK_ASSERT(inp);
+				m = n;
+				if (m)
+					n = m->m_nextpkt;
+				if (m != NULL &&
+				    m->m_pkthdr.pace_lock == TI_RLOCKED) {
+					INP_INFO_RLOCK(&V_tcbinfo);
+					ti_locked = TI_RLOCKED;
+				} else
+					ti_locked = TI_UNLOCKED;
+				INP_WLOCK(inp);
+				/*
+				 * Since we have an opening here we must
+				 * re-check if the tcb went away while we
+				 * were getting the lock(s).
+				 */
+				if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+				    (inp->inp_flags2 & INP_FREED)) {
+			out_free:
+					while (m) {
+						m_freem(m);
+						m = n;
+						if (m)
+							n = m->m_nextpkt;
+					}
+					CURVNET_RESTORE();
+					goto out;
+				}
+				/*
+				 * Now that we hold the INP lock, check if
+				 * we need to upgrade our lock.
+				 */
+				if (ti_locked == TI_UNLOCKED &&
+				    (tp->t_state != TCPS_ESTABLISHED)) {
+					ti_locked = TI_RLOCKED;
+					if (tcp_hptsi_lock_inpinfo(inp, &tp))
+						goto out_free;
+				}
+			}	/** end while(m) */
+		}		/** end if ((m != NULL)  && (m == tp->t_in_pkt)) */
+		if (in_pcbrele_wlocked(inp) == 0)
+			INP_WUNLOCK(inp);
+		if (ti_locked == TI_RLOCKED)
+			INP_INFO_RUNLOCK(&V_tcbinfo);
+		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+		INP_UNLOCK_ASSERT(inp);
+		ti_locked = TI_UNLOCKED;
+		mtx_lock(&hpts->p_mtx);
+		hpts->p_inp = NULL;
+		CURVNET_RESTORE();
+	}
+}
+
+static int
+tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
+{
+	int32_t ticks_to_run;
+
+	if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
+		ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
+		if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
+			ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
+		}
+	} else {
+		if (hpts->p_prevtick == hpts->p_curtick) {
+			/* This happens when we get woken up right away */
+			return (-1);
+		}
+		ticks_to_run = 1;
+	}
+	/* Set in where we will be when we catch up */
+	hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
+	if (hpts->p_nxt_slot == hpts->p_cur_slot) {
+		panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
+		    hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
+	}
+	return (ticks_to_run);
+}
+
+static void
+tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+{
+	struct tcpcb *tp;
+	struct inpcb *inp = NULL, *ninp;
+	struct timeval tv;
+	int32_t ticks_to_run, i, error, tick_now, interum_tick;
+	int32_t paced_cnt = 0;
+	int32_t did_prefetch = 0;
+	int32_t prefetch_ninp = 0;
+	int32_t prefetch_tp = 0;
+	uint32_t cts;
+	int16_t set_cpu;
+
+	HPTS_MTX_ASSERT(hpts);
+	hpts->p_curtick = tcp_tv_to_hptstick(ctick);
+	cts = tcp_tv_to_usectick(ctick);
+	memcpy(&tv, ctick, sizeof(struct timeval));
+	hpts->p_cur_slot = hpts_tick(hpts, 1);
+
+	/* Figure out if we had missed ticks */
+again:
+	HPTS_MTX_ASSERT(hpts);
+	ticks_to_run = tcp_hpts_est_run(hpts);
+	if (!TAILQ_EMPTY(&hpts->p_input)) {
+		tcp_input_data(hpts, &tv);
+	}
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&hpts->p_input) &&
+	    (hpts->p_on_inqueue_cnt != 0)) {
+		panic("tp:%p in_hpts input empty but cnt:%d",
+		    hpts, hpts->p_on_inqueue_cnt);
+	}
+#endif
+	HPTS_MTX_ASSERT(hpts);
+	/* Reset the ticks to run and time if we need too */
+	interum_tick = tcp_gethptstick(&tv);
+	if (interum_tick != hpts->p_curtick) {
+		/* Save off the new time we execute to */
+		*ctick = tv;
+		hpts->p_curtick = interum_tick;
+		cts = tcp_tv_to_usectick(&tv);
+		hpts->p_cur_slot = hpts_tick(hpts, 1);
+		ticks_to_run = tcp_hpts_est_run(hpts);
+	}
+	if (ticks_to_run == -1) {
+		goto no_run;
+	}
+	if (logging_on) {
+		tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
+	}
+	if (hpts->p_on_queue_cnt == 0) {
+		goto no_one;
+	}
+	HPTS_MTX_ASSERT(hpts);
+	for (i = 0; i < ticks_to_run; i++) {
+		/*
+		 * Calculate our delay, if there are no extra ticks there
+		 * was not any
+		 */
+		hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
+		HPTS_MTX_ASSERT(hpts);
+		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+			/* For debugging */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
+			}
+			hpts->p_inp = inp;
+			paced_cnt++;
+			if (hpts->p_cur_slot != inp->inp_hptsslot) {
+				panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
+				    hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+			}
+			/* Now pull it */
+			if (inp->inp_hpts_cpu_set == 0) {
+				set_cpu = 1;
+			} else {
+				set_cpu = 0;
+			}
+			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
+			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+				/* We prefetch the next inp if possible */
+				kern_prefetch(ninp, &prefetch_ninp);
+				prefetch_ninp = 1;
+			}
+			if (inp->inp_hpts_request) {
+				/*
+				 * This guy is deferred out further in time
+				 * then our wheel had on it. Push him back
+				 * on the wheel.
+				 */
+				int32_t remaining_slots;
+
+				remaining_slots = ticks_to_run - (i + 1);
+				if (inp->inp_hpts_request > remaining_slots) {
+					/*
+					 * Keep INVARIANTS happy by clearing
+					 * the flag
+					 */
+					tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+					hpts->p_inp = NULL;
+					continue;
+				}
+				inp->inp_hpts_request = 0;
+			}
+			/*
+			 * We clear the hpts flag here after dealing with
+			 * remaining slots. This way anyone looking with the
+			 * TCB lock will see its on the hpts until just
+			 * before we unlock.
+			 */
+			inp->inp_in_hpts = 0;
+			mtx_unlock(&hpts->p_mtx);
+			INP_WLOCK(inp);
+			if (in_pcbrele_wlocked(inp)) {
+				mtx_lock(&hpts->p_mtx);
+				if (logging_on)
+					tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
+				hpts->p_inp = NULL;
+				continue;
+			}
+			if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+out_now:
+#ifdef INVARIANTS
+				if (mtx_owned(&hpts->p_mtx)) {
+					panic("Hpts:%p owns mtx prior-to lock line:%d",
+					    hpts, __LINE__);
+				}
+#endif
+				INP_WUNLOCK(inp);
+				mtx_lock(&hpts->p_mtx);
+				if (logging_on)
+					tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
+				hpts->p_inp = NULL;
+				continue;
+			}
+			tp = intotcpcb(inp);
+			if ((tp == NULL) || (tp->t_inpcb == NULL)) {
+				goto out_now;
+			}
+			if (set_cpu) {
+				/*
+				 * Setup so the next time we will move to
+				 * the right CPU. This should be a rare
+				 * event. It will sometimes happens when we
+				 * are the client side (usually not the
+				 * server). Somehow tcp_output() gets called
+				 * before the tcp_do_segment() sets the
+				 * intial state. This means the r_cpu and
+				 * r_hpts_cpu is 0. We get on the hpts, and
+				 * then tcp_input() gets called setting up
+				 * the r_cpu to the correct value. The hpts
+				 * goes off and sees the mis-match. We
+				 * simply correct it here and the CPU will
+				 * switch to the new hpts nextime the tcb
+				 * gets added to the the hpts (not this one)
+				 * :-)
+				 */
+				tcp_set_hpts(inp);
+			}
+			if (out_newts_every_tcb) {
+				struct timeval sv;
+
+				if (out_ts_percision)
+					microuptime(&sv);
+				else
+					getmicrouptime(&sv);
+				cts = tcp_tv_to_usectick(&sv);
+			}
+			CURVNET_SET(tp->t_vnet);
+			/*
+			 * There is a hole here, we get the refcnt on the
+			 * inp so it will still be preserved but to make
+			 * sure we can get the INP we need to hold the p_mtx
+			 * above while we pull out the tp/inp,  as long as
+			 * fini gets the lock first we are assured of having
+			 * a sane INP we can lock and test.
+			 */
+#ifdef INVARIANTS
+			if (mtx_owned(&hpts->p_mtx)) {
+				panic("Hpts:%p owns mtx before tcp-output:%d",
+				    hpts, __LINE__);
+			}
+#endif
+			if (tp->t_fb_ptr != NULL) {
+				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
+				did_prefetch = 1;
+			}
+			inp->inp_hpts_calls = 1;
+			if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
+				error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
+			} else {
+				error = tp->t_fb->tfb_tcp_output(tp);
+			}
+			if (ninp && ninp->inp_ppcb) {
+				/*
+				 * If we have a nxt inp, see if we can
+				 * prefetch its ppcb. Note this may seem
+				 * "risky" since we have no locks (other
+				 * than the previous inp) and there no
+				 * assurance that ninp was not pulled while
+				 * we were processing inp and freed. If this
+				 * occured it could mean that either:
+				 *
+				 * a) Its NULL (which is fine we won't go
+				 * here) <or> b) Its valid (which is cool we
+				 * will prefetch it) <or> c) The inp got
+				 * freed back to the slab which was
+				 * reallocated. Then the piece of memory was
+				 * re-used and something else (not an
+				 * address) is in inp_ppcb. If that occurs
+				 * we don't crash, but take a TLB shootdown
+				 * performance hit (same as if it was NULL
+				 * and we tried to pre-fetch it).
+				 *
+				 * Considering that the likelyhood of <c> is
+				 * quite rare we will take a risk on doing
+				 * this. If performance drops after testing
+				 * we can always take this out. NB: the
+				 * kern_prefetch on amd64 actually has
+				 * protection against a bad address now via
+				 * the DMAP_() tests. This will prevent the
+				 * TLB hit, and instead if <c> occurs just
+				 * cause us to load cache with a useless
+				 * address (to us).
+				 */
+				kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
+				prefetch_tp = 1;
+			}
+			INP_WUNLOCK(inp);
+			INP_UNLOCK_ASSERT(inp);
+			CURVNET_RESTORE();
+#ifdef INVARIANTS
+			if (mtx_owned(&hpts->p_mtx)) {
+				panic("Hpts:%p owns mtx prior-to lock line:%d",
+				    hpts, __LINE__);
+			}
+#endif
+			mtx_lock(&hpts->p_mtx);
+			if (logging_on)
+				tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
+			hpts->p_inp = NULL;
+		}
+		HPTS_MTX_ASSERT(hpts);
+		hpts->p_inp = NULL;
+		hpts->p_cur_slot++;
+		if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
+			hpts->p_cur_slot = 0;
+		}
+	}
+no_one:
+	HPTS_MTX_ASSERT(hpts);
+	hpts->p_prevtick = hpts->p_curtick;
+	hpts->p_delayed_by = 0;
+	/*
+	 * Check to see if we took an excess amount of time and need to run
+	 * more ticks (if we did not hit eno-bufs).
+	 */
+	/* Re-run any input that may be there */
+	(void)tcp_gethptstick(&tv);
+	if (!TAILQ_EMPTY(&hpts->p_input)) {
+		tcp_input_data(hpts, &tv);
+	}
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&hpts->p_input) &&
+	    (hpts->p_on_inqueue_cnt != 0)) {
+		panic("tp:%p in_hpts input empty but cnt:%d",
+		    hpts, hpts->p_on_inqueue_cnt);
+	}
+#endif
+	tick_now = tcp_gethptstick(&tv);
+	if (SEQ_GT(tick_now, hpts->p_prevtick)) {
+		struct timeval res;
+
+		/* Did we really spend a full tick or more in here? */
+		timersub(&tv, ctick, &res);
+		if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+			counter_u64_add(hpts_loops, 1);
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
+			}
+			*ctick = res;
+			hpts->p_curtick = tick_now;
+			goto again;
+		}
+	}
+no_run:
+	{
+		uint32_t t = 0, i, fnd = 0;
+
+		if (hpts->p_on_queue_cnt) {
+
+
+			/*
+			 * Find next slot that is occupied and use that to
+			 * be the sleep time.
+			 */
+			for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+				if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
+					fnd = 1;
+					break;
+				}
+				t = (t + 1) % NUM_OF_HPTSI_SLOTS;
+			}
+			if (fnd) {
+				hpts->p_hpts_sleep_time = i;
+			} else {
+				counter_u64_add(back_tosleep, 1);
+#ifdef INVARIANTS
+				panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+#endif
+				hpts->p_on_queue_cnt = 0;
+				goto non_found;
+			}
+			t++;
+		} else {
+			/* No one on the wheel sleep for all but 2 slots  */
+non_found:
+			if (hpts_sleep_max == 0)
+				hpts_sleep_max = 1;
+			hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
+			t = 0;
+		}
+		if (logging_on) {
+			tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
+		}
+	}
+}
+
+void
+__tcp_set_hpts(struct inpcb *inp, int32_t line)
+{
+	struct tcp_hpts_entry *hpts;
+
+	INP_WLOCK_ASSERT(inp);
+	hpts = tcp_hpts_lock(inp);
+	if ((inp->inp_in_hpts == 0) &&
+	    (inp->inp_hpts_cpu_set == 0)) {
+		inp->inp_hpts_cpu = hpts_cpuid(inp);
+		inp->inp_hpts_cpu_set = 1;
+	}
+	mtx_unlock(&hpts->p_mtx);
+	hpts = tcp_input_lock(inp);
+	if ((inp->inp_input_cpu_set == 0) &&
+	    (inp->inp_in_input == 0)) {
+		inp->inp_input_cpu = hpts_cpuid(inp);
+		inp->inp_input_cpu_set = 1;
+	}
+	mtx_unlock(&hpts->p_mtx);
+}
+
+uint16_t
+tcp_hpts_delayedby(struct inpcb *inp){
+	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
+}
+
+static void
+tcp_hpts_thread(void *ctx)
+{
+	struct tcp_hpts_entry *hpts;
+	struct timeval tv;
+	sbintime_t sb;
+
+	hpts = (struct tcp_hpts_entry *)ctx;
+	mtx_lock(&hpts->p_mtx);
+	if (hpts->p_direct_wake) {
+		/* Signaled by input */
+		if (logging_on)
+			tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
+		callout_stop(&hpts->co);
+	} else {
+		/* Timed out */
+		if (callout_pending(&hpts->co) ||
+		    !callout_active(&hpts->co)) {
+			if (logging_on)
+				tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
+			mtx_unlock(&hpts->p_mtx);
+			return;
+		}
+		callout_deactivate(&hpts->co);
+		if (logging_on)
+			tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
+	}
+	hpts->p_hpts_active = 1;
+	(void)tcp_gethptstick(&tv);
+	tcp_hptsi(hpts, &tv);
+	HPTS_MTX_ASSERT(hpts);
+	tv.tv_sec = 0;
+	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+		tv.tv_usec = tcp_min_hptsi_time;
+		hpts->p_on_min_sleep = 1;
+	} else {
+		/* Clear the min sleep flag */
+		hpts->p_on_min_sleep = 0;
+	}
+	hpts->p_hpts_active = 0;
+	sb = tvtosbt(tv);
+	if (tcp_hpts_callout_skip_swi == 0) {
+		callout_reset_sbt_on(&hpts->co, sb, 0,
+		    hpts_timeout_swi, hpts, hpts->p_cpu,
+		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+	} else {
+		callout_reset_sbt_on(&hpts->co, sb, 0,
+		    hpts_timeout_dir, hpts,
+		    hpts->p_cpu,
+		    C_PREL(tcp_hpts_precision));
+	}
+	hpts->p_direct_wake = 0;
+	mtx_unlock(&hpts->p_mtx);
+}
+
+#undef	timersub
+
+static void
+tcp_init_hptsi(void *st)
+{
+	int32_t i, j, error, bound = 0, created = 0;
+	size_t sz, asz;
+	struct timeval tv;
+	sbintime_t sb;
+	struct tcp_hpts_entry *hpts;
+	char unit[16];
+	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+
+	tcp_pace.rp_proc = NULL;
+	tcp_pace.rp_num_hptss = ncpus;
+	hpts_loops = counter_u64_alloc(M_WAITOK);
+	back_tosleep = counter_u64_alloc(M_WAITOK);
+
+	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
+	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+		    M_TCPHPTS, M_WAITOK | M_ZERO);
+		tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
+		    M_TCPHPTS, M_WAITOK);
+		hpts = tcp_pace.rp_ent[i];
+		/*
+		 * Init all the hpts structures that are not specifically
+		 * zero'd by the allocations. Also lets attach them to the
+		 * appropriate sysctl block as well.
+		 */
+		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
+		    "hpts", MTX_DEF | MTX_DUPOK);
+		TAILQ_INIT(&hpts->p_input);
+		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+			TAILQ_INIT(&hpts->p_hptss[j]);
+		}
+		sysctl_ctx_init(&hpts->hpts_ctx);
+		sprintf(unit, "%d", i);
+		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+		    OID_AUTO,
+		    unit,
+		    CTLFLAG_RW, 0,
+		    "");
+		SYSCTL_ADD_INT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "in_qcnt", CTLFLAG_RD,
+		    &hpts->p_on_inqueue_cnt, 0,
+		    "Count TCB's awaiting input processing");
+		SYSCTL_ADD_INT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
+		    &hpts->p_on_queue_cnt, 0,
+		    "Count TCB's awaiting output processing");
+		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "active", CTLFLAG_RD,
+		    &hpts->p_hpts_active, 0,
+		    "Is the hpts active");
+		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "curslot", CTLFLAG_RD,
+		    &hpts->p_cur_slot, 0,
+		    "What the current slot is if active");
+		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "curtick", CTLFLAG_RD,
+		    &hpts->p_curtick, 0,
+		    "What the current tick on if active");
+		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "logsize", CTLFLAG_RD,
+		    &hpts->p_logsize, 0,
+		    "Hpts logging buffer size");
+		hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+		hpts->p_num = i;
+		hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
+		hpts->p_prevtick -= 1;
+		hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+		hpts->p_cpu = 0xffff;
+		hpts->p_nxt_slot = 1;
+		hpts->p_logsize = tcp_hpts_logging_size;
+		if (hpts->p_logsize) {
+			sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+			hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+		}
+		callout_init(&hpts->co, 1);
+	}
+	/*
+	 * Now lets start ithreads to handle the hptss.
+	 */
+	CPU_FOREACH(i) {
+		hpts = tcp_pace.rp_ent[i];
+		hpts->p_cpu = i;
+		error = swi_add(&hpts->ie, "hpts",
+		    tcp_hpts_thread, (void *)hpts,
+		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
+		if (error) {
+			panic("Can't add hpts:%p i:%d err:%d",
+			    hpts, i, error);
+		}
+		created++;
+		if (tcp_bind_threads) {
+			if (intr_event_bind(hpts->ie, i) == 0)
+				bound++;
+		}
+		tv.tv_sec = 0;
+		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+		sb = tvtosbt(tv);
+		if (tcp_hpts_callout_skip_swi == 0) {
+			callout_reset_sbt_on(&hpts->co, sb, 0,
+			    hpts_timeout_swi, hpts, hpts->p_cpu,
+			    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+		} else {
+			callout_reset_sbt_on(&hpts->co, sb, 0,
+			    hpts_timeout_dir, hpts,
+			    hpts->p_cpu,
+			    C_PREL(tcp_hpts_precision));
+		}
+	}
+	printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
+	    created, bound);
+	return;
+}
+
+SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
new file mode 100644
index 000000000000..c5a3a5f197bd
--- /dev/null
+++ b/sys/netinet/tcp_hpts.h
@@ -0,0 +1,304 @@
+#ifndef __tcp_hpts_h__
+#define __tcp_hpts_h__
+/*-
+ * Copyright (c) 2016-8
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$")
+ */
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+TAILQ_HEAD(hptsh, inpcb);
+
+/* Number of useconds in a hpts tick */
+#define HPTS_TICKS_PER_USEC 10
+#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
+#define HPTS_USEC_IN_SEC 1000000
+#define HPTS_MSEC_IN_SEC 1000
+#define HPTS_USEC_IN_MSEC 1000
+
+#define DEFAULT_HPTS_LOG 3072
+
+/*
+ * Log flags consist of
+ *  7f      7f         1            1 bits
+ * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
+ *
+ * So for example cpu 10, number 10 would with
+ * input active would show up as:
+ * p_flags = 0001010 0001010 1 0
+ *  <or>
+ * p_flags = 0x142a
+ */
+#define HPTS_HPTS_ACTIVE 0x01
+#define HPTS_INPUT_ACTIVE 0x02
+
+#define HPTSLOG_IMMEDIATE	1
+#define HPTSLOG_INSERT_NORMAL	2
+#define HPTSLOG_INSERT_SLEEPER	3
+#define HPTSLOG_SLEEP_AFTER	4
+#define HPTSLOG_SLEEP_BEFORE	5
+#define HPTSLOG_INSERTED	6
+#define HPTSLOG_WAKEUP_HPTS	7
+#define HPTSLOG_SETTORUN	8
+#define HPTSLOG_HPTSI		9
+#define HPTSLOG_TOLONG		10
+#define HPTSLOG_AWAKENS	11
+#define HPTSLOG_TIMESOUT	12
+#define HPTSLOG_SLEEPSET	13
+#define HPTSLOG_WAKEUP_INPUT	14
+#define HPTSLOG_RESCHEDULE     15
+#define HPTSLOG_AWAKE		16
+#define HPTSLOG_INP_DONE	17
+
+struct hpts_log {
+	struct inpcb *inp;
+	int32_t event;
+	uint32_t cts;
+	int32_t line;
+	uint32_t ticknow;
+	uint32_t t_paceslot;
+	uint32_t t_hptsreq;
+	uint32_t p_curtick;
+	uint32_t p_prevtick;
+	uint32_t slot_req;
+	uint32_t p_on_queue_cnt;
+	uint32_t p_nxt_slot;
+	uint32_t p_cur_slot;
+	uint32_t p_hpts_sleep_time;
+	uint16_t p_flags;
+	uint8_t p_onhpts;
+	uint8_t p_oninput;
+	uint8_t is_notempty;
+};
+
+struct hpts_diag {
+	uint32_t p_hpts_active;
+	uint32_t p_nxt_slot;
+	uint32_t p_cur_slot;
+	uint32_t slot_req;
+	uint32_t inp_hptsslot;
+	uint32_t slot_now;
+	uint32_t have_slept;
+	uint32_t hpts_sleep_time;
+	uint32_t yet_to_sleep;
+	uint32_t need_new_to;
+	int32_t co_ret;
+	uint8_t p_on_min_sleep;
+};
+
+#ifdef _KERNEL
+/* Each hpts has its own p_mtx which is used for locking */
+struct tcp_hpts_entry {
+	/* Cache line 0x00 */
+	struct mtx p_mtx;	/* Mutex for hpts */
+	uint32_t p_hpts_active; /* Flag that says hpts is awake  */
+	uint32_t p_curtick;	/* Current tick in 10 us the hpts is at */
+	uint32_t p_prevtick;	/* Previous tick in 10 us the hpts ran */
+	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
+	uint32_t p_nxt_slot;	/* The next slot outside the current range of
+				 * slots that the hpts is running on. */
+	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
+	uint32_t enobuf_cnt;
+	uint16_t p_log_at;
+	uint8_t p_direct_wake :1, /* boolean */
+		p_log_wrapped :1, /* boolean */
+		p_on_min_sleep:1; /* boolean */
+	uint8_t p_fill;
+	/* Cache line 0x40 */
+	void *p_inp;
+	struct hptsh p_input;	/* For the tcp-input runner */
+	/* Hptsi wheel */
+	struct hptsh *p_hptss;
+	struct hpts_log *p_log;
+	uint32_t p_logsize;
+	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
+	uint32_t hit_no_enobuf;
+	uint32_t p_dyn_adjust;
+	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
+					 * of 255ms */
+	uint32_t p_delayed_by;	/* How much were we delayed by */
+	/* Cache line 0x80 */
+	struct sysctl_ctx_list hpts_ctx;
+	struct sysctl_oid *hpts_root;
+	struct intr_event *ie;
+	void *ie_cookie;
+	uint16_t p_num;		/* The hpts number one per cpu */
+	uint16_t p_cpu;		/* The hpts CPU */
+	/* There is extra space in here */
+	/* Cache line 0x100 */
+	struct callout co __aligned(CACHE_LINE_SIZE);
+}               __aligned(CACHE_LINE_SIZE);
+
+struct tcp_hptsi {
+	struct proc *rp_proc;	/* Process structure for hpts */
+	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
+	uint32_t rp_num_hptss;	/* Number of hpts threads */
+};
+
+#endif
+
+#define HPTS_REMOVE_INPUT  0x01
+#define HPTS_REMOVE_OUTPUT 0x02
+#define HPTS_REMOVE_ALL    (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
+
+/*
+ * When using the hpts, a TCP stack must make sure
+ * that once a INP_DROPPED flag is applied to a INP
+ * that it does not expect tcp_output() to ever be
+ * called by the hpts. The hpts will *not* call
+ * any output (or input) functions on a TCB that
+ * is in the DROPPED state.
+ *
+ * This implies final ACK's and RST's that might
+ * be sent when a TCB is still around must be
+ * sent from a routine like tcp_respond().
+ */
+#define DEFAULT_MIN_SLEEP 250	/* How many usec's is default for hpts sleep
+				 * this determines min granularity of the
+				 * hpts. If 0, granularity is 10useconds at
+				 * the cost of more CPU (context switching). */
+#ifdef _KERNEL
+#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
+struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
+int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
+
+struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
+#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
+void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
+
+/*
+ * To insert a TCB on the hpts you *must* be holding the
+ * INP_WLOCK(). The hpts insert code will then acqurire
+ * the hpts's lock and insert the TCB on the requested
+ * slot possibly waking up the hpts if you are requesting
+ * a time earlier than what the hpts is sleeping to (if
+ * the hpts is sleeping). You may check the inp->inp_in_hpts
+ * flag without the hpts lock. The hpts is the only one
+ * that will clear this flag holding only the hpts lock. This
+ * means that in your tcp_output() routine when you test for
+ * it to be 1 (so you wont call output) it may be transitioning
+ * to 0 (by the hpts). That will be fine since that will just
+ * mean an extra call to tcp_output that most likely will find
+ * the call you executed (when the mis-match occured) will have
+ * put the TCB back on the hpts and it will return. If your
+ * call did not add it back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Thoug usually
+ * you are either doing this from a timer, where you need
+ * that INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ */
+uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
+#define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
+
+int
+    __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
+#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked);
+int
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line);
+#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+
+uint16_t tcp_hpts_delayedby(struct inpcb *inp);
+
+void __tcp_set_hpts(struct inpcb *inp, int32_t line);
+#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+
+void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
+#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+
+extern int32_t tcp_min_hptsi_time;
+
+static __inline uint32_t
+tcp_tv_to_hptstick(struct timeval *sv)
+{
+	return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
+}
+
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+	struct timeval tv;
+
+	if (sv == NULL)
+		sv = &tv;
+	microuptime(sv);
+	return (tcp_tv_to_hptstick(sv));
+}
+
+static __inline uint32_t
+tcp_tv_to_usectick(struct timeval *sv)
+{
+	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+static __inline uint32_t
+tcp_tv_to_mssectick(struct timeval *sv)
+{
+	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
+}
+
+static __inline void
+tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
+{
+	mtx_unlock(&hpts->p_mtx);
+}
+
+static __inline uint32_t
+tcp_get_usecs(struct timeval *tv)
+{
+	struct timeval tvd;
+
+	if (tv == NULL)
+		tv = &tvd;
+	microuptime(tv);
+	return (tcp_tv_to_usectick(tv));
+}
+
+#endif
+#endif
diff --git a/sys/netinet/tcp_stacks/fastpath.c b/sys/netinet/tcp_stacks/fastpath.c
index 92db0d551fee..c6632a22c058 100644
--- a/sys/netinet/tcp_stacks/fastpath.c
+++ b/sys/netinet/tcp_stacks/fastpath.c
@@ -2404,7 +2404,7 @@ tcp_addfastpaths(module_t mod, int type, void *data)
 		err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 
 		if (err) {
 			printf("Failed to register fastslow module -- err:%d\n", err);
-			deregister_tcp_functions(&__tcp_fastack);
+			deregister_tcp_functions(&__tcp_fastack, false, true);
 			return(err);
 		}
 		break;
@@ -2412,12 +2412,12 @@ tcp_addfastpaths(module_t mod, int type, void *data)
 		if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
 			return(EBUSY);
 		}
+		err = deregister_tcp_functions(&__tcp_fastack, true, false);
+		err = deregister_tcp_functions(&__tcp_fastslow, true, false);
 		break;
 	case MOD_UNLOAD:
-		err = deregister_tcp_functions(&__tcp_fastack);
-		if (err == EBUSY)
-			break;
-		err = deregister_tcp_functions(&__tcp_fastslow);
+		err = deregister_tcp_functions(&__tcp_fastack, false, true);
+		err = deregister_tcp_functions(&__tcp_fastslow, false, true);
 		if (err == EBUSY)
 			break;
 		err = 0;
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 77cfc8d12724..bc03fb37de46 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -232,6 +232,9 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone);
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 #endif
 
+static int	tcp_default_fb_init(struct tcpcb *tp);
+static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
+static int	tcp_default_handoff_ok(struct tcpcb *tp);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static void tcp_mtudisc(struct inpcb *, int);
@@ -240,18 +243,13 @@ static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 
 
 static struct tcp_function_block tcp_def_funcblk = {
-	"default",
-	tcp_output,
-	tcp_do_segment,
-	tcp_default_ctloutput,
-	NULL,
-	NULL,	
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	0,
-	0
+	.tfb_tcp_block_name = "freebsd",
+	.tfb_tcp_output = tcp_output,
+	.tfb_tcp_do_segment = tcp_do_segment,
+	.tfb_tcp_ctloutput = tcp_default_ctloutput,
+	.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
+	.tfb_tcp_fb_init = tcp_default_fb_init,
+	.tfb_tcp_fb_fini = tcp_default_fb_fini,
 };
 
 int t_functions_inited = 0;
@@ -328,6 +326,88 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk)
 	return(rblk);
 }
 
+static struct tcp_function_block *
+find_and_ref_tcp_default_fb(void)
+{
+	struct tcp_function_block *rblk;
+
+	rw_rlock(&tcp_function_lock);
+	rblk = tcp_func_set_ptr;
+	refcount_acquire(&rblk->tfb_refcnt);
+	rw_runlock(&tcp_function_lock);
+	return (rblk);
+}
+
+void
+tcp_switch_back_to_default(struct tcpcb *tp)
+{
+	struct tcp_function_block *tfb;
+
+	KASSERT(tp->t_fb != &tcp_def_funcblk,
+	    ("%s: called by the built-in default stack", __func__));
+
+	/*
+	 * Release the old stack. This function will either find a new one
+	 * or panic.
+	 */
+	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+	refcount_release(&tp->t_fb->tfb_refcnt);
+
+	/*
+	 * Now, we'll find a new function block to use.
+	 * Start by trying the current user-selected
+	 * default, unless this stack is the user-selected
+	 * default.
+	 */
+	tfb = find_and_ref_tcp_default_fb();
+	if (tfb == tp->t_fb) {
+		refcount_release(&tfb->tfb_refcnt);
+		tfb = NULL;
+	}
+	/* Does the stack accept this connection? */
+	if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
+	    (*tfb->tfb_tcp_handoff_ok)(tp)) {
+		refcount_release(&tfb->tfb_refcnt);
+		tfb = NULL;
+	}
+	/* Try to use that stack. */
+	if (tfb != NULL) {
+		/* Initialize the new stack. If it succeeds, we are done. */
+		tp->t_fb = tfb;
+		if (tp->t_fb->tfb_tcp_fb_init == NULL ||
+		    (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+			return;
+
+		/*
+		 * Initialization failed. Release the reference count on
+		 * the stack.
+		 */
+		refcount_release(&tfb->tfb_refcnt);
+	}
+
+	/*
+	 * If that wasn't feasible, use the built-in default
+	 * stack which is not allowed to reject anyone.
+	 */
+	tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
+	if (tfb == NULL) {
+		/* there always should be a default */
+		panic("Can't refer to tcp_def_funcblk");
+	}
+	if (tfb->tfb_tcp_handoff_ok != NULL) {
+		if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
+			/* The default stack cannot say no */
+			panic("Default stack rejects a new session?");
+		}
+	}
+	tp->t_fb = tfb;
+	if (tp->t_fb->tfb_tcp_fb_init != NULL &&
+	    (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+		/* The default stack cannot fail */
+		panic("Default stack initialization failed");
+	}
+}
 
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
@@ -507,6 +587,89 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
 	    "List TCP function block name-to-ID mappings");
 
 /*
+ * tfb_tcp_handoff_ok() function for the default stack.
+ * Note that we'll basically try to take all comers.
+ */
+static int
+tcp_default_handoff_ok(struct tcpcb *tp)
+{
+
+	return (0);
+}
+
+/*
+ * tfb_tcp_fb_init() function for the default stack.
+ *
+ * This handles making sure we have appropriate timers set if you are
+ * transitioning a socket that has some amount of setup done.
+ *
+ * The init() fuction from the default can *never* return non-zero i.e.
+ * it is required to always succeed since it is the stack of last resort!
+ */
+static int
+tcp_default_fb_init(struct tcpcb *tp)
+{
+
+	struct socket *so;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
+	    ("%s: connection %p in unexpected state %d", __func__, tp,
+	    tp->t_state));
+
+	/*
+	 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
+	 * know what to do for unexpected states (which includes TIME_WAIT).
+	 */
+	if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
+		return (0);
+
+	/*
+	 * Make sure some kind of transmission timer is set if there is
+	 * outstanding data.
+	 */
+	so = tp->t_inpcb->inp_socket;
+	if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
+	    tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
+	    tcp_timer_active(tp, TT_PERSIST))) {
+		/*
+		 * If the session has established and it looks like it should
+		 * be in the persist state, set the persist timer. Otherwise,
+		 * set the retransmit timer.
+		 */
+		if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
+		    (int32_t)(tp->snd_nxt - tp->snd_una) <
+		    (int32_t)sbavail(&so->so_snd))
+			tcp_setpersist(tp);
+		else
+			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+	}
+
+	/* All non-embryonic sessions get a keepalive timer. */
+	if (!tcp_timer_active(tp, TT_KEEP))
+		tcp_timer_activate(tp, TT_KEEP,
+		    TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
+		    TP_KEEPINIT(tp));
+
+	return (0);
+}
+
+/*
+ * tfb_tcp_fb_fini() function for the default stack.
+ *
+ * This changes state as necessary (or prudent) to prepare for another stack
+ * to assume responsibility for the connection.
+ */
+static void
+tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
+{
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	return;
+}
+
+/*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
@@ -732,11 +895,28 @@ register_tcp_functions(struct tcp_function_block *blk, int wait)
 	return (register_tcp_functions_as_name(blk, NULL, wait));
 }
 
+/*
+ * Deregister all names associated with a function block. This
+ * functionally removes the function block from use within the system.
+ *
+ * When called with a true quiesce argument, mark the function block
+ * as being removed so no more stacks will use it and determine
+ * whether the removal would succeed.
+ *
+ * When called with a false quiesce argument, actually attempt the
+ * removal.
+ *
+ * When called with a force argument, attempt to switch all TCBs to
+ * use the default stack instead of returning EBUSY.
+ *
+ * Returns 0 on success (or if the removal would succeed, or an error
+ * code on failure.
+ */
 int
-deregister_tcp_functions(struct tcp_function_block *blk)
+deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+    bool force)
 {
 	struct tcp_function *f;
-	int error=ENOENT;
 	
 	if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
 		/* You can't un-register the default */
@@ -748,22 +928,63 @@ deregister_tcp_functions(struct tcp_function_block *blk)
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
+	/* Mark the block so no more stacks can use it. */
+	blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
+	/*
+	 * If TCBs are still attached to the stack, attempt to switch them
+	 * to the default stack.
+	 */
+	if (force && blk->tfb_refcnt) {
+		struct inpcb *inp;
+		struct tcpcb *tp;
+		VNET_ITERATOR_DECL(vnet_iter);
+
+		rw_wunlock(&tcp_function_lock);
+
+		VNET_LIST_RLOCK();
+		VNET_FOREACH(vnet_iter) {
+			CURVNET_SET(vnet_iter);
+			INP_INFO_WLOCK(&V_tcbinfo);
+			LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+				INP_WLOCK(inp);
+				if (inp->inp_flags & INP_TIMEWAIT) {
+					INP_WUNLOCK(inp);
+					continue;
+				}
+				tp = intotcpcb(inp);
+				if (tp == NULL || tp->t_fb != blk) {
+					INP_WUNLOCK(inp);
+					continue;
+				}
+				tcp_switch_back_to_default(tp);
+				INP_WUNLOCK(inp);
+			}
+			INP_INFO_WUNLOCK(&V_tcbinfo);
+			CURVNET_RESTORE();
+		}
+		VNET_LIST_RUNLOCK();
+
+		rw_wlock(&tcp_function_lock);
+	}
 	if (blk->tfb_refcnt) {
-		/* Still tcb attached, mark it. */
-		blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
-		rw_wunlock(&tcp_function_lock);		
+		/* TCBs still attached. */
+		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
+	if (quiesce) {
+		/* Skip removal. */
+		rw_wunlock(&tcp_function_lock);
+		return (0);
+	}
+	/* Remove any function names that map to this function block. */
 	while (find_tcp_fb_locked(blk, &f) != NULL) {
-		/* Found */
 		TAILQ_REMOVE(&t_functions, f, tf_next);
 		tcp_fb_cnt--;
 		f->tf_fb = NULL;
 		free(f, M_TCPFUNCTIONS);
-		error = 0;
 	}
 	rw_wunlock(&tcp_function_lock);
-	return (error);
+	return (0);
 }
 
 void
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index a91fd1eca220..8061e512fd7a 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -852,6 +852,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_fb = rblk;
+		/*
+		 * XXXrrs this is quite dangerous, it is possible
+		 * for the new function to fail to init. We also
+		 * are not asking if the handoff_is_ok though at
+		 * the very start thats probalbly ok.
+		 */
 		if (tp->t_fb->tfb_tcp_fb_init) {
 			(*tp->t_fb->tfb_tcp_fb_init)(tp);
 		}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index c824fbbf2202..c9d8c844e7e4 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1521,17 +1521,34 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 			 */
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 		}
+#ifdef TCPHPTS 
+		/* Assure that we are not on any hpts */
+		tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
+#endif
+		if (blk->tfb_tcp_fb_init) {
+			error = (*blk->tfb_tcp_fb_init)(tp);
+			if (error) {
+				refcount_release(&blk->tfb_refcnt);
+				if (tp->t_fb->tfb_tcp_fb_init) {
+					if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0)  {
+						/* Fall back failed, drop the connection */
+						INP_WUNLOCK(inp);
+						soabort(so);
+						return(error);
+					}
+				}
+				goto err_out;
+			}
+		}
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_fb = blk;
-		if (tp->t_fb->tfb_tcp_fb_init) {
-			(*tp->t_fb->tfb_tcp_fb_init)(tp);
-		}
 #ifdef TCP_OFFLOAD
 		if (tp->t_flags & TF_TOE) {
 			tcp_offload_ctloutput(tp, sopt->sopt_dir,
 			     sopt->sopt_name);
 		}
 #endif
+err_out:
 		INP_WUNLOCK(inp);
 		return (error);
 	} else if ((sopt->sopt_dir == SOPT_GET) && 
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 53a748ebb2dd..2c1847740287 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -90,6 +90,8 @@ struct tcpcb {
 	int	t_segqlen;		/* segment reassembly queue length */
 	int	t_dupacks;		/* consecutive dup acks recd */
 
+	struct mbuf      *t_in_pkt;	/* head of the input packet queue for the tcp_hpts system */
+	struct mbuf	 *t_tail_pkt;	/* tail of the input packet queue for the tcp_hpts system */
 	struct tcp_timer *t_timers;	/* All the TCP timers in one struct */
 
 	struct	inpcb *t_inpcb;		/* back pointer to internet pcb */
@@ -257,14 +259,19 @@ struct tcptemp {
 struct tcp_function_block {
 	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
 	int	(*tfb_tcp_output)(struct tcpcb *);
+	int	(*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
 	void	(*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 			    int, int, uint8_t,
 			    int);
+	void	(*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
+			    struct socket *, struct tcpcb *,
+			    int, int, uint8_t,
+			    int, int, struct timeval *);
 	int     (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
 			    struct inpcb *inp, struct tcpcb *tp);
 	/* Optional memory allocation/free routine */
-	void	(*tfb_tcp_fb_init)(struct tcpcb *);
+	int	(*tfb_tcp_fb_init)(struct tcpcb *);
 	void	(*tfb_tcp_fb_fini)(struct tcpcb *, int);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
@@ -274,6 +281,7 @@ struct tcp_function_block {
 	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_rexmit_tmr)(struct tcpcb *);
 	int	(*tfb_tcp_handoff_ok)(struct tcpcb *);
+	void	(*tfb_tcp_mtu_chg)(struct tcpcb *);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 	uint8_t	tfb_id;
@@ -851,9 +859,12 @@ int register_tcp_functions_as_names(struct tcp_function_block *blk,
     int wait, const char *names[], int *num_names);
 int register_tcp_functions_as_name(struct tcp_function_block *blk,
     const char *name, int wait);
-int deregister_tcp_functions(struct tcp_function_block *blk);
+int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+    bool force);
 struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
-struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+void tcp_switch_back_to_default(struct tcpcb *tp);
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *fs);
 int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
 
 uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
diff --git a/sys/sys/kern_prefetch.h b/sys/sys/kern_prefetch.h
new file mode 100644
index 000000000000..5acf06597498
--- /dev/null
+++ b/sys/sys/kern_prefetch.h
@@ -0,0 +1,50 @@
+#ifndef __kern_prefetch_h__
+/*-
+ * Copyright (c) 2016-8
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$")
+ */
+#define __kern_prefetch_h__
+#ifdef _KERNEL
+#if defined(__amd64__)
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/pmap.h>
+#endif 
+
+static __inline void
+kern_prefetch(const volatile void *addr, void* before)
+{
+#if defined(__amd64__)
+	__asm __volatile("prefetcht1 (%1)":"=rm"(*((int32_t *)before)):"r"(addr):);
+#else
+	__builtin_prefetch(addr);
+#endif
+}
+
+#endif
+#endif
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index ba1e88c6175d..81aed4e75f88 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -196,6 +196,11 @@ struct pkthdr {
 #define	lro_nsegs	tso_segsz
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
+#define pace_thoff	PH_loc.sixteen[0]
+#define pace_tlen	PH_loc.sixteen[1]
+#define pace_drphdrlen	PH_loc.sixteen[2]
+#define pace_tos	PH_loc.eight[6]
+#define pace_lock	PH_loc.eight[7]
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is