aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/conf/files1
-rw-r--r--sys/conf/options1
-rw-r--r--sys/netinet/in_pcb.c48
-rw-r--r--sys/netinet/in_pcb.h80
-rw-r--r--sys/netinet/tcp_hpts.c1964
-rw-r--r--sys/netinet/tcp_hpts.h304
-rw-r--r--sys/netinet/tcp_stacks/fastpath.c10
-rw-r--r--sys/netinet/tcp_subr.c261
-rw-r--r--sys/netinet/tcp_syncache.c6
-rw-r--r--sys/netinet/tcp_usrreq.c23
-rw-r--r--sys/netinet/tcp_var.h17
-rw-r--r--sys/sys/kern_prefetch.h50
-rw-r--r--sys/sys/mbuf.h5
13 files changed, 2735 insertions, 35 deletions
diff --git a/sys/conf/files b/sys/conf/files
index b2da16980cee..60a172d2f18d 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4355,6 +4355,7 @@ netinet/tcp_log_buf.c optional tcp_blackbox inet | tcp_blackbox inet6
netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
+netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
diff --git a/sys/conf/options b/sys/conf/options
index 2c50d3b47f3f..980cf73df885 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -218,6 +218,7 @@ SYSVMSG opt_sysvipc.h
SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
+TCPHPTS opt_inet.h
TURNSTILE_PROFILING
UMTX_PROFILING
UMTX_CHAINS opt_global.h
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 8545bd833a9c..19eb5af9596d 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/rmlock.h>
+#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
@@ -87,6 +88,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
+#ifdef TCPHPTS
+#include <netinet/tcp_hpts.h>
+#endif
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#endif
@@ -1224,9 +1228,28 @@ in_pcbrele_rlocked(struct inpcb *inp)
}
return (0);
}
-
+
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+ if (inp->inp_in_hpts || inp->inp_in_input) {
+ struct tcp_hpts_entry *hpts;
+ /*
+ * We should not be on the hpts at
+ * this point in any form. we must
+ * get the lock to be sure.
+ */
+ hpts = tcp_hpts_lock(inp);
+ if (inp->inp_in_hpts)
+ panic("Hpts:%p inp:%p at free still on hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if (inp->inp_in_input)
+ panic("Hpts:%p inp:%p at free still on input hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ }
+#endif
INP_RUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
@@ -1255,7 +1278,26 @@ in_pcbrele_wlocked(struct inpcb *inp)
}
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+ if (inp->inp_in_hpts || inp->inp_in_input) {
+ struct tcp_hpts_entry *hpts;
+ /*
+ * We should not be on the hpts at
+ * this point in any form. we must
+ * get the lock to be sure.
+ */
+ hpts = tcp_hpts_lock(inp);
+ if (inp->inp_in_hpts)
+ panic("Hpts:%p inp:%p at free still on hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if (inp->inp_in_input)
+ panic("Hpts:%p inp:%p at free still on input hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ }
+#endif
INP_WUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 574ab4077aea..f4b6da20a2db 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -156,6 +156,7 @@ struct in_conninfo {
* from the global list.
*
* Key:
+ * (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
@@ -164,6 +165,51 @@ struct in_conninfo {
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
+ *
+ * Notes on the tcp_hpts:
+ *
+ * First Hpts lock order is
+ * 1) INP_WLOCK()
+ * 2) HPTS_LOCK() i.e. hpts->pmtx
+ *
+ * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
+ * You may check the inp->inp_in_hpts flag without the hpts lock.
+ * The hpts is the only one that will clear this flag holding
+ * only the hpts lock. This means that in your tcp_output()
+ * routine when you test for the inp_in_hpts flag to be 1
+ * it may be transitioning to 0 (by the hpts).
+ * That's ok since that will just mean an extra call to tcp_output
+ * that most likely will find the call you executed
+ * (when the mis-match occured) will have put the TCB back
+ * on the hpts and it will return. If your
+ * call did not add the inp back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Though usually
+ * you are either doing this from a timer, where you need and have
+ * the INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ *
+ * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
+ * inp_input_cpu_set fields are controlled completely by
+ * the hpts. Do not ever set these. The inp_hpts_cpu_set
+ * and inp_input_cpu_set fields indicate if the hpts has
+ * setup the respective cpu field. It is advised if this
+ * field is 0, to enqueue the packet with the appropriate
+ * hpts_immediate() call. If the _set field is 1, then
+ * you may compare the inp_*_cpu field to the curcpu and
+ * may want to again insert onto the hpts if these fields
+ * are not equal (i.e. you are not on the expected CPU).
+ *
+ * A note on inp_hpts_calls and inp_input_calls, these
+ * flags are set when the hpts calls either the output
+ * or do_segment routines respectively. If the routine
+ * being called wants to use this, then it needs to
+ * clear the flag before returning. The hpts will not
+ * clear the flag. The flags can be used to tell if
+ * the hpts is the function calling the respective
+ * routine.
*
* A few other notes:
*
@@ -190,14 +236,45 @@ struct inpcb {
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
-#define inp_start_zero inp_refcount
+#define inp_start_zero inp_hpts
#define inp_zero_size (sizeof(struct inpcb) - \
offsetof(struct inpcb, inp_start_zero))
+ TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */
+
+ uint32_t inp_hpts_request; /* Current hpts request, zero if
+ * fits in the pacing window (i&b). */
+ /*
+ * Note the next fields are protected by a
+ * different lock (hpts-lock). This means that
+ * they must correspond in size to the smallest
+ * protectable bit field (uint8_t on x86, and
+ * other platfomrs potentially uint32_t?). Also
+ * since CPU switches can occur at different times the two
+ * fields can *not* be collapsed into a signal bit field.
+ */
+#if defined(__amd64__) || defined(__i386__)
+ volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
+ volatile uint8_t inp_in_input; /* on input hpts (lock b) */
+#else
+ volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
+ volatile uint32_t inp_in_input; /* on input hpts (lock b) */
+#endif
+ volatile uint16_t inp_hpts_cpu; /* Lock (i) */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
+ volatile uint16_t inp_input_cpu; /* Lock (i) */
+ volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */
+ inp_input_cpu_set : 1, /* on input hpts (i) */
+ inp_hpts_calls :1, /* (i) from output hpts */
+ inp_input_calls :1, /* (i) from input hpts */
+ inp_spare_bits2 : 4;
+ uint8_t inp_spare_byte; /* Compiler hole */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */
+ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */
+ uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */
+ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
@@ -638,6 +715,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
+#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
/*
* Flags passed to in_pcblookup*() functions.
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
new file mode 100644
index 000000000000..b3b8c9c0bb84
--- /dev/null
+++ b/sys/netinet/tcp_hpts.c
@@ -0,0 +1,1964 @@
+/*-
+ * Copyright (c) 2016-8
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+/**
+ * Some notes about usage.
+ *
+ * The tcp_hpts system is designed to provide a high precision timer
+ * system for tcp. Its main purpose is to provide a mechanism for
+ * pacing packets out onto the wire. It can be used in two ways
+ * by a given TCP stack (and those two methods can be used simultaneously).
+ *
+ * First, and probably the main thing its used by Rack and BBR for, it can
+ * be used to call tcp_output() of a transport stack at some time in the future.
+ * The normal way this is done is that tcp_output() of the stack schedules
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
+ * slot is the time from now that the stack wants to be called but it
+ * must be converted to tcp_hpts's notion of slot. This is done with
+ * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * call from the tcp_output() routine might look like:
+ *
+ * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ *
+ * The above would schedule tcp_ouput() to be called in 550 useconds.
+ * Note that if using this mechanism the stack will want to add near
+ * its top a check to prevent unwanted calls (from user land or the
+ * arrival of incoming ack's). So it would add something like:
+ *
+ * if (inp->inp_in_hpts)
+ * return;
+ *
+ * to prevent output processing until the time alotted has gone by.
+ * Of course this is a bare bones example and the stack will probably
+ * have more consideration then just the above.
+ *
+ * Now the tcp_hpts system will call tcp_output in one of two forms,
+ * it will first check to see if the stack as defined a
+ * tfb_tcp_output_wtime() function, if so that is the routine it
+ * will call, if that function is not defined then it will call the
+ * tfb_tcp_output() function. The only difference between these
+ * two calls is that the former passes the time in to the function
+ * so the function does not have to access the time (which tcp_hpts
+ * already has). What these functions do is of course totally up
+ * to the individual tcp stack.
+ *
+ * Now the second function (actually two functions I guess :D)
+ * the tcp_hpts system provides is the ability to either abort
+ * a connection (later) or process input on a connection.
+ * Why would you want to do this? To keep processor locality.
+ *
+ * So in order to use the input redirection function the
+ * stack changes its tcp_do_segment() routine to instead
+ * of process the data call the function:
+ *
+ * tcp_queue_pkt_to_input()
+ *
+ * You will note that the arguments to this function look
+ * a lot like tcp_do_segments's arguments. This function
+ * will assure that the tcp_hpts system will
+ * call the functions tfb_tcp_hpts_do_segment() from the
+ * correct CPU. Note that multiple calls can get pushed
+ * into the tcp_hpts system this will be indicated by
+ * the next to last argument to tfb_tcp_hpts_do_segment()
+ * (nxt_pkt). If nxt_pkt is a 1 then another packet is
+ * coming. If nxt_pkt is a 0 then this is the last call
+ * that the tcp_hpts system has available for the tcp stack.
+ *
+ * The other point of the input system is to be able to safely
+ * drop a tcp connection without worrying about the recursive
+ * locking that may be occuring on the INP_WLOCK. So if
+ * a stack wants to drop a connection it calls:
+ *
+ * tcp_set_inp_to_drop(tp, ETIMEDOUT)
+ *
+ * To schedule the tcp_hpts system to call
+ *
+ * tcp_drop(tp, drop_reason)
+ *
+ * at a future point. This is quite handy to prevent locking
+ * issues when dropping connections.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/counter.h>
+#include <sys/time.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_hpts.h>
+
+#ifdef tcpdebug
+#include <netinet/tcp_debug.h>
+#endif /* tcpdebug */
+#ifdef tcp_offload
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef ipsec
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* ipsec */
+#include "opt_rss.h"
+
+MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+#ifdef RSS
+static int tcp_bind_threads = 1;
+#else
+static int tcp_bind_threads = 0;
+#endif
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+
+static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
+
+TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
+
+static struct tcp_hptsi tcp_pace;
+
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp,
+ struct tcpcb **tp);
+static void tcp_wakehpts(struct tcp_hpts_entry *p);
+static void tcp_wakeinput(struct tcp_hpts_entry *p);
+static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hpts_thread(void *ctx);
+static void tcp_init_hptsi(void *st);
+
+int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
+static int32_t tcp_hpts_callout_skip_swi = 0;
+
+SYSCTL_DECL(_net_inet_tcp);
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
+
+#define timersub(tvp, uvp, vvp) \
+ do { \
+ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
+ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
+ if ((vvp)->tv_usec < 0) { \
+ (vvp)->tv_sec--; \
+ (vvp)->tv_usec += 1000000; \
+ } \
+ } while (0)
+
+static int32_t logging_on = 0;
+static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
+static int32_t tcp_hpts_precision = 120;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+ &tcp_hpts_precision, 120,
+ "Value for PRE() precision of callout");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+ &logging_on, 0,
+ "Turn on logging if compiled in");
+
+counter_u64_t hpts_loops;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+ &hpts_loops, "Number of times hpts had to loop to catch up");
+
+counter_u64_t back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+ &back_tosleep, "Number of times hpts found no tcbs");
+
+static int32_t in_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
+ &in_newts_every_tcb, 0,
+ "Do we have a new cts every tcb we process for input");
+static int32_t in_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
+ &in_ts_percision, 0,
+ "Do we use percise timestamp for clients on input");
+static int32_t out_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
+ &out_newts_every_tcb, 0,
+ "Do we have a new cts every tcb we process for output");
+static int32_t out_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
+ &out_ts_percision, 0,
+ "Do we use a percise timestamp for every output cts");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+ &hpts_sleep_max, 0,
+ "The maximum time the hpts will sleep <1 - 254>");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+ &tcp_min_hptsi_time, 0,
+ "The minimum time the hpts must sleep before processing more slots");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
+ &tcp_hpts_callout_skip_swi, 0,
+ "Do we have the callout call directly to the hpts?");
+
+static void
+__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
+ uint32_t ticknow, int32_t line)
+{
+ struct hpts_log *pl;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (hpts->p_log == NULL)
+ return;
+ pl = &hpts->p_log[hpts->p_log_at];
+ hpts->p_log_at++;
+ if (hpts->p_log_at >= hpts->p_logsize) {
+ hpts->p_log_at = 0;
+ hpts->p_log_wrapped = 1;
+ }
+ pl->inp = inp;
+ if (inp) {
+ pl->t_paceslot = inp->inp_hptsslot;
+ pl->t_hptsreq = inp->inp_hpts_request;
+ pl->p_onhpts = inp->inp_in_hpts;
+ pl->p_oninput = inp->inp_in_input;
+ } else {
+ pl->t_paceslot = 0;
+ pl->t_hptsreq = 0;
+ pl->p_onhpts = 0;
+ pl->p_oninput = 0;
+ }
+ pl->is_notempty = 1;
+ pl->event = event;
+ pl->line = line;
+ pl->cts = tcp_get_usecs(NULL);
+ pl->p_curtick = hpts->p_curtick;
+ pl->p_prevtick = hpts->p_prevtick;
+ pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
+ pl->ticknow = ticknow;
+ pl->slot_req = slot;
+ pl->p_nxt_slot = hpts->p_nxt_slot;
+ pl->p_cur_slot = hpts->p_cur_slot;
+ pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
+ pl->p_flags = (hpts->p_cpu & 0x7f);
+ pl->p_flags <<= 7;
+ pl->p_flags |= (hpts->p_num & 0x7f);
+ pl->p_flags <<= 2;
+ if (hpts->p_hpts_active) {
+ pl->p_flags |= HPTS_HPTS_ACTIVE;
+ }
+}
+
+#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
+
+static void
+hpts_timeout_swi(void *arg)
+{
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+ swi_sched(hpts->ie_cookie, 0);
+}
+
+static void
+hpts_timeout_dir(void *arg)
+{
+ tcp_hpts_thread(arg);
+}
+
+static inline void
+hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_hpts_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if (inp->inp_in_hpts == 0) {
+ /* We are not on the hpts? */
+ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
+ }
+ if (TAILQ_EMPTY(head) &&
+ (hpts->p_on_queue_cnt != 0)) {
+ /* We should not be empty with a queue count */
+ panic("%s hpts:%p hpts bucket empty but cnt:%d",
+ __FUNCTION__, hpts, hpts->p_on_queue_cnt);
+ }
+#endif
+ TAILQ_REMOVE(head, inp, inp_hpts);
+ hpts->p_on_queue_cnt--;
+ if (hpts->p_on_queue_cnt < 0) {
+ /* Count should not go negative .. */
+#ifdef INVARIANTS
+ panic("Hpts goes negative inp:%p hpts:%p",
+ inp, hpts);
+#endif
+ hpts->p_on_queue_cnt = 0;
+ }
+ if (clear) {
+ inp->inp_hpts_request = 0;
+ inp->inp_in_hpts = 0;
+ }
+}
+
+static inline void
+hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_hpts_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if ((noref == 0) && (inp->inp_in_hpts == 1)) {
+ /* We are already on the hpts? */
+ panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
+ }
+#endif
+ TAILQ_INSERT_TAIL(head, inp, inp_hpts);
+ inp->inp_in_hpts = 1;
+ hpts->p_on_queue_cnt++;
+ if (noref == 0) {
+ in_pcbref(inp);
+ }
+}
+
+static inline void
+hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_input_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if (inp->inp_in_input == 0) {
+ /* We are not on the input hpts? */
+ panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
+ }
+#endif
+ TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
+ hpts->p_on_inqueue_cnt--;
+ if (hpts->p_on_inqueue_cnt < 0) {
+#ifdef INVARIANTS
+ panic("Hpts in goes negative inp:%p hpts:%p",
+ inp, hpts);
+#endif
+ hpts->p_on_inqueue_cnt = 0;
+ }
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&hpts->p_input) &&
+ (hpts->p_on_inqueue_cnt != 0)) {
+ /* We should not be empty with a queue count */
+ panic("%s hpts:%p in_hpts input empty but cnt:%d",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
+ }
+#endif
+ if (clear)
+ inp->inp_in_input = 0;
+}
+
+static inline void
+hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_input_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if (inp->inp_in_input == 1) {
+ /* We are already on the input hpts? */
+ panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
+ }
+#endif
+ TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
+ inp->inp_in_input = 1;
+ hpts->p_on_inqueue_cnt++;
+ in_pcbref(inp);
+}
+
+static int
+sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
+{
+ struct tcp_hpts_entry *hpts;
+ size_t sz;
+ int32_t logging_was, i;
+ int32_t error = 0;
+
+ /*
+ * HACK: Turn off logging so no locks are required this really needs
+ * a memory barrier :)
+ */
+ logging_was = logging_on;
+ logging_on = 0;
+ if (!req->oldptr) {
+ /* How much? */
+ sz = 0;
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ hpts = tcp_pace.rp_ent[i];
+ if (hpts->p_log == NULL)
+ continue;
+ sz += (sizeof(struct hpts_log) * hpts->p_logsize);
+ }
+ error = SYSCTL_OUT(req, 0, sz);
+ } else {
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ hpts = tcp_pace.rp_ent[i];
+ if (hpts->p_log == NULL)
+ continue;
+ if (hpts->p_log_wrapped)
+ sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+ else
+ sz = (sizeof(struct hpts_log) * hpts->p_log_at);
+ error = SYSCTL_OUT(req, hpts->p_log, sz);
+ }
+ }
+ logging_on = logging_was;
+ return error;
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
+
+
+/*
+ * Try to get the INP_INFO lock.
+ *
+ * This function always succeeds in getting the lock. It will clear
+ * *tpp and return (1) if something critical changed while the inpcb
+ * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
+ *
+ * This function relies on the fact that the hpts always holds a
+ * reference on the inpcb while the segment is on the hptsi wheel and
+ * in the input queue.
+ *
+ */
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
+{
+ struct tcp_function_block *tfb;
+ struct tcpcb *tp;
+ void *ptr;
+
+ /* Try the easy way. */
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
+ return (0);
+
+ /*
+ * OK, let's try the hard way. We'll save the function pointer block
+ * to make sure that doesn't change while we aren't holding the
+ * lock.
+ */
+ tp = *tpp;
+ tfb = tp->t_fb;
+ ptr = tp->t_fb_ptr;
+ INP_WUNLOCK(inp);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ /* If the session went away, return an error. */
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ *tpp = NULL;
+ return (1);
+ }
+ /*
+ * If the function block or stack-specific data block changed,
+ * report an error.
+ */
+ tp = intotcpcb(inp);
+ if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
+ *tpp = NULL;
+ return (1);
+ }
+ return (0);
+}
+
+
+static void
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
+{
+ HPTS_MTX_ASSERT(hpts);
+ swi_sched(hpts->ie_cookie, 0);
+ if (hpts->p_hpts_active == 2) {
+ /* Rare sleeping on a ENOBUF */
+ wakeup_one(hpts);
+ }
+}
+
+static void
+tcp_wakeinput(struct tcp_hpts_entry *hpts)
+{
+ HPTS_MTX_ASSERT(hpts);
+ swi_sched(hpts->ie_cookie, 0);
+ if (hpts->p_hpts_active == 2) {
+ /* Rare sleeping on a ENOBUF */
+ wakeup_one(hpts);
+ }
+}
+
+struct tcp_hpts_entry *
+tcp_cur_hpts(struct inpcb *inp)
+{
+ int32_t hpts_num;
+ struct tcp_hpts_entry *hpts;
+
+ hpts_num = inp->inp_hpts_cpu;
+ hpts = tcp_pace.rp_ent[hpts_num];
+ return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_hpts_lock(struct inpcb *inp)
+{
+ struct tcp_hpts_entry *hpts;
+ int32_t hpts_num;
+
+again:
+ hpts_num = inp->inp_hpts_cpu;
+ hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ mtx_lock(&hpts->p_mtx);
+ if (hpts_num != inp->inp_hpts_cpu) {
+ mtx_unlock(&hpts->p_mtx);
+ goto again;
+ }
+ return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_input_lock(struct inpcb *inp)
+{
+ struct tcp_hpts_entry *hpts;
+ int32_t hpts_num;
+
+again:
+ hpts_num = inp->inp_input_cpu;
+ hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ mtx_lock(&hpts->p_mtx);
+ if (hpts_num != inp->inp_input_cpu) {
+ mtx_unlock(&hpts->p_mtx);
+ goto again;
+ }
+ return (hpts);
+}
+
+static void
+tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
+{
+ int32_t add_freed;
+
+ if (inp->inp_flags2 & INP_FREED) {
+ /*
+ * Need to play a special trick so that in_pcbrele_wlocked
+ * does not return 1 when it really should have returned 0.
+ */
+ add_freed = 1;
+ inp->inp_flags2 &= ~INP_FREED;
+ } else {
+ add_freed = 0;
+ }
+#ifndef INP_REF_DEBUG
+ if (in_pcbrele_wlocked(inp)) {
+ /*
+ * This should not happen. We have the inpcb referred to by
+ * the main socket (why we are called) and the hpts. It
+ * should always return 0.
+ */
+ panic("inpcb:%p release ret 1",
+ inp);
+ }
+#else
+ if (__in_pcbrele_wlocked(inp, line)) {
+ /*
+ * This should not happen. We have the inpcb referred to by
+ * the main socket (why we are called) and the hpts. It
+ * should always return 0.
+ */
+ panic("inpcb:%p release ret 1",
+ inp);
+ }
+#endif
+ if (add_freed) {
+ inp->inp_flags2 |= INP_FREED;
+ }
+}
+
+static void
+tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+ if (inp->inp_in_hpts) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
+ tcp_remove_hpts_ref(inp, hpts, line);
+ }
+}
+
+static void
+tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+ HPTS_MTX_ASSERT(hpts);
+ if (inp->inp_in_input) {
+ hpts_sane_input_remove(hpts, inp, 1);
+ tcp_remove_hpts_ref(inp, hpts, line);
+ }
+}
+
+/*
+ * Called normally with the INP_LOCKED but it
+ * does not matter, the hpts lock is the key
+ * but the lock order allows us to hold the
+ * INP lock and then get the hpts lock.
+ *
+ * Valid values in the flags are
+ * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
+ * HPTS_REMOVE_INPUT - remove from the input of the hpts.
+ * Note that you can or both values together and get two
+ * actions.
+ */
+void
+__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
+{
+ struct tcp_hpts_entry *hpts;
+
+ INP_WLOCK_ASSERT(inp);
+ if (flags & HPTS_REMOVE_OUTPUT) {
+ hpts = tcp_hpts_lock(inp);
+ tcp_hpts_remove_locked_output(hpts, inp, flags, line);
+ mtx_unlock(&hpts->p_mtx);
+ }
+ if (flags & HPTS_REMOVE_INPUT) {
+ hpts = tcp_input_lock(inp);
+ tcp_hpts_remove_locked_input(hpts, inp, flags, line);
+ mtx_unlock(&hpts->p_mtx);
+ }
+}
+
+static inline int
+hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+{
+ return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+}
+
+static int
+tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
+{
+ int32_t need_wake = 0;
+ uint32_t ticknow = 0;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (inp->inp_in_hpts == 0) {
+ /* Ok we need to set it on the hpts in the current slot */
+ if (hpts->p_hpts_active == 0) {
+ /* A sleeping hpts we want in next slot to run */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
+ hpts_tick(hpts, 1));
+ }
+ inp->inp_hptsslot = hpts_tick(hpts, 1);
+ inp->inp_hpts_request = 0;
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
+ }
+ need_wake = 1;
+ } else if ((void *)inp == hpts->p_inp) {
+ /*
+ * We can't allow you to go into the same slot we
+ * are in. We must put you out.
+ */
+ inp->inp_hptsslot = hpts->p_nxt_slot;
+ } else
+ inp->inp_hptsslot = hpts->p_cur_slot;
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+ inp->inp_hpts_request = 0;
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
+ }
+ if (need_wake) {
+ /*
+ * Activate the hpts if it is sleeping and its
+ * timeout is not 1.
+ */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
+ }
+ hpts->p_direct_wake = 1;
+ tcp_wakehpts(hpts);
+ }
+ }
+ return (need_wake);
+}
+
+int
+__tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
+{
+ int32_t ret;
+ struct tcp_hpts_entry *hpts;
+
+ INP_WLOCK_ASSERT(inp);
+ hpts = tcp_hpts_lock(inp);
+ ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
+ mtx_unlock(&hpts->p_mtx);
+ return (ret);
+}
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
+ struct hpts_diag *diag, int32_t noref)
+{
+ int32_t need_new_to = 0;
+ int32_t need_wakeup = 0;
+ uint32_t largest_slot;
+ uint32_t ticknow = 0;
+ uint32_t slot_calc;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (diag) {
+ memset(diag, 0, sizeof(struct hpts_diag));
+ diag->p_hpts_active = hpts->p_hpts_active;
+ diag->p_nxt_slot = hpts->p_nxt_slot;
+ diag->p_cur_slot = hpts->p_cur_slot;
+ diag->slot_req = slot;
+ }
+ if ((inp->inp_in_hpts == 0) || noref) {
+ inp->inp_hpts_request = slot;
+ if (slot == 0) {
+ /* Immediate */
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+ return;
+ }
+ if (hpts->p_hpts_active) {
+ /*
+ * Its slot - 1 since nxt_slot is the next tick that
+ * will go off since the hpts is awake
+ */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
+ }
+ /*
+ * We want to make sure that we don't place a inp in
+ * the range of p_cur_slot <-> p_nxt_slot. If we
+ * take from p_nxt_slot to the end, plus p_cur_slot
+ * and then take away 2, we will know how many is
+ * the max slots we can use.
+ */
+ if (hpts->p_nxt_slot > hpts->p_cur_slot) {
+ /*
+ * Non-wrap case nxt_slot <-> cur_slot we
+ * don't want to land in. So the diff gives
+ * us what is taken away from the number of
+ * slots.
+ */
+ largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
+ } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
+ largest_slot = NUM_OF_HPTSI_SLOTS - 2;
+ } else {
+ /*
+ * Wrap case so the diff gives us the number
+ * of slots that we can land in.
+ */
+ largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+ }
+ /*
+ * We take away two so we never have a problem (20
+ * usec's) out of 1024000 usecs
+ */
+ largest_slot -= 2;
+ if (inp->inp_hpts_request > largest_slot) {
+ /*
+ * Restrict max jump of slots and remember
+ * leftover
+ */
+ slot = largest_slot;
+ inp->inp_hpts_request -= largest_slot;
+ } else {
+ /* This one will run when we hit it */
+ inp->inp_hpts_request = 0;
+ }
+ if (hpts->p_nxt_slot == hpts->p_cur_slot)
+ slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
+ else
+ slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
+ if (slot_calc == hpts->p_cur_slot) {
+#ifdef INVARIANTS
+ /* TSNH */
+ panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
+ hpts, slot_calc, slot, largest_slot);
+#endif
+ if (slot_calc)
+ slot_calc--;
+ else
+ slot_calc = NUM_OF_HPTSI_SLOTS - 1;
+ }
+ inp->inp_hptsslot = slot_calc;
+ if (diag) {
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
+ } else {
+ /*
+ * The hpts is sleeping, we need to figure out where
+ * it will wake up at and if we need to reschedule
+ * its time-out.
+ */
+ uint32_t have_slept, yet_to_sleep;
+ uint32_t slot_now;
+ struct timeval tv;
+
+ ticknow = tcp_gethptstick(&tv);
+ slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
+ /*
+ * The user wants to be inserted at (slot_now +
+ * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
+ */
+ largest_slot = NUM_OF_HPTSI_SLOTS - 2;
+ if (inp->inp_hpts_request > largest_slot) {
+ /* Adjust the residual in inp_hpts_request */
+ slot = largest_slot;
+ inp->inp_hpts_request -= largest_slot;
+ } else {
+ /* No residual it all fits */
+ inp->inp_hpts_request = 0;
+ }
+ inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
+ if (diag) {
+ diag->slot_now = slot_now;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ diag->p_on_min_sleep = hpts->p_on_min_sleep;
+ }
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
+ }
+ /* Now do we need to restart the hpts's timer? */
+ if (TSTMP_GT(ticknow, hpts->p_curtick))
+ have_slept = ticknow - hpts->p_curtick;
+ else
+ have_slept = 0;
+ if (have_slept < hpts->p_hpts_sleep_time) {
+ /* This should be what happens */
+ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
+ } else {
+ /* We are over-due */
+ yet_to_sleep = 0;
+ need_wakeup = 1;
+ }
+ if (diag) {
+ diag->have_slept = have_slept;
+ diag->yet_to_sleep = yet_to_sleep;
+ diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
+ }
+ if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+ /*
+ * We need to reschedule the hptss time-out.
+ */
+ hpts->p_hpts_sleep_time = slot;
+ need_new_to = slot * HPTS_TICKS_PER_USEC;
+ }
+ }
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
+ }
+ /*
+ * Now how far is the hpts sleeping to? if active is 1, its
+ * up and ticking we do nothing, otherwise we may need to
+ * reschedule its callout if need_new_to is set from above.
+ */
+ if (need_wakeup) {
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
+ }
+ hpts->p_direct_wake = 1;
+ tcp_wakehpts(hpts);
+ if (diag) {
+ diag->need_new_to = 0;
+ diag->co_ret = 0xffff0000;
+ }
+ } else if (need_new_to) {
+ int32_t co_ret;
+ struct timeval tv;
+ sbintime_t sb;
+
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ while (need_new_to > HPTS_USEC_IN_SEC) {
+ tv.tv_sec++;
+ need_new_to -= HPTS_USEC_IN_SEC;
+ }
+ tv.tv_usec = need_new_to;
+ sb = tvtosbt(tv);
+ if (tcp_hpts_callout_skip_swi == 0) {
+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else {
+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_dir, hpts,
+ hpts->p_cpu,
+ C_PREL(tcp_hpts_precision));
+ }
+ if (diag) {
+ diag->need_new_to = need_new_to;
+ diag->co_ret = co_ret;
+ }
+ }
+ } else {
+#ifdef INVARIANTS
+ panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
+#endif
+ }
+}
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+ struct tcp_hpts_entry *hpts;
+ uint32_t slot_on, cts;
+ struct timeval tv;
+
+ /*
+ * We now return the next-slot the hpts will be on, beyond its
+ * current run (if up) or where it was when it stopped if it is
+ * sleeping.
+ */
+ INP_WLOCK_ASSERT(inp);
+ hpts = tcp_hpts_lock(inp);
+ if (in_ts_percision)
+ microuptime(&tv);
+ else
+ getmicrouptime(&tv);
+ cts = tcp_tv_to_usectick(&tv);
+ tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+ slot_on = hpts->p_nxt_slot;
+ mtx_unlock(&hpts->p_mtx);
+ return (slot_on);
+}
+
+uint32_t
+__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
+ return (tcp_hpts_insert_diag(inp, slot, line, NULL));
+}
+
+int
+__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
+{
+ int32_t retval = 0;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (inp->inp_in_input == 0) {
+ /* Ok we need to set it on the hpts in the current slot */
+ hpts_sane_input_insert(hpts, inp, line);
+ retval = 1;
+ if (hpts->p_hpts_active == 0) {
+ /*
+ * Activate the hpts if it is sleeping.
+ */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
+ }
+ retval = 2;
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ } else if (hpts->p_hpts_active == 0) {
+ retval = 4;
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ return (retval);
+}
+
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked)
+{
+ /* Setup packet for input first */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
+ m->m_pkthdr.pace_tlen = (uint16_t) tlen;
+ m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
+ m->m_pkthdr.pace_tos = iptos;
+ m->m_pkthdr.pace_lock = (uint8_t) ti_locked;
+ if (tp->t_in_pkt == NULL) {
+ tp->t_in_pkt = m;
+ tp->t_tail_pkt = m;
+ } else {
+ tp->t_tail_pkt->m_nextpkt = m;
+ tp->t_tail_pkt = m;
+ }
+}
+
+
+int32_t
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line){
+ struct tcp_hpts_entry *hpts;
+ int32_t ret;
+
+ tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos, ti_locked);
+ hpts = tcp_input_lock(tp->t_inpcb);
+ ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+ mtx_unlock(&hpts->p_mtx);
+ return (ret);
+}
+
+void
+__tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
+{
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp;
+
+ tp = intotcpcb(inp);
+ hpts = tcp_input_lock(tp->t_inpcb);
+ if (inp->inp_in_input == 0) {
+ /* Ok we need to set it on the hpts in the current slot */
+ hpts_sane_input_insert(hpts, inp, line);
+ if (hpts->p_hpts_active == 0) {
+ /*
+ * Activate the hpts if it is sleeping.
+ */
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ } else if (hpts->p_hpts_active == 0) {
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ inp->inp_hpts_drop_reas = reason;
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static uint16_t
+hpts_random_cpu(struct inpcb *inp){
+ /*
+ * No flow type set distribute the load randomly.
+ */
+ uint16_t cpuid;
+ uint32_t ran;
+
+ /*
+ * If one has been set use it i.e. we want both in and out on the
+ * same hpts.
+ */
+ if (inp->inp_input_cpu_set) {
+ return (inp->inp_input_cpu);
+ } else if (inp->inp_hpts_cpu_set) {
+ return (inp->inp_hpts_cpu);
+ }
+ /* Nothing set use a random number */
+ ran = arc4random();
+ cpuid = (ran & 0xffff) % mp_ncpus;
+ return (cpuid);
+}
+
+static uint16_t
+hpts_cpuid(struct inpcb *inp){
+ uint16_t cpuid;
+
+
+ /*
+ * If one has been set use it i.e. we want both in and out on the
+ * same hpts.
+ */
+ if (inp->inp_input_cpu_set) {
+ return (inp->inp_input_cpu);
+ } else if (inp->inp_hpts_cpu_set) {
+ return (inp->inp_hpts_cpu);
+ }
+ /* If one is set the other must be the same */
+#ifdef RSS
+ cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
+ if (cpuid == NETISR_CPUID_NONE)
+ return (hpts_random_cpu(inp));
+ else
+ return (cpuid);
+#else
+ /*
+ * We don't have a flowid -> cpuid mapping, so cheat and just map
+ * unknown cpuids to curcpu. Not the best, but apparently better
+ * than defaulting to swi 0.
+ */
+ if (inp->inp_flowtype != M_HASHTYPE_NONE) {
+ cpuid = inp->inp_flowid % mp_ncpus;
+ return (cpuid);
+ }
+ cpuid = hpts_random_cpu(inp);
+ return (cpuid);
+#endif
+}
+
+/*
+ * Do NOT try to optimize the processing of inp's
+ * by first pulling off all the inp's into a temporary
+ * list (e.g. TAILQ_CONCAT). If you do that the subtle
+ * interactions of switching CPU's will kill because of
+ * problems in the linked list manipulation. Basically
+ * you would switch cpu's with the hpts mutex locked
+ * but then while you were processing one of the inp's
+ * some other one that you switch will get a new
+ * packet on the different CPU. It will insert it
+ * on the new hptss input list. Creating a temporary
+ * link in the inp will not fix it either, since
+ * the other hpts will be doing the same thing and
+ * you will both end up using the temporary link.
+ *
+ * You will die in an ASSERT for tailq corruption if you
+ * run INVARIANTS or you will die horribly without
+ * INVARIANTS in some unknown way with a corrupt linked
+ * list.
+ */
+static void
+tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+ struct mbuf *m, *n;
+ struct tcpcb *tp;
+ struct inpcb *inp;
+ uint16_t drop_reason;
+ int16_t set_cpu;
+ uint32_t did_prefetch = 0;
+ int32_t ti_locked = TI_UNLOCKED;
+
+ HPTS_MTX_ASSERT(hpts);
+ while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
+ HPTS_MTX_ASSERT(hpts);
+ hpts_sane_input_remove(hpts, inp, 0);
+ if (inp->inp_input_cpu_set == 0) {
+ set_cpu = 1;
+ } else {
+ set_cpu = 0;
+ }
+ hpts->p_inp = inp;
+ drop_reason = inp->inp_hpts_drop_reas;
+ inp->inp_in_input = 0;
+ mtx_unlock(&hpts->p_mtx);
+ if (drop_reason) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
+ } else {
+ ti_locked = TI_UNLOCKED;
+ }
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+out:
+ hpts->p_inp = NULL;
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ if (in_pcbrele_wlocked(inp) == 0) {
+ INP_WUNLOCK(inp);
+ }
+ ti_locked = TI_UNLOCKED;
+ mtx_lock(&hpts->p_mtx);
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if ((tp == NULL) || (tp->t_inpcb == NULL)) {
+ goto out;
+ }
+ if (drop_reason) {
+ /* This tcb is being destroyed for drop_reason */
+ m = tp->t_in_pkt;
+ if (m)
+ n = m->m_nextpkt;
+ else
+ n = NULL;
+ tp->t_in_pkt = NULL;
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+ tp = tcp_drop(tp, drop_reason);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (tp == NULL) {
+ INP_WLOCK(inp);
+ }
+ if (in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+ mtx_lock(&hpts->p_mtx);
+ continue;
+ }
+ if (set_cpu) {
+ /*
+ * Setup so the next time we will move to the right
+ * CPU. This should be a rare event. It will
+ * sometimes happens when we are the client side
+ * (usually not the server). Somehow tcp_output()
+ * gets called before the tcp_do_segment() sets the
+ * intial state. This means the r_cpu and r_hpts_cpu
+ * is 0. We get on the hpts, and then tcp_input()
+ * gets called setting up the r_cpu to the correct
+ * value. The hpts goes off and sees the mis-match.
+ * We simply correct it here and the CPU will switch
+ * to the new hpts nextime the tcb gets added to the
+ * the hpts (not this time) :-)
+ */
+ tcp_set_hpts(inp);
+ }
+ CURVNET_SET(tp->t_vnet);
+ m = tp->t_in_pkt;
+ n = NULL;
+ if (m != NULL &&
+ (m->m_pkthdr.pace_lock == TI_RLOCKED ||
+ tp->t_state != TCPS_ESTABLISHED)) {
+ ti_locked = TI_RLOCKED;
+ if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
+ CURVNET_RESTORE();
+ goto out;
+ }
+ m = tp->t_in_pkt;
+ }
+ if (in_newts_every_tcb) {
+ if (in_ts_percision)
+ microuptime(tv);
+ else
+ getmicrouptime(tv);
+ }
+ if (tp->t_fb_ptr != NULL) {
+ kern_prefetch(tp->t_fb_ptr, &did_prefetch);
+ did_prefetch = 1;
+ }
+ /* Any input work to do, if so do it first */
+ if ((m != NULL) && (m == tp->t_in_pkt)) {
+ struct tcphdr *th;
+ int32_t tlen, drop_hdrlen, nxt_pkt;
+ uint8_t iptos;
+
+ n = m->m_nextpkt;
+ tp->t_in_pkt = tp->t_tail_pkt = NULL;
+ while (m) {
+ th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
+ tlen = m->m_pkthdr.pace_tlen;
+ drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
+ iptos = m->m_pkthdr.pace_tos;
+ m->m_nextpkt = NULL;
+ if (n)
+ nxt_pkt = 1;
+ else
+ nxt_pkt = 0;
+ inp->inp_input_calls = 1;
+ if (tp->t_fb->tfb_tcp_hpts_do_segment) {
+ /* Use the hpts specific do_segment */
+ (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
+ tp, drop_hdrlen,
+ tlen, iptos, ti_locked, nxt_pkt, tv);
+ } else {
+ /* Use the default do_segment */
+ (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
+ tp, drop_hdrlen,
+ tlen, iptos, ti_locked);
+ }
+ /*
+ * Do segment returns unlocked we need the
+ * lock again but we also need some kasserts
+ * here.
+ */
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_UNLOCK_ASSERT(inp);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ if (m != NULL &&
+ m->m_pkthdr.pace_lock == TI_RLOCKED) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
+ } else
+ ti_locked = TI_UNLOCKED;
+ INP_WLOCK(inp);
+ /*
+ * Since we have an opening here we must
+ * re-check if the tcb went away while we
+ * were getting the lock(s).
+ */
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ out_free:
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+ CURVNET_RESTORE();
+ goto out;
+ }
+ /*
+ * Now that we hold the INP lock, check if
+ * we need to upgrade our lock.
+ */
+ if (ti_locked == TI_UNLOCKED &&
+ (tp->t_state != TCPS_ESTABLISHED)) {
+ ti_locked = TI_RLOCKED;
+ if (tcp_hptsi_lock_inpinfo(inp, &tp))
+ goto out_free;
+ }
+ } /** end while(m) */
+ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
+ if (in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_UNLOCK_ASSERT(inp);
+ ti_locked = TI_UNLOCKED;
+ mtx_lock(&hpts->p_mtx);
+ hpts->p_inp = NULL;
+ CURVNET_RESTORE();
+ }
+}
+
+static int
+tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
+{
+ int32_t ticks_to_run;
+
+ if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
+ ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
+ if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
+ ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
+ }
+ } else {
+ if (hpts->p_prevtick == hpts->p_curtick) {
+ /* This happens when we get woken up right away */
+ return (-1);
+ }
+ ticks_to_run = 1;
+ }
+ /* Set in where we will be when we catch up */
+ hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
+ if (hpts->p_nxt_slot == hpts->p_cur_slot) {
+ panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
+ hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
+ }
+ return (ticks_to_run);
+}
+
+static void
+tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+{
+ struct tcpcb *tp;
+ struct inpcb *inp = NULL, *ninp;
+ struct timeval tv;
+ int32_t ticks_to_run, i, error, tick_now, interum_tick;
+ int32_t paced_cnt = 0;
+ int32_t did_prefetch = 0;
+ int32_t prefetch_ninp = 0;
+ int32_t prefetch_tp = 0;
+ uint32_t cts;
+ int16_t set_cpu;
+
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_curtick = tcp_tv_to_hptstick(ctick);
+ cts = tcp_tv_to_usectick(ctick);
+ memcpy(&tv, ctick, sizeof(struct timeval));
+ hpts->p_cur_slot = hpts_tick(hpts, 1);
+
+ /* Figure out if we had missed ticks */
+again:
+ HPTS_MTX_ASSERT(hpts);
+ ticks_to_run = tcp_hpts_est_run(hpts);
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ }
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&hpts->p_input) &&
+ (hpts->p_on_inqueue_cnt != 0)) {
+ panic("tp:%p in_hpts input empty but cnt:%d",
+ hpts, hpts->p_on_inqueue_cnt);
+ }
+#endif
+ HPTS_MTX_ASSERT(hpts);
+ /* Reset the ticks to run and time if we need too */
+ interum_tick = tcp_gethptstick(&tv);
+ if (interum_tick != hpts->p_curtick) {
+ /* Save off the new time we execute to */
+ *ctick = tv;
+ hpts->p_curtick = interum_tick;
+ cts = tcp_tv_to_usectick(&tv);
+ hpts->p_cur_slot = hpts_tick(hpts, 1);
+ ticks_to_run = tcp_hpts_est_run(hpts);
+ }
+ if (ticks_to_run == -1) {
+ goto no_run;
+ }
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
+ }
+ if (hpts->p_on_queue_cnt == 0) {
+ goto no_one;
+ }
+ HPTS_MTX_ASSERT(hpts);
+ for (i = 0; i < ticks_to_run; i++) {
+ /*
+ * Calculate our delay, if there are no extra ticks there
+ * was not any
+ */
+ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
+ HPTS_MTX_ASSERT(hpts);
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ /* For debugging */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
+ }
+ hpts->p_inp = inp;
+ paced_cnt++;
+ if (hpts->p_cur_slot != inp->inp_hptsslot) {
+ panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
+ hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+ }
+ /* Now pull it */
+ if (inp->inp_hpts_cpu_set == 0) {
+ set_cpu = 1;
+ } else {
+ set_cpu = 0;
+ }
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ /* We prefetch the next inp if possible */
+ kern_prefetch(ninp, &prefetch_ninp);
+ prefetch_ninp = 1;
+ }
+ if (inp->inp_hpts_request) {
+ /*
+ * This guy is deferred out further in time
+ * then our wheel had on it. Push him back
+ * on the wheel.
+ */
+ int32_t remaining_slots;
+
+ remaining_slots = ticks_to_run - (i + 1);
+ if (inp->inp_hpts_request > remaining_slots) {
+ /*
+ * Keep INVARIANTS happy by clearing
+ * the flag
+ */
+ tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+ hpts->p_inp = NULL;
+ continue;
+ }
+ inp->inp_hpts_request = 0;
+ }
+ /*
+ * We clear the hpts flag here after dealing with
+ * remaining slots. This way anyone looking with the
+ * TCB lock will see its on the hpts until just
+ * before we unlock.
+ */
+ inp->inp_in_hpts = 0;
+ mtx_unlock(&hpts->p_mtx);
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ mtx_lock(&hpts->p_mtx);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
+ hpts->p_inp = NULL;
+ continue;
+ }
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+out_now:
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ INP_WUNLOCK(inp);
+ mtx_lock(&hpts->p_mtx);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
+ hpts->p_inp = NULL;
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if ((tp == NULL) || (tp->t_inpcb == NULL)) {
+ goto out_now;
+ }
+ if (set_cpu) {
+ /*
+ * Setup so the next time we will move to
+ * the right CPU. This should be a rare
+ * event. It will sometimes happens when we
+ * are the client side (usually not the
+ * server). Somehow tcp_output() gets called
+ * before the tcp_do_segment() sets the
+ * intial state. This means the r_cpu and
+ * r_hpts_cpu is 0. We get on the hpts, and
+ * then tcp_input() gets called setting up
+ * the r_cpu to the correct value. The hpts
+ * goes off and sees the mis-match. We
+ * simply correct it here and the CPU will
+ * switch to the new hpts nextime the tcb
+ * gets added to the the hpts (not this one)
+ * :-)
+ */
+ tcp_set_hpts(inp);
+ }
+ if (out_newts_every_tcb) {
+ struct timeval sv;
+
+ if (out_ts_percision)
+ microuptime(&sv);
+ else
+ getmicrouptime(&sv);
+ cts = tcp_tv_to_usectick(&sv);
+ }
+ CURVNET_SET(tp->t_vnet);
+ /*
+ * There is a hole here, we get the refcnt on the
+ * inp so it will still be preserved but to make
+ * sure we can get the INP we need to hold the p_mtx
+ * above while we pull out the tp/inp, as long as
+ * fini gets the lock first we are assured of having
+ * a sane INP we can lock and test.
+ */
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx before tcp-output:%d",
+ hpts, __LINE__);
+ }
+#endif
+ if (tp->t_fb_ptr != NULL) {
+ kern_prefetch(tp->t_fb_ptr, &did_prefetch);
+ did_prefetch = 1;
+ }
+ inp->inp_hpts_calls = 1;
+ if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
+ error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
+ } else {
+ error = tp->t_fb->tfb_tcp_output(tp);
+ }
+ if (ninp && ninp->inp_ppcb) {
+ /*
+ * If we have a nxt inp, see if we can
+ * prefetch its ppcb. Note this may seem
+ * "risky" since we have no locks (other
+ * than the previous inp) and there no
+ * assurance that ninp was not pulled while
+ * we were processing inp and freed. If this
+ * occured it could mean that either:
+ *
+ * a) Its NULL (which is fine we won't go
+ * here) <or> b) Its valid (which is cool we
+ * will prefetch it) <or> c) The inp got
+ * freed back to the slab which was
+ * reallocated. Then the piece of memory was
+ * re-used and something else (not an
+ * address) is in inp_ppcb. If that occurs
+ * we don't crash, but take a TLB shootdown
+ * performance hit (same as if it was NULL
+ * and we tried to pre-fetch it).
+ *
+ * Considering that the likelyhood of <c> is
+ * quite rare we will take a risk on doing
+ * this. If performance drops after testing
+ * we can always take this out. NB: the
+ * kern_prefetch on amd64 actually has
+ * protection against a bad address now via
+ * the DMAP_() tests. This will prevent the
+ * TLB hit, and instead if <c> occurs just
+ * cause us to load cache with a useless
+ * address (to us).
+ */
+ kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
+ prefetch_tp = 1;
+ }
+ INP_WUNLOCK(inp);
+ INP_UNLOCK_ASSERT(inp);
+ CURVNET_RESTORE();
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ mtx_lock(&hpts->p_mtx);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
+ hpts->p_inp = NULL;
+ }
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_inp = NULL;
+ hpts->p_cur_slot++;
+ if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_cur_slot = 0;
+ }
+ }
+no_one:
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_prevtick = hpts->p_curtick;
+ hpts->p_delayed_by = 0;
+ /*
+ * Check to see if we took an excess amount of time and need to run
+ * more ticks (if we did not hit eno-bufs).
+ */
+ /* Re-run any input that may be there */
+ (void)tcp_gethptstick(&tv);
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ }
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&hpts->p_input) &&
+ (hpts->p_on_inqueue_cnt != 0)) {
+ panic("tp:%p in_hpts input empty but cnt:%d",
+ hpts, hpts->p_on_inqueue_cnt);
+ }
+#endif
+ tick_now = tcp_gethptstick(&tv);
+ if (SEQ_GT(tick_now, hpts->p_prevtick)) {
+ struct timeval res;
+
+ /* Did we really spend a full tick or more in here? */
+ timersub(&tv, ctick, &res);
+ if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+ counter_u64_add(hpts_loops, 1);
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
+ }
+ *ctick = res;
+ hpts->p_curtick = tick_now;
+ goto again;
+ }
+ }
+no_run:
+ {
+ uint32_t t = 0, i, fnd = 0;
+
+ if (hpts->p_on_queue_cnt) {
+
+
+ /*
+ * Find next slot that is occupied and use that to
+ * be the sleep time.
+ */
+ for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+ if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
+ fnd = 1;
+ break;
+ }
+ t = (t + 1) % NUM_OF_HPTSI_SLOTS;
+ }
+ if (fnd) {
+ hpts->p_hpts_sleep_time = i;
+ } else {
+ counter_u64_add(back_tosleep, 1);
+#ifdef INVARIANTS
+ panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+#endif
+ hpts->p_on_queue_cnt = 0;
+ goto non_found;
+ }
+ t++;
+ } else {
+ /* No one on the wheel sleep for all but 2 slots */
+non_found:
+ if (hpts_sleep_max == 0)
+ hpts_sleep_max = 1;
+ hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
+ t = 0;
+ }
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
+ }
+ }
+}
+
+void
+__tcp_set_hpts(struct inpcb *inp, int32_t line)
+{
+ struct tcp_hpts_entry *hpts;
+
+ INP_WLOCK_ASSERT(inp);
+ hpts = tcp_hpts_lock(inp);
+ if ((inp->inp_in_hpts == 0) &&
+ (inp->inp_hpts_cpu_set == 0)) {
+ inp->inp_hpts_cpu = hpts_cpuid(inp);
+ inp->inp_hpts_cpu_set = 1;
+ }
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if ((inp->inp_input_cpu_set == 0) &&
+ (inp->inp_in_input == 0)) {
+ inp->inp_input_cpu = hpts_cpuid(inp);
+ inp->inp_input_cpu_set = 1;
+ }
+ mtx_unlock(&hpts->p_mtx);
+}
+
+uint16_t
+tcp_hpts_delayedby(struct inpcb *inp){
+ return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
+}
+
+static void
+tcp_hpts_thread(void *ctx)
+{
+ struct tcp_hpts_entry *hpts;
+ struct timeval tv;
+ sbintime_t sb;
+
+ hpts = (struct tcp_hpts_entry *)ctx;
+ mtx_lock(&hpts->p_mtx);
+ if (hpts->p_direct_wake) {
+ /* Signaled by input */
+ if (logging_on)
+ tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
+ callout_stop(&hpts->co);
+ } else {
+ /* Timed out */
+ if (callout_pending(&hpts->co) ||
+ !callout_active(&hpts->co)) {
+ if (logging_on)
+ tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
+ mtx_unlock(&hpts->p_mtx);
+ return;
+ }
+ callout_deactivate(&hpts->co);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
+ }
+ hpts->p_hpts_active = 1;
+ (void)tcp_gethptstick(&tv);
+ tcp_hptsi(hpts, &tv);
+ HPTS_MTX_ASSERT(hpts);
+ tv.tv_sec = 0;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+ if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+ tv.tv_usec = tcp_min_hptsi_time;
+ hpts->p_on_min_sleep = 1;
+ } else {
+ /* Clear the min sleep flag */
+ hpts->p_on_min_sleep = 0;
+ }
+ hpts->p_hpts_active = 0;
+ sb = tvtosbt(tv);
+ if (tcp_hpts_callout_skip_swi == 0) {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_dir, hpts,
+ hpts->p_cpu,
+ C_PREL(tcp_hpts_precision));
+ }
+ hpts->p_direct_wake = 0;
+ mtx_unlock(&hpts->p_mtx);
+}
+
+#undef timersub
+
+static void
+tcp_init_hptsi(void *st)
+{
+ int32_t i, j, error, bound = 0, created = 0;
+ size_t sz, asz;
+ struct timeval tv;
+ sbintime_t sb;
+ struct tcp_hpts_entry *hpts;
+ char unit[16];
+ uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+
+ tcp_pace.rp_proc = NULL;
+ tcp_pace.rp_num_hptss = ncpus;
+ hpts_loops = counter_u64_alloc(M_WAITOK);
+ back_tosleep = counter_u64_alloc(M_WAITOK);
+
+ sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+ tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+ asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+ M_TCPHPTS, M_WAITOK | M_ZERO);
+ tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
+ M_TCPHPTS, M_WAITOK);
+ hpts = tcp_pace.rp_ent[i];
+ /*
+ * Init all the hpts structures that are not specifically
+ * zero'd by the allocations. Also lets attach them to the
+ * appropriate sysctl block as well.
+ */
+ mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
+ "hpts", MTX_DEF | MTX_DUPOK);
+ TAILQ_INIT(&hpts->p_input);
+ for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+ TAILQ_INIT(&hpts->p_hptss[j]);
+ }
+ sysctl_ctx_init(&hpts->hpts_ctx);
+ sprintf(unit, "%d", i);
+ hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+ OID_AUTO,
+ unit,
+ CTLFLAG_RW, 0,
+ "");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "in_qcnt", CTLFLAG_RD,
+ &hpts->p_on_inqueue_cnt, 0,
+ "Count TCB's awaiting input processing");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "out_qcnt", CTLFLAG_RD,
+ &hpts->p_on_queue_cnt, 0,
+ "Count TCB's awaiting output processing");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "active", CTLFLAG_RD,
+ &hpts->p_hpts_active, 0,
+ "Is the hpts active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curslot", CTLFLAG_RD,
+ &hpts->p_cur_slot, 0,
+ "What the current slot is if active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curtick", CTLFLAG_RD,
+ &hpts->p_curtick, 0,
+ "What the current tick on if active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "logsize", CTLFLAG_RD,
+ &hpts->p_logsize, 0,
+ "Hpts logging buffer size");
+ hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+ hpts->p_num = i;
+ hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_prevtick -= 1;
+ hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+ hpts->p_cpu = 0xffff;
+ hpts->p_nxt_slot = 1;
+ hpts->p_logsize = tcp_hpts_logging_size;
+ if (hpts->p_logsize) {
+ sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+ hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+ }
+ callout_init(&hpts->co, 1);
+ }
+ /*
+ * Now lets start ithreads to handle the hptss.
+ */
+ CPU_FOREACH(i) {
+ hpts = tcp_pace.rp_ent[i];
+ hpts->p_cpu = i;
+ error = swi_add(&hpts->ie, "hpts",
+ tcp_hpts_thread, (void *)hpts,
+ SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
+ if (error) {
+ panic("Can't add hpts:%p i:%d err:%d",
+ hpts, i, error);
+ }
+ created++;
+ if (tcp_bind_threads) {
+ if (intr_event_bind(hpts->ie, i) == 0)
+ bound++;
+ }
+ tv.tv_sec = 0;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+ sb = tvtosbt(tv);
+ if (tcp_hpts_callout_skip_swi == 0) {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_dir, hpts,
+ hpts->p_cpu,
+ C_PREL(tcp_hpts_precision));
+ }
+ }
+ printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
+ created, bound);
+ return;
+}
+
+SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
new file mode 100644
index 000000000000..c5a3a5f197bd
--- /dev/null
+++ b/sys/netinet/tcp_hpts.h
@@ -0,0 +1,304 @@
+#ifndef __tcp_hpts_h__
+#define __tcp_hpts_h__
+/*-
+ * Copyright (c) 2016-8
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$")
+ */
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+TAILQ_HEAD(hptsh, inpcb);
+
+/* Number of useconds in a hpts tick */
+#define HPTS_TICKS_PER_USEC 10
+#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
+#define HPTS_USEC_IN_SEC 1000000
+#define HPTS_MSEC_IN_SEC 1000
+#define HPTS_USEC_IN_MSEC 1000
+
+#define DEFAULT_HPTS_LOG 3072
+
+/*
+ * Log flags consist of
+ * 7f 7f 1 1 bits
+ * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
+ *
+ * So for example cpu 10, number 10 would with
+ * input active would show up as:
+ * p_flags = 0001010 0001010 1 0
+ * <or>
+ * p_flags = 0x142a
+ */
+#define HPTS_HPTS_ACTIVE 0x01
+#define HPTS_INPUT_ACTIVE 0x02
+
+#define HPTSLOG_IMMEDIATE 1
+#define HPTSLOG_INSERT_NORMAL 2
+#define HPTSLOG_INSERT_SLEEPER 3
+#define HPTSLOG_SLEEP_AFTER 4
+#define HPTSLOG_SLEEP_BEFORE 5
+#define HPTSLOG_INSERTED 6
+#define HPTSLOG_WAKEUP_HPTS 7
+#define HPTSLOG_SETTORUN 8
+#define HPTSLOG_HPTSI 9
+#define HPTSLOG_TOLONG 10
+#define HPTSLOG_AWAKENS 11
+#define HPTSLOG_TIMESOUT 12
+#define HPTSLOG_SLEEPSET 13
+#define HPTSLOG_WAKEUP_INPUT 14
+#define HPTSLOG_RESCHEDULE 15
+#define HPTSLOG_AWAKE 16
+#define HPTSLOG_INP_DONE 17
+
+struct hpts_log {
+ struct inpcb *inp;
+ int32_t event;
+ uint32_t cts;
+ int32_t line;
+ uint32_t ticknow;
+ uint32_t t_paceslot;
+ uint32_t t_hptsreq;
+ uint32_t p_curtick;
+ uint32_t p_prevtick;
+ uint32_t slot_req;
+ uint32_t p_on_queue_cnt;
+ uint32_t p_nxt_slot;
+ uint32_t p_cur_slot;
+ uint32_t p_hpts_sleep_time;
+ uint16_t p_flags;
+ uint8_t p_onhpts;
+ uint8_t p_oninput;
+ uint8_t is_notempty;
+};
+
+struct hpts_diag {
+ uint32_t p_hpts_active;
+ uint32_t p_nxt_slot;
+ uint32_t p_cur_slot;
+ uint32_t slot_req;
+ uint32_t inp_hptsslot;
+ uint32_t slot_now;
+ uint32_t have_slept;
+ uint32_t hpts_sleep_time;
+ uint32_t yet_to_sleep;
+ uint32_t need_new_to;
+ int32_t co_ret;
+ uint8_t p_on_min_sleep;
+};
+
+#ifdef _KERNEL
+/* Each hpts has its own p_mtx which is used for locking */
+struct tcp_hpts_entry {
+ /* Cache line 0x00 */
+ struct mtx p_mtx; /* Mutex for hpts */
+ uint32_t p_hpts_active; /* Flag that says hpts is awake */
+ uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
+ uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
+ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
+ uint32_t p_nxt_slot; /* The next slot outside the current range of
+ * slots that the hpts is running on. */
+ int32_t p_on_queue_cnt; /* Count on queue in this hpts */
+ uint32_t enobuf_cnt;
+ uint16_t p_log_at;
+ uint8_t p_direct_wake :1, /* boolean */
+ p_log_wrapped :1, /* boolean */
+ p_on_min_sleep:1; /* boolean */
+ uint8_t p_fill;
+ /* Cache line 0x40 */
+ void *p_inp;
+ struct hptsh p_input; /* For the tcp-input runner */
+ /* Hptsi wheel */
+ struct hptsh *p_hptss;
+ struct hpts_log *p_log;
+ uint32_t p_logsize;
+ int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
+ uint32_t hit_no_enobuf;
+ uint32_t p_dyn_adjust;
+ uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
+ * of 255ms */
+ uint32_t p_delayed_by; /* How much were we delayed by */
+ /* Cache line 0x80 */
+ struct sysctl_ctx_list hpts_ctx;
+ struct sysctl_oid *hpts_root;
+ struct intr_event *ie;
+ void *ie_cookie;
+ uint16_t p_num; /* The hpts number one per cpu */
+ uint16_t p_cpu; /* The hpts CPU */
+ /* There is extra space in here */
+ /* Cache line 0x100 */
+ struct callout co __aligned(CACHE_LINE_SIZE);
+} __aligned(CACHE_LINE_SIZE);
+
+struct tcp_hptsi {
+ struct proc *rp_proc; /* Process structure for hpts */
+ struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t rp_num_hptss; /* Number of hpts threads */
+};
+
+#endif
+
+#define HPTS_REMOVE_INPUT 0x01
+#define HPTS_REMOVE_OUTPUT 0x02
+#define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
+
+/*
+ * When using the hpts, a TCP stack must make sure
+ * that once a INP_DROPPED flag is applied to a INP
+ * that it does not expect tcp_output() to ever be
+ * called by the hpts. The hpts will *not* call
+ * any output (or input) functions on a TCB that
+ * is in the DROPPED state.
+ *
+ * This implies final ACK's and RST's that might
+ * be sent when a TCB is still around must be
+ * sent from a routine like tcp_respond().
+ */
+#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
+ * this determines min granularity of the
+ * hpts. If 0, granularity is 10useconds at
+ * the cost of more CPU (context switching). */
+#ifdef _KERNEL
+#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
+struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
+int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
+
+struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
+#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
+void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
+
+/*
+ * To insert a TCB on the hpts you *must* be holding the
+ * INP_WLOCK(). The hpts insert code will then acqurire
+ * the hpts's lock and insert the TCB on the requested
+ * slot possibly waking up the hpts if you are requesting
+ * a time earlier than what the hpts is sleeping to (if
+ * the hpts is sleeping). You may check the inp->inp_in_hpts
+ * flag without the hpts lock. The hpts is the only one
+ * that will clear this flag holding only the hpts lock. This
+ * means that in your tcp_output() routine when you test for
+ * it to be 1 (so you wont call output) it may be transitioning
+ * to 0 (by the hpts). That will be fine since that will just
+ * mean an extra call to tcp_output that most likely will find
+ * the call you executed (when the mis-match occured) will have
+ * put the TCB back on the hpts and it will return. If your
+ * call did not add it back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Thoug usually
+ * you are either doing this from a timer, where you need
+ * that INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ */
+uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
+#define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
+
+int
+ __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
+#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked);
+int
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line);
+#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+
+uint16_t tcp_hpts_delayedby(struct inpcb *inp);
+
+void __tcp_set_hpts(struct inpcb *inp, int32_t line);
+#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+
+void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
+#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+
+extern int32_t tcp_min_hptsi_time;
+
+static __inline uint32_t
+tcp_tv_to_hptstick(struct timeval *sv)
+{
+ return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
+}
+
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+ struct timeval tv;
+
+ if (sv == NULL)
+ sv = &tv;
+ microuptime(sv);
+ return (tcp_tv_to_hptstick(sv));
+}
+
+static __inline uint32_t
+tcp_tv_to_usectick(struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+static __inline uint32_t
+tcp_tv_to_mssectick(struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
+}
+
+static __inline void
+tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
+{
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static __inline uint32_t
+tcp_get_usecs(struct timeval *tv)
+{
+ struct timeval tvd;
+
+ if (tv == NULL)
+ tv = &tvd;
+ microuptime(tv);
+ return (tcp_tv_to_usectick(tv));
+}
+
+#endif
+#endif
diff --git a/sys/netinet/tcp_stacks/fastpath.c b/sys/netinet/tcp_stacks/fastpath.c
index 92db0d551fee..c6632a22c058 100644
--- a/sys/netinet/tcp_stacks/fastpath.c
+++ b/sys/netinet/tcp_stacks/fastpath.c
@@ -2404,7 +2404,7 @@ tcp_addfastpaths(module_t mod, int type, void *data)
err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
if (err) {
printf("Failed to register fastslow module -- err:%d\n", err);
- deregister_tcp_functions(&__tcp_fastack);
+ deregister_tcp_functions(&__tcp_fastack, false, true);
return(err);
}
break;
@@ -2412,12 +2412,12 @@ tcp_addfastpaths(module_t mod, int type, void *data)
if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
return(EBUSY);
}
+ err = deregister_tcp_functions(&__tcp_fastack, true, false);
+ err = deregister_tcp_functions(&__tcp_fastslow, true, false);
break;
case MOD_UNLOAD:
- err = deregister_tcp_functions(&__tcp_fastack);
- if (err == EBUSY)
- break;
- err = deregister_tcp_functions(&__tcp_fastslow);
+ err = deregister_tcp_functions(&__tcp_fastack, false, true);
+ err = deregister_tcp_functions(&__tcp_fastslow, false, true);
if (err == EBUSY)
break;
err = 0;
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 77cfc8d12724..bc03fb37de46 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -232,6 +232,9 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone);
VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
#endif
+static int tcp_default_fb_init(struct tcpcb *tp);
+static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
+static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
static void tcp_mtudisc(struct inpcb *, int);
@@ -240,18 +243,13 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
static struct tcp_function_block tcp_def_funcblk = {
- "default",
- tcp_output,
- tcp_do_segment,
- tcp_default_ctloutput,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- 0,
- 0
+ .tfb_tcp_block_name = "freebsd",
+ .tfb_tcp_output = tcp_output,
+ .tfb_tcp_do_segment = tcp_do_segment,
+ .tfb_tcp_ctloutput = tcp_default_ctloutput,
+ .tfb_tcp_handoff_ok = tcp_default_handoff_ok,
+ .tfb_tcp_fb_init = tcp_default_fb_init,
+ .tfb_tcp_fb_fini = tcp_default_fb_fini,
};
int t_functions_inited = 0;
@@ -328,6 +326,88 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk)
return(rblk);
}
+static struct tcp_function_block *
+find_and_ref_tcp_default_fb(void)
+{
+ struct tcp_function_block *rblk;
+
+ rw_rlock(&tcp_function_lock);
+ rblk = tcp_func_set_ptr;
+ refcount_acquire(&rblk->tfb_refcnt);
+ rw_runlock(&tcp_function_lock);
+ return (rblk);
+}
+
+void
+tcp_switch_back_to_default(struct tcpcb *tp)
+{
+ struct tcp_function_block *tfb;
+
+ KASSERT(tp->t_fb != &tcp_def_funcblk,
+ ("%s: called by the built-in default stack", __func__));
+
+ /*
+ * Release the old stack. This function will either find a new one
+ * or panic.
+ */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+
+ /*
+ * Now, we'll find a new function block to use.
+ * Start by trying the current user-selected
+ * default, unless this stack is the user-selected
+ * default.
+ */
+ tfb = find_and_ref_tcp_default_fb();
+ if (tfb == tp->t_fb) {
+ refcount_release(&tfb->tfb_refcnt);
+ tfb = NULL;
+ }
+ /* Does the stack accept this connection? */
+ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
+ (*tfb->tfb_tcp_handoff_ok)(tp)) {
+ refcount_release(&tfb->tfb_refcnt);
+ tfb = NULL;
+ }
+ /* Try to use that stack. */
+ if (tfb != NULL) {
+ /* Initialize the new stack. If it succeeds, we are done. */
+ tp->t_fb = tfb;
+ if (tp->t_fb->tfb_tcp_fb_init == NULL ||
+ (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+ return;
+
+ /*
+ * Initialization failed. Release the reference count on
+ * the stack.
+ */
+ refcount_release(&tfb->tfb_refcnt);
+ }
+
+ /*
+ * If that wasn't feasible, use the built-in default
+ * stack which is not allowed to reject anyone.
+ */
+ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
+ if (tfb == NULL) {
+ /* there always should be a default */
+ panic("Can't refer to tcp_def_funcblk");
+ }
+ if (tfb->tfb_tcp_handoff_ok != NULL) {
+ if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
+ /* The default stack cannot say no */
+ panic("Default stack rejects a new session?");
+ }
+ }
+ tp->t_fb = tfb;
+ if (tp->t_fb->tfb_tcp_fb_init != NULL &&
+ (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ /* The default stack cannot fail */
+ panic("Default stack initialization failed");
+ }
+}
static int
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
@@ -507,6 +587,89 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
"List TCP function block name-to-ID mappings");
/*
+ * tfb_tcp_handoff_ok() function for the default stack.
+ * Note that we'll basically try to take all comers.
+ */
+static int
+tcp_default_handoff_ok(struct tcpcb *tp)
+{
+
+ return (0);
+}
+
+/*
+ * tfb_tcp_fb_init() function for the default stack.
+ *
+ * This handles making sure we have appropriate timers set if you are
+ * transitioning a socket that has some amount of setup done.
+ *
+ * The init() fuction from the default can *never* return non-zero i.e.
+ * it is required to always succeed since it is the stack of last resort!
+ */
+static int
+tcp_default_fb_init(struct tcpcb *tp)
+{
+
+ struct socket *so;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
+ ("%s: connection %p in unexpected state %d", __func__, tp,
+ tp->t_state));
+
+ /*
+ * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
+ * know what to do for unexpected states (which includes TIME_WAIT).
+ */
+ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
+ return (0);
+
+ /*
+ * Make sure some kind of transmission timer is set if there is
+ * outstanding data.
+ */
+ so = tp->t_inpcb->inp_socket;
+ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
+ tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
+ tcp_timer_active(tp, TT_PERSIST))) {
+ /*
+ * If the session has established and it looks like it should
+ * be in the persist state, set the persist timer. Otherwise,
+ * set the retransmit timer.
+ */
+ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
+ (int32_t)(tp->snd_nxt - tp->snd_una) <
+ (int32_t)sbavail(&so->so_snd))
+ tcp_setpersist(tp);
+ else
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ }
+
+ /* All non-embryonic sessions get a keepalive timer. */
+ if (!tcp_timer_active(tp, TT_KEEP))
+ tcp_timer_activate(tp, TT_KEEP,
+ TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
+ TP_KEEPINIT(tp));
+
+ return (0);
+}
+
+/*
+ * tfb_tcp_fb_fini() function for the default stack.
+ *
+ * This changes state as necessary (or prudent) to prepare for another stack
+ * to assume responsibility for the connection.
+ */
+static void
+tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
+{
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ return;
+}
+
+/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
* Note that this can be overridden by the kernel environment
@@ -732,11 +895,28 @@ register_tcp_functions(struct tcp_function_block *blk, int wait)
return (register_tcp_functions_as_name(blk, NULL, wait));
}
+/*
+ * Deregister all names associated with a function block. This
+ * functionally removes the function block from use within the system.
+ *
+ * When called with a true quiesce argument, mark the function block
+ * as being removed so no more stacks will use it and determine
+ * whether the removal would succeed.
+ *
+ * When called with a false quiesce argument, actually attempt the
+ * removal.
+ *
+ * When called with a force argument, attempt to switch all TCBs to
+ * use the default stack instead of returning EBUSY.
+ *
+ * Returns 0 on success (or if the removal would succeed, or an error
+ * code on failure.
+ */
int
-deregister_tcp_functions(struct tcp_function_block *blk)
+deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+ bool force)
{
struct tcp_function *f;
- int error=ENOENT;
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
/* You can't un-register the default */
@@ -748,22 +928,63 @@ deregister_tcp_functions(struct tcp_function_block *blk)
rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
+ /* Mark the block so no more stacks can use it. */
+ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
+ /*
+ * If TCBs are still attached to the stack, attempt to switch them
+ * to the default stack.
+ */
+ if (force && blk->tfb_refcnt) {
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ rw_wunlock(&tcp_function_lock);
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ INP_WLOCK(inp);
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if (tp == NULL || tp->t_fb != blk) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ tcp_switch_back_to_default(tp);
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ rw_wlock(&tcp_function_lock);
+ }
if (blk->tfb_refcnt) {
- /* Still tcb attached, mark it. */
- blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
- rw_wunlock(&tcp_function_lock);
+ /* TCBs still attached. */
+ rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
+ if (quiesce) {
+ /* Skip removal. */
+ rw_wunlock(&tcp_function_lock);
+ return (0);
+ }
+ /* Remove any function names that map to this function block. */
while (find_tcp_fb_locked(blk, &f) != NULL) {
- /* Found */
TAILQ_REMOVE(&t_functions, f, tf_next);
tcp_fb_cnt--;
f->tf_fb = NULL;
free(f, M_TCPFUNCTIONS);
- error = 0;
}
rw_wunlock(&tcp_function_lock);
- return (error);
+ return (0);
}
void
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index a91fd1eca220..8061e512fd7a 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -852,6 +852,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = rblk;
+ /*
+ * XXXrrs this is quite dangerous, it is possible
+ * for the new function to fail to init. We also
+ * are not asking if the handoff_is_ok though at
+ * the very start thats probalbly ok.
+ */
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index c824fbbf2202..c9d8c844e7e4 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1521,17 +1521,34 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
*/
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
}
+#ifdef TCPHPTS
+ /* Assure that we are not on any hpts */
+ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
+#endif
+ if (blk->tfb_tcp_fb_init) {
+ error = (*blk->tfb_tcp_fb_init)(tp);
+ if (error) {
+ refcount_release(&blk->tfb_refcnt);
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
+ /* Fall back failed, drop the connection */
+ INP_WUNLOCK(inp);
+ soabort(so);
+ return(error);
+ }
+ }
+ goto err_out;
+ }
+ }
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = blk;
- if (tp->t_fb->tfb_tcp_fb_init) {
- (*tp->t_fb->tfb_tcp_fb_init)(tp);
- }
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
sopt->sopt_name);
}
#endif
+err_out:
INP_WUNLOCK(inp);
return (error);
} else if ((sopt->sopt_dir == SOPT_GET) &&
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 53a748ebb2dd..2c1847740287 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -90,6 +90,8 @@ struct tcpcb {
int t_segqlen; /* segment reassembly queue length */
int t_dupacks; /* consecutive dup acks recd */
+ struct mbuf *t_in_pkt; /* head of the input packet queue for the tcp_hpts system */
+ struct mbuf *t_tail_pkt; /* tail of the input packet queue for the tcp_hpts system */
struct tcp_timer *t_timers; /* All the TCP timers in one struct */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
@@ -257,14 +259,19 @@ struct tcptemp {
struct tcp_function_block {
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
int (*tfb_tcp_output)(struct tcpcb *);
+ int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
int);
+ void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int, int, struct timeval *);
int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp);
/* Optional memory allocation/free routine */
- void (*tfb_tcp_fb_init)(struct tcpcb *);
+ int (*tfb_tcp_fb_init)(struct tcpcb *);
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
@@ -274,6 +281,7 @@ struct tcp_function_block {
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
+ void (*tfb_tcp_mtu_chg)(struct tcpcb *);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
@@ -851,9 +859,12 @@ int register_tcp_functions_as_names(struct tcp_function_block *blk,
int wait, const char *names[], int *num_names);
int register_tcp_functions_as_name(struct tcp_function_block *blk,
const char *name, int wait);
-int deregister_tcp_functions(struct tcp_function_block *blk);
+int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+ bool force);
struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
-struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+void tcp_switch_back_to_default(struct tcpcb *tp);
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *fs);
int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
diff --git a/sys/sys/kern_prefetch.h b/sys/sys/kern_prefetch.h
new file mode 100644
index 000000000000..5acf06597498
--- /dev/null
+++ b/sys/sys/kern_prefetch.h
@@ -0,0 +1,50 @@
+#ifndef __kern_prefetch_h__
+/*-
+ * Copyright (c) 2016-8
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$")
+ */
+#define __kern_prefetch_h__
+#ifdef _KERNEL
+#if defined(__amd64__)
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/pmap.h>
+#endif
+
+static __inline void
+kern_prefetch(const volatile void *addr, void* before)
+{
+#if defined(__amd64__)
+ __asm __volatile("prefetcht1 (%1)":"=rm"(*((int32_t *)before)):"r"(addr):);
+#else
+ __builtin_prefetch(addr);
+#endif
+}
+
+#endif
+#endif
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index ba1e88c6175d..81aed4e75f88 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -196,6 +196,11 @@ struct pkthdr {
#define lro_nsegs tso_segsz
#define csum_phsum PH_per.sixteen[2]
#define csum_data PH_per.thirtytwo[1]
+#define pace_thoff PH_loc.sixteen[0]
+#define pace_tlen PH_loc.sixteen[1]
+#define pace_drphdrlen PH_loc.sixteen[2]
+#define pace_tos PH_loc.eight[6]
+#define pace_lock PH_loc.eight[7]
/*
* Description of external storage mapped into mbuf; valid only if M_EXT is