src - FreeBSD source tree

diff options


context:
space:
mode:

author	Gleb Smirnoff <glebius@FreeBSD.org>	2023-04-25 19:18:33 +0000
committer	Gleb Smirnoff <glebius@FreeBSD.org>	2023-04-25 19:18:33 +0000
commit	c2a69e846fffb95271c0299e0a81e2033382e9c2 (patch)
tree	29b324199272af7b1359717bfddd3fe7b56faee7
parent	144259f673038635709022506d3adc819da137b6 (diff)

tcp_hpts: move HPTS related fields from inpcb to tcpcb

This makes inpcb lighter and allows future cache line optimizations of tcpcb. The reason why HPTS originally used inpcb is the compressed TIME-WAIT state (see 0d7445193ab), that used to free a tcpcb, while the associated connection is still on the HPTS ring. Reviewed by: rrs Differential Revision: https://reviews.freebsd.org/D39697

Diffstat

-rw-r--r--

sys/netinet/in_pcb.c

-rw-r--r--

sys/netinet/in_pcb.h

-rw-r--r--

sys/netinet/tcp_hpts.c

384

-rw-r--r--

sys/netinet/tcp_hpts.h

-rw-r--r--

sys/netinet/tcp_lro.c

-rw-r--r--

sys/netinet/tcp_stacks/bbr.c

-rw-r--r--

sys/netinet/tcp_stacks/rack.c

116

-rw-r--r--

sys/netinet/tcp_subr.c

-rw-r--r--

sys/netinet/tcp_timewait.c

-rw-r--r--

sys/netinet/tcp_usrreq.c

-rw-r--r--

sys/netinet/tcp_var.h

11 files changed, 311 insertions, 387 deletions

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 9193dfb2372b..350d08360105 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c

@@ -1692,7 +1692,6 @@ in_pcbrele_rlocked(struct inpcb *inp)

MPASS(inp->inp_flags & INP_FREED);

MPASS(inp->inp_socket == NULL);

- MPASS(inp->inp_in_hpts == 0);

crfree(inp->inp_cred);

#ifdef INVARIANTS

inp->inp_cred = NULL;

@@ -1713,7 +1712,6 @@ in_pcbrele_wlocked(struct inpcb *inp)

MPASS(inp->inp_flags & INP_FREED);

MPASS(inp->inp_socket == NULL);

- MPASS(inp->inp_in_hpts == 0);

crfree(inp->inp_cred);

#ifdef INVARIANTS

inp->inp_cred = NULL;

diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 984cb9e26561..62c5758268a7 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h

@@ -145,7 +145,6 @@ struct in_conninfo {

* lock is to be obtained and SMR section exited.

* Key:

- * (b) - Protected by the hpts lock.

* (c) - Constant after initialization

* (e) - Protected by the SMR section

* (i) - Protected by the inpcb lock

@@ -154,51 +153,6 @@ struct in_conninfo {

* (s) - Protected by another subsystem's locks

* (x) - Undefined locking

- * Notes on the tcp_hpts:

- *

- * First Hpts lock order is

- * 1) INP_WLOCK()

- * 2) HPTS_LOCK() i.e. hpts->pmtx

- *

- * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().

- * You may check the inp->inp_in_hpts flag without the hpts lock.

- * The hpts is the only one that will clear this flag holding

- * only the hpts lock. This means that in your tcp_output()

- * routine when you test for the inp_in_hpts flag to be 1

- * it may be transitioning to 0 (by the hpts).

- * That's ok since that will just mean an extra call to tcp_output

- * that most likely will find the call you executed

- * (when the mis-match occurred) will have put the TCB back

- * on the hpts and it will return. If your

- * call did not add the inp back to the hpts then you will either

- * over-send or the cwnd will block you from sending more.

- *

- * Note you should also be holding the INP_WLOCK() when you

- * call the remove from the hpts as well. Though usually

- * you are either doing this from a timer, where you need and have

- * the INP_WLOCK() or from destroying your TCB where again

- * you should already have the INP_WLOCK().

- *

- * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and

- * inp_input_cpu_set fields are controlled completely by

- * the hpts. Do not ever set these. The inp_hpts_cpu_set

- * and inp_input_cpu_set fields indicate if the hpts has

- * setup the respective cpu field. It is advised if this

- * field is 0, to enqueue the packet with the appropriate

- * hpts_immediate() call. If the _set field is 1, then

- * you may compare the inp_*_cpu field to the curcpu and

- * may want to again insert onto the hpts if these fields

- * are not equal (i.e. you are not on the expected CPU).

- *

- * A note on inp_hpts_calls and inp_input_calls, these

- * flags are set when the hpts calls either the output

- * or do_segment routines respectively. If the routine

- * being called wants to use this, then it needs to

- * clear the flag before returning. The hpts will not

- * clear the flag. The flags can be used to tell if

- * the hpts is the function calling the respective

- * routine.

- *

* A few other notes:

* When a read lock is held, stability of the field is guaranteed; to write

@@ -219,41 +173,15 @@ struct inpcb {

CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */

struct rwlock inp_lock;

/* Cache line #2 (amd64) */

-#define inp_start_zero inp_hpts

+#define inp_start_zero inp_refcount

#define inp_zero_size (sizeof(struct inpcb) - \

offsetof(struct inpcb, inp_start_zero))

- TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */

- uint32_t inp_hpts_gencnt; /* XXXGL */

- uint32_t inp_hpts_request; /* Current hpts request, zero if

- * fits in the pacing window (i&b). */

- /*

- * Note the next fields are protected by a

- * different lock (hpts-lock). This means that

- * they must correspond in size to the smallest

- * protectable bit field (uint8_t on x86, and

- * other platfomrs potentially uint32_t?). Also

- * since CPU switches can occur at different times the two

- * fields can *not* be collapsed into a signal bit field.

- */

-#if defined(__amd64__) || defined(__i386__)

- uint8_t inp_in_hpts; /* on output hpts (lock b) */

-#else

- uint32_t inp_in_hpts; /* on output hpts (lock b) */

-#endif

- volatile uint16_t inp_hpts_cpu; /* Lock (i) */

- volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */

u_int inp_refcount; /* (i) refcount */

int inp_flags; /* (i) generic IP/datagram flags */

int inp_flags2; /* (i) generic IP/datagram flags #2*/

- uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */

- inp_hpts_calls :1, /* (i) from output hpts */

- inp_irq_cpu_set :1, /* (i) from LRO/Driver */

- inp_spare_bits2 : 3;

uint8_t inp_numa_domain; /* numa domain */

void *inp_ppcb; /* (i) pointer to per-protocol pcb */

struct socket *inp_socket; /* (i) back pointer to socket */

- int32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */

- uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */

struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */

struct ucred *inp_cred; /* (c) cache of socket cred */

u_int32_t inp_flow; /* (i) IPv6 flow information */

diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index cc1bd71d0d43..59122bb242b9 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c

@@ -199,7 +199,7 @@ struct tcp_hpts_entry {

uint8_t p_fill[3]; /* Fill to 32 bits */

/* Cache line 0x40 */

struct hptsh {

- TAILQ_HEAD(, inpcb) head;

+ TAILQ_HEAD(, tcpcb) head;

uint32_t count;

uint32_t gencnt;

} *p_hptss; /* Hptsi wheel */

@@ -273,12 +273,6 @@ static struct hpts_domain_info {

int cpu[MAXCPU];

} hpts_domains[MAXMEMDOM];

-enum {

- IHPTS_NONE = 0,

- IHPTS_ONQUEUE,

- IHPTS_MOVING,

-};

counter_u64_t hpts_hopelessly_behind;

SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,

@@ -426,6 +420,17 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,

&tcp_hpts_no_wake_over_thresh, 0,

"When we are over the threshold on the pacer do we prohibit wakeups?");

+static uint16_t

+hpts_random_cpu(void)

+ uint16_t cpuid;

+ uint32_t ran;

+ ran = arc4random();

+ cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);

+ return (cpuid);

static void

tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,

int slots_to_run, int idx, int from_callout)

@@ -489,94 +494,107 @@ hpts_timeout_swi(void *arg)

}

static void

-inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts)

+tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)

{

+ struct inpcb *inp = tptoinpcb(tp);

struct hptsh *hptsh;

INP_WLOCK_ASSERT(inp);

HPTS_MTX_ASSERT(hpts);

- MPASS(hpts->p_cpu == inp->inp_hpts_cpu);

+ MPASS(hpts->p_cpu == tp->t_hpts_cpu);

MPASS(!(inp->inp_flags & INP_DROPPED));

- hptsh = &hpts->p_hptss[inp->inp_hptsslot];

+ hptsh = &hpts->p_hptss[tp->t_hpts_slot];

- if (inp->inp_in_hpts == IHPTS_NONE) {

- inp->inp_in_hpts = IHPTS_ONQUEUE;

+ if (tp->t_in_hpts == IHPTS_NONE) {

+ tp->t_in_hpts = IHPTS_ONQUEUE;

in_pcbref(inp);

- } else if (inp->inp_in_hpts == IHPTS_MOVING) {

- inp->inp_in_hpts = IHPTS_ONQUEUE;

+ } else if (tp->t_in_hpts == IHPTS_MOVING) {

+ tp->t_in_hpts = IHPTS_ONQUEUE;

} else

- MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);

- inp->inp_hpts_gencnt = hptsh->gencnt;

+ MPASS(tp->t_in_hpts == IHPTS_ONQUEUE);

+ tp->t_hpts_gencnt = hptsh->gencnt;

- TAILQ_INSERT_TAIL(&hptsh->head, inp, inp_hpts);

+ TAILQ_INSERT_TAIL(&hptsh->head, tp, t_hpts);

hptsh->count++;

hpts->p_on_queue_cnt++;

}

static struct tcp_hpts_entry *

-tcp_hpts_lock(struct inpcb *inp)

+tcp_hpts_lock(struct tcpcb *tp)

{

struct tcp_hpts_entry *hpts;

- INP_LOCK_ASSERT(inp);

+ INP_LOCK_ASSERT(tptoinpcb(tp));

- hpts = tcp_pace.rp_ent[inp->inp_hpts_cpu];

+ hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];

HPTS_LOCK(hpts);

return (hpts);

}

static void

-inp_hpts_release(struct inpcb *inp)

+tcp_hpts_release(struct tcpcb *tp)

{

bool released __diagused;

- inp->inp_in_hpts = IHPTS_NONE;

- released = in_pcbrele_wlocked(inp);

+ tp->t_in_hpts = IHPTS_NONE;

+ released = in_pcbrele_wlocked(tptoinpcb(tp));

MPASS(released == false);

}

+ * Initialize newborn tcpcb to get ready for use with HPTS.

+ */

+void

+tcp_hpts_init(struct tcpcb *tp)

+ tp->t_hpts_cpu = hpts_random_cpu();

+ tp->t_lro_cpu = HPTS_CPU_NONE;

+ MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));

+/*

* Called normally with the INP_LOCKED but it

* does not matter, the hpts lock is the key

* but the lock order allows us to hold the

* INP lock and then get the hpts lock.

void

-tcp_hpts_remove(struct inpcb *inp)

+tcp_hpts_remove(struct tcpcb *tp)

{

struct tcp_hpts_entry *hpts;

struct hptsh *hptsh;

- INP_WLOCK_ASSERT(inp);

+ INP_WLOCK_ASSERT(tptoinpcb(tp));

- hpts = tcp_hpts_lock(inp);

- if (inp->inp_in_hpts == IHPTS_ONQUEUE) {

- hptsh = &hpts->p_hptss[inp->inp_hptsslot];

- inp->inp_hpts_request = 0;

- if (__predict_true(inp->inp_hpts_gencnt == hptsh->gencnt)) {

- TAILQ_REMOVE(&hptsh->head, inp, inp_hpts);

+ hpts = tcp_hpts_lock(tp);

+ if (tp->t_in_hpts == IHPTS_ONQUEUE) {

+ hptsh = &hpts->p_hptss[tp->t_hpts_slot];

+ tp->t_hpts_request = 0;

+ if (__predict_true(tp->t_hpts_gencnt == hptsh->gencnt)) {

+ TAILQ_REMOVE(&hptsh->head, tp, t_hpts);

MPASS(hptsh->count > 0);

hptsh->count--;

MPASS(hpts->p_on_queue_cnt > 0);

hpts->p_on_queue_cnt--;

- inp_hpts_release(inp);

+ tcp_hpts_release(tp);

} else {

* tcp_hptsi() now owns the TAILQ head of this inp.

* Can't TAILQ_REMOVE, just mark it.

#ifdef INVARIANTS

- struct inpcb *tmp;

+ struct tcpcb *tmp;

- TAILQ_FOREACH(tmp, &hptsh->head, inp_hpts)

- MPASS(tmp != inp);

+ TAILQ_FOREACH(tmp, &hptsh->head, t_hpts)

+ MPASS(tmp != tp);

#endif

- inp->inp_in_hpts = IHPTS_MOVING;

- inp->inp_hptsslot = -1;

+ tp->t_in_hpts = IHPTS_MOVING;

+ tp->t_hpts_slot = -1;

}

- } else if (inp->inp_in_hpts == IHPTS_MOVING) {

+ } else if (tp->t_in_hpts == IHPTS_MOVING) {

* Handle a special race condition:

* tcp_hptsi() moves inpcb to detached tailq

@@ -585,18 +603,11 @@ tcp_hpts_remove(struct inpcb *inp)

* tcp_hpts_remove() again (we are here!), then in_pcbdrop()

* tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED

- inp->inp_hptsslot = -1;

+ tp->t_hpts_slot = -1;

}

HPTS_UNLOCK(hpts);

}

-bool

-tcp_in_hpts(struct inpcb *inp)

- return (inp->inp_in_hpts == IHPTS_ONQUEUE);

static inline int

hpts_slot(uint32_t wheel_slot, uint32_t plus)

{

@@ -762,15 +773,15 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *

#ifdef INVARIANTS

static void

-check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)

+check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,

+ uint32_t hptsslot, int line)

{

* Sanity checks for the pacer with invariants

* on insert.

- KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,

- ("hpts:%p inp:%p slot:%d > max",

- hpts, inp, inp_hptsslot));

+ KASSERT(hptsslot < NUM_OF_HPTSI_SLOTS,

+ ("hpts:%p tp:%p slot:%d > max", hpts, tp, hptsslot));

if ((hpts->p_hpts_active) &&

(hpts->p_wheel_complete == 0)) {

@@ -781,22 +792,21 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uin

int distance, yet_to_run;

- distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);

+ distance = hpts_slots_diff(hpts->p_runningslot, hptsslot);

if (hpts->p_runningslot != hpts->p_cur_slot)

yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);

else

yet_to_run = 0; /* processing last slot */

- KASSERT(yet_to_run <= distance,

- ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",

- hpts, inp, inp_hptsslot,

- distance, yet_to_run,

- hpts->p_runningslot, hpts->p_cur_slot));

+ KASSERT(yet_to_run <= distance, ("hpts:%p tp:%p slot:%d "

+ "distance:%d yet_to_run:%d rs:%d cs:%d", hpts, tp,

+ hptsslot, distance, yet_to_run, hpts->p_runningslot,

+ hpts->p_cur_slot));

}

#endif

uint32_t

-tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)

+tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag)

{

struct tcp_hpts_entry *hpts;

struct timeval tv;

@@ -804,16 +814,16 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts

int32_t wheel_slot, maxslots;

bool need_wakeup = false;

- INP_WLOCK_ASSERT(inp);

- MPASS(!tcp_in_hpts(inp));

- MPASS(!(inp->inp_flags & INP_DROPPED));

+ INP_WLOCK_ASSERT(tptoinpcb(tp));

+ MPASS(!(tptoinpcb(tp)->inp_flags & INP_DROPPED));

+ MPASS(!tcp_in_hpts(tp));

* We now return the next-slot the hpts will be on, beyond its

* current run (if up) or where it was when it stopped if it is

* sleeping.

- hpts = tcp_hpts_lock(inp);

+ hpts = tcp_hpts_lock(tp);

microuptime(&tv);

if (diag) {

memset(diag, 0, sizeof(struct hpts_diag));

@@ -830,20 +840,20 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts

}

if (slot == 0) {

/* Ok we need to set it on the hpts in the current slot */

- inp->inp_hpts_request = 0;

+ tp->t_hpts_request = 0;

if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) {

* A sleeping hpts we want in next slot to run

* note that in this state p_prev_slot == p_cur_slot

- inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);

+ tp->t_hpts_slot = hpts_slot(hpts->p_prev_slot, 1);

if ((hpts->p_on_min_sleep == 0) &&

(hpts->p_hpts_active == 0))

need_wakeup = true;

} else

- inp->inp_hptsslot = hpts->p_runningslot;

- if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))

- inp_hpts_insert(inp, hpts);

+ tp->t_hpts_slot = hpts->p_runningslot;

+ if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))

+ tcp_hpts_insert_internal(tp, hpts);

if (need_wakeup) {

* Activate the hpts if it is sleeping and its

@@ -880,28 +890,28 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts

slot--;

}

- inp->inp_hptsslot = last_slot;

- inp->inp_hpts_request = slot;

+ tp->t_hpts_slot = last_slot;

+ tp->t_hpts_request = slot;

} else if (maxslots >= slot) {

/* It all fits on the wheel */

- inp->inp_hpts_request = 0;

- inp->inp_hptsslot = hpts_slot(wheel_slot, slot);

+ tp->t_hpts_request = 0;

+ tp->t_hpts_slot = hpts_slot(wheel_slot, slot);

} else {

/* It does not fit */

- inp->inp_hpts_request = slot - maxslots;

- inp->inp_hptsslot = last_slot;

+ tp->t_hpts_request = slot - maxslots;

+ tp->t_hpts_slot = last_slot;

}

if (diag) {

- diag->slot_remaining = inp->inp_hpts_request;

- diag->inp_hptsslot = inp->inp_hptsslot;

+ diag->slot_remaining = tp->t_hpts_request;

+ diag->inp_hptsslot = tp->t_hpts_slot;

}

#ifdef INVARIANTS

- check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);

+ check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line);

#endif

- if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))

- inp_hpts_insert(inp, hpts);

+ if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))

+ tcp_hpts_insert_internal(tp, hpts);

if ((hpts->p_hpts_active == 0) &&

- (inp->inp_hpts_request == 0) &&

+ (tp->t_hpts_request == 0) &&

(hpts->p_on_min_sleep == 0)) {

* The hpts is sleeping and NOT on a minimum

@@ -972,54 +982,35 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts

return (slot_on);

}

-uint16_t

-hpts_random_cpu(struct inpcb *inp){

- /*

- * No flow type set distribute the load randomly.

- */

- uint16_t cpuid;

- uint32_t ran;

- /*

- * Shortcut if it is already set. XXXGL: does it happen?

- */

- if (inp->inp_hpts_cpu_set) {

- return (inp->inp_hpts_cpu);

- }

- /* Nothing set use a random number */

- ran = arc4random();

- cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);

- return (cpuid);

static uint16_t

-hpts_cpuid(struct inpcb *inp, int *failed)

+hpts_cpuid(struct tcpcb *tp, int *failed)

{

+ struct inpcb *inp = tptoinpcb(tp);

u_int cpuid;

#ifdef NUMA

struct hpts_domain_info *di;

#endif

*failed = 0;

- if (inp->inp_hpts_cpu_set) {

- return (inp->inp_hpts_cpu);

+ if (tp->t_flags2 & TF2_HPTS_CPU_SET) {

+ return (tp->t_hpts_cpu);

}

* If we are using the irq cpu set by LRO or

* the driver then it overrides all other domains.

if (tcp_use_irq_cpu) {

- if (inp->inp_irq_cpu_set == 0) {

+ if (tp->t_lro_cpu == HPTS_CPU_NONE) {

*failed = 1;

- return(0);

+ return (0);

}

- return(inp->inp_irq_cpu);

+ return (tp->t_lro_cpu);

}

/* If one is set the other must be the same */

#ifdef RSS

cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);

if (cpuid == NETISR_CPUID_NONE)

- return (hpts_random_cpu(inp));

+ return (hpts_random_cpu());

else

return (cpuid);

#endif

@@ -1030,7 +1021,7 @@ hpts_cpuid(struct inpcb *inp, int *failed)

if (inp->inp_flowtype == M_HASHTYPE_NONE) {

counter_u64_add(cpu_uses_random, 1);

- return (hpts_random_cpu(inp));

+ return (hpts_random_cpu());

}

* Hash to a thread based on the flowid. If we are using numa,

@@ -1081,12 +1072,10 @@ static int32_t

tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)

{

struct tcpcb *tp;

- struct inpcb *inp;

struct timeval tv;

int32_t slots_to_run, i, error;

int32_t loop_cnt = 0;

int32_t did_prefetch = 0;

- int32_t prefetch_ninp = 0;

int32_t prefetch_tp = 0;

int32_t wrap_loop_cnt = 0;

int32_t slot_pos_of_endpoint = 0;

@@ -1154,25 +1143,25 @@ again:

* run them, the extra 10usecs of late (by being

* put behind) does not really matter in this situation.

- TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot].head,

- inp_hpts) {

- MPASS(inp->inp_hptsslot == hpts->p_nxt_slot);

- MPASS(inp->inp_hpts_gencnt ==

+ TAILQ_FOREACH(tp, &hpts->p_hptss[hpts->p_nxt_slot].head,

+ t_hpts) {

+ MPASS(tp->t_hpts_slot == hpts->p_nxt_slot);

+ MPASS(tp->t_hpts_gencnt ==

hpts->p_hptss[hpts->p_nxt_slot].gencnt);

- MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);

+ MPASS(tp->t_in_hpts == IHPTS_ONQUEUE);

* Update gencnt and nextslot accordingly to match

* the new location. This is safe since it takes both

* the INP lock and the pacer mutex to change the

- * inp_hptsslot and inp_hpts_gencnt.

+ * t_hptsslot and t_hpts_gencnt.

- inp->inp_hpts_gencnt =

+ tp->t_hpts_gencnt =

hpts->p_hptss[hpts->p_runningslot].gencnt;

- inp->inp_hptsslot = hpts->p_runningslot;

+ tp->t_hpts_slot = hpts->p_runningslot;

}

TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head,

- &hpts->p_hptss[hpts->p_nxt_slot].head, inp_hpts);

+ &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts);

hpts->p_hptss[hpts->p_runningslot].count +=

hpts->p_hptss[hpts->p_nxt_slot].count;

hpts->p_hptss[hpts->p_nxt_slot].count = 0;

@@ -1191,8 +1180,8 @@ again:

goto no_one;

}

for (i = 0; i < slots_to_run; i++) {

- struct inpcb *inp, *ninp;

- TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);

+ struct tcpcb *tp, *ntp;

+ TAILQ_HEAD(, tcpcb) head = TAILQ_HEAD_INITIALIZER(head);

struct hptsh *hptsh;

uint32_t runningslot;

@@ -1205,20 +1194,54 @@ again:

runningslot = hpts->p_runningslot;

hptsh = &hpts->p_hptss[runningslot];

- TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts);

+ TAILQ_SWAP(&head, &hptsh->head, tcpcb, t_hpts);

hpts->p_on_queue_cnt -= hptsh->count;

hptsh->count = 0;

hptsh->gencnt++;

HPTS_UNLOCK(hpts);

- TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) {

+ TAILQ_FOREACH_SAFE(tp, &head, t_hpts, ntp) {

+ struct inpcb *inp = tptoinpcb(tp);

bool set_cpu;

- if (ninp != NULL) {

- /* We prefetch the next inp if possible */

- kern_prefetch(ninp, &prefetch_ninp);

- prefetch_ninp = 1;

+ if (ntp != NULL) {

+ /*

+ * If we have a next tcpcb, see if we can

+ * prefetch it. Note this may seem

+ * "risky" since we have no locks (other

+ * than the previous inp) and there no

+ * assurance that ntp was not pulled while

+ * we were processing tp and freed. If this

+ * occurred it could mean that either:

+ *

+ * a) Its NULL (which is fine we won't go

+ * here) <or> b) Its valid (which is cool we

+ * will prefetch it) <or> c) The inp got

+ * freed back to the slab which was

+ * reallocated. Then the piece of memory was

+ * re-used and something else (not an

+ * address) is in inp_ppcb. If that occurs

+ * we don't crash, but take a TLB shootdown

+ * performance hit (same as if it was NULL

+ * and we tried to pre-fetch it).

+ *

+ * Considering that the likelyhood of <c> is

+ * quite rare we will take a risk on doing

+ * this. If performance drops after testing

+ * we can always take this out. NB: the

+ * kern_prefetch on amd64 actually has

+ * protection against a bad address now via

+ * the DMAP_() tests. This will prevent the

+ * TLB hit, and instead if <c> occurs just

+ * cause us to load cache with a useless

+ * address (to us).

+ *

+ * XXXGL: this comment and the prefetch action

+ * could be outdated after tp == inp change.

+ */

+ kern_prefetch(ntp, &prefetch_tp);

+ prefetch_tp = 1;

}

/* For debugging */

@@ -1232,33 +1255,33 @@ again:

}

INP_WLOCK(inp);

- if (inp->inp_hpts_cpu_set == 0) {

+ if ((tp->t_flags2 & TF2_HPTS_CPU_SET) == 0) {

set_cpu = true;

} else {

set_cpu = false;

}

- if (__predict_false(inp->inp_in_hpts == IHPTS_MOVING)) {

- if (inp->inp_hptsslot == -1) {

- inp->inp_in_hpts = IHPTS_NONE;

+ if (__predict_false(tp->t_in_hpts == IHPTS_MOVING)) {

+ if (tp->t_hpts_slot == -1) {

+ tp->t_in_hpts = IHPTS_NONE;

if (in_pcbrele_wlocked(inp) == false)

INP_WUNLOCK(inp);

} else {

HPTS_LOCK(hpts);

- inp_hpts_insert(inp, hpts);

+ tcp_hpts_insert_internal(tp, hpts);

HPTS_UNLOCK(hpts);

INP_WUNLOCK(inp);

}

continue;

}

- MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);

+ MPASS(tp->t_in_hpts == IHPTS_ONQUEUE);

MPASS(!(inp->inp_flags & INP_DROPPED));

- KASSERT(runningslot == inp->inp_hptsslot,

+ KASSERT(runningslot == tp->t_hpts_slot,

("Hpts:%p inp:%p slot mis-aligned %u vs %u",

- hpts, inp, runningslot, inp->inp_hptsslot));

+ hpts, inp, runningslot, tp->t_hpts_slot));

- if (inp->inp_hpts_request) {

+ if (tp->t_hpts_request) {

* This guy is deferred out further in time

* then our wheel had available on it.

@@ -1268,38 +1291,36 @@ again:

uint32_t maxslots, last_slot, remaining_slots;

remaining_slots = slots_to_run - (i + 1);

- if (inp->inp_hpts_request > remaining_slots) {

+ if (tp->t_hpts_request > remaining_slots) {

HPTS_LOCK(hpts);

* How far out can we go?

maxslots = max_slots_available(hpts,

hpts->p_cur_slot, &last_slot);

- if (maxslots >= inp->inp_hpts_request) {

+ if (maxslots >= tp->t_hpts_request) {

/* We can place it finally to

* be processed. */

- inp->inp_hptsslot = hpts_slot(

+ tp->t_hpts_slot = hpts_slot(

hpts->p_runningslot,

- inp->inp_hpts_request);

- inp->inp_hpts_request = 0;

+ tp->t_hpts_request);

+ tp->t_hpts_request = 0;

} else {

/* Work off some more time */

- inp->inp_hptsslot = last_slot;

- inp->inp_hpts_request -=

+ tp->t_hpts_slot = last_slot;

+ tp->t_hpts_request -=

maxslots;

}

- inp_hpts_insert(inp, hpts);

+ tcp_hpts_insert_internal(tp, hpts);

HPTS_UNLOCK(hpts);

INP_WUNLOCK(inp);

continue;

}

- inp->inp_hpts_request = 0;

+ tp->t_hpts_request = 0;

/* Fall through we will so do it now */

}

- inp_hpts_release(inp);

- tp = intotcpcb(inp);

- MPASS(tp);

+ tcp_hpts_release(tp);

if (set_cpu) {

* Setup so the next time we will move to

@@ -1318,7 +1339,7 @@ again:

* gets added to the hpts (not this one)

* :-)

- tcp_set_hpts(inp);

+ tcp_set_hpts(tp);

}

CURVNET_SET(inp->inp_vnet);

/* Lets do any logging that we might want to */

@@ -1331,16 +1352,17 @@ again:

did_prefetch = 1;

}

- * We set inp_hpts_calls to 1 before any possible output.

- * The contract with the transport is that if it cares about

- * hpts calling it should clear the flag. That way next time

- * it is called it will know it is hpts.

+ * We set TF2_HPTS_CALLS before any possible output.

+ * The contract with the transport is that if it cares

+ * about hpts calling it should clear the flag. That

+ * way next time it is called it will know it is hpts.

- * We also only call tfb_do_queued_segments() <or> tcp_output()

- * it is expected that if segments are queued and come in that

- * the final input mbuf will cause a call to output if it is needed.

+ * We also only call tfb_do_queued_segments() <or>

+ * tcp_output(). It is expected that if segments are

+ * queued and come in that the final input mbuf will

+ * cause a call to output if it is needed.

- inp->inp_hpts_calls = 1;

+ tp->t_flags2 |= TF2_HPTS_CALLS;

if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) &&

!STAILQ_EMPTY(&tp->t_inqueue)) {

error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0);

@@ -1352,44 +1374,6 @@ again:

error = tcp_output(tp);

if (error < 0)

goto skip_pacing;

- if (ninp) {

- /*

- * If we have a nxt inp, see if we can

- * prefetch it. Note this may seem

- * "risky" since we have no locks (other

- * than the previous inp) and there no

- * assurance that ninp was not pulled while

- * we were processing inp and freed. If this

- * occurred it could mean that either:

- *

- * a) Its NULL (which is fine we won't go

- * here) <or> b) Its valid (which is cool we

- * will prefetch it) <or> c) The inp got

- * freed back to the slab which was

- * reallocated. Then the piece of memory was

- * re-used and something else (not an

- * address) is in inp_ppcb. If that occurs

- * we don't crash, but take a TLB shootdown

- * performance hit (same as if it was NULL

- * and we tried to pre-fetch it).

- *

- * Considering that the likelyhood of <c> is

- * quite rare we will take a risk on doing

- * this. If performance drops after testing

- * we can always take this out. NB: the

- * kern_prefetch on amd64 actually has

- * protection against a bad address now via

- * the DMAP_() tests. This will prevent the

- * TLB hit, and instead if <c> occurs just

- * cause us to load cache with a useless

- * address (to us).

- *

- * XXXGL: with tcpcb == inpcb, I'm unsure this

- * prefetch is still correct and useful.

- */

- kern_prefetch(ninp, &prefetch_tp);

- prefetch_tp = 1;

- }

INP_WUNLOCK(inp);

skip_pacing:

CURVNET_RESTORE();

@@ -1491,18 +1475,18 @@ no_run:

}

void

-__tcp_set_hpts(struct inpcb *inp, int32_t line)

+__tcp_set_hpts(struct tcpcb *tp, int32_t line)

{

struct tcp_hpts_entry *hpts;

int failed;

- INP_WLOCK_ASSERT(inp);

- hpts = tcp_hpts_lock(inp);

- if ((inp->inp_in_hpts == 0) &&

- (inp->inp_hpts_cpu_set == 0)) {

- inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);

+ INP_WLOCK_ASSERT(tptoinpcb(tp));

+ hpts = tcp_hpts_lock(tp);

+ if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {

+ tp->t_hpts_cpu = hpts_cpuid(tp, &failed);

if (failed == 0)

- inp->inp_hpts_cpu_set = 1;

+ tp->t_flags2 |= TF2_HPTS_CPU_SET;

}

mtx_unlock(&hpts->p_mtx);

}

diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index 9bceca0fd340..dfa6eaf79bdc 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h

@@ -111,10 +111,14 @@ struct hpts_diag {

#ifdef _KERNEL

-void tcp_hpts_remove(struct inpcb *);

-bool tcp_in_hpts(struct inpcb *);

+void tcp_hpts_init(struct tcpcb *);

+void tcp_hpts_remove(struct tcpcb *);

+static bool

+tcp_in_hpts(struct tcpcb *tp)

+ return (tp->t_in_hpts == IHPTS_ONQUEUE);

* To insert a TCB on the hpts you *must* be holding the

@@ -140,20 +144,18 @@ bool tcp_in_hpts(struct inpcb *);

* that INP_WLOCK() or from destroying your TCB where again

* you should already have the INP_WLOCK().

-uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line,

+uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,

struct hpts_diag *diag);

#define tcp_hpts_insert(inp, slot) \

tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)

-void __tcp_set_hpts(struct inpcb *inp, int32_t line);

+void __tcp_set_hpts(struct tcpcb *tp, int32_t line);

#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)

void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);

void tcp_run_hpts(void);

-uint16_t hpts_random_cpu(struct inpcb *inp);

extern int32_t tcp_min_hptsi_time;

#endif /* _KERNEL */

diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 7cbf535a9263..76c345add1f8 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c

@@ -1380,10 +1380,8 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)

INP_WUNLOCK(inp);

return (TCP_LRO_CANNOT);

}

- if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) {

- inp->inp_irq_cpu = lc->lro_last_cpu;

- inp->inp_irq_cpu_set = 1;

- }

+ if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)

+ tp->t_lro_cpu = lc->lro_last_cpu;

/* Check if the transport doesn't support the needed optimizations. */

if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {

INP_WUNLOCK(inp);

diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index f5cf362a57dc..f8c7557150dd 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c

@@ -739,7 +739,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_

int32_t delay_calc = 0;

uint32_t prev_delay = 0;

- if (tcp_in_hpts(inp)) {

+ if (tcp_in_hpts(tp)) {

/* A previous call is already set up */

return;

}

@@ -904,14 +904,14 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_

inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;

bbr->rc_pacer_started = cts;

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),

__LINE__, &diag);

bbr->rc_timer_first = 0;

bbr->bbr_timer_src = frm;

bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);

bbr_log_hpts_diag(bbr, cts, &diag);

} else if (hpts_timeout) {

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),

__LINE__, &diag);

* We add the flag here as well if the slot is set,

@@ -1050,8 +1050,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock

wrong_timer:

if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {

- if (tcp_in_hpts(inp))

- tcp_hpts_remove(inp);

+ if (tcp_in_hpts(tp))

+ tcp_hpts_remove(tp);

bbr_timer_cancel(bbr, __LINE__, cts);

bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,

0);

@@ -1875,7 +1875,7 @@ bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t ct

l->lt_epoch = bbr->r_ctl.rc_lt_epoch;

l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;

l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;

- l->inhpts = tcp_in_hpts(bbr->rc_inp);

+ l->inhpts = tcp_in_hpts(bbr->rc_tp);

l->use_lt_bw = bbr->rc_lt_use_bw;

l->pkts_out = bbr->r_ctl.rc_flight_at_input;

l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;

@@ -2496,7 +2496,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u

log.u_bbr.flex2 = to;

log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;

log.u_bbr.flex4 = slot;

- log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot;

+ log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot;

log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);

log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2;

log.u_bbr.flex8 = which;

@@ -3953,7 +3953,7 @@ bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_s

bbr->rc_tlp_rtx_out = 0;

bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate;

tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);

- if (tcp_in_hpts(bbr->rc_inp) &&

+ if (tcp_in_hpts(bbr->rc_tp) &&

((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) {

* When we enter recovery, we need to restart

@@ -5209,7 +5209,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t

left = bbr->r_ctl.rc_timer_exp - cts;

ret = -3;

bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);

- tcp_hpts_insert(tptoinpcb(tp), HPTS_USEC_TO_SLOTS(left));

+ tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left));

return (1);

}

bbr->rc_tmr_stopped = 0;

@@ -5240,7 +5240,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)

if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {

uint8_t hpts_removed = 0;

- if (tcp_in_hpts(bbr->rc_inp) &&

+ if (tcp_in_hpts(bbr->rc_tp) &&

(bbr->rc_timer_first == 1)) {

* If we are canceling timer's when we have the

@@ -5248,7 +5248,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)

* must remove ourselves from the hpts.

hpts_removed = 1;

- tcp_hpts_remove(bbr->rc_inp);

+ tcp_hpts_remove(bbr->rc_tp);

if (bbr->r_ctl.rc_last_delay_val) {

/* Update the last hptsi delay too */

uint32_t time_since_send;

@@ -7920,8 +7920,8 @@ bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t li

* don't want to transfer forward the time

* for our sum's calculations.

- if (tcp_in_hpts(bbr->rc_inp)) {

- tcp_hpts_remove(bbr->rc_inp);

+ if (tcp_in_hpts(bbr->rc_tp)) {

+ tcp_hpts_remove(bbr->rc_tp);

bbr->rc_timer_first = 0;

bbr->r_ctl.rc_hpts_flags = 0;

bbr->r_ctl.rc_last_delay_val = 0;

@@ -9854,8 +9854,8 @@ bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr)

/* We enter in persists, set the flag appropriately */

bbr->rc_in_persist = 1;

}

- if (tcp_in_hpts(bbr->rc_inp)) {

- tcp_hpts_remove(bbr->rc_inp);

+ if (tcp_in_hpts(bbr->rc_tp)) {

+ tcp_hpts_remove(bbr->rc_tp);

}

@@ -11437,7 +11437,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,

}

/* Set the flag */

bbr->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0;

- tcp_set_hpts(inp);

+ tcp_set_hpts(tp);

sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack);

}

if (thflags & TH_ACK) {

@@ -11546,7 +11546,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,

if ((tp->snd_max == tp->snd_una) &&

((tp->t_flags & TF_DELACK) == 0) &&

- (tcp_in_hpts(bbr->rc_inp)) &&

+ (tcp_in_hpts(tp)) &&

(bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {

* keep alive not needed if we are hptsi

@@ -11554,8 +11554,8 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,

;

} else {

- if (tcp_in_hpts(bbr->rc_inp)) {

- tcp_hpts_remove(bbr->rc_inp);

+ if (tcp_in_hpts(tp)) {

+ tcp_hpts_remove(tp);

if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&

(TSTMP_GT(lcts, bbr->rc_pacer_started))) {

uint32_t del;

@@ -11582,8 +11582,8 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,

bbr_timer_audit(tp, bbr, lcts, &so->so_snd);

}

/* Clear the flag, it may have been cleared by output but we may not have */

- if ((nxt_pkt == 0) && (inp->inp_hpts_calls))

- inp->inp_hpts_calls = 0;

+ if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS))

+ tp->t_flags2 &= ~TF2_HPTS_CALLS;

/* Do we have a new state */

if (bbr->r_state != tp->t_state)

bbr_set_state(tp, bbr, tiwin);

@@ -11842,7 +11842,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)

int32_t slot = 0;

struct inpcb *inp;

struct sockbuf *sb;

- uint32_t hpts_calling;

+ bool hpts_calling;

#ifdef INET6

struct ip6_hdr *ip6 = NULL;

int32_t isipv6;

@@ -11853,8 +11853,8 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)

memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));

cts = tcp_tv_to_usectick(&bbr->rc_tv);

inp = bbr->rc_inp;

- hpts_calling = inp->inp_hpts_calls;

- inp->inp_hpts_calls = 0;

+ hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS);

+ tp->t_flags2 &= ~TF2_HPTS_CALLS;

so = inp->inp_socket;

sb = &so->so_snd;

if (tp->t_nic_ktls_xmit)

@@ -11884,7 +11884,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)

}

#endif

if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&

- tcp_in_hpts(inp)) {

+ tcp_in_hpts(tp)) {

* We are on the hpts for some timer but not hptsi output.

* Possibly remove from the hpts so we can send/recv etc.

@@ -11913,7 +11913,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)

return (0);

}

- tcp_hpts_remove(inp);

+ tcp_hpts_remove(tp);

bbr_timer_cancel(bbr, __LINE__, cts);

}

if (bbr->r_ctl.rc_last_delay_val) {

@@ -11929,9 +11929,9 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)

if ((bbr->r_timer_override) ||

(tp->t_state < TCPS_ESTABLISHED)) {

/* Timeouts or early states are exempt */

- if (tcp_in_hpts(inp))

- tcp_hpts_remove(inp);

- } else if (tcp_in_hpts(inp)) {

+ if (tcp_in_hpts(tp))

+ tcp_hpts_remove(tp);

+ } else if (tcp_in_hpts(tp)) {

if ((bbr->r_ctl.rc_last_delay_val) &&

(bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&

delay_calc) {

@@ -11943,10 +11943,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)

counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1);

bbr->r_ctl.rc_last_delay_val = 0;

- tcp_hpts_remove(inp);

+ tcp_hpts_remove(tp);

} else if (tp->t_state == TCPS_CLOSED) {

bbr->r_ctl.rc_last_delay_val = 0;

- tcp_hpts_remove(inp);

+ tcp_hpts_remove(tp);

} else {

* On the hpts, you shall not pass! even if ACKNOW

@@ -14088,7 +14088,7 @@ bbr_switch_failed(struct tcpcb *tp)

inp->inp_flags2 |= INP_CANNOT_DO_ECN;

inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;

tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);

- if (inp->inp_in_hpts) {

+ if (tp->t_in_hpts > IHPTS_NONE) {

return;

}

bbr = (struct tcp_bbr *)tp->t_fb_ptr;

@@ -14109,7 +14109,7 @@ bbr_switch_failed(struct tcpcb *tp)

}

} else

toval = HPTS_TICKS_PER_SLOT;

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),

__LINE__, &diag);

bbr_log_hpts_diag(bbr, cts, &diag);

}

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index e0130e7fea2a..9e531a1d3182 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c

@@ -2579,7 +2579,7 @@ log_anyway:

log.u_bbr.flex5 = rsm->r_start;

log.u_bbr.flex6 = rsm->r_end;

log.u_bbr.flex8 = mod;

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.timeStamp = tcp_get_usecs(&tv);

log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);

log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;

@@ -2605,7 +2605,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot

log.u_bbr.flex2 = to;

log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;

log.u_bbr.flex4 = slot;

- log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;

+ log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot;

log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;

log.u_bbr.flex7 = rack->rc_in_persist;

log.u_bbr.flex8 = which;

@@ -2613,7 +2613,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot

log.u_bbr.pkts_out = 0;

else

log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.timeStamp = tcp_get_usecs(&tv);

log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);

log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;

@@ -2640,7 +2640,7 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs

struct timeval tv;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex8 = to_num;

log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;

log.u_bbr.flex2 = rack->rc_rack_rtt;

@@ -2678,7 +2678,7 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

log.u_bbr.flex8 = flag;

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.cur_del_rate = (uint64_t)prev;

log.u_bbr.delRate = (uint64_t)rsm;

log.u_bbr.rttProp = (uint64_t)next;

@@ -2722,7 +2722,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l

union tcp_log_stackspecific log;

struct timeval tv;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = t;

log.u_bbr.flex2 = len;

log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;

@@ -2894,7 +2894,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,

struct timeval tv;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = line;

log.u_bbr.flex2 = tick;

log.u_bbr.flex3 = tp->t_maxunacktime;

@@ -2920,7 +2920,7 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_

union tcp_log_stackspecific log;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = slot;

if (rack->rack_no_prr)

log.u_bbr.flex2 = 0;

@@ -2968,7 +2968,7 @@ rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_

log.u_bbr.flex7 <<= 1;

log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */

log.u_bbr.flex8 = rack->rc_in_persist;

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.timeStamp = tcp_get_usecs(&tv);

log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);

log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;

@@ -3021,7 +3021,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui

struct timeval tv;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = slot;

log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;

log.u_bbr.flex4 = reason;

@@ -3054,7 +3054,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32

union tcp_log_stackspecific log;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = line;

log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;

log.u_bbr.flex3 = flags_on_entry;

@@ -4904,7 +4904,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,

rack->r_ctl.rc_app_limited_cnt,

0, 0, 10, __LINE__, NULL, quality);

}

- if (tcp_in_hpts(rack->rc_inp) &&

+ if (tcp_in_hpts(rack->rc_tp) &&

(rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {

* Ok we can't trust the pacer in this case

@@ -4914,7 +4914,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,

* Stop the pacer and clear up all the aggregate

* delays etc.

- tcp_hpts_remove(rack->rc_inp);

+ tcp_hpts_remove(rack->rc_tp);

rack->r_ctl.rc_hpts_flags = 0;

rack->r_ctl.rc_last_output_to = 0;

}

@@ -6506,8 +6506,8 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)

struct timeval tv;

uint32_t t_time;

- if (tcp_in_hpts(rack->rc_inp)) {

- tcp_hpts_remove(rack->rc_inp);

+ if (tcp_in_hpts(rack->rc_tp)) {

+ tcp_hpts_remove(rack->rc_tp);

rack->r_ctl.rc_hpts_flags = 0;

}

#ifdef NETFLIX_SHARED_CWND

@@ -6645,7 +6645,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,

(tp->t_state == TCPS_LISTEN)) {

return;

}

- if (tcp_in_hpts(inp)) {

+ if (tcp_in_hpts(tp)) {

/* Already on the pacer */

return;

}

@@ -6896,12 +6896,12 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,

* Arrange for the hpts to kick back in after the

* t-o if the t-o does not cause a send.

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),

__LINE__, &diag);

rack_log_hpts_diag(rack, us_cts, &diag, &tv);

rack_log_to_start(rack, cts, hpts_timeout, slot, 0);

} else {

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),

__LINE__, &diag);

rack_log_hpts_diag(rack, us_cts, &diag, &tv);

rack_log_to_start(rack, cts, hpts_timeout, slot, 1);

@@ -6916,7 +6916,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,

* at the start of this block) are good enough.

rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),

__LINE__, &diag);

rack_log_hpts_diag(rack, us_cts, &diag, &tv);

rack_log_to_start(rack, cts, hpts_timeout, slot, 0);

@@ -8039,7 +8039,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8

rack->rc_inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;

ret = -3;

left = rack->r_ctl.rc_timer_exp - cts;

- tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left));

+ tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left));

rack_log_to_processing(rack, cts, ret, left);

return (1);

}

@@ -8080,7 +8080,7 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin

if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&

((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||

((tp->snd_max - tp->snd_una) == 0))) {

- tcp_hpts_remove(rack->rc_inp);

+ tcp_hpts_remove(rack->rc_tp);

hpts_removed = 1;

/* If we were not delayed cancel out the flag. */

if ((tp->snd_max - tp->snd_una) == 0)

@@ -8089,14 +8089,14 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin

}

if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {

rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;

- if (tcp_in_hpts(rack->rc_inp) &&

+ if (tcp_in_hpts(rack->rc_tp) &&

((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {

* Canceling timer's when we have no output being

* paced. We also must remove ourselves from the

* hpts.

- tcp_hpts_remove(rack->rc_inp);

+ tcp_hpts_remove(rack->rc_tp);

hpts_removed = 1;

}

rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);

@@ -8124,8 +8124,8 @@ rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack)

/* We enter in persists, set the flag appropriately */

rack->rc_in_persist = 1;

}

- if (tcp_in_hpts(rack->rc_inp)) {

- tcp_hpts_remove(rack->rc_inp);

+ if (tcp_in_hpts(rack->rc_tp)) {

+ tcp_hpts_remove(rack->rc_tp);

}

@@ -11394,7 +11394,7 @@ out:

(entered_recovery == 0)) {

rack_update_prr(tp, rack, changed, th_ack);

if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&

- ((tcp_in_hpts(rack->rc_inp) == 0) &&

+ ((tcp_in_hpts(rack->rc_tp) == 0) &&

((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {

* If you are pacing output you don't want

@@ -14583,7 +14583,7 @@ rack_switch_failed(struct tcpcb *tp)

inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;

if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))

rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;

- if (inp->inp_in_hpts) {

+ if (tp->t_in_hpts > IHPTS_NONE) {

/* Strange */

return;

}

@@ -14604,7 +14604,7 @@ rack_switch_failed(struct tcpcb *tp)

}

} else

toval = HPTS_TICKS_PER_SLOT;

- (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),

__LINE__, &diag);

rack_log_hpts_diag(rack, cts, &diag, &tv);

}

@@ -15201,7 +15201,7 @@ rack_init(struct tcpcb *tp, void **ptr)

if (tov) {

struct hpts_diag diag;

- (void)tcp_hpts_insert_diag(rack->rc_inp, HPTS_USEC_TO_SLOTS(tov),

+ (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov),

__LINE__, &diag);

rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);

}

@@ -15487,7 +15487,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)

* We will force the hpts to be stopped if any, and restart

* with the slot set to what was in the saved slot.

- if (tcp_in_hpts(rack->rc_inp)) {

+ if (tcp_in_hpts(rack->rc_tp)) {

if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {

uint32_t us_cts;

@@ -15498,7 +15498,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)

}

rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;

}

- tcp_hpts_remove(rack->rc_inp);

+ tcp_hpts_remove(rack->rc_tp);

}

rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);

rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);

@@ -15579,7 +15579,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent

}

#endif

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

if (rack->rack_no_prr == 0)

log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;

else

@@ -16438,8 +16438,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb

}

did_out = 1;

}

- if (rack->rc_inp->inp_hpts_calls)

- rack->rc_inp->inp_hpts_calls = 0;

+ if (tp->t_flags2 & TF2_HPTS_CALLS)

+ tp->t_flags2 &= ~TF2_HPTS_CALLS;

rack_free_trim(rack);

#ifdef TCP_ACCOUNTING

sched_unpin();

@@ -16673,7 +16673,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,

}

#endif

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

if (rack->rack_no_prr == 0)

log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;

else

@@ -16900,7 +16900,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,

#endif

return (1);

}

- tcp_set_hpts(inp);

+ tcp_set_hpts(tp);

sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);

}

if (thflags & TH_FIN)

@@ -16999,7 +16999,7 @@ do_output_now:

rack_free_trim(rack);

} else if ((no_output == 1) &&

(nxt_pkt == 0) &&

- (tcp_in_hpts(rack->rc_inp) == 0)) {

+ (tcp_in_hpts(rack->rc_tp) == 0)) {

* We are not in hpts and we had a pacing timer up. Use

* the remaining time (slot_remaining) to restart the timer.

@@ -17009,8 +17009,8 @@ do_output_now:

rack_free_trim(rack);

}

/* Clear the flag, it may have been cleared by output but we may not have */

- if ((nxt_pkt == 0) && (inp->inp_hpts_calls))

- inp->inp_hpts_calls = 0;

+ if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS))

+ tp->t_flags2 &= ~TF2_HPTS_CALLS;

/* Update any rounds needed */

if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))

rack_log_hystart_event(rack, high_seq, 8);

@@ -17044,13 +17044,13 @@ do_output_now:

/* We could not send (probably in the hpts but stopped the timer earlier)? */

if ((tp->snd_max == tp->snd_una) &&

((tp->t_flags & TF_DELACK) == 0) &&

- (tcp_in_hpts(rack->rc_inp)) &&

+ (tcp_in_hpts(rack->rc_tp)) &&

(rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {

/* keep alive not needed if we are hptsi output yet */

;

} else {

int late = 0;

- if (tcp_in_hpts(inp)) {

+ if (tcp_in_hpts(tp)) {

if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {

us_cts = tcp_get_usecs(NULL);

if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {

@@ -17060,7 +17060,7 @@ do_output_now:

late = 1;

rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;

}

- tcp_hpts_remove(inp);

+ tcp_hpts_remove(tp);

}

if (late && (did_out == 0)) {

@@ -18074,7 +18074,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_

struct timeval tv;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = error;

log.u_bbr.flex2 = flags;

log.u_bbr.flex3 = rsm_is_null;

@@ -18339,7 +18339,7 @@ rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack,

err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);

#endif

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = p_rate;

log.u_bbr.flex2 = p_queue;

log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;

@@ -18404,7 +18404,7 @@ rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp,

out:

if (tcp_bblogging_on(tp)) {

memset(&log, 0, sizeof(log));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

log.u_bbr.flex1 = p_rate;

log.u_bbr.flex2 = p_queue;

log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;

@@ -18769,7 +18769,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma

counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));

}

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

if (rack->rack_no_prr)

log.u_bbr.flex1 = 0;

else

@@ -19302,7 +19302,7 @@ again:

union tcp_log_stackspecific log;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

if (rack->rack_no_prr)

log.u_bbr.flex1 = 0;

else

@@ -19634,7 +19634,7 @@ rack_output(struct tcpcb *tp)

uint32_t cts, ms_cts, delayed, early;

uint16_t add_flag = RACK_SENT_SP;

/* The doing_tlp flag will be set by the actual rack_timeout_tlp() */

- uint8_t hpts_calling, doing_tlp = 0;

+ uint8_t doing_tlp = 0;

uint32_t cwnd_to_use, pace_max_seg;

int32_t do_a_prefetch = 0;

int32_t prefetch_rsm = 0;

@@ -19652,7 +19652,7 @@ rack_output(struct tcpcb *tp)

struct ip6_hdr *ip6 = NULL;

int32_t isipv6;

#endif

- bool hw_tls = false;

+ bool hpts_calling, hw_tls = false;

NET_EPOCH_ASSERT();

INP_WLOCK_ASSERT(inp);

@@ -19663,8 +19663,8 @@ rack_output(struct tcpcb *tp)

sched_pin();

ts_val = get_cyclecount();

#endif

- hpts_calling = inp->inp_hpts_calls;

- rack->rc_inp->inp_hpts_calls = 0;

+ hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS);

+ tp->t_flags2 &= ~TF2_HPTS_CALLS;

#ifdef TCP_OFFLOAD

if (tp->t_flags & TF_TOE) {

#ifdef TCP_ACCOUNTING

@@ -19707,7 +19707,7 @@ rack_output(struct tcpcb *tp)

cts = tcp_get_usecs(&tv);

ms_cts = tcp_tv_to_mssectick(&tv);

if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&

- tcp_in_hpts(rack->rc_inp)) {

+ tcp_in_hpts(rack->rc_tp)) {

* We are on the hpts for some timer but not hptsi output.

* Remove from the hpts unconditionally.

@@ -19741,7 +19741,7 @@ rack_output(struct tcpcb *tp)

}

if (rack->rc_in_persist) {

- if (tcp_in_hpts(rack->rc_inp) == 0) {

+ if (tcp_in_hpts(rack->rc_tp) == 0) {

/* Timer is not running */

rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);

}

@@ -19753,7 +19753,7 @@ rack_output(struct tcpcb *tp)

if ((rack->rc_ack_required == 1) &&

(rack->r_timer_override == 0)){

/* A timeout occurred and no ack has arrived */

- if (tcp_in_hpts(rack->rc_inp) == 0) {

+ if (tcp_in_hpts(rack->rc_tp) == 0) {

/* Timer is not running */

rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);

}

@@ -19767,9 +19767,9 @@ rack_output(struct tcpcb *tp)

(delayed) ||

(tp->t_state < TCPS_ESTABLISHED)) {

rack->rc_ack_can_sendout_data = 0;

- if (tcp_in_hpts(rack->rc_inp))

- tcp_hpts_remove(rack->rc_inp);

- } else if (tcp_in_hpts(rack->rc_inp)) {

+ if (tcp_in_hpts(rack->rc_tp))

+ tcp_hpts_remove(rack->rc_tp);

+ } else if (tcp_in_hpts(rack->rc_tp)) {

* On the hpts you can't pass even if ACKNOW is on, we will

* when the hpts fires.

@@ -21683,7 +21683,7 @@ send:

union tcp_log_stackspecific log;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);

+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);

if (rack->rack_no_prr)

log.u_bbr.flex1 = 0;

else

diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index c57eedef151f..40dd9b7f3aa9 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c

@@ -2148,7 +2148,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,

struct timeval tv;

memset(&log.u_bbr, 0, sizeof(log.u_bbr));

- log.u_bbr.inhpts = inp->inp_in_hpts;

+ log.u_bbr.inhpts = tcp_in_hpts(tp);

log.u_bbr.flex8 = 4;

log.u_bbr.pkts_out = tp->t_maxseg;

log.u_bbr.timeStamp = tcp_get_usecs(&tv);

@@ -2315,11 +2315,7 @@ tcp_newtcpcb(struct inpcb *inp)

inp->inp_ip_ttl = V_ip_defttl;

#ifdef TCPHPTS

- /*

- * If using hpts lets drop a random number in so

- * not all new connections fall on the same CPU.

- */

- inp->inp_hpts_cpu = hpts_random_cpu(inp);

+ tcp_hpts_init(tp);

#endif

#ifdef TCPPCAP

@@ -2434,6 +2430,7 @@ tcp_discardcb(struct tcpcb *tp)

if (tp->t_fb->tfb_tcp_fb_fini)

(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);

+ MPASS(!tcp_in_hpts(tp));

#ifdef TCP_BLACKBOX

tcp_log_tcpcbfini(tp);

#endif

@@ -2529,7 +2526,7 @@ tcp_close(struct tcpcb *tp)

tp->t_tfo_pending = NULL;

}

#ifdef TCPHPTS

- tcp_hpts_remove(inp);

+ tcp_hpts_remove(tp);

#endif

in_pcbdrop(inp);

TCPSTAT_INC(tcps_closed);

diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 124a254cae3d..0d144cb04e55 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c

@@ -80,7 +80,6 @@ __FBSDID("$FreeBSD$");

#include <netinet/tcp_seq.h>

#include <netinet/tcp_timer.h>

#include <netinet/tcp_var.h>

-#include <netinet/tcp_hpts.h>

#include <netinet/tcpip.h>

#include <netinet/udp.h>

diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index a613e5fbf2b7..d23dd9f97222 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c

@@ -1712,7 +1712,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)

#ifdef TCPHPTS

/* Assure that we are not on any hpts */

- tcp_hpts_remove(tptoinpcb(tp));

+ tcp_hpts_remove(tp);

#endif

if (blk->tfb_tcp_fb_init) {

error = (*blk->tfb_tcp_fb_init)(tp, &ptr);

diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index f80e9fc37ff4..a3016a143b93 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h

@@ -314,6 +314,23 @@ struct tcpcb {

sbintime_t t_timers[TT_N];

sbintime_t t_precisions[TT_N];

+ /* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */

+ TAILQ_ENTRY(tcpcb) t_hpts; /* linkage to HPTS ring */

+ STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input packets queue */

+ uint32_t t_hpts_request; /* Current hpts request, zero if

+ * fits in the pacing window. */

+ uint32_t t_hpts_slot; /* HPTS wheel slot this tcb is. */

+ uint32_t t_hpts_drop_reas; /* Reason we are dropping the pcb. */

+ uint32_t t_hpts_gencnt;

+ uint16_t t_hpts_cpu; /* CPU chosen by hpts_cpuid(). */

+ uint16_t t_lro_cpu; /* CPU derived from LRO. */

+#define HPTS_CPU_NONE ((uint16_t)-1)

+ enum {

+ IHPTS_NONE = 0,

+ IHPTS_ONQUEUE,

+ IHPTS_MOVING,

+ } t_in_hpts; /* Is it linked into HPTS? */

uint32_t t_maxseg:24, /* maximum segment size */

_t_logstate:8; /* State of "black box" logging */

uint32_t t_port:16, /* Tunneling (over udp) port */

@@ -355,7 +372,6 @@ struct tcpcb {

int t_segqlen; /* segment reassembly queue length */

uint32_t t_segqmbuflen; /* total reassembly queue byte length */

struct tsegqe_head t_segq; /* segment reassembly queue */

- STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input queue */

uint32_t snd_ssthresh; /* snd_cwnd size threshold for

* for slow start exponential to

* linear switch

@@ -832,9 +848,11 @@ tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack)

#define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */

#define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */

#define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */

+#define TF2_HPTS_CPU_SET 0x00000200 /* t_hpts_cpu is not random */

#define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */

#define TF2_ECN_USE_ECT1 0x00000800 /* Use ECT(1) marking on session */

#define TF2_TCP_ACCOUNTING 0x00001000 /* Do TCP accounting */

+#define TF2_HPTS_CALLS 0x00002000 /* tcp_output() called via HPTS */

* Structure to hold TCP options that are only used during segment