aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/igmp.c8
-rw-r--r--sys/netinet/in.c13
-rw-r--r--sys/netinet/in_fib_dxr.c209
-rw-r--r--sys/netinet/in_jail.c139
-rw-r--r--sys/netinet/in_kdtrace.c2
-rw-r--r--sys/netinet/in_mcast.c49
-rw-r--r--sys/netinet/in_pcb.c154
-rw-r--r--sys/netinet/in_pcb.h93
-rw-r--r--sys/netinet/in_pcb_var.h3
-rw-r--r--sys/netinet/in_proto.c8
-rw-r--r--sys/netinet/ip_carp.c27
-rw-r--r--sys/netinet/ip_divert.c30
-rw-r--r--sys/netinet/ip_fastfwd.c101
-rw-r--r--sys/netinet/ip_input.c38
-rw-r--r--sys/netinet/ip_mroute.c88
-rw-r--r--sys/netinet/ip_mroute.h4
-rw-r--r--sys/netinet/ip_var.h7
-rw-r--r--sys/netinet/raw_ip.c44
-rw-r--r--sys/netinet/sctp_constants.h2
-rw-r--r--sys/netinet/sctp_input.c8
-rw-r--r--sys/netinet/sctp_module.c2
-rw-r--r--sys/netinet/sctp_os_bsd.h42
-rw-r--r--sys/netinet/sctp_output.c118
-rw-r--r--sys/netinet/sctp_output.h2
-rw-r--r--sys/netinet/sctp_pcb.c24
-rw-r--r--sys/netinet/sctp_usrreq.c107
-rw-r--r--sys/netinet/sctp_var.h3
-rw-r--r--sys/netinet/sctputil.c40
-rw-r--r--sys/netinet/sctputil.h3
-rw-r--r--sys/netinet/tcp_hpts.c215
-rw-r--r--sys/netinet/tcp_hpts.h6
-rw-r--r--sys/netinet/tcp_input.c40
-rw-r--r--sys/netinet/tcp_log_buf.h2
-rw-r--r--sys/netinet/tcp_lro.c24
-rw-r--r--sys/netinet/tcp_lro.h4
-rw-r--r--sys/netinet/tcp_output.c2
-rw-r--r--sys/netinet/tcp_ratelimit.c2
-rw-r--r--sys/netinet/tcp_sack.c5
-rw-r--r--sys/netinet/tcp_stacks/bbr.c100
-rw-r--r--sys/netinet/tcp_stacks/rack.c228
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c9
-rw-r--r--sys/netinet/tcp_stacks/tcp_bbr.h2
-rw-r--r--sys/netinet/tcp_subr.c172
-rw-r--r--sys/netinet/tcp_syncache.c30
-rw-r--r--sys/netinet/tcp_timer.c24
-rw-r--r--sys/netinet/tcp_timer.h3
-rw-r--r--sys/netinet/tcp_timewait.c26
-rw-r--r--sys/netinet/tcp_usrreq.c99
-rw-r--r--sys/netinet/tcp_var.h121
-rw-r--r--sys/netinet/toecore.c3
-rw-r--r--sys/netinet/udp.h2
-rw-r--r--sys/netinet/udp_usrreq.c69
-rw-r--r--sys/netinet/udp_var.h5
53 files changed, 1260 insertions, 1301 deletions
diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c
index e7636330d267..58d66ebafe64 100644
--- a/sys/netinet/igmp.c
+++ b/sys/netinet/igmp.c
@@ -482,6 +482,7 @@ out_locked:
static int
sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
{
+ struct epoch_tracker et;
int *name;
int error;
u_int namelen;
@@ -504,14 +505,11 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
IN_MULTI_LIST_LOCK();
IGMP_LOCK();
- if (name[0] <= 0 || name[0] > V_if_index) {
- error = ENOENT;
- goto out_locked;
- }
-
error = ENOENT;
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(name[0]);
+ NET_EPOCH_EXIT(et);
if (ifp == NULL)
goto out_locked;
diff --git a/sys/netinet/in.c b/sys/netinet/in.c
index 70fbe32c05ac..a504f54a026e 100644
--- a/sys/netinet/in.c
+++ b/sys/netinet/in.c
@@ -35,6 +35,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+
#define IN_HISTORICAL_NETS /* include class masks */
#include <sys/param.h>
@@ -1702,6 +1704,17 @@ in_lltattach(struct ifnet *ifp)
return (llt);
}
+struct lltable *
+in_lltable_get(struct ifnet *ifp)
+{
+ struct lltable *llt = NULL;
+
+ void *afdata_ptr = ifp->if_afdata[AF_INET];
+ if (afdata_ptr != NULL)
+ llt = ((struct in_ifinfo *)afdata_ptr)->ii_llt;
+ return (llt);
+}
+
void *
in_domifattach(struct ifnet *ifp)
{
diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c
index f23db925444f..47771187fd6d 100644
--- a/sys/netinet/in_fib_dxr.c
+++ b/sys/netinet/in_fib_dxr.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
- * Copyright (c) 2012-2021 Marko Zec
+ * Copyright (c) 2012-2022 Marko Zec
* Copyright (c) 2005, 2018 University of Zagreb
* Copyright (c) 2005 International Computer Science Institute
*
@@ -78,7 +78,6 @@ CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
#else
#define DXR_D (DXR_TRIE_BITS - 1)
#endif
-#define DXR_X (DXR_TRIE_BITS - DXR_D)
#define D_TBL_SIZE (1 << DXR_D)
#define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
@@ -211,9 +210,6 @@ struct dxr_aux {
struct dxr {
/* Lookup tables */
- uint16_t d_shift;
- uint16_t x_shift;
- uint32_t x_mask;
void *d;
void *x;
void *r;
@@ -224,6 +220,9 @@ struct dxr {
struct fib_data *fd;
struct epoch_context epoch_ctx;
uint32_t fibnum;
+ uint16_t d_shift;
+ uint16_t x_shift;
+ uint32_t x_mask;
};
static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
@@ -235,46 +234,6 @@ uma_zone_t trie_zone;
VNET_DEFINE_STATIC(int, frag_limit) = 100;
#define V_frag_limit VNET(frag_limit)
-SYSCTL_DECL(_net_route_algo);
-SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "DXR tunables");
-
-static int
-sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
-{
- char buf[8];
- int error, new, i;
-
- snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
- V_frag_limit % 100);
- error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
- if (error != 0 || req->newptr == NULL)
- return (error);
- if (!isdigit(*buf) && *buf != '.')
- return (EINVAL);
- for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
- new = new * 10 + buf[i] - '0';
- new *= 100;
- if (buf[i++] == '.') {
- if (!isdigit(buf[i]))
- return (EINVAL);
- new += (buf[i++] - '0') * 10;
- if (isdigit(buf[i]))
- new += buf[i++] - '0';
- }
- if (new > 1000)
- return (EINVAL);
- V_frag_limit = new;
- snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
- V_frag_limit % 100);
- return (0);
-}
-
-SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
- CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
- 0, 0, sysctl_dxr_frag_limit, "A",
- "Fragmentation threshold to full rebuild");
-
/* Binary search for a matching address range */
#define DXR_LOOKUP_STAGE \
if (masked_dst < range[middle].start) { \
@@ -290,35 +249,14 @@ SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
return (range[lowerbound].nexthop);
static int
-dxr_lookup(struct dxr *dxr, uint32_t dst)
+range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
{
-#ifdef DXR2
- uint16_t *dt = dxr->d;
- struct direct_entry *xt = dxr->x;
- int xi;
-#else
- struct direct_entry *dt = dxr->d;
-#endif
- struct direct_entry de;
- struct range_entry_long *rt;
uint32_t base;
uint32_t upperbound;
uint32_t middle;
uint32_t lowerbound;
uint32_t masked_dst;
-#ifdef DXR2
- xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
- ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
- de = xt[xi];
-#else
- de = dt[dst >> DXR_RANGE_SHIFT];
-#endif
-
- if (__predict_true(de.fragments == FRAGS_MARK_HIT))
- return (de.base);
-
- rt = dxr->r;
base = de.base;
lowerbound = 0;
masked_dst = dst & DXR_RANGE_MASK;
@@ -355,6 +293,65 @@ dxr_lookup(struct dxr *dxr, uint32_t dst)
}
}
+#define DXR_LOOKUP_DEFINE(D) \
+ static int inline \
+ dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
+ { \
+ struct direct_entry de; \
+ uint16_t *dt = dxr->d; \
+ struct direct_entry *xt = dxr->x; \
+ \
+ de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
+ + ((dst >> DXR_RANGE_SHIFT) & \
+ (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
+ if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
+ return (de.base); \
+ return (range_lookup(dxr->r, de, dst)); \
+ } \
+ \
+ static struct nhop_object * \
+ dxr_fib_lookup_##D(void *algo_data, \
+ const struct flm_lookup_key key, uint32_t scopeid __unused) \
+ { \
+ struct dxr *dxr = algo_data; \
+ \
+ return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
+ ntohl(key.addr4.s_addr))]); \
+ }
+
+#ifdef DXR2
+#if DXR_TRIE_BITS > 16
+DXR_LOOKUP_DEFINE(16)
+#endif
+DXR_LOOKUP_DEFINE(15)
+DXR_LOOKUP_DEFINE(14)
+DXR_LOOKUP_DEFINE(13)
+DXR_LOOKUP_DEFINE(12)
+DXR_LOOKUP_DEFINE(11)
+DXR_LOOKUP_DEFINE(10)
+DXR_LOOKUP_DEFINE(9)
+#endif /* DXR2 */
+
+static int inline
+dxr_lookup(struct dxr *dxr, uint32_t dst)
+{
+ struct direct_entry de;
+#ifdef DXR2
+ uint16_t *dt = dxr->d;
+ struct direct_entry *xt = dxr->x;
+
+ de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
+ ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
+#else /* !DXR2 */
+ struct direct_entry *dt = dxr->d;
+
+ de = dt[dst >> DXR_RANGE_SHIFT];
+#endif /* !DXR2 */
+ if (__predict_true(de.fragments == FRAGS_MARK_HIT))
+ return (de.base);
+ return (range_lookup(dxr->r, de, dst));
+}
+
static void
initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
{
@@ -1111,11 +1108,8 @@ dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
uint32_t scopeid)
{
struct dxr *dxr = algo_data;
- uint32_t nh;
-
- nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr));
- return (dxr->nh_tbl[nh]);
+ return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
}
static enum flm_op_result
@@ -1183,6 +1177,35 @@ epoch_dxr_destroy(epoch_context_t ctx)
dxr_destroy(dxr);
}
+static void *
+choose_lookup_fn(struct dxr_aux *da)
+{
+
+#ifdef DXR2
+ switch (da->d_bits) {
+#if DXR_TRIE_BITS > 16
+ case 16:
+ return (dxr_fib_lookup_16);
+#endif
+ case 15:
+ return (dxr_fib_lookup_15);
+ case 14:
+ return (dxr_fib_lookup_14);
+ case 13:
+ return (dxr_fib_lookup_13);
+ case 12:
+ return (dxr_fib_lookup_12);
+ case 11:
+ return (dxr_fib_lookup_11);
+ case 10:
+ return (dxr_fib_lookup_10);
+ case 9:
+ return (dxr_fib_lookup_9);
+ }
+#endif /* DXR2 */
+ return (dxr_fib_lookup);
+}
+
static enum flm_op_result
dxr_dump_end(void *data, struct fib_dp *dp)
{
@@ -1203,7 +1226,7 @@ dxr_dump_end(void *data, struct fib_dp *dp)
if (dxr->d == NULL)
return (FLM_REBUILD);
- dp->f = dxr_fib_lookup;
+ dp->f = choose_lookup_fn(da);
dp->arg = dxr;
return (FLM_SUCCESS);
@@ -1300,7 +1323,7 @@ dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
return (FLM_REBUILD);
}
- new_dp.f = dxr_fib_lookup;
+ new_dp.f = choose_lookup_fn(da);
new_dp.arg = new_dxr;
if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
fib_set_algo_ptr(dxr->fd, new_dxr);
@@ -1320,6 +1343,46 @@ dxr_get_pref(const struct rib_rtable_info *rinfo)
return (251);
}
+SYSCTL_DECL(_net_route_algo);
+SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "DXR tunables");
+
+static int
+sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
+{
+ char buf[8];
+ int error, new, i;
+
+ snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
+ V_frag_limit % 100);
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (!isdigit(*buf) && *buf != '.')
+ return (EINVAL);
+ for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
+ new = new * 10 + buf[i] - '0';
+ new *= 100;
+ if (buf[i++] == '.') {
+ if (!isdigit(buf[i]))
+ return (EINVAL);
+ new += (buf[i++] - '0') * 10;
+ if (isdigit(buf[i]))
+ new += buf[i++] - '0';
+ }
+ if (new > 1000)
+ return (EINVAL);
+ V_frag_limit = new;
+ snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
+ V_frag_limit % 100);
+ return (0);
+}
+
+SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
+ 0, 0, sysctl_dxr_frag_limit, "A",
+ "Fragmentation threshold to full rebuild");
+
static struct fib_lookup_module fib_dxr_mod = {
.flm_name = "dxr",
.flm_family = AF_INET,
diff --git a/sys/netinet/in_jail.c b/sys/netinet/in_jail.c
index 11891b56ecbe..d9f864f369cf 100644
--- a/sys/netinet/in_jail.c
+++ b/sys/netinet/in_jail.c
@@ -65,6 +65,13 @@ __FBSDID("$FreeBSD$");
#include <netinet/in.h>
+static in_addr_t
+prison_primary_ip4(const struct prison *pr)
+{
+
+ return (((const struct in_addr *)prison_ip_get0(pr, PR_INET))->s_addr);
+}
+
int
prison_qcmp_v4(const void *ip1, const void *ip2)
{
@@ -90,88 +97,16 @@ prison_qcmp_v4(const void *ip1, const void *ip2)
return (0);
}
-/*
- * Restrict a prison's IP address list with its parent's, possibly replacing
- * it. Return true if the replacement buffer was used (or would have been).
- */
-int
-prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
+bool
+prison_valid_v4(const void *ip)
{
- int ii, ij, used;
- struct prison *ppr;
-
- ppr = pr->pr_parent;
- if (!(pr->pr_flags & PR_IP4_USER)) {
- /* This has no user settings, so just copy the parent's list. */
- if (pr->pr_ip4s < ppr->pr_ip4s) {
- /*
- * There's no room for the parent's list. Use the
- * new list buffer, which is assumed to be big enough
- * (if it was passed). If there's no buffer, try to
- * allocate one.
- */
- used = 1;
- if (newip4 == NULL) {
- newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
- M_PRISON, M_NOWAIT);
- if (newip4 != NULL)
- used = 0;
- }
- if (newip4 != NULL) {
- bcopy(ppr->pr_ip4, newip4,
- ppr->pr_ip4s * sizeof(*newip4));
- free(pr->pr_ip4, M_PRISON);
- pr->pr_ip4 = newip4;
- pr->pr_ip4s = ppr->pr_ip4s;
- }
- return (used);
- }
- pr->pr_ip4s = ppr->pr_ip4s;
- if (pr->pr_ip4s > 0)
- bcopy(ppr->pr_ip4, pr->pr_ip4,
- pr->pr_ip4s * sizeof(*newip4));
- else if (pr->pr_ip4 != NULL) {
- free(pr->pr_ip4, M_PRISON);
- pr->pr_ip4 = NULL;
- }
- } else if (pr->pr_ip4s > 0) {
- /* Remove addresses that aren't in the parent. */
- for (ij = 0; ij < ppr->pr_ip4s; ij++)
- if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
- break;
- if (ij < ppr->pr_ip4s)
- ii = 1;
- else {
- bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
- --pr->pr_ip4s * sizeof(*pr->pr_ip4));
- ii = 0;
- }
- for (ij = 1; ii < pr->pr_ip4s; ) {
- if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
- ii++;
- continue;
- }
- switch (ij >= ppr->pr_ip4s ? -1 :
- prison_qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
- case -1:
- bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
- (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
- break;
- case 0:
- ii++;
- ij++;
- break;
- case 1:
- ij++;
- break;
- }
- }
- if (pr->pr_ip4s == 0) {
- free(pr->pr_ip4, M_PRISON);
- pr->pr_ip4 = NULL;
- }
- }
- return (0);
+ in_addr_t ia = ((const struct in_addr *)ip)->s_addr;
+
+ /*
+ * We do not have to care about byte order for these
+ * checks so we will do them in NBO.
+ */
+ return (ia != INADDR_ANY && ia != INADDR_BROADCAST);
}
/*
@@ -199,12 +134,12 @@ prison_get_ip4(struct ucred *cred, struct in_addr *ia)
mtx_unlock(&pr->pr_mtx);
return (0);
}
- if (pr->pr_ip4 == NULL) {
+ if (pr->pr_addrs[PR_INET] == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
- ia->s_addr = pr->pr_ip4[0].s_addr;
+ ia->s_addr = prison_primary_ip4(pr);
mtx_unlock(&pr->pr_mtx);
return (0);
}
@@ -299,7 +234,7 @@ prison_local_ip4(struct ucred *cred, struct in_addr *ia)
mtx_unlock(&pr->pr_mtx);
return (0);
}
- if (pr->pr_ip4 == NULL) {
+ if (pr->pr_addrs[PR_INET] == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
@@ -310,15 +245,15 @@ prison_local_ip4(struct ucred *cred, struct in_addr *ia)
/*
* In case there is only 1 IPv4 address, bind directly.
*/
- if (pr->pr_ip4s == 1)
- ia->s_addr = pr->pr_ip4[0].s_addr;
+ if (prison_ip_cnt(pr, PR_INET) == 1)
+ ia->s_addr = prison_primary_ip4(pr);
mtx_unlock(&pr->pr_mtx);
return (0);
}
error = prison_check_ip4_locked(pr, ia);
if (error == EADDRNOTAVAIL && ia0.s_addr == INADDR_LOOPBACK) {
- ia->s_addr = pr->pr_ip4[0].s_addr;
+ ia->s_addr = prison_primary_ip4(pr);
error = 0;
}
@@ -348,14 +283,14 @@ prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
mtx_unlock(&pr->pr_mtx);
return (0);
}
- if (pr->pr_ip4 == NULL) {
+ if (pr->pr_addrs[PR_INET] == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
if (ntohl(ia->s_addr) == INADDR_LOOPBACK &&
prison_check_ip4_locked(pr, ia) == EADDRNOTAVAIL) {
- ia->s_addr = pr->pr_ip4[0].s_addr;
+ ia->s_addr = prison_primary_ip4(pr);
mtx_unlock(&pr->pr_mtx);
return (0);
}
@@ -376,31 +311,11 @@ prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
int
prison_check_ip4_locked(const struct prison *pr, const struct in_addr *ia)
{
- int i, a, z, d;
- /*
- * Check the primary IP.
- */
- if (pr->pr_ip4[0].s_addr == ia->s_addr)
+ if (!(pr->pr_flags & PR_IP4))
return (0);
- /*
- * All the other IPs are sorted so we can do a binary search.
- */
- a = 0;
- z = pr->pr_ip4s - 2;
- while (a <= z) {
- i = (a + z) / 2;
- d = prison_qcmp_v4(&pr->pr_ip4[i+1], ia);
- if (d > 0)
- z = i - 1;
- else if (d < 0)
- a = i + 1;
- else
- return (0);
- }
-
- return (EADDRNOTAVAIL);
+ return (prison_ip_check(pr, PR_INET, ia));
}
int
@@ -420,7 +335,7 @@ prison_check_ip4(const struct ucred *cred, const struct in_addr *ia)
mtx_unlock(&pr->pr_mtx);
return (0);
}
- if (pr->pr_ip4 == NULL) {
+ if (pr->pr_addrs[PR_INET] == NULL) {
mtx_unlock(&pr->pr_mtx);
return (EAFNOSUPPORT);
}
diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c
index 2cb6748eacdb..a7314a27dbac 100644
--- a/sys/netinet/in_kdtrace.c
+++ b/sys/netinet/in_kdtrace.c
@@ -111,7 +111,7 @@ SDT_PROBE_DEFINE1_XLATE(tcp, , , siftr,
SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__input,
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfoh_t *",
- "uint8_t *", "ipinfo_t *");
+ "struct mbuf *", "ipinfo_t *");
SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__output,
"struct tcpcb *", "tcpsinfo_t *" ,
diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c
index 6ac81aa98e44..3f25471f0858 100644
--- a/sys/netinet/in_mcast.c
+++ b/sys/netinet/in_mcast.c
@@ -1376,6 +1376,7 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
static int
inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
{
+ struct epoch_tracker et;
struct group_source_req gsr;
sockunion_t *gsa, *ssa;
struct ifnet *ifp;
@@ -1414,8 +1415,6 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
ssa->sin.sin_addr = mreqs.imr_sourceaddr;
if (!in_nullhost(mreqs.imr_interface)) {
- struct epoch_tracker et;
-
NET_EPOCH_ENTER(et);
INADDR_TO_IFP(mreqs.imr_interface, ifp);
/* XXXGL: ifref? */
@@ -1445,10 +1444,11 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
ssa->sin.sin_len != sizeof(struct sockaddr_in))
return (EINVAL);
- if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
- return (EADDRNOTAVAIL);
-
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(gsr.gsr_interface);
+ NET_EPOCH_EXIT(et);
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
doblock = 1;
@@ -1624,6 +1624,7 @@ inp_freemoptions(struct ip_moptions *imo)
static int
inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
{
+ struct epoch_tracker et;
struct __msfilterreq msfr;
sockunion_t *gsa;
struct ifnet *ifp;
@@ -1649,10 +1650,9 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
if (error)
return (error);
- if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
- return (EINVAL);
-
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(msfr.msfr_ifindex);
+ NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifnet pointer left */
if (ifp == NULL)
return (EINVAL);
@@ -2026,11 +2026,11 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
return (EINVAL);
- if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
- return (EADDRNOTAVAIL);
NET_EPOCH_ENTER(et);
ifp = ifnet_byindex_ref(gsr.gsr_interface);
NET_EPOCH_EXIT(et);
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
break;
default:
@@ -2243,6 +2243,7 @@ out_inp_unlocked:
static int
inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
{
+ struct epoch_tracker et;
struct group_source_req gsr;
struct ip_mreq_source mreqs;
sockunion_t *gsa, *ssa;
@@ -2304,8 +2305,6 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
* using an IPv4 address as a key is racy.
*/
if (!in_nullhost(mreqs.imr_interface)) {
- struct epoch_tracker et;
-
NET_EPOCH_ENTER(et);
INADDR_TO_IFP(mreqs.imr_interface, ifp);
/* XXXGL ifref? */
@@ -2340,11 +2339,9 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
return (EINVAL);
}
- if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
- return (EADDRNOTAVAIL);
-
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(gsr.gsr_interface);
-
+ NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */
if (ifp == NULL)
return (EADDRNOTAVAIL);
break;
@@ -2481,13 +2478,17 @@ inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
if (error)
return (error);
- if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex)
+ if (mreqn.imr_ifindex < 0)
return (EINVAL);
if (mreqn.imr_ifindex == 0) {
ifp = NULL;
} else {
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(mreqn.imr_ifindex);
+ NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */
if (ifp == NULL)
return (EADDRNOTAVAIL);
}
@@ -2536,6 +2537,7 @@ inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
static int
inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
{
+ struct epoch_tracker et;
struct __msfilterreq msfr;
sockunion_t *gsa;
struct ifnet *ifp;
@@ -2566,10 +2568,9 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
gsa->sin.sin_port = 0; /* ignore port */
- if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
- return (EADDRNOTAVAIL);
-
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(msfr.msfr_ifindex);
+ NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */
if (ifp == NULL)
return (EADDRNOTAVAIL);
@@ -2881,13 +2882,6 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
if (namelen != 2)
return (EINVAL);
- ifindex = name[0];
- if (ifindex <= 0 || ifindex > V_if_index) {
- CTR2(KTR_IGMPV3, "%s: ifindex %u out of range",
- __func__, ifindex);
- return (ENOENT);
- }
-
group.s_addr = name[1];
if (!IN_MULTICAST(ntohl(group.s_addr))) {
CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast",
@@ -2895,6 +2889,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
return (EINVAL);
}
+ ifindex = name[0];
NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(ifindex);
if (ifp == NULL) {
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 081d204f559c..97471ae7800c 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -49,7 +49,9 @@ __FBSDID("$FreeBSD$");
#include "opt_rss.h"
#include <sys/param.h>
+#include <sys/hash.h>
#include <sys/systm.h>
+#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
@@ -246,6 +248,16 @@ SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
#endif /* INET */
+VNET_DEFINE(uint32_t, in_pcbhashseed);
+static void
+in_pcbhashseed_init(void)
+{
+
+ V_in_pcbhashseed = arc4random();
+}
+VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
+ in_pcbhashseed_init, 0);
+
/*
* in_pcb.c: manage the Protocol Control Blocks.
*
@@ -502,33 +514,20 @@ abort_with_hash_wlock:
return (err);
}
-/*
- * Different protocols initialize their inpcbs differently - giving
- * different name to the lock. But they all are disposed the same.
- */
-static void
-inpcb_fini(void *mem, int size)
-{
- struct inpcb *inp = mem;
-
- INP_LOCK_DESTROY(inp);
-}
-
/* Make sure it is safe to use hashinit(9) on CK_LIST. */
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
/*
- * Initialize an inpcbinfo -- we should be able to reduce the number of
- * arguments in time.
+ * Initialize an inpcbinfo - a per-VNET instance of connections db.
*/
void
-in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
- u_int hash_nelements, int porthash_nelements, char *inpcbzone_name,
- uma_init inpcbzone_init)
+in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
+ u_int hash_nelements, u_int porthash_nelements)
{
- mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF);
- mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF);
+ mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
+ mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
+ NULL, MTX_DEF);
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
@@ -541,16 +540,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
&pcbinfo->ipi_porthashmask);
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_lbgrouphashmask);
- pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
- NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR,
- UMA_ZONE_SMR);
- uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
- uma_zone_set_warning(pcbinfo->ipi_zone,
- "kern.ipc.maxsockets limit reached");
+ pcbinfo->ipi_zone = pcbstor->ips_zone;
+ pcbinfo->ipi_portzone = pcbstor->ips_portzone;
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
- pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name,
- sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
- uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr);
}
/*
@@ -568,12 +560,42 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
pcbinfo->ipi_porthashmask);
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
pcbinfo->ipi_lbgrouphashmask);
- uma_zdestroy(pcbinfo->ipi_zone);
mtx_destroy(&pcbinfo->ipi_hash_lock);
mtx_destroy(&pcbinfo->ipi_lock);
}
/*
+ * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
+ */
+static void inpcb_dtor(void *, int, void *);
+static void inpcb_fini(void *, int);
+void
+in_pcbstorage_init(void *arg)
+{
+ struct inpcbstorage *pcbstor = arg;
+
+ pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
+ sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit,
+ inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
+ pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
+ sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_smr(pcbstor->ips_portzone,
+ uma_zone_get_smr(pcbstor->ips_zone));
+}
+
+/*
+ * Destroy a pcbstorage - used by unloadable protocols.
+ */
+void
+in_pcbstorage_destroy(void *arg)
+{
+ struct inpcbstorage *pcbstor = arg;
+
+ uma_zdestroy(pcbstor->ips_zone);
+ uma_zdestroy(pcbstor->ips_portzone);
+}
+
+/*
* Allocate a PCB and associate it with the socket.
* On success return with the PCB locked.
*/
@@ -629,7 +651,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
* If using hpts lets drop a random number in so
* not all new connections fall on the same CPU.
*/
- inp->inp_hpts_cpu = inp->inp_dropq_cpu = hpts_random_cpu(inp);
+ inp->inp_hpts_cpu = hpts_random_cpu(inp);
#endif
refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
INP_WLOCK(inp);
@@ -644,7 +666,6 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
out:
- crfree(inp->inp_cred);
uma_zfree_smr(pcbinfo->ipi_zone, inp);
return (error);
#endif
@@ -748,7 +769,7 @@ in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
if (first == last)
dorandom = 0;
/* Make sure to not include UDP(-Lite) packets in the count. */
- if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
+ if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo)
V_ipport_tcpallocs++;
/*
* Instead of having two loops further down counting up or down
@@ -1654,6 +1675,8 @@ inp_next(struct inpcb_iterator *ii)
smr_enter(ipi->ipi_smr);
MPASS(inp != II_LIST_FIRST(ipi, hash));
inp = II_LIST_FIRST(ipi, hash);
+ if (inp == NULL)
+ break;
}
}
@@ -1760,7 +1783,6 @@ in_pcbrele_rlocked(struct inpcb *inp)
MPASS(inp->inp_flags & INP_FREED);
MPASS(inp->inp_socket == NULL);
MPASS(inp->inp_in_hpts == 0);
- MPASS(inp->inp_in_dropq == 0);
INP_RUNLOCK(inp);
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
return (true);
@@ -1778,7 +1800,6 @@ in_pcbrele_wlocked(struct inpcb *inp)
MPASS(inp->inp_flags & INP_FREED);
MPASS(inp->inp_socket == NULL);
MPASS(inp->inp_in_hpts == 0);
- MPASS(inp->inp_in_dropq == 0);
INP_WUNLOCK(inp);
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
return (true);
@@ -1833,7 +1854,6 @@ in_pcbfree(struct inpcb *inp)
inp->inp_flags &= ~INP_INHASHLIST;
}
- crfree(inp->inp_cred);
RO_INVALIDATE_CACHE(&inp->inp_route);
#ifdef MAC
mac_inpcb_destroy(inp);
@@ -1864,6 +1884,30 @@ in_pcbfree(struct inpcb *inp)
#ifdef INET
inp_freemoptions(imo);
#endif
+ /* Destruction is finalized in inpcb_dtor(). */
+}
+
+static void
+inpcb_dtor(void *mem, int size, void *arg)
+{
+ struct inpcb *inp = mem;
+
+ crfree(inp->inp_cred);
+#ifdef INVARIANTS
+ inp->inp_cred = NULL;
+#endif
+}
+
+/*
+ * Different protocols initialize their inpcbs differently - giving
+ * different name to the lock. But they all are disposed the same.
+ */
+static void
+inpcb_fini(void *mem, int size)
+{
+ struct inpcb *inp = mem;
+
+ INP_LOCK_DESTROY(inp);
}
/*
@@ -2074,8 +2118,8 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
* Look for an unconnected (wildcard foreign addr) PCB that
* matches the local address and port we're looking for.
*/
- head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
- 0, pcbinfo->ipi_hashmask)];
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
+ pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
@@ -2203,7 +2247,7 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
if (grp->il_lport != lport)
continue;
- idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
+ idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
grp->il_inpcnt;
if (grp->il_laddr.s_addr == laddr->s_addr) {
if (numa_domain == M_NODOM ||
@@ -2249,7 +2293,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
* First look for an exact match.
*/
tmpinp = NULL;
- head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
@@ -2304,8 +2348,8 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
* 4. non-jailed, wild.
*/
- head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
- 0, pcbinfo->ipi_hashmask)];
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
+ pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
@@ -2318,8 +2362,8 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
- if (prison_check_ip4(inp->inp_cred,
- &laddr) != 0)
+ if (prison_check_ip4_locked(
+ inp->inp_cred->cr_prison, &laddr) != 0)
continue;
} else {
if (local_exact != NULL)
@@ -2428,7 +2472,6 @@ in_pcbinshash(struct inpcb *inp)
struct inpcbporthead *pcbporthash;
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
- u_int32_t hashkey_faddr;
int so_options;
INP_WLOCK_ASSERT(inp);
@@ -2439,13 +2482,12 @@ in_pcbinshash(struct inpcb *inp)
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
- hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
+ pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
else
#endif
- hashkey_faddr = inp->inp_faddr.s_addr;
-
- pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
- inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
+ pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
@@ -2505,7 +2547,6 @@ in_pcbrehash(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbhead *head;
- u_int32_t hashkey_faddr;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2515,13 +2556,12 @@ in_pcbrehash(struct inpcb *inp)
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
- hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
+ head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
else
#endif
- hashkey_faddr = inp->inp_faddr.s_addr;
-
- head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
- inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
+ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
CK_LIST_REMOVE(inp, inp_hash);
CK_LIST_INSERT_HEAD(head, inp, inp_hash);
@@ -2577,8 +2617,8 @@ ipport_tick(void *xtp)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
- if (V_ipport_tcpallocs <=
- V_ipport_tcplastcount + V_ipport_randomcps) {
+ if (V_ipport_tcpallocs - V_ipport_tcplastcount <=
+ V_ipport_randomcps) {
if (V_ipport_stoprandom > 0)
V_ipport_stoprandom--;
} else
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 77dd85241e01..49b891e33b15 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -73,7 +73,8 @@ typedef uint64_t inp_gen_t;
/*
* PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
* So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
- * the following structure.
+ * the following structure. This requires padding always be zeroed out,
+ * which is done right after inpcb allocation and stays through its lifetime.
*/
struct in_addr_4in6 {
u_int32_t ia46_pad32[3];
@@ -235,19 +236,15 @@ struct inpcb {
*/
#if defined(__amd64__) || defined(__i386__)
uint8_t inp_in_hpts; /* on output hpts (lock b) */
- uint8_t inp_in_dropq; /* on input hpts (lock b) */
#else
uint32_t inp_in_hpts; /* on output hpts (lock b) */
- uint32_t inp_in_dropq; /* on input hpts (lock b) */
#endif
volatile uint16_t inp_hpts_cpu; /* Lock (i) */
volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
- uint16_t inp_dropq_cpu; /* Lock (i) */
uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */
- inp_dropq_cpu_set : 1, /* on input hpts (i) */
inp_hpts_calls :1, /* (i) from output hpts */
inp_irq_cpu_set :1, /* (i) from LRO/Driver */
inp_spare_bits2 : 3;
@@ -256,8 +253,6 @@ struct inpcb {
struct socket *inp_socket; /* (i) back pointer to socket */
int32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */
uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */
- uint32_t inp_dropq_gencnt;
- TAILQ_ENTRY(inpcb) inp_dropq; /* hpts drop queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
@@ -380,8 +375,8 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
#ifdef _KERNEL
/*
- * Global data structure for each high-level protocol (UDP, TCP, ...) in both
- * IPv4 and IPv6. Holds inpcb lists and information for managing them.
+ * Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both
+ * IPv4 and IPv6.
*
* The pcbs are protected with SMR section and thus all lists in inpcbinfo
* are CK-lists. Locking is required to insert a pcb into database. Two
@@ -451,6 +446,41 @@ struct inpcbinfo {
};
/*
+ * Global allocation storage for each high-level protocol (UDP, TCP, ...).
+ * Each corresponding per-VNET inpcbinfo points into this one.
+ */
+struct inpcbstorage {
+ uma_zone_t ips_zone;
+ uma_zone_t ips_portzone;
+ uma_init ips_pcbinit;
+ const char * ips_zone_name;
+ const char * ips_portzone_name;
+ const char * ips_infolock_name;
+ const char * ips_hashlock_name;
+};
+
+#define INPCBSTORAGE_DEFINE(prot, lname, zname, iname, hname) \
+static int \
+prot##_inpcb_init(void *mem, int size __unused, int flags __unused) \
+{ \
+ struct inpcb *inp = mem; \
+ \
+ rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK); \
+ return (0); \
+} \
+static struct inpcbstorage prot = { \
+ .ips_pcbinit = prot##_inpcb_init, \
+ .ips_zone_name = zname, \
+ .ips_portzone_name = zname " ports", \
+ .ips_infolock_name = iname, \
+ .ips_hashlock_name = hname, \
+}; \
+SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN, \
+ SI_ORDER_SECOND, in_pcbstorage_init, &prot); \
+SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \
+ SI_ORDER_SECOND, in_pcbstorage_destroy, &prot)
+
+/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
* INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
@@ -471,8 +501,6 @@ struct inpcblbgroup {
struct inpcb *il_inp[]; /* (h) */
};
-#define INP_LOCK_INIT(inp, d, t) \
- rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock)
#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock)
@@ -536,13 +564,36 @@ int inp_so_options(const struct inpcb *inp);
#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \
MA_OWNED)
-#define INP_PCBHASH(faddr, lport, fport, mask) \
- (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
-#define INP_PCBPORTHASH(lport, mask) \
- (ntohs((lport)) & (mask))
-#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
- ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
-#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
+/*
+ * Wildcard matching hash is not just a microoptimisation! The hash for
+ * wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6
+ * wildcard bound pcb won't be able to receive AF_INET connections, while:
+ * jenkins_hash(&zeroes, 1, s) != jenkins_hash(&zeroes, 4, s)
+ * See also comment above struct in_addr_4in6.
+ */
+#define IN_ADDR_JHASH32(addr) \
+ ((addr)->s_addr == INADDR_ANY ? V_in_pcbhashseed : \
+ jenkins_hash32((&(addr)->s_addr), 1, V_in_pcbhashseed))
+#define IN6_ADDR_JHASH32(addr) \
+ (memcmp((addr), &in6addr_any, sizeof(in6addr_any)) == 0 ? \
+ V_in_pcbhashseed : \
+ jenkins_hash32((addr)->__u6_addr.__u6_addr32, \
+ nitems((addr)->__u6_addr.__u6_addr32), V_in_pcbhashseed))
+
+#define INP_PCBHASH(faddr, lport, fport, mask) \
+ ((IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
+#define INP6_PCBHASH(faddr, lport, fport, mask) \
+ ((IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
+
+#define INP_PCBHASH_WILD(lport, mask) \
+ ((V_in_pcbhashseed ^ ntohs(lport)) & (mask))
+
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ (IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
+#define INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ (IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
+
+#define INP_PCBPORTHASH(lport, mask) (ntohs((lport)) & (mask))
/*
* Flags for inp_vflags -- historically version flags only
@@ -670,9 +721,11 @@ VNET_DECLARE(int, ipport_tcpallocs);
#define V_ipport_stoprandom VNET(ipport_stoprandom)
#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
+void in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *,
+ u_int, u_int);
void in_pcbinfo_destroy(struct inpcbinfo *);
-void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *,
- uma_init);
+void in_pcbstorage_init(void *);
+void in_pcbstorage_destroy(void *);
int in_pcbbind_check_bindmulti(const struct inpcb *ni,
const struct inpcb *oi);
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index 4db20418708d..31214b6092f3 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -44,6 +44,9 @@
* Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c
*/
+VNET_DECLARE(uint32_t, in_pcbhashseed);
+#define V_in_pcbhashseed VNET(in_pcbhashseed)
+
bool inp_smr_lock(struct inpcb *, const inp_lookup_t);
int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
struct ucred *, int);
diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c
index 351c90699fc2..b9f506518cce 100644
--- a/sys/netinet/in_proto.c
+++ b/sys/netinet/in_proto.c
@@ -113,7 +113,6 @@ struct protosw inetsw[] = {
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_IP,
.pr_flags = PR_CAPATTACH,
- .pr_init = ip_init,
.pr_slowtimo = ip_slowtimo,
.pr_drain = ip_drain,
.pr_usrreqs = &nousrreqs
@@ -126,7 +125,6 @@ struct protosw inetsw[] = {
.pr_input = udp_input,
.pr_ctlinput = udp_ctlinput,
.pr_ctloutput = udp_ctloutput,
- .pr_init = udp_init,
.pr_usrreqs = &udp_usrreqs
},
{
@@ -138,7 +136,6 @@ struct protosw inetsw[] = {
.pr_input = tcp_input,
.pr_ctlinput = tcp_ctlinput,
.pr_ctloutput = tcp_ctloutput,
- .pr_init = tcp_init,
.pr_slowtimo = tcp_slowtimo,
.pr_drain = tcp_drain,
.pr_usrreqs = &tcp_usrreqs
@@ -152,7 +149,6 @@ struct protosw inetsw[] = {
.pr_input = sctp_input,
.pr_ctlinput = sctp_ctlinput,
.pr_ctloutput = sctp_ctloutput,
- .pr_init = sctp_init,
.pr_drain = sctp_drain,
.pr_usrreqs = &sctp_usrreqs
},
@@ -176,7 +172,6 @@ struct protosw inetsw[] = {
.pr_input = udp_input,
.pr_ctlinput = udplite_ctlinput,
.pr_ctloutput = udp_ctloutput,
- .pr_init = udplite_init,
.pr_usrreqs = &udp_usrreqs
},
{
@@ -290,7 +285,6 @@ IPPROTOSPACER,
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_input = rip_input,
.pr_ctloutput = rip_ctloutput,
- .pr_init = rip_init,
.pr_usrreqs = &rip_usrreqs
},
};
@@ -308,7 +302,7 @@ struct domain inetdomain = {
.dom_ifdetach = in_domifdetach
};
-VNET_DOMAIN_SET(inet);
+DOMAIN_SET(inet);
#endif /* INET */
SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c
index 7554becb974e..71558be619d8 100644
--- a/sys/netinet/ip_carp.c
+++ b/sys/netinet/ip_carp.c
@@ -854,6 +854,13 @@ static void
carp_send_ad_error(struct carp_softc *sc, int error)
{
+ /*
+ * We track errors and successfull sends with this logic:
+ * - Any error resets success counter to 0.
+ * - MAX_ERRORS triggers demotion.
+ * - MIN_SUCCESS successes resets error counter to 0.
+ * - MIN_SUCCESS reverts demotion, if it was triggered before.
+ */
if (error) {
if (sc->sc_sendad_errors < INT_MAX)
sc->sc_sendad_errors++;
@@ -865,17 +872,17 @@ carp_send_ad_error(struct carp_softc *sc, int error)
carp_demote_adj(V_carp_senderr_adj, msg);
}
sc->sc_sendad_success = 0;
- } else {
- if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
- ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
- static const char fmt[] = "send ok on %s";
- char msg[sizeof(fmt) + IFNAMSIZ];
-
- sprintf(msg, fmt, sc->sc_carpdev->if_xname);
- carp_demote_adj(-V_carp_senderr_adj, msg);
- sc->sc_sendad_errors = 0;
- } else
+ } else if (sc->sc_sendad_errors > 0) {
+ if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
+ if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
+ static const char fmt[] = "send ok on %s";
+ char msg[sizeof(fmt) + IFNAMSIZ];
+
+ sprintf(msg, fmt, sc->sc_carpdev->if_xname);
+ carp_demote_adj(-V_carp_senderr_adj, msg);
+ }
sc->sc_sendad_errors = 0;
+ }
}
}
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index cd0034008dc2..6c4d85b03e6a 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -117,8 +117,6 @@ VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */
-static eventhandler_tag ip_divert_event_tag;
-
static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
struct sockaddr_in *sin);
static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
@@ -126,24 +124,10 @@ static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
/*
* Initialize divert connection block queue.
*/
-static void
-div_zone_change(void *tag)
-{
-
- uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
-}
-
-static int
-div_inpcb_init(void *mem, int size, int flags)
-{
- struct inpcb *inp = mem;
-
- INP_LOCK_INIT(inp, "inp", "divinp");
- return (0);
-}
+INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
static void
-div_init(void)
+div_init(void *arg __unused)
{
/*
@@ -151,8 +135,9 @@ div_init(void)
* allocate one-entry hash lists than it is to check all over the
* place for hashbase == NULL.
*/
- in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init);
+ in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
}
+VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
static void
div_destroy(void *unused __unused)
@@ -160,8 +145,7 @@ div_destroy(void *unused __unused)
in_pcbinfo_destroy(&V_divcbinfo);
}
-VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
- div_destroy, NULL);
+VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL);
/*
* IPPROTO_DIVERT is not in the real IP protocol number space; this
@@ -775,7 +759,6 @@ struct protosw div_protosw = {
.pr_protocol = IPPROTO_DIVERT,
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_input = div_input,
- .pr_init = div_init,
.pr_usrreqs = &div_usrreqs
};
@@ -795,8 +778,6 @@ div_modevent(module_t mod, int type, void *unused)
if (err != 0)
return (err);
ip_divert_ptr = divert_packet;
- ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change,
- div_zone_change, NULL, EVENTHANDLER_PRI_ANY);
break;
case MOD_QUIESCE:
/*
@@ -830,7 +811,6 @@ div_modevent(module_t mod, int type, void *unused)
#ifndef VIMAGE
div_destroy(NULL);
#endif
- EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
break;
default:
err = EOPNOTSUPP;
diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c
index facf876f18cc..95a601ced3ef 100644
--- a/sys/netinet/ip_fastfwd.c
+++ b/sys/netinet/ip_fastfwd.c
@@ -60,14 +60,6 @@
*
* We take full advantage of hardware support for IP checksum and
* fragmentation offloading.
- *
- * We don't do ICMP redirect in the fast forwarding path. I have had my own
- * cases where two core routers with Zebra routing suite would send millions
- * ICMP redirects to connected hosts if the destination router was not the
- * default gateway. In one case it was filling the routing table of a host
- * with approximately 300.000 cloned redirect entries until it ran out of
- * kernel memory. However the networking code proved very robust and it didn't
- * crash or fail in other ways.
*/
/*
@@ -114,11 +106,68 @@ __FBSDID("$FreeBSD$");
#define V_ipsendredirects VNET(ipsendredirects)
static struct mbuf *
-ip_redir_alloc(struct mbuf *m, struct nhop_object *nh,
- struct ip *ip, in_addr_t *addr)
+ip_redir_alloc(struct mbuf *m, struct nhop_object *nh, u_short ip_len,
+ struct in_addr *osrc, struct in_addr *newgw)
{
- struct mbuf *mcopy = m_gethdr(M_NOWAIT, m->m_type);
+ struct in_ifaddr *nh_ia;
+ struct mbuf *mcopy;
+
+ KASSERT(nh != NULL, ("%s: m %p nh is NULL\n", __func__, m));
+
+ /*
+ * Only send a redirect if:
+ * - Redirects are not disabled (must be checked by caller),
+ * - We have not applied NAT (must be checked by caller as possible),
+ * - Neither a MCAST or BCAST packet (must be checked by caller)
+ * [RFC1009 Appendix A.2].
+ * - The packet does not do IP source routing or having any other
+ * IP options (this case was handled already by ip_input() calling
+ * ip_dooptions() [RFC792, p13],
+ * - The packet is being forwarded out the same physical interface
+ * that it was received from [RFC1812, 5.2.7.2].
+ */
+
+ /*
+ * - The forwarding route was not created by a redirect
+ * [RFC1812, 5.2.7.2], or
+ * if it was to follow a default route (see below).
+ * - The next-hop is reachable by us [RFC1009 Appendix A.2].
+ */
+ if ((nh->nh_flags & (NHF_DEFAULT | NHF_REDIRECT |
+ NHF_BLACKHOLE | NHF_REJECT)) != 0)
+ return (NULL);
+
+ /* Get the new gateway. */
+ if ((nh->nh_flags & NHF_GATEWAY) == 0 || nh->gw_sa.sa_family != AF_INET)
+ return (NULL);
+ newgw->s_addr = nh->gw4_sa.sin_addr.s_addr;
+
+ /*
+ * - The resulting forwarding destination is not "This host on this
+ * network" [RFC1122, Section 3.2.1.3] (default route check above).
+ */
+ if (newgw->s_addr == 0)
+ return (NULL);
+
+ /*
+ * - We know how to reach the sender and the source address is
+ * directly connected to us [RFC792, p13].
+ * + The new gateway address and the source address are on the same
+ * subnet [RFC1009 Appendix A.2, RFC1122 3.2.2.2, RFC1812, 5.2.7.2].
+ * NB: if you think multiple logical subnets on the same wire should
+ * receive redirects read [RFC1812, APPENDIX C (14->15)].
+ */
+ nh_ia = (struct in_ifaddr *)nh->nh_ifa;
+ if ((ntohl(osrc->s_addr) & nh_ia->ia_subnetmask) != nh_ia->ia_subnet)
+ return (NULL);
+
+ /* Prepare for sending the redirect. */
+ /*
+ * Make a copy of as much as we need of the packet as the original
+ * one will be forwarded but we need (a portion) for icmp_error().
+ */
+ mcopy = m_gethdr(M_NOWAIT, m->m_type);
if (mcopy == NULL)
return (NULL);
@@ -132,23 +181,10 @@ ip_redir_alloc(struct mbuf *m, struct nhop_object *nh,
m_free(mcopy);
return (NULL);
}
- mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
+ mcopy->m_len = min(ip_len, M_TRAILINGSPACE(mcopy));
mcopy->m_pkthdr.len = mcopy->m_len;
m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
- if (nh != NULL &&
- ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) {
- struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa);
- u_long src = ntohl(ip->ip_src.s_addr);
-
- if (nh_ia != NULL &&
- (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) {
- if (nh->nh_flags & NHF_GATEWAY)
- *addr = nh->gw4_sa.sin_addr.s_addr;
- else
- *addr = ip->ip_dst.s_addr;
- }
- }
return (mcopy);
}
@@ -202,7 +238,7 @@ ip_tryforward(struct mbuf *m)
struct route ro;
struct sockaddr_in *dst;
const struct sockaddr *gw;
- struct in_addr dest, odest, rtdest;
+ struct in_addr dest, odest, rtdest, osrc;
uint16_t ip_len, ip_off;
int error = 0;
struct m_tag *fwd_tag = NULL;
@@ -274,6 +310,7 @@ ip_tryforward(struct mbuf *m)
*/
odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
+ osrc.s_addr = ip->ip_src.s_addr;
/*
* Run through list of ipfilter hooks for input packets
@@ -434,13 +471,11 @@ passout:
} else
gw = (const struct sockaddr *)dst;
- /*
- * Handle redirect case.
- */
+ /* Handle redirect case. */
redest.s_addr = 0;
- if (V_ipsendredirects && (nh->nh_ifp == m->m_pkthdr.rcvif) &&
- gw->sa_family == AF_INET)
- mcopy = ip_redir_alloc(m, nh, ip, &redest.s_addr);
+ if (V_ipsendredirects && osrc.s_addr == ip->ip_src.s_addr &&
+ nh->nh_ifp == m->m_pkthdr.rcvif)
+ mcopy = ip_redir_alloc(m, nh, ip_len, &osrc, &redest);
/*
* Check if packet fits MTU or if hardware will fragment for us
@@ -514,7 +549,7 @@ passout:
/* Send required redirect */
if (mcopy != NULL) {
icmp_error(mcopy, ICMP_REDIRECT, ICMP_REDIRECT_HOST, redest.s_addr, 0);
- mcopy = NULL; /* Freed by caller */
+ mcopy = NULL; /* Was consumed by callee. */
}
consumed:
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index 44500c46b0d8..8fd26e4ca861 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -301,12 +301,10 @@ SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops,
* IP initialization: fill in IP protocol switch table.
* All protocols not implemented in kernel go to raw IP protocol handler.
*/
-void
-ip_init(void)
+static void
+ip_vnet_init(void *arg __unused)
{
struct pfil_head_args args;
- struct protosw *pr;
- int i;
CK_STAILQ_INIT(&V_in_ifaddrhead);
V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
@@ -332,23 +330,27 @@ ip_init(void)
printf("%s: WARNING: unable to register output helper hook\n",
__func__);
- /* Skip initialization of globals for non-default instances. */
#ifdef VIMAGE
- if (!IS_DEFAULT_VNET(curvnet)) {
- netisr_register_vnet(&ip_nh);
+ netisr_register_vnet(&ip_nh);
#ifdef RSS
- netisr_register_vnet(&ip_direct_nh);
+ netisr_register_vnet(&ip_direct_nh);
#endif
- return;
- }
#endif
+}
+VNET_SYSINIT(ip_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
+ ip_vnet_init, NULL);
+
+
+static void
+ip_init(const void *unused __unused)
+{
+ struct protosw *pr;
pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
- if (pr == NULL)
- panic("ip_init: PF_INET not found");
+ KASSERT(pr, ("%s: PF_INET not found", __func__));
/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
- for (i = 0; i < IPPROTO_MAX; i++)
+ for (int i = 0; i < IPPROTO_MAX; i++)
ip_protox[i] = pr - inetsw;
/*
* Cycle through IP protocols and put them into the appropriate place
@@ -368,6 +370,7 @@ ip_init(void)
netisr_register(&ip_direct_nh);
#endif
}
+SYSINIT(ip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_init, NULL);
#ifdef VIMAGE
static void
@@ -560,8 +563,9 @@ tooshort:
/*
* Try to forward the packet, but if we fail continue.
- * ip_tryforward() does not generate redirects, so fall
- * through to normal processing if redirects are required.
+ * ip_tryforward() may generate redirects these days.
+ * XXX the logic below falling through to normal processing
+ * if redirects are required should be revisited as well.
* ip_tryforward() does inbound and outbound packet firewall
* processing. If firewall has decided that destination becomes
* our local address, it sets M_FASTFWD_OURS flag. In this
@@ -574,6 +578,10 @@ tooshort:
IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0)
#endif
) {
+ /*
+ * ip_dooptions() was run so we can ignore the source route (or
+ * any IP options case) case for redirects in ip_tryforward().
+ */
if ((m = ip_tryforward(m)) == NULL)
return;
if (m->m_flags & M_FASTFWD_OURS) {
diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c
index 02738616d56e..fdca39b5dbb9 100644
--- a/sys/netinet/ip_mroute.c
+++ b/sys/netinet/ip_mroute.c
@@ -99,6 +99,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
+#include <sys/taskqueue.h>
#include <sys/time.h>
#include <sys/counter.h>
#include <machine/atomic.h>
@@ -177,6 +178,10 @@ VNET_DEFINE_STATIC(u_char *, nexpire); /* 0..mfchashsize-1 */
#define V_nexpire VNET(nexpire)
VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
#define V_mfchashtbl VNET(mfchashtbl)
+VNET_DEFINE_STATIC(struct taskqueue *, task_queue);
+#define V_task_queue VNET(task_queue)
+VNET_DEFINE_STATIC(struct task, task);
+#define V_task VNET(task)
VNET_DEFINE_STATIC(vifi_t, numvifs);
#define V_numvifs VNET(numvifs)
@@ -188,12 +193,6 @@ static eventhandler_tag if_detach_event_tag = NULL;
VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
#define V_expire_upcalls_ch VNET(expire_upcalls_ch)
-VNET_DEFINE_STATIC(struct mtx, upcall_thread_mtx);
-#define V_upcall_thread_mtx VNET(upcall_thread_mtx)
-
-VNET_DEFINE_STATIC(struct cv, upcall_thread_cv);
-#define V_upcall_thread_cv VNET(upcall_thread_cv)
-
VNET_DEFINE_STATIC(struct mtx, buf_ring_mtx);
#define V_buf_ring_mtx VNET(buf_ring_mtx)
@@ -232,8 +231,6 @@ SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
&pim_squelch_wholepkt, 0,
"Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
-static volatile int upcall_thread_shutdown = 0;
-
static const struct encaptab *pim_encap_cookie;
static int pim_encapcheck(const struct mbuf *, int, int, void *);
static int pim_input(struct mbuf *, int, int, void *);
@@ -303,7 +300,7 @@ VNET_DEFINE_STATIC(struct ifnet *, multicast_register_if);
static u_long X_ip_mcast_src(int);
static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
struct ip_moptions *);
-static int X_ip_mrouter_done(void);
+static int X_ip_mrouter_done(void *);
static int X_ip_mrouter_get(struct socket *, struct sockopt *);
static int X_ip_mrouter_set(struct socket *, struct sockopt *);
static int X_legal_vif_num(int);
@@ -434,7 +431,7 @@ X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
break;
case MRT_DONE:
- error = ip_mrouter_done();
+ error = ip_mrouter_done(NULL);
break;
case MRT_ADD_VIF:
@@ -660,22 +657,15 @@ if_detached_event(void *arg __unused, struct ifnet *ifp)
}
static void
-ip_mrouter_upcall_thread(void *arg)
+ip_mrouter_upcall_thread(void *arg, int pending __unused)
{
CURVNET_SET((struct vnet *) arg);
- while (upcall_thread_shutdown == 0) {
- /* START: Event loop */
-
- /* END: Event loop */
- mtx_lock(&V_upcall_thread_mtx);
- cv_timedwait(&V_upcall_thread_cv, &V_upcall_thread_mtx, hz);
- mtx_unlock(&V_upcall_thread_mtx);
- }
+ MRW_WLOCK();
+ bw_upcalls_send();
+ MRW_WUNLOCK();
- upcall_thread_shutdown = 0;
CURVNET_RESTORE();
- kthread_exit();
}
/*
@@ -718,12 +708,9 @@ ip_mrouter_init(struct socket *so, int version)
return (ENOMEM);
}
- /* Create upcall thread */
- upcall_thread_shutdown = 0;
- mtx_init(&V_upcall_thread_mtx, "ip_mroute upcall thread mtx", NULL, MTX_DEF);
- cv_init(&V_upcall_thread_cv, "ip_mroute upcall cv");
- kthread_add(ip_mrouter_upcall_thread, curvnet,
- NULL, NULL, 0, 0, "ip_mroute upcall thread");
+ TASK_INIT(&V_task, 0, ip_mrouter_upcall_thread, curvnet);
+ taskqueue_cancel(V_task_queue, &V_task, NULL);
+ taskqueue_unblock(V_task_queue);
callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
curvnet);
@@ -747,15 +734,20 @@ ip_mrouter_init(struct socket *so, int version)
* Disable multicast forwarding.
*/
static int
-X_ip_mrouter_done(void)
+X_ip_mrouter_done(void *locked)
{
struct ifnet *ifp;
u_long i;
vifi_t vifi;
struct bw_upcall *bu;
- if (V_ip_mrouter == NULL)
+ if (V_ip_mrouter == NULL) {
+ if (locked) {
+ struct epoch_tracker *mrouter_et = locked;
+ MROUTER_RUNLOCK_PARAM(mrouter_et);
+ }
return EINVAL;
+ }
/*
* Detach/disable hooks to the reset of the system.
@@ -764,19 +756,21 @@ X_ip_mrouter_done(void)
atomic_subtract_int(&ip_mrouter_cnt, 1);
V_mrt_api_config = 0;
- MROUTER_WAIT();
-
- MRW_WLOCK();
+ if (locked) {
+ struct epoch_tracker *mrouter_et = locked;
+ MROUTER_RUNLOCK_PARAM(mrouter_et);
+ }
- upcall_thread_shutdown = 1;
- mtx_lock(&V_upcall_thread_mtx);
- cv_signal(&V_upcall_thread_cv);
- mtx_unlock(&V_upcall_thread_mtx);
+ MROUTER_WAIT();
- /* Wait for thread shutdown */
- while (upcall_thread_shutdown == 1) {};
+ /* Stop and drain task queue */
+ taskqueue_block(V_task_queue);
+ while (taskqueue_cancel(V_task_queue, &V_task, NULL)) {
+ taskqueue_drain(V_task_queue, &V_task);
+ }
- mtx_destroy(&V_upcall_thread_mtx);
+ MRW_WLOCK();
+ taskqueue_cancel(V_task_queue, &V_task, NULL);
/* Destroy upcall ring */
while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) {
@@ -1848,9 +1842,7 @@ expire_bw_meter_leq(void *arg)
}
/* Send all upcalls that are pending delivery */
- mtx_lock(&V_upcall_thread_mtx);
- cv_signal(&V_upcall_thread_cv);
- mtx_unlock(&V_upcall_thread_mtx);
+ taskqueue_enqueue(V_task_queue, &V_task);
/* Reset counters */
x->bm_start_time = now;
@@ -2154,9 +2146,7 @@ bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
if (buf_ring_enqueue(V_bw_upcalls_ring, u))
log(LOG_WARNING, "bw_meter_prepare_upcall: cannot enqueue upcall\n");
if (buf_ring_count(V_bw_upcalls_ring) > (BW_UPCALLS_MAX / 2)) {
- mtx_lock(&V_upcall_thread_mtx);
- cv_signal(&V_upcall_thread_cv);
- mtx_unlock(&V_upcall_thread_mtx);
+ taskqueue_enqueue(V_task_queue, &V_task);
}
}
/*
@@ -2753,6 +2743,11 @@ vnet_mroute_init(const void *unused __unused)
callout_init_rw(&V_expire_upcalls_ch, &mrouter_mtx, 0);
callout_init_rw(&V_bw_upcalls_ch, &mrouter_mtx, 0);
+
+ /* Prepare taskqueue */
+ V_task_queue = taskqueue_create_fast("ip_mroute_tskq", M_NOWAIT,
+ taskqueue_thread_enqueue, &V_task_queue);
+ taskqueue_start_threads(&V_task_queue, 1, PI_NET, "ip_mroute_tskq task");
}
VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
@@ -2762,6 +2757,9 @@ static void
vnet_mroute_uninit(const void *unused __unused)
{
+ /* Taskqueue should be cancelled and drained before freeing */
+ taskqueue_free(V_task_queue);
+
free(V_viftable, M_MRTABLE);
free(V_nexpire, M_MRTABLE);
V_nexpire = NULL;
diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h
index 65c5bdd3a025..016d026d184c 100644
--- a/sys/netinet/ip_mroute.h
+++ b/sys/netinet/ip_mroute.h
@@ -363,12 +363,14 @@ struct sockopt;
extern int (*ip_mrouter_set)(struct socket *, struct sockopt *);
extern int (*ip_mrouter_get)(struct socket *, struct sockopt *);
-extern int (*ip_mrouter_done)(void);
+extern int (*ip_mrouter_done)(void *);
extern int (*mrt_ioctl)(u_long, caddr_t, int);
#define MROUTER_RLOCK_TRACKER struct epoch_tracker mrouter_et
+#define MROUTER_RLOCK_PARAM_PTR &mrouter_et
#define MROUTER_RLOCK() epoch_enter_preempt(net_epoch_preempt, &mrouter_et)
#define MROUTER_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &mrouter_et)
+#define MROUTER_RUNLOCK_PARAM(param) epoch_exit_preempt(net_epoch_preempt, param)
#define MROUTER_WAIT() epoch_wait_preempt(net_epoch_preempt)
#endif /* _KERNEL */
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
index 77b6ee88507a..4eaaef5c6991 100644
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -35,8 +35,11 @@
#ifndef _NETINET_IP_VAR_H_
#define _NETINET_IP_VAR_H_
-#include <sys/queue.h>
#include <sys/epoch.h>
+#include <sys/queue.h>
+#include <sys/types.h>
+
+#include <netinet/in.h>
/*
* Overlay for ip header used by other protocols (tcp, udp).
@@ -216,7 +219,6 @@ void ip_drain(void);
int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
u_long if_hwassist_flags);
void ip_forward(struct mbuf *m, int srcrt);
-void ip_init(void);
extern int
(*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
struct ip_moptions *);
@@ -233,7 +235,6 @@ void ip_slowtimo(void);
void ip_fillid(struct ip *);
int rip_ctloutput(struct socket *, struct sockopt *);
void rip_ctlinput(int, struct sockaddr *, void *);
-void rip_init(void);
int rip_input(struct mbuf **, int *, int);
int rip_output(struct mbuf *, struct socket *, ...);
int ipip_input(struct mbuf **, int *, int);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index de4e6e851c32..229716918875 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -119,7 +119,7 @@ VNET_DEFINE(struct socket *, ip_mrouter);
*/
int (*ip_mrouter_set)(struct socket *, struct sockopt *);
int (*ip_mrouter_get)(struct socket *, struct sockopt *);
-int (*ip_mrouter_done)(void);
+int (*ip_mrouter_done)(void *locked);
int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
struct ip_moptions *);
int (*mrt_ioctl)(u_long, caddr_t, int);
@@ -182,38 +182,15 @@ rip_delhash(struct inpcb *inp)
}
#endif /* INET */
-/*
- * Raw interface to IP protocol.
- */
+INPCBSTORAGE_DEFINE(ripcbstor, "rawinp", "ripcb", "rip", "riphash");
-/*
- * Initialize raw connection block q.
- */
static void
-rip_zone_change(void *tag)
-{
-
- uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
-}
-
-static int
-rip_inpcb_init(void *mem, int size, int flags)
+rip_init(void *arg __unused)
{
- struct inpcb *inp = mem;
- INP_LOCK_INIT(inp, "inp", "rawinp");
- return (0);
-}
-
-void
-rip_init(void)
-{
-
- in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb",
- rip_inpcb_init);
- EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
- EVENTHANDLER_PRI_ANY);
+ in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
}
+VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);
#ifdef VIMAGE
static void
@@ -902,18 +879,25 @@ static void
rip_detach(struct socket *so)
{
struct inpcb *inp;
+ MROUTER_RLOCK_TRACKER;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
("rip_detach: not closed"));
+ /* Disable mrouter first, lock released inside ip_mrouter_done */
+ MROUTER_RLOCK();
+ if (so == V_ip_mrouter && ip_mrouter_done)
+ ip_mrouter_done(MROUTER_RLOCK_PARAM_PTR);
+ else
+ MROUTER_RUNLOCK();
+
INP_WLOCK(inp);
INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
INP_HASH_WUNLOCK(&V_ripcbinfo);
- if (so == V_ip_mrouter && ip_mrouter_done)
- ip_mrouter_done();
+
if (ip_rsvp_force_done)
ip_rsvp_force_done(so);
if (so == V_ip_rsvpd)
diff --git a/sys/netinet/sctp_constants.h b/sys/netinet/sctp_constants.h
index 1ff3f3918ef6..66f2cca5ab6d 100644
--- a/sys/netinet/sctp_constants.h
+++ b/sys/netinet/sctp_constants.h
@@ -673,8 +673,6 @@ __FBSDID("$FreeBSD$");
/* amount peer is obligated to have in rwnd or I will abort */
#define SCTP_MIN_RWND 1500
-#define SCTP_DEFAULT_MAXSEGMENT 65535
-
#define SCTP_CHUNK_BUFFER_SIZE 512
#define SCTP_PARAM_BUFFER_SIZE 512
diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c
index bdb126cbb50f..222b69102bef 100644
--- a/sys/netinet/sctp_input.c
+++ b/sys/netinet/sctp_input.c
@@ -5291,7 +5291,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
/* UDP encapsulation turned on. */
net->mtu -= sizeof(struct udphdr);
if (stcb->asoc.smallest_mtu > net->mtu) {
- sctp_pathmtu_adjustment(stcb, net->mtu);
+ sctp_pathmtu_adjustment(stcb, net->mtu, true);
}
} else if (port == 0) {
/* UDP encapsulation turned off. */
@@ -5331,7 +5331,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
/* UDP encapsulation turned on. */
net->mtu -= sizeof(struct udphdr);
if (stcb->asoc.smallest_mtu > net->mtu) {
- sctp_pathmtu_adjustment(stcb, net->mtu);
+ sctp_pathmtu_adjustment(stcb, net->mtu, true);
}
} else if (port == 0) {
/* UDP encapsulation turned off. */
@@ -5352,12 +5352,14 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
goto out;
}
if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
+ SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks);
sctp_send_shutdown_complete2(src, dst, sh,
mflowtype, mflowid, fibnum,
vrf_id, port);
goto out;
}
if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) {
+ SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks);
goto out;
}
if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) {
@@ -5426,7 +5428,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
/* UDP encapsulation turned on. */
net->mtu -= sizeof(struct udphdr);
if (stcb->asoc.smallest_mtu > net->mtu) {
- sctp_pathmtu_adjustment(stcb, net->mtu);
+ sctp_pathmtu_adjustment(stcb, net->mtu, true);
}
} else if (port == 0) {
/* UDP encapsulation turned off. */
diff --git a/sys/netinet/sctp_module.c b/sys/netinet/sctp_module.c
index 50b09eb6f930..70a9daeffc2a 100644
--- a/sys/netinet/sctp_module.c
+++ b/sys/netinet/sctp_module.c
@@ -91,7 +91,6 @@ struct protosw sctp6_stream_protosw = {
.pr_input = sctp6_input,
.pr_ctlinput = sctp6_ctlinput,
.pr_ctloutput = sctp_ctloutput,
- .pr_init = sctp_init,
.pr_drain = sctp_drain,
.pr_usrreqs = &sctp6_usrreqs,
};
@@ -105,7 +104,6 @@ struct protosw sctp6_seqpacket_protosw = {
.pr_ctlinput = sctp6_ctlinput,
.pr_ctloutput = sctp_ctloutput,
#ifndef INET /* Do not call initialization and drain routines twice. */
- .pr_init = sctp_init,
.pr_drain = sctp_drain,
#endif
.pr_usrreqs = &sctp6_usrreqs,
diff --git a/sys/netinet/sctp_os_bsd.h b/sys/netinet/sctp_os_bsd.h
index 12a666b8eead..39c0a8cb5c2a 100644
--- a/sys/netinet/sctp_os_bsd.h
+++ b/sys/netinet/sctp_os_bsd.h
@@ -411,28 +411,30 @@ typedef struct route sctp_route_t;
/*
* IP output routines
*/
-#define SCTP_IP_OUTPUT(result, o_pak, ro, stcb, vrf_id) \
-{ \
- int o_flgs = IP_RAWOUTPUT; \
- struct sctp_tcb *local_stcb = stcb; \
- if (local_stcb && \
- local_stcb->sctp_ep && \
- local_stcb->sctp_ep->sctp_socket) \
- o_flgs |= local_stcb->sctp_ep->sctp_socket->so_options & SO_DONTROUTE; \
- m_clrprotoflags(o_pak); \
- result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \
+#define SCTP_IP_OUTPUT(result, o_pak, ro, _inp, vrf_id) \
+{ \
+ struct sctp_inpcb *local_inp = _inp; \
+ int o_flgs = IP_RAWOUTPUT; \
+ \
+ m_clrprotoflags(o_pak); \
+ if ((local_inp != NULL) && (local_inp->sctp_socket != NULL)) { \
+ o_flgs |= local_inp->sctp_socket->so_options & SO_DONTROUTE; \
+ } \
+ result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \
}
-#define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, stcb, vrf_id) \
-{ \
- struct sctp_tcb *local_stcb = stcb; \
- m_clrprotoflags(o_pak); \
- if (local_stcb && local_stcb->sctp_ep) \
- result = ip6_output(o_pak, \
- ((struct inpcb *)(local_stcb->sctp_ep))->in6p_outputopts, \
- (ro), 0, 0, ifp, NULL); \
- else \
- result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \
+#define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, _inp, vrf_id) \
+{ \
+ struct sctp_inpcb *local_inp = _inp; \
+ \
+ m_clrprotoflags(o_pak); \
+ if (local_inp != NULL) { \
+ result = ip6_output(o_pak, \
+ local_inp->ip_inp.inp.in6p_outputopts, \
+ (ro), 0, 0, ifp, NULL); \
+ } else { \
+ result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \
+ } \
}
struct mbuf *
diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c
index 9b4c9093512a..4c48d4787a69 100644
--- a/sys/netinet/sctp_output.c
+++ b/sys/netinet/sctp_output.c
@@ -4220,7 +4220,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
#endif
/* send it out. table id is taken from stcb */
SCTP_PROBE5(send, NULL, stcb, ip, stcb, sctphdr);
- SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id);
+ SCTP_IP_OUTPUT(ret, o_pak, ro, inp, vrf_id);
if (port) {
UDPSTAT_INC(udps_opackets);
}
@@ -4244,10 +4244,10 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
mtu -= sizeof(struct udphdr);
}
if (mtu < net->mtu) {
+ net->mtu = mtu;
if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) {
- sctp_mtu_size_reset(inp, &stcb->asoc, mtu);
+ sctp_pathmtu_adjustment(stcb, mtu, true);
}
- net->mtu = mtu;
}
}
} else if (ro->ro_nh == NULL) {
@@ -4544,7 +4544,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
sctp_packet_log(o_pak);
#endif
SCTP_PROBE5(send, NULL, stcb, ip6h, stcb, sctphdr);
- SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id);
+ SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, inp, vrf_id);
if (net) {
/* for link local this must be done */
sin6->sin6_scope_id = prev_scope;
@@ -4586,18 +4586,16 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
mtu -= sizeof(struct udphdr);
}
if (mtu < net->mtu) {
+ net->mtu = mtu;
if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) {
- sctp_mtu_size_reset(inp, &stcb->asoc, mtu);
+ sctp_pathmtu_adjustment(stcb, mtu, false);
}
- net->mtu = mtu;
}
}
- } else if (ifp) {
- if (ND_IFINFO(ifp)->linkmtu &&
+ } else if (ifp != NULL) {
+ if ((ND_IFINFO(ifp)->linkmtu > 0) &&
(stcb->asoc.smallest_mtu > ND_IFINFO(ifp)->linkmtu)) {
- sctp_mtu_size_reset(inp,
- &stcb->asoc,
- ND_IFINFO(ifp)->linkmtu);
+ sctp_pathmtu_adjustment(stcb, ND_IFINFO(ifp)->linkmtu, false);
}
}
}
@@ -6217,43 +6215,49 @@ sctp_prune_prsctp(struct sctp_tcb *stcb,
} /* if enabled in asoc */
}
-int
-sctp_get_frag_point(struct sctp_tcb *stcb,
- struct sctp_association *asoc)
+uint32_t
+sctp_get_frag_point(struct sctp_tcb *stcb)
{
- int siz, ovh;
+ struct sctp_association *asoc;
+ uint32_t frag_point, overhead;
- /*
- * For endpoints that have both v6 and v4 addresses we must reserve
- * room for the ipv6 header, for those that are only dealing with V4
- * we use a larger frag point.
- */
+ asoc = &stcb->asoc;
+ /* Consider IP header and SCTP common header. */
if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MIN_OVERHEAD;
+ overhead = SCTP_MIN_OVERHEAD;
} else {
- ovh = SCTP_MIN_V4_OVERHEAD;
+ overhead = SCTP_MIN_V4_OVERHEAD;
}
- ovh += SCTP_DATA_CHUNK_OVERHEAD(stcb);
- if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu)
- siz = asoc->smallest_mtu - ovh;
- else
- siz = (stcb->asoc.sctp_frag_point - ovh);
- /*
- * if (siz > (MCLBYTES-sizeof(struct sctp_data_chunk))) {
- */
- /* A data chunk MUST fit in a cluster */
- /* siz = (MCLBYTES - sizeof(struct sctp_data_chunk)); */
- /* } */
-
- /* adjust for an AUTH chunk if DATA requires auth */
- if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks))
- siz -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
-
- if (siz % 4) {
- /* make it an even word boundary please */
- siz -= (siz % 4);
- }
- return (siz);
+ /* Consider DATA/IDATA chunk header and AUTH header, if needed. */
+ if (asoc->idata_supported) {
+ overhead += sizeof(struct sctp_idata_chunk);
+ if (sctp_auth_is_required_chunk(SCTP_IDATA, asoc->peer_auth_chunks)) {
+ overhead += sctp_get_auth_chunk_len(asoc->peer_hmac_id);
+ }
+ } else {
+ overhead += sizeof(struct sctp_data_chunk);
+ if (sctp_auth_is_required_chunk(SCTP_DATA, asoc->peer_auth_chunks)) {
+ overhead += sctp_get_auth_chunk_len(asoc->peer_hmac_id);
+ }
+ }
+ KASSERT(overhead % 4 == 0,
+ ("overhead (%u) not a multiple of 4", overhead));
+ /* Consider padding. */
+ if (asoc->smallest_mtu % 4 > 0) {
+ overhead += (asoc->smallest_mtu % 4);
+ }
+ KASSERT(asoc->smallest_mtu > overhead,
+ ("Association MTU (%u) too small for overhead (%u)",
+ asoc->smallest_mtu, overhead));
+ frag_point = asoc->smallest_mtu - overhead;
+ KASSERT(frag_point % 4 == 0,
+ ("frag_point (%u) not a multiple of 4", frag_point));
+ /* Honor MAXSEG socket option. */
+ if ((asoc->sctp_frag_point > 0) &&
+ (asoc->sctp_frag_point < frag_point)) {
+ frag_point = asoc->sctp_frag_point;
+ }
+ return (frag_point);
}
static void
@@ -6571,7 +6575,8 @@ sctp_med_chunk_output(struct sctp_inpcb *inp,
int *num_out,
int *reason_code,
int control_only, int from_where,
- struct timeval *now, int *now_filled, int frag_point, int so_locked);
+ struct timeval *now, int *now_filled,
+ uint32_t frag_point, int so_locked);
static void
sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
@@ -6740,13 +6745,13 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
if (do_chunk_output)
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED);
else if (added_control) {
- int num_out, reason, now_filled = 0;
struct timeval now;
- int frag_point;
+ int num_out, reason, now_filled = 0;
- frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
(void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out,
- &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_NOT_LOCKED);
+ &reason, 1, 1, &now, &now_filled,
+ sctp_get_frag_point(stcb),
+ SCTP_SO_NOT_LOCKED);
}
no_chunk_output:
if (ret) {
@@ -7674,8 +7679,9 @@ out_of:
}
static void
-sctp_fill_outqueue(struct sctp_tcb *stcb, struct sctp_nets *net, int frag_point,
- int eeor_mode, int *quit_now, int so_locked)
+sctp_fill_outqueue(struct sctp_tcb *stcb, struct sctp_nets *net,
+ uint32_t frag_point, int eeor_mode, int *quit_now,
+ int so_locked)
{
struct sctp_association *asoc;
struct sctp_stream_out *strq;
@@ -7794,7 +7800,8 @@ sctp_med_chunk_output(struct sctp_inpcb *inp,
int *num_out,
int *reason_code,
int control_only, int from_where,
- struct timeval *now, int *now_filled, int frag_point, int so_locked)
+ struct timeval *now, int *now_filled,
+ uint32_t frag_point, int so_locked)
{
/**
* Ok this is the generic chunk service queue. we must do the
@@ -9975,7 +9982,7 @@ sctp_chunk_output(struct sctp_inpcb *inp,
struct timeval now;
int now_filled = 0;
int nagle_on;
- int frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
+ uint32_t frag_point = sctp_get_frag_point(stcb);
int un_sent = 0;
int fr_done;
unsigned int tot_frs = 0;
@@ -13663,16 +13670,17 @@ skip_out_eof:
}
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
} else if (some_on_control) {
- int num_out, reason, frag_point;
+ int num_out, reason;
/* Here we do control only */
if (hold_tcblock == 0) {
hold_tcblock = 1;
SCTP_TCB_LOCK(stcb);
}
- frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
(void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out,
- &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_LOCKED);
+ &reason, 1, 1, &now, &now_filled,
+ sctp_get_frag_point(stcb),
+ SCTP_SO_LOCKED);
}
NET_EPOCH_EXIT(et);
SCTPDBG(SCTP_DEBUG_OUTPUT1, "USR Send complete qo:%d prw:%d unsent:%d tf:%d cooq:%d toqs:%d err:%d\n",
diff --git a/sys/netinet/sctp_output.h b/sys/netinet/sctp_output.h
index 7d2cdc4071d8..e6ee80c41f1a 100644
--- a/sys/netinet/sctp_output.h
+++ b/sys/netinet/sctp_output.h
@@ -117,7 +117,7 @@ void sctp_send_asconf(struct sctp_tcb *, struct sctp_nets *, int addr_locked);
void sctp_send_asconf_ack(struct sctp_tcb *);
-int sctp_get_frag_point(struct sctp_tcb *, struct sctp_association *);
+uint32_t sctp_get_frag_point(struct sctp_tcb *);
void sctp_toss_old_cookies(struct sctp_tcb *, struct sctp_association *);
diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c
index 0b26ea8c1944..e1006255204a 100644
--- a/sys/netinet/sctp_pcb.c
+++ b/sys/netinet/sctp_pcb.c
@@ -2422,7 +2422,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
#endif
inp->sctp_associd_counter = 1;
inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT;
- inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
+ inp->sctp_frag_point = 0;
inp->max_cwnd = 0;
inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off);
inp->ecn_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_ecn_enable);
@@ -2498,7 +2498,8 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
SCTP_INP_INFO_WLOCK();
SCTP_INP_LOCK_INIT(inp);
- INP_LOCK_INIT(&inp->ip_inp.inp, "inp", "sctpinp");
+ rw_init_flags(&inp->ip_inp.inp.inp_lock, "sctpinp",
+ RW_RECURSE | RW_DUPOK);
SCTP_INP_READ_INIT(inp);
SCTP_ASOC_CREATE_LOCK_INIT(inp);
/* lock the new ep */
@@ -2622,6 +2623,23 @@ sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp,
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
+#ifdef INET6
+ if (old_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ new_inp->ip_inp.inp.inp_flags |= old_inp->ip_inp.inp.inp_flags & INP_CONTROLOPTS;
+ if (old_inp->ip_inp.inp.in6p_outputopts) {
+ new_inp->ip_inp.inp.in6p_outputopts = ip6_copypktopts(old_inp->ip_inp.inp.in6p_outputopts, M_NOWAIT);
+ }
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ new_inp->ip_inp.inp.inp_ip_tos = old_inp->ip_inp.inp.inp_ip_tos;
+ new_inp->ip_inp.inp.inp_ip_ttl = old_inp->ip_inp.inp.inp_ip_ttl;
+ }
+#endif
new_inp->sctp_ep.time_of_secret_change =
old_inp->sctp_ep.time_of_secret_change;
memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key,
@@ -4008,7 +4026,7 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
stcb->asoc.smallest_mtu = net->mtu;
}
if (stcb->asoc.smallest_mtu > net->mtu) {
- sctp_pathmtu_adjustment(stcb, net->mtu);
+ sctp_pathmtu_adjustment(stcb, net->mtu, true);
}
#ifdef INET6
if (newaddr->sa_family == AF_INET6) {
diff --git a/sys/netinet/sctp_usrreq.c b/sys/netinet/sctp_usrreq.c
index c5c45a2f2072..464337c534f2 100644
--- a/sys/netinet/sctp_usrreq.c
+++ b/sys/netinet/sctp_usrreq.c
@@ -58,8 +58,8 @@ __FBSDID("$FreeBSD$");
extern const struct sctp_cc_functions sctp_cc_functions[];
extern const struct sctp_ss_functions sctp_ss_functions[];
-void
-sctp_init(void)
+static void
+sctp_init(void *arg SCTP_UNUSED)
{
u_long sb_max_adj;
@@ -73,7 +73,7 @@ sctp_init(void)
*/
sb_max_adj = (u_long)((u_quad_t)(SB_MAX) * MCLBYTES / (MSIZE + MCLBYTES));
SCTP_BASE_SYSCTL(sctp_sendspace) = min(sb_max_adj,
- (((uint32_t)nmbclusters / 2) * SCTP_DEFAULT_MAXSEGMENT));
+ (((uint32_t)nmbclusters / 2) * MCLBYTES));
/*
* Now for the recv window, should we take the same amount? or
* should I do 1/2 the SB_MAX instead in the SB_MAX min above. For
@@ -92,6 +92,8 @@ sctp_init(void)
sctp_addr_change_event_handler, NULL, EVENTHANDLER_PRI_FIRST);
}
+VNET_SYSINIT(sctp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, sctp_init, NULL);
+
#ifdef VIMAGE
static void
sctp_finish(void *unused __unused)
@@ -104,35 +106,51 @@ VNET_SYSUNINIT(sctp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, sctp_finish, NULL);
#endif
void
-sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz)
+sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint32_t mtu, bool resend)
{
+ struct sctp_association *asoc;
struct sctp_tmit_chunk *chk;
- uint16_t overhead;
-
- /* Adjust that too */
- stcb->asoc.smallest_mtu = nxtsz;
- /* now off to subtract IP_DF flag if needed */
- overhead = IP_HDR_SIZE + sizeof(struct sctphdr);
- if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) {
- overhead += sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
+ uint32_t overhead;
+
+ asoc = &stcb->asoc;
+ KASSERT(mtu < asoc->smallest_mtu,
+ ("Currently only reducing association MTU %u supported (MTU %u)",
+ asoc->smallest_mtu, mtu));
+ asoc->smallest_mtu = mtu;
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ overhead = SCTP_MIN_OVERHEAD;
+ } else {
+ overhead = SCTP_MIN_V4_OVERHEAD;
}
- TAILQ_FOREACH(chk, &stcb->asoc.send_queue, sctp_next) {
- if ((chk->send_size + overhead) > nxtsz) {
+ if (asoc->idata_supported) {
+ if (sctp_auth_is_required_chunk(SCTP_IDATA, asoc->peer_auth_chunks)) {
+ overhead += sctp_get_auth_chunk_len(asoc->peer_hmac_id);
+ }
+ } else {
+ if (sctp_auth_is_required_chunk(SCTP_DATA, asoc->peer_auth_chunks)) {
+ overhead += sctp_get_auth_chunk_len(asoc->peer_hmac_id);
+ }
+ }
+ KASSERT(overhead % 4 == 0,
+ ("overhead (%u) not a multiple of 4", overhead));
+ TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
+ if (((uint32_t)chk->send_size + overhead) > mtu) {
chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
}
}
- TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
- if ((chk->send_size + overhead) > nxtsz) {
- /*
- * For this guy we also mark for immediate resend
- * since we sent to big of chunk
- */
+ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
+ if (((uint32_t)chk->send_size + overhead) > mtu) {
chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
- if (chk->sent < SCTP_DATAGRAM_RESEND) {
+ if (resend && chk->sent < SCTP_DATAGRAM_RESEND) {
+ /*
+ * If requested, mark the chunk for
+ * immediate resend, since we sent it being
+ * too big.
+ */
sctp_flight_size_decrease(chk);
sctp_total_flight_decrease(stcb, chk);
chk->sent = SCTP_DATAGRAM_RESEND;
- sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ sctp_ucount_incr(asoc->sent_queue_retran_cnt);
chk->rec.data.doing_fast_retransmit = 0;
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU,
@@ -141,7 +159,7 @@ sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz)
(uint32_t)(uintptr_t)chk->whoTo,
chk->rec.data.tsn);
}
- /* Clear any time so NO RTT is being done */
+ /* Clear any time, so NO RTT is being done. */
if (chk->do_rtt == 1) {
chk->do_rtt = 0;
chk->whoTo->rto_needed = 1;
@@ -229,7 +247,7 @@ sctp_notify(struct sctp_inpcb *inp,
}
/* Update the association MTU */
if (stcb->asoc.smallest_mtu > next_mtu) {
- sctp_pathmtu_adjustment(stcb, next_mtu);
+ sctp_pathmtu_adjustment(stcb, next_mtu, true);
}
/* Finally, start the PMTU timer if it was running before. */
if (timer_stopped) {
@@ -2032,13 +2050,12 @@ flags_out:
case SCTP_MAXSEG:
{
struct sctp_assoc_value *av;
- int ovh;
SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
SCTP_FIND_STCB(inp, stcb, av->assoc_id);
if (stcb) {
- av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc);
+ av->assoc_value = stcb->asoc.sctp_frag_point;
SCTP_TCB_UNLOCK(stcb);
} else {
if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
@@ -2046,15 +2063,7 @@ flags_out:
((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
(av->assoc_id == SCTP_FUTURE_ASSOC))) {
SCTP_INP_RLOCK(inp);
- if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MED_OVERHEAD;
- } else {
- ovh = SCTP_MED_V4_OVERHEAD;
- }
- if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT)
- av->assoc_value = 0;
- else
- av->assoc_value = inp->sctp_frag_point - ovh;
+ av->assoc_value = inp->sctp_frag_point;
SCTP_INP_RUNLOCK(inp);
} else {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -2623,7 +2632,7 @@ flags_out:
stcb->asoc.cnt_on_all_streams);
sstat->sstat_instrms = stcb->asoc.streamincnt;
sstat->sstat_outstrms = stcb->asoc.streamoutcnt;
- sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb, &stcb->asoc);
+ sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb);
net = stcb->asoc.primary_destination;
if (net != NULL) {
memcpy(&sstat->sstat_primary.spinfo_address,
@@ -4977,22 +4986,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
case SCTP_MAXSEG:
{
struct sctp_assoc_value *av;
- int ovh;
SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
SCTP_FIND_STCB(inp, stcb, av->assoc_id);
- if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MED_OVERHEAD;
- } else {
- ovh = SCTP_MED_V4_OVERHEAD;
- }
if (stcb) {
- if (av->assoc_value) {
- stcb->asoc.sctp_frag_point = (av->assoc_value + ovh);
- } else {
- stcb->asoc.sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
- }
+ stcb->asoc.sctp_frag_point = av->assoc_value;
SCTP_TCB_UNLOCK(stcb);
} else {
if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
@@ -5000,15 +4999,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
(av->assoc_id == SCTP_FUTURE_ASSOC))) {
SCTP_INP_WLOCK(inp);
- /*
- * FIXME MT: I think this is not in
- * tune with the API ID
- */
- if (av->assoc_value) {
- inp->sctp_frag_point = (av->assoc_value + ovh);
- } else {
- inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
- }
+ inp->sctp_frag_point = av->assoc_value;
SCTP_INP_WUNLOCK(inp);
} else {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -5390,7 +5381,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
break;
}
if (net->mtu < stcb->asoc.smallest_mtu) {
- sctp_pathmtu_adjustment(stcb, net->mtu);
+ sctp_pathmtu_adjustment(stcb, net->mtu, true);
}
}
}
@@ -5534,7 +5525,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
break;
}
if (net->mtu < stcb->asoc.smallest_mtu) {
- sctp_pathmtu_adjustment(stcb, net->mtu);
+ sctp_pathmtu_adjustment(stcb, net->mtu, true);
}
}
}
diff --git a/sys/netinet/sctp_var.h b/sys/netinet/sctp_var.h
index ed01de7d7014..b45c4ac410f9 100644
--- a/sys/netinet/sctp_var.h
+++ b/sys/netinet/sctp_var.h
@@ -328,9 +328,8 @@ int sctp_ctloutput(struct socket *, struct sockopt *);
void sctp_input_with_port(struct mbuf *, int, uint16_t);
int sctp_input(struct mbuf **, int *, int);
#endif
-void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t);
+void sctp_pathmtu_adjustment(struct sctp_tcb *, uint32_t, bool);
void sctp_drain(void);
-void sctp_init(void);
void
sctp_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *,
uint8_t, uint8_t, uint16_t, uint32_t);
diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c
index 8322603438c3..04a7e12f10eb 100644
--- a/sys/netinet/sctputil.c
+++ b/sys/netinet/sctputil.c
@@ -1248,7 +1248,7 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
asoc->my_rwnd = max(SCTP_SB_LIMIT_RCV(inp->sctp_socket), SCTP_MINIMAL_RWND);
asoc->peers_rwnd = SCTP_SB_LIMIT_RCV(inp->sctp_socket);
- asoc->smallest_mtu = inp->sctp_frag_point;
+ asoc->smallest_mtu = 0;
asoc->minrto = inp->sctp_ep.sctp_minrto;
asoc->maxrto = inp->sctp_ep.sctp_maxrto;
@@ -2895,48 +2895,16 @@ sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
uint32_t
sctp_calculate_len(struct mbuf *m)
{
- uint32_t tlen = 0;
struct mbuf *at;
+ uint32_t tlen;
- at = m;
- while (at) {
+ tlen = 0;
+ for (at = m; at != NULL; at = SCTP_BUF_NEXT(at)) {
tlen += SCTP_BUF_LEN(at);
- at = SCTP_BUF_NEXT(at);
}
return (tlen);
}
-void
-sctp_mtu_size_reset(struct sctp_inpcb *inp,
- struct sctp_association *asoc, uint32_t mtu)
-{
- /*
- * Reset the P-MTU size on this association, this involves changing
- * the asoc MTU, going through ANY chunk+overhead larger than mtu to
- * allow the DF flag to be cleared.
- */
- struct sctp_tmit_chunk *chk;
- unsigned int eff_mtu, ovh;
-
- asoc->smallest_mtu = mtu;
- if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MIN_OVERHEAD;
- } else {
- ovh = SCTP_MIN_V4_OVERHEAD;
- }
- eff_mtu = mtu - ovh;
- TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
- if (chk->send_size > eff_mtu) {
- chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
- }
- }
- TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
- if (chk->send_size > eff_mtu) {
- chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
- }
- }
-}
-
/*
* Given an association and starting time of the current RTT period, update
* RTO in number of msecs. net should point to the current network.
diff --git a/sys/netinet/sctputil.h b/sys/netinet/sctputil.h
index 3319eb4f455b..8253fde829e1 100644
--- a/sys/netinet/sctputil.h
+++ b/sys/netinet/sctputil.h
@@ -109,9 +109,6 @@ int
sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id);
void
- sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t);
-
-void
sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
int so_locked
SCTP_UNUSED
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 9bf6e6773cca..40747a238918 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -100,25 +100,6 @@ __FBSDID("$FreeBSD$");
* function (ctf_do_queued_segments()) requires that
* you have defined the tfb_do_segment_nounlock() as
* described above.
- *
- * Now the second function the tcp_hpts system provides is the ability
- * to abort a connection later. Why would you want to do this?
- * To not have to worry about untangling any recursive locks.
- *
- * The second feature of the input side of hpts is the
- * dropping of a connection. This is due to the way that
- * locking may have occured on the INP_WLOCK. So if
- * a stack wants to drop a connection it calls:
- *
- * tcp_set_inp_to_drop(tp, ETIMEDOUT)
- *
- * To schedule the tcp_hpts system to call
- *
- * tcp_drop(tp, drop_reason)
- *
- * at a future point. This is quite handy to prevent locking
- * issues when dropping connections.
- *
*/
#include <sys/param.h>
@@ -222,14 +203,11 @@ struct tcp_hpts_entry {
p_avail:5;
uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
- TAILQ_HEAD(, inpcb) p_dropq; /* Delayed drop queue */
struct hptsh {
TAILQ_HEAD(, inpcb) head;
uint32_t count;
uint32_t gencnt;
} *p_hptss; /* Hptsi wheel */
- uint32_t p_dropq_cnt; /* Count on drop queue */
- uint32_t p_dropq_gencnt;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
@@ -473,7 +451,7 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
* Unused logs are
* 64 bit - delRate, rttProp, bw_inuse
* 16 bit - cwnd_gain
- * 8 bit - bbr_state, bbr_substate, inhpts, ininput;
+ * 8 bit - bbr_state, bbr_substate, inhpts;
*/
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex1 = hpts->p_nxt_slot;
@@ -565,19 +543,6 @@ tcp_hpts_lock(struct inpcb *inp)
return (hpts);
}
-static struct tcp_hpts_entry *
-tcp_dropq_lock(struct inpcb *inp)
-{
- struct tcp_hpts_entry *hpts;
-
- INP_LOCK_ASSERT(inp);
-
- hpts = tcp_pace.rp_ent[inp->inp_dropq_cpu];
- HPTS_LOCK(hpts);
-
- return (hpts);
-}
-
static void
inp_hpts_release(struct inpcb *inp)
{
@@ -588,69 +553,20 @@ inp_hpts_release(struct inpcb *inp)
MPASS(released == false);
}
-static void
-tcp_dropq_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp)
-{
- bool released __diagused;
-
- HPTS_MTX_ASSERT(hpts);
- INP_WLOCK_ASSERT(inp);
-
- if (inp->inp_in_dropq != IHPTS_ONQUEUE)
- return;
-
- MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
- if (__predict_true(inp->inp_dropq_gencnt == hpts->p_dropq_gencnt)) {
- TAILQ_REMOVE(&hpts->p_dropq, inp, inp_dropq);
- MPASS(hpts->p_dropq_cnt > 0);
- hpts->p_dropq_cnt--;
- inp->inp_in_dropq = IHPTS_NONE;
- released = in_pcbrele_wlocked(inp);
- MPASS(released == false);
- } else {
- /*
- * tcp_delayed_drop() now owns the TAILQ head of this inp.
- * Can't TAILQ_REMOVE, just mark it.
- */
-#ifdef INVARIANTS
- struct inpcb *tmp;
-
- TAILQ_FOREACH(tmp, &hpts->p_dropq, inp_dropq)
- MPASS(tmp != inp);
-#endif
- inp->inp_in_dropq = IHPTS_MOVING;
- }
-
-}
-
/*
* Called normally with the INP_LOCKED but it
* does not matter, the hpts lock is the key
* but the lock order allows us to hold the
* INP lock and then get the hpts lock.
- *
- * Valid values in the flags are
- * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
- * HPTS_REMOVE_DROPQ - remove from the drop queue of the hpts.
- * Note that you can use one or both values together
- * and get two actions.
*/
void
-__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
+tcp_hpts_remove(struct inpcb *inp)
{
struct tcp_hpts_entry *hpts;
struct hptsh *hptsh;
INP_WLOCK_ASSERT(inp);
- if (flags & HPTS_REMOVE_DROPQ) {
- hpts = tcp_dropq_lock(inp);
- tcp_dropq_remove(hpts, inp);
- mtx_unlock(&hpts->p_mtx);
- }
-
- MPASS(flags & HPTS_REMOVE_OUTPUT);
-
hpts = tcp_hpts_lock(inp);
if (inp->inp_in_hpts == IHPTS_ONQUEUE) {
hptsh = &hpts->p_hptss[inp->inp_hptsslot];
@@ -1074,32 +990,6 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
return (slot_on);
}
-void
-tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason)
-{
- struct tcp_hpts_entry *hpts;
- struct tcpcb *tp = intotcpcb(inp);
-
- INP_WLOCK_ASSERT(inp);
- inp->inp_hpts_drop_reas = reason;
- if (inp->inp_in_dropq != IHPTS_NONE)
- return;
- hpts = tcp_dropq_lock(tp->t_inpcb);
- MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
-
- TAILQ_INSERT_TAIL(&hpts->p_dropq, inp, inp_dropq);
- inp->inp_in_dropq = IHPTS_ONQUEUE;
- inp->inp_dropq_gencnt = hpts->p_dropq_gencnt;
- hpts->p_dropq_cnt++;
- in_pcbref(inp);
-
- if ((hpts->p_hpts_active == 0) && (hpts->p_on_min_sleep == 0)){
- hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
- }
- HPTS_UNLOCK(hpts);
-}
-
uint16_t
hpts_random_cpu(struct inpcb *inp){
/*
@@ -1109,12 +999,9 @@ hpts_random_cpu(struct inpcb *inp){
uint32_t ran;
/*
- * If one has been set use it i.e. we want both in and out on the
- * same hpts.
+ * Shortcut if it is already set. XXXGL: does it happen?
*/
- if (inp->inp_dropq_cpu_set) {
- return (inp->inp_dropq_cpu);
- } else if (inp->inp_hpts_cpu_set) {
+ if (inp->inp_hpts_cpu_set) {
return (inp->inp_hpts_cpu);
}
/* Nothing set use a random number */
@@ -1132,13 +1019,7 @@ hpts_cpuid(struct inpcb *inp, int *failed)
#endif
*failed = 0;
- /*
- * If one has been set use it i.e. we want both in and out on the
- * same hpts.
- */
- if (inp->inp_dropq_cpu_set) {
- return (inp->inp_dropq_cpu);
- } else if (inp->inp_hpts_cpu_set) {
+ if (inp->inp_hpts_cpu_set) {
return (inp->inp_hpts_cpu);
}
/*
@@ -1204,57 +1085,6 @@ tcp_drop_in_pkts(struct tcpcb *tp)
}
}
-/*
- * Delayed drop functionality is factored out into separate function,
- * but logic is similar to the logic of tcp_hptsi().
- */
-static void
-tcp_delayed_drop(struct tcp_hpts_entry *hpts)
-{
- TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
- struct inpcb *inp, *tmp;
- struct tcpcb *tp;
-
- HPTS_MTX_ASSERT(hpts);
- NET_EPOCH_ASSERT();
-
- TAILQ_SWAP(&head, &hpts->p_dropq, inpcb, inp_dropq);
- hpts->p_dropq_cnt = 0;
- hpts->p_dropq_gencnt++;
- HPTS_UNLOCK(hpts);
-
- TAILQ_FOREACH_SAFE(inp, &head, inp_dropq, tmp) {
- INP_WLOCK(inp);
- MPASS(inp->inp_hpts_drop_reas != 0);
- if (__predict_false(inp->inp_in_dropq == IHPTS_MOVING)) {
- inp->inp_in_dropq = IHPTS_NONE;
- if (in_pcbrele_wlocked(inp) == false)
- INP_WUNLOCK(inp);
- continue;
- }
- MPASS(inp->inp_in_dropq == IHPTS_ONQUEUE);
- inp->inp_in_dropq = IHPTS_NONE;
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
- if (in_pcbrele_wlocked(inp) == false)
- INP_WUNLOCK(inp);
- continue;
- }
- CURVNET_SET(inp->inp_vnet);
- if (__predict_true((tp = intotcpcb(inp)) != NULL)) {
- MPASS(tp->t_inpcb == inp);
- tcp_drop_in_pkts(tp);
- tp = tcp_drop(tp, inp->inp_hpts_drop_reas);
- if (tp == NULL)
- INP_WLOCK(inp);
- }
- if (in_pcbrele_wlocked(inp) == false)
- INP_WUNLOCK(inp);
- CURVNET_RESTORE();
- }
-
- mtx_lock(&hpts->p_mtx); /* XXXGL */
-}
-
static void
tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
{
@@ -1392,10 +1222,6 @@ again:
hpts->p_nxt_slot = hpts->p_prev_slot;
hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
}
- KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) ||
- ((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))),
- ("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
- __FUNCTION__, hpts, hpts->p_dropq_cnt));
if (hpts->p_on_queue_cnt == 0) {
goto no_one;
}
@@ -1403,7 +1229,7 @@ again:
struct inpcb *inp, *ninp;
TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
struct hptsh *hptsh;
- uint32_t runningslot, gencnt;
+ uint32_t runningslot;
/*
* Calculate our delay, if there are no extra ticks there
@@ -1417,7 +1243,7 @@ again:
TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts);
hpts->p_on_queue_cnt -= hptsh->count;
hptsh->count = 0;
- gencnt = hptsh->gencnt++;
+ hptsh->gencnt++;
HPTS_UNLOCK(hpts);
@@ -1549,7 +1375,9 @@ again:
}
}
inp->inp_hpts_calls = 1;
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output(tp);
+ if (error < 0)
+ goto skip_pacing;
inp->inp_hpts_calls = 0;
if (ninp && ninp->inp_ppcb) {
/*
@@ -1613,10 +1441,6 @@ no_one:
* Check to see if we took an excess amount of time and need to run
* more ticks (if we did not hit eno-bufs).
*/
- KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) ||
- ((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))),
- ("%s hpts:%p in_hpts cnt:%d queue state mismatch",
- __FUNCTION__, hpts, hpts->p_dropq_cnt));
hpts->p_prev_slot = hpts->p_cur_slot;
hpts->p_lasttick = hpts->p_curtick;
if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
@@ -1659,11 +1483,6 @@ no_run:
*/
hpts->p_wheel_complete = 1;
/*
- * Run any input that may be there not covered
- * in running data.
- */
- tcp_delayed_drop(hpts);
- /*
* Now did we spend too long running input and need to run more ticks?
* Note that if wrap_loop_cnt < 2 then we should have the conditions
* in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
@@ -1710,14 +1529,6 @@ __tcp_set_hpts(struct inpcb *inp, int32_t line)
inp->inp_hpts_cpu_set = 1;
}
mtx_unlock(&hpts->p_mtx);
- hpts = tcp_dropq_lock(inp);
- if ((inp->inp_dropq_cpu_set == 0) &&
- (inp->inp_in_dropq == 0)) {
- inp->inp_dropq_cpu = hpts_cpuid(inp, &failed);
- if (failed == 0)
- inp->inp_dropq_cpu_set = 1;
- }
- mtx_unlock(&hpts->p_mtx);
}
static void
@@ -2035,7 +1846,6 @@ tcp_init_hptsi(void *st)
*/
mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
"hpts", MTX_DEF | MTX_DUPOK);
- TAILQ_INIT(&hpts->p_dropq);
for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
TAILQ_INIT(&hpts->p_hptss[j].head);
hpts->p_hptss[j].count = 0;
@@ -2051,11 +1861,6 @@ tcp_init_hptsi(void *st)
"");
SYSCTL_ADD_INT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "in_qcnt", CTLFLAG_RD,
- &hpts->p_dropq_cnt, 0,
- "Count TCB's awaiting delayed drop");
- SYSCTL_ADD_INT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "out_qcnt", CTLFLAG_RD,
&hpts->p_on_queue_cnt, 0,
"Count TCB's awaiting output processing");
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index ac99296e34f3..161cf9721051 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -114,11 +114,7 @@ struct hpts_diag {
#ifdef _KERNEL
-#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
-void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
-#define HPTS_REMOVE_DROPQ 0x01
-#define HPTS_REMOVE_OUTPUT 0x02
-#define HPTS_REMOVE_ALL (HPTS_REMOVE_DROPQ | HPTS_REMOVE_OUTPUT)
+void tcp_hpts_remove(struct inpcb *);
bool tcp_in_hpts(struct inpcb *);
/*
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index ee514a11eef6..110cb99df7bb 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -258,28 +258,6 @@ SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
"TCP connection counts by TCP state");
-static void
-tcp_vnet_init(const void *unused)
-{
-
- COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
- VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
-}
-VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
- tcp_vnet_init, NULL);
-
-#ifdef VIMAGE
-static void
-tcp_vnet_uninit(const void *unused)
-{
-
- COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
- VNET_PCPUSTAT_FREE(tcpstat);
-}
-VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
- tcp_vnet_uninit, NULL);
-#endif /* VIMAGE */
-
/*
* Kernel module interface for updating tcpstat. The first argument is an index
* into tcpstat treated as an array.
@@ -1911,7 +1889,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_rxtcur);
sowwakeup(so);
if (sbavail(&so->so_snd))
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
goto check_delack;
}
} else if (th->th_ack == tp->snd_una &&
@@ -1980,7 +1958,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_flags |= TF_DELACK;
} else {
tp->t_flags |= TF_ACKNOW;
- tp->t_fb->tfb_tcp_output(tp);
+ tcp_output(tp);
}
goto check_delack;
}
@@ -2651,7 +2629,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
} else
tp->snd_cwnd += maxseg;
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
goto drop;
} else if (tp->t_dupacks == tcprexmtthresh ||
(tp->t_flags & TF_SACK_PERMIT &&
@@ -2720,14 +2698,14 @@ enter_recovery:
tcps_sack_recovery_episode);
tp->snd_recover = tp->snd_nxt;
tp->snd_cwnd = maxseg;
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
if (SEQ_GT(th->th_ack, tp->snd_una))
goto resume_partialack;
goto drop;
}
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = maxseg;
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
KASSERT(tp->snd_limited <= 2,
("%s: tp->snd_limited too big",
__func__));
@@ -2775,7 +2753,7 @@ enter_recovery:
(tp->snd_nxt - tp->snd_una);
SOCKBUF_UNLOCK(&so->so_snd);
if (avail > 0)
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
sent = tp->snd_max - oldsndmax;
if (sent > maxseg) {
KASSERT((tp->t_dupacks == 2 &&
@@ -3327,7 +3305,7 @@ dodata: /* XXX */
* Return any desired output.
*/
if (needoutput || (tp->t_flags & TF_ACKNOW))
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
check_delack:
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -3368,7 +3346,7 @@ dropafterack:
#endif
TCP_PROBE3(debug__input, tp, th, m);
tp->t_flags |= TF_ACKNOW;
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
@@ -4071,7 +4049,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
*/
tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index 820f345a758f..1290a8ce6b29 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -95,7 +95,7 @@ struct tcp_log_bbr {
uint8_t bbr_state;
uint8_t bbr_substate;
uint8_t inhpts;
- uint8_t ininput;
+ uint8_t __spare;
uint8_t use_lt_bw;
uint8_t flex8;
uint32_t pkt_epoch;
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 215b9097a4fd..b973c788d23e 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -820,7 +820,7 @@ static void
tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le)
{
/* Check if we need to recompute any checksums. */
- if (le->m_head->m_pkthdr.lro_nsegs > 1) {
+ if (le->needs_merge) {
uint16_t csum;
switch (le->inner.data.lro_type) {
@@ -921,6 +921,8 @@ tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le,
le->next_seq = ntohl(th->th_seq) + tcp_data_len;
le->ack_seq = th->th_ack;
le->window = th->th_win;
+ le->flags = th->th_flags;
+ le->needs_merge = 0;
/* Setup new data pointers. */
le->m_head = m;
@@ -962,10 +964,12 @@ tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m)
}
static void
-tcp_lro_mbuf_append_pkthdr(struct mbuf *m, const struct mbuf *p)
+tcp_lro_mbuf_append_pkthdr(struct lro_entry *le, const struct mbuf *p)
{
+ struct mbuf *m;
uint32_t csum;
+ m = le->m_head;
if (m->m_pkthdr.lro_nsegs == 1) {
/* Compute relative checksum. */
csum = p->m_pkthdr.lro_tcp_d_csum;
@@ -982,6 +986,7 @@ tcp_lro_mbuf_append_pkthdr(struct mbuf *m, const struct mbuf *p)
m->m_pkthdr.lro_tcp_d_csum = csum;
m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len;
m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs;
+ le->needs_merge = 1;
}
static void
@@ -1088,10 +1093,12 @@ again:
}
/* Try to append the new segment. */
if (__predict_false(ntohl(th->th_seq) != le->next_seq ||
+ ((th->th_flags & TH_ACK) !=
+ (le->flags & TH_ACK)) ||
(tcp_data_len == 0 &&
le->ack_seq == th->th_ack &&
le->window == th->th_win))) {
- /* Out of order packet or duplicate ACK. */
+ /* Out of order packet, non-ACK + ACK or dup ACK. */
tcp_push_and_replace(lc, le, m);
goto again;
}
@@ -1100,8 +1107,12 @@ again:
le->next_seq += tcp_data_len;
le->ack_seq = th->th_ack;
le->window = th->th_win;
+ le->needs_merge = 1;
} else if (th->th_ack == le->ack_seq) {
- le->window = WIN_MAX(le->window, th->th_win);
+ if (WIN_GT(th->th_win, le->window)) {
+ le->window = th->th_win;
+ le->needs_merge = 1;
+ }
}
if (tcp_data_len == 0) {
@@ -1110,7 +1121,7 @@ again:
}
/* Merge TCP data checksum and length to head mbuf. */
- tcp_lro_mbuf_append_pkthdr(le->m_head, m);
+ tcp_lro_mbuf_append_pkthdr(le, m);
/*
* Adjust the mbuf so that m_data points to the first byte of
@@ -1353,8 +1364,7 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
/* Check if any data mbufs left. */
if (le->m_head != NULL) {
counter_u64_add(tcp_inp_lro_direct_queue, 1);
- tcp_lro_log(tp, lc, le, NULL, 22, 1,
- inp->inp_flags2, inp->inp_in_dropq, 1);
+ tcp_lro_log(tp, lc, le, NULL, 22, 1, inp->inp_flags2, 0, 1);
tcp_queue_pkts(inp, tp, le);
}
if (should_wake) {
diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h
index 3eefd4f0537c..b8abc2fa1ab3 100644
--- a/sys/netinet/tcp_lro.h
+++ b/sys/netinet/tcp_lro.h
@@ -146,7 +146,9 @@ struct lro_entry {
uint16_t compressed;
uint16_t uncompressed;
uint16_t window;
- uint16_t timestamp; /* flag, not a TCP hdr field. */
+ uint8_t flags;
+ uint8_t timestamp : 1;
+ uint8_t needs_merge : 1;
struct bintime alloc_time; /* time when entry was allocated */
};
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index d0b56072e9af..ff40e67767ab 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -194,7 +194,7 @@ cc_after_idle(struct tcpcb *tp)
* Tcp output routine: figure out what should be sent and send it.
*/
int
-tcp_output(struct tcpcb *tp)
+tcp_default_output(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
int32_t len;
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
index 528dc062fd97..96a38b6afd54 100644
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@@ -1529,10 +1529,8 @@ tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
struct timeval tv;
- uint32_t cts;
memset(&log, 0, sizeof(log));
- cts = tcp_get_usecs(&tv);
log.u_bbr.flex1 = segsiz;
log.u_bbr.flex2 = new_tso;
log.u_bbr.flex3 = time_between;
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 25eb633fbbd4..c38b9dd1d006 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -902,7 +902,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
highdata - maxseg), highdata, NULL);
}
}
- (void) tp->t_fb->tfb_tcp_output(tp);
+ (void) tcp_output(tp);
}
#if 0
@@ -1013,7 +1013,7 @@ void
tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th)
{
struct sackhole *temp;
- uint32_t prev_cwnd;
+
if (IN_RECOVERY(tp->t_flags) &&
SEQ_GT(tp->snd_fack, tp->snd_recover) &&
((temp = TAILQ_FIRST(&tp->snd_holes)) != NULL) &&
@@ -1037,7 +1037,6 @@ tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th)
* prior to invoking another cwnd reduction by the CC
* module, to not shrink it excessively.
*/
- prev_cwnd = tp->snd_cwnd;
tp->snd_cwnd = tp->snd_ssthresh;
/*
* Formally exit recovery, and let the CC module adjust
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 24d238bbd04e..8d19f2fe0bde 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -191,7 +191,7 @@ static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this
static int32_t bbr_policer_call_from_rack_to = 0;
static int32_t bbr_policer_detection_enabled = 1;
static int32_t bbr_min_measurements_req = 1; /* We need at least 2
- * measurments before we are
+ * measurement before we are
* "good" note that 2 == 1.
* This is because we use a >
* comparison. This means if
@@ -1059,7 +1059,7 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock
wrong_timer:
if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
if (tcp_in_hpts(inp))
- tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(inp);
bbr_timer_cancel(bbr, __LINE__, cts);
bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,
0);
@@ -1402,7 +1402,7 @@ bbr_init_sysctls(void)
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "min_measure_good_bw", CTLFLAG_RW,
&bbr_min_measurements_req, 1,
- "What is the minimum measurment count we need before we switch to our b/w estimate");
+ "What is the minimum measurement count we need before we switch to our b/w estimate");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_measure),
OID_AUTO, "min_measure_before_pace", CTLFLAG_RW,
@@ -1558,7 +1558,7 @@ bbr_init_sysctls(void)
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "tar_rtt", CTLFLAG_RW,
&bbr_cwndtarget_rtt_touse, 0,
- "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
+ "Target cwnd rtt measurement to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
SYSCTL_ADD_S32(&bbr_sysctl_ctx,
SYSCTL_CHILDREN(bbr_cwnd),
OID_AUTO, "may_shrink", CTLFLAG_RW,
@@ -1884,7 +1884,6 @@ bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t ct
l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
l->inhpts = tcp_in_hpts(bbr->rc_inp);
- l->ininput = bbr->rc_inp->inp_in_dropq;
l->use_lt_bw = bbr->rc_lt_use_bw;
l->pkts_out = bbr->r_ctl.rc_flight_at_input;
l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
@@ -2972,7 +2971,7 @@ use_initial_window:
rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
if (rtt && (rtt < 0xffffffff)) {
/*
- * We have an RTT measurment. Use that in
+ * We have an RTT measurement. Use that in
* combination with our initial window to calculate
* a b/w.
*/
@@ -4580,8 +4579,7 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
}
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
- return (1);
+ return (-ETIMEDOUT); /* tcp_drop() */
}
/* Did we somehow get into persists? */
if (bbr->rc_in_persist) {
@@ -4773,8 +4771,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
*/
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
- goto out;
+ return (-ETIMEDOUT); /* tcp_drop() */
}
/*
* Hack: if the peer is dead/unreachable, we do not time out if the
@@ -4787,8 +4784,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
- tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
- goto out;
+ return (-ETIMEDOUT); /* tcp_drop() */
}
if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) &&
tp->snd_una == tp->snd_max) {
@@ -4804,8 +4800,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
(ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
- tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
- goto out;
+ return (-ETIMEDOUT); /* tcp_drop() */
}
t_template = tcpip_maketemplate(bbr->rc_inp);
if (t_template) {
@@ -4877,8 +4872,7 @@ bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
dropit:
KMOD_TCPSTAT_INC(tcps_keepdrops);
tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
- tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
- return (1);
+ return (-ETIMEDOUT); /* tcp_drop() */
}
/*
@@ -4998,10 +4992,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
* and retransmit one segment.
*/
if (ctf_progress_timeout_check(tp, true)) {
- retval = 1;
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
- goto out;
+ return (-ETIMEDOUT); /* tcp_drop() */
}
bbr_remxt_tmr(tp);
if ((bbr->r_ctl.rc_resend == NULL) ||
@@ -5017,11 +5009,11 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
KMOD_TCPSTAT_INC(tcps_timeoutdrop);
- retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
- tcp_set_inp_to_drop(bbr->rc_inp,
- (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
- goto out;
+ /* XXXGL: previously t_softerror was casted to uint16_t */
+ MPASS(tp->t_softerror >= 0);
+ retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
+ return (retval); /* tcp_drop() */
}
if (tp->t_state == TCPS_SYN_SENT) {
/*
@@ -5194,7 +5186,7 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
tp->snd_recover = tp->snd_max;
tp->t_flags |= TF_ACKNOW;
tp->t_rtttime = 0;
-out:
+
return (retval);
}
@@ -5272,7 +5264,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
* must remove ourselves from the hpts.
*/
hpts_removed = 1;
- tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(bbr->rc_inp);
if (bbr->r_ctl.rc_last_delay_val) {
/* Update the last hptsi delay too */
uint32_t time_since_send;
@@ -6500,7 +6492,7 @@ bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, u
/* We log only when not in persist */
/* Translate to a Bytes Per Second */
uint64_t tim, bw, ts_diff, ts_bw;
- uint32_t upper, lower, delivered;
+ uint32_t delivered;
if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
@@ -6519,8 +6511,6 @@ bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, u
/* We must have a calculatable amount */
return;
}
- upper = (bw >> 32) & 0x00000000ffffffff;
- lower = bw & 0x00000000ffffffff;
/*
* If we are using this b/w shove it in now so we
* can see in the trace viewer if it gets over-ridden.
@@ -6616,7 +6606,7 @@ bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rt
/* We log only when not in persist */
/* Translate to a Bytes Per Second */
uint64_t tim, bw;
- uint32_t upper, lower, delivered;
+ uint32_t delivered;
int no_apply = 0;
if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
@@ -6638,8 +6628,6 @@ bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rt
no_apply = 1;
}
- upper = (bw >> 32) & 0x00000000ffffffff;
- lower = bw & 0x00000000ffffffff;
/*
* If we are using this b/w shove it in now so we
* can see in the trace viewer if it gets over-ridden.
@@ -7007,12 +6995,11 @@ bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack,
struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts)
{
int32_t times = 0;
- uint32_t start, end, maxseg, changed = 0;
+ uint32_t start, end, changed = 0;
struct bbr_sendmap *rsm, *nrsm;
int32_t used_ref = 1;
uint8_t went_back = 0, went_fwd = 0;
- maxseg = tp->t_maxseg - bbr->rc_last_options;
start = sack->start;
end = sack->end;
rsm = *prsm;
@@ -7973,7 +7960,7 @@ bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t li
* for our sum's calculations.
*/
if (tcp_in_hpts(bbr->rc_inp)) {
- tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(bbr->rc_inp);
bbr->rc_timer_first = 0;
bbr->r_ctl.rc_hpts_flags = 0;
bbr->r_ctl.rc_last_delay_val = 0;
@@ -10367,10 +10354,9 @@ bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog)
* Now what state are we going into now? Is there adjustments
* needed?
*/
- int32_t old_state, old_gain;
+ int32_t old_state;
old_state = bbr_state_val(bbr);
- old_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) {
/* Save the lowest srtt we saw in our end of the sub-state */
bbr->rc_hit_state_1 = 0;
@@ -11369,7 +11355,6 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct bbr_sendmap *rsm;
struct timeval ltv;
int32_t did_out = 0;
- int32_t in_recovery;
uint16_t nsegs;
int32_t prev_state;
uint32_t lost;
@@ -11589,7 +11574,6 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
- in_recovery = IN_RECOVERY(tp->t_flags);
if (tiwin > bbr->r_ctl.rc_high_rwnd)
bbr->r_ctl.rc_high_rwnd = tiwin;
#ifdef BBR_INVARIANTS
@@ -11637,7 +11621,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (bbr->r_wanted_output != 0) {
bbr->rc_output_starts_timer = 0;
did_out = 1;
- (void)tp->t_fb->tfb_tcp_output(tp);
+ if (tcp_output(tp) < 0)
+ return (1);
} else
bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0);
}
@@ -11662,7 +11647,7 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
;
} else {
if (tcp_in_hpts(bbr->rc_inp)) {
- tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(bbr->rc_inp);
if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
(TSTMP_GT(lcts, bbr->rc_pacer_started))) {
uint32_t del;
@@ -11676,7 +11661,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
/* We are late */
bbr->r_ctl.rc_last_delay_val = 0;
BBR_STAT_INC(bbr_force_output);
- (void)tp->t_fb->tfb_tcp_output(tp);
+ if (tcp_output(tp) < 0)
+ return (1);
}
}
}
@@ -11953,7 +11939,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
uint8_t more_to_rxt=0;
int32_t prefetch_so_done = 0;
int32_t prefetch_rsm = 0;
- uint32_t what_we_can = 0;
uint32_t tot_len = 0;
uint32_t rtr_cnt = 0;
uint32_t maxseg, pace_max_segs, p_maxseg;
@@ -12040,7 +12025,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
return (0);
}
}
- tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(inp);
bbr_timer_cancel(bbr, __LINE__, cts);
}
if (bbr->r_ctl.rc_last_delay_val) {
@@ -12057,7 +12042,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
(tp->t_state < TCPS_ESTABLISHED)) {
/* Timeouts or early states are exempt */
if (tcp_in_hpts(inp))
- tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(inp);
} else if (tcp_in_hpts(inp)) {
if ((bbr->r_ctl.rc_last_delay_val) &&
(bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
@@ -12070,10 +12055,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
*/
counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1);
bbr->r_ctl.rc_last_delay_val = 0;
- tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(inp);
} else if (tp->t_state == TCPS_CLOSED) {
bbr->r_ctl.rc_last_delay_val = 0;
- tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(inp);
} else {
/*
* On the hpts, you shall not pass! even if ACKNOW
@@ -12163,9 +12148,16 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
hpts_calling = inp->inp_hpts_calls;
inp->inp_hpts_calls = 0;
if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
- if (bbr_process_timers(tp, bbr, cts, hpts_calling)) {
+ int retval;
+
+ retval = bbr_process_timers(tp, bbr, cts, hpts_calling);
+ if (retval != 0) {
counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1);
- return (0);
+ /*
+ * If timers want tcp_drop(), then pass error out,
+ * otherwise suppress it.
+ */
+ return (retval < 0 ? retval : 0);
}
}
bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
@@ -12427,7 +12419,7 @@ recheck_resend:
}
bbr->rc_tlp_new_data = 0;
} else {
- what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts);
+ len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts);
if ((len < p_maxseg) &&
(bbr->rc_in_persist == 0) &&
(ctf_outstanding(tp) >= (2 * p_maxseg)) &&
@@ -13171,7 +13163,6 @@ send:
*/
if (len) {
uint32_t moff;
- uint32_t orig_len;
/*
* We place a limit on sending with hptsi.
@@ -13234,10 +13225,9 @@ send:
* is the only thing to do.
*/
BBR_STAT_INC(bbr_offset_drop);
- tcp_set_inp_to_drop(inp, EFAULT);
SOCKBUF_UNLOCK(sb);
(void)m_free(m);
- return (0);
+ return (-EFAULT); /* tcp_drop() */
}
len = rsm->r_end - rsm->r_start;
}
@@ -13276,7 +13266,6 @@ send:
}
}
#endif
- orig_len = len;
m->m_next = tcp_m_copym(
mb, moff, &len,
if_hw_tsomaxsegcount,
@@ -13891,7 +13880,7 @@ nomore:
bbr->oerror_cnt++;
if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) {
/* drop the session */
- tcp_set_inp_to_drop(inp, ENETDOWN);
+ return (-ENETDOWN);
}
switch (error) {
case ENOBUFS:
@@ -14162,11 +14151,9 @@ bbr_output(struct tcpcb *tp)
{
int32_t ret;
struct timeval tv;
- struct tcp_bbr *bbr;
NET_EPOCH_ASSERT();
- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
INP_WLOCK_ASSERT(tp->t_inpcb);
(void)tcp_get_usecs(&tv);
ret = bbr_output_wtime(tp, &tv);
@@ -14238,6 +14225,7 @@ struct tcp_function_block __tcp_bbr = {
.tfb_tcp_handoff_ok = bbr_handoff_ok,
.tfb_tcp_mtu_chg = bbr_mtu_chg,
.tfb_pru_options = bbr_pru_options,
+ .tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
};
/*
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 7391734a9786..0810d3e53eb0 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -2295,7 +2295,6 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
log.u_bbr.flex6 = rsm->r_end;
log.u_bbr.flex8 = mod;
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
@@ -2330,7 +2329,6 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
else
log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
@@ -2355,7 +2353,6 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex8 = to_num;
log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
log.u_bbr.flex2 = rack->rc_rack_rtt;
@@ -2394,7 +2391,6 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex8 = flag;
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.cur_del_rate = (uint64_t)prev;
log.u_bbr.delRate = (uint64_t)rsm;
log.u_bbr.rttProp = (uint64_t)next;
@@ -2439,7 +2435,6 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l
struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex1 = t;
log.u_bbr.flex2 = len;
log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
@@ -2589,7 +2584,6 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = tick;
log.u_bbr.flex3 = tp->t_maxunacktime;
@@ -2616,7 +2610,6 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex1 = slot;
if (rack->rack_no_prr)
log.u_bbr.flex2 = 0;
@@ -2684,10 +2677,8 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
struct timeval tv;
- uint32_t cts;
memset(&log, 0, sizeof(log));
- cts = tcp_get_usecs(&tv);
log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
log.u_bbr.flex4 = arg1;
@@ -2718,7 +2709,6 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex1 = slot;
log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = reason;
@@ -2751,7 +2741,6 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex1 = line;
log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
log.u_bbr.flex3 = flags_on_entry;
@@ -4476,7 +4465,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
* Stop the pacer and clear up all the aggregate
* delays etc.
*/
- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(rack->rc_inp);
rack->r_ctl.rc_hpts_flags = 0;
rack->r_ctl.rc_last_output_to = 0;
}
@@ -5679,7 +5668,7 @@ static void
rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
if (tcp_in_hpts(rack->rc_inp)) {
- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(rack->rc_inp);
rack->r_ctl.rc_hpts_flags = 0;
}
#ifdef NETFLIX_SHARED_CWND
@@ -6239,8 +6228,9 @@ rack_merge_rsm(struct tcp_rack *rack,
* is any reason we need to try to find
* the oldest (or last oldest retransmitted).
*/
+#ifdef INVARIANTS
struct rack_sendmap *rm;
-
+#endif
rack_log_map_chg(rack->rc_tp, rack, NULL,
l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
l_rsm->r_end = r_rsm->r_end;
@@ -6273,8 +6263,10 @@ rack_merge_rsm(struct tcp_rack *rack,
if (r_rsm == rack->r_ctl.rc_first_appl)
rack->r_ctl.rc_first_appl = l_rsm;
}
+#ifndef INVARIANTS
+ (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
+#else
rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
-#ifdef INVARIANTS
if (rm != r_rsm) {
panic("removing head in rack:%p rsm:%p rm:%p",
rack, r_rsm, rm);
@@ -6304,7 +6296,9 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
* Tail Loss Probe.
*/
struct rack_sendmap *rsm = NULL;
+#ifdef INVARIANTS
struct rack_sendmap *insret;
+#endif
struct socket *so;
uint32_t amm;
uint32_t out, avail;
@@ -6319,8 +6313,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
}
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- return (1);
+ return (-ETIMEDOUT); /* tcp_drop() */
}
/*
* A TLP timer has expired. We have been idle for 2 rtts. So we now
@@ -6466,8 +6459,10 @@ need_retran:
rack_clone_rsm(rack, nrsm, rsm,
(rsm->r_end - ctf_fixed_maxseg(tp)));
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -6538,9 +6533,8 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
if (ctf_progress_timeout_check(tp, false)) {
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(inp, ETIMEDOUT);
counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
- return (1);
+ return (-ETIMEDOUT); /* tcp_drop() */
}
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
/*
@@ -6558,10 +6552,9 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
(ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
KMOD_TCPSTAT_INC(tcps_persistdrop);
- retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
- tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
+ retval = -ETIMEDOUT; /* tcp_drop() */
goto out;
}
if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6574,11 +6567,10 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
*/
if (tp->t_state > TCPS_CLOSE_WAIT &&
(ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
- retval = 1;
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
- tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
+ retval = -ETIMEDOUT; /* tcp_drop() */
goto out;
}
t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6669,8 +6661,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
dropit:
KMOD_TCPSTAT_INC(tcps_keepdrops);
tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
- tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
- return (1);
+ return (-ETIMEDOUT); /* tcp_drop() */
}
/*
@@ -6849,11 +6840,9 @@ static int
rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
int32_t rexmt;
- struct inpcb *inp;
int32_t retval = 0;
bool isipv6;
- inp = tp->t_inpcb;
if (tp->t_timers->tt_flags & TT_STOPPED) {
return (1);
}
@@ -6874,8 +6863,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
if (ctf_progress_timeout_check(tp, false)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(inp, ETIMEDOUT);
- return (1);
+ return (-ETIMEDOUT); /* tcp_drop() */
}
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
rack->r_ctl.retran_during_recovery = 0;
@@ -6944,10 +6932,10 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
drop_it:
tp->t_rxtshift = TCP_MAXRXTSHIFT;
KMOD_TCPSTAT_INC(tcps_timeoutdrop);
- retval = 1;
- tcp_set_inp_to_drop(rack->rc_inp,
- (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
- goto out;
+ /* XXXGL: previously t_softerror was casted to uint16_t */
+ MPASS(tp->t_softerror >= 0);
+ retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
+ goto out; /* tcp_drop() */
}
if (tp->t_state == TCPS_SYN_SENT) {
/*
@@ -7235,7 +7223,7 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin
if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
((tp->snd_max - tp->snd_una) == 0))) {
- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(rack->rc_inp);
hpts_removed = 1;
/* If we were not delayed cancel out the flag. */
if ((tp->snd_max - tp->snd_una) == 0)
@@ -7251,7 +7239,7 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin
* paced. We also must remove ourselves from the
* hpts.
*/
- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(rack->rc_inp);
hpts_removed = 1;
}
rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
@@ -7357,7 +7345,10 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
* We (re-)transmitted starting at rsm->r_start for some length
* (possibly less than r_end.
*/
- struct rack_sendmap *nrsm, *insret;
+ struct rack_sendmap *nrsm;
+#ifdef INVARIANTS
+ struct rack_sendmap *insret;
+#endif
uint32_t c_end;
int32_t len;
@@ -7404,8 +7395,10 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
rack_clone_rsm(rack, nrsm, rsm, c_end);
nrsm->r_dupack = 0;
rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -7429,7 +7422,10 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls)
{
struct tcp_rack *rack;
- struct rack_sendmap *rsm, *nrsm, *insret, fe;
+ struct rack_sendmap *rsm, *nrsm, fe;
+#ifdef INVARIANTS
+ struct rack_sendmap *insret;
+#endif
register uint32_t snd_max, snd_una;
/*
@@ -7587,8 +7583,10 @@ again:
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
/* Log a new rsm */
rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -7661,9 +7659,11 @@ refind:
* to not include this part.
*/
rack_clone_rsm(rack, nrsm, rsm, seq_out);
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
-#ifdef INVARIANTS
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#else
+ insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -7790,8 +7790,6 @@ static void
tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
{
int32_t delta;
- uint32_t o_srtt, o_var;
- int32_t hrtt_up = 0;
int32_t rtt;
if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
@@ -7834,7 +7832,6 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
*/
if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
- hrtt_up = 1;
}
if (rack->rc_highly_buffered == 0) {
/*
@@ -7865,8 +7862,6 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
rack->r_ctl.rc_lowest_us_rtt = 1;
}
}
- o_srtt = tp->t_srtt;
- o_var = tp->t_rttvar;
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (tp->t_srtt != 0) {
/*
@@ -8417,7 +8412,10 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack
{
uint32_t start, end, changed = 0;
struct rack_sendmap stack_map;
- struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
+ struct rack_sendmap *rsm, *nrsm, fe, *prev, *next;
+#ifdef INVARIANTS
+ struct rack_sendmap *insret;
+#endif
int32_t used_ref = 1;
int moved = 0;
@@ -8608,8 +8606,10 @@ do_rest_ofb:
counter_u64_add(rack_sack_splits, 1);
rack_clone_rsm(rack, nrsm, rsm, start);
rsm->r_just_ret = 0;
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -8904,8 +8904,10 @@ do_rest_ofb:
rack_clone_rsm(rack, nrsm, rsm, end);
rsm->r_flags &= (~RACK_HAS_FIN);
rsm->r_just_ret = 0;
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -9056,9 +9058,11 @@ rack_do_decay(struct tcp_rack *rack)
* we want all SAD detection metrics to
* decay 1/4 per second (or more) passed.
*/
+#ifdef NETFLIX_EXP_DETECTION
uint32_t pkt_delta;
pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
+#endif
/* Update our saved tracking values */
rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
@@ -9089,7 +9093,10 @@ rack_do_decay(struct tcp_rack *rack)
static void
rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
{
- struct rack_sendmap *rsm, *rm;
+ struct rack_sendmap *rsm;
+#ifdef INVARIANTS
+ struct rack_sendmap *rm;
+#endif
/*
* The ACK point is advancing to th_ack, we must drop off
@@ -9233,8 +9240,10 @@ more:
rsm->r_rtr_bytes = 0;
/* Record the time of highest cumack sent */
rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+#ifndef INVARIANTS
+ (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#else
rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#ifdef INVARIANTS
if (rm != rsm) {
panic("removing head in rack:%p rsm:%p rm:%p",
rack, rsm, rm);
@@ -9638,7 +9647,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
register uint32_t th_ack;
int32_t i, j, k, num_sack_blks = 0;
- uint32_t cts, acked, ack_point, sack_changed = 0;
+ uint32_t cts, acked, ack_point;
int loop_start = 0, moved_two = 0;
uint32_t tsused;
@@ -9815,7 +9824,6 @@ do_sack_work:
if (acked) {
rack->r_wanted_output = 1;
changed += acked;
- sack_changed += acked;
}
if (num_sack_blks == 1) {
/*
@@ -9879,7 +9887,6 @@ do_sack_work:
if (acked) {
rack->r_wanted_output = 1;
changed += acked;
- sack_changed += acked;
}
if (moved_two) {
/*
@@ -10357,7 +10364,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
acked = BYTES_THIS_ACK(tp, th);
if (acked) {
- /*
+ /*
* Any time we move the cum-ack forward clear
* keep-alive tied probe-not-answered. The
* persists clears its own on entry.
@@ -10526,7 +10533,10 @@ rack_collapsed_window(struct tcp_rack *rack)
* did not send those segments something
* won't work.
*/
- struct rack_sendmap *rsm, *nrsm, fe, *insret;
+ struct rack_sendmap *rsm, *nrsm, fe;
+#ifdef INVARIANTS
+ struct rack_sendmap *insret;
+#endif
tcp_seq max_seq;
max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
@@ -10552,8 +10562,10 @@ rack_collapsed_window(struct tcp_rack *rack)
}
/* Clone it */
rack_clone_rsm(rack, nrsm, rsm, max_seq);
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
nrsm, insret, rack, rsm);
@@ -11853,8 +11865,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -11955,8 +11966,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -12112,8 +12122,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -12227,8 +12236,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -12342,8 +12350,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -12454,8 +12461,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
}
@@ -12628,7 +12634,9 @@ static int
rack_init(struct tcpcb *tp)
{
struct tcp_rack *rack = NULL;
+#ifdef INVARIANTS
struct rack_sendmap *insret;
+#endif
uint32_t iwin, snt, us_cts;
int err;
@@ -12831,8 +12839,10 @@ rack_init(struct tcpcb *tp)
rsm->orig_m_len = 0;
rsm->soff = 0;
}
+#ifndef INVARIANTS
+ (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#else
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
insret, rack, rsm);
@@ -12956,7 +12966,10 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
if (tp->t_fb_ptr) {
struct tcp_rack *rack;
- struct rack_sendmap *rsm, *nrsm, *rm;
+ struct rack_sendmap *rsm, *nrsm;
+#ifdef INVARIANTS
+ struct rack_sendmap *rm;
+#endif
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (tp->t_in_pkt) {
@@ -13078,8 +13091,10 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
tcp_log_flowend(tp);
#endif
RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
+#ifndef INVARIANTS
+ (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+#else
rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#ifdef INVARIANTS
if (rm != rsm) {
panic("At fini, rack:%p rsm:%p rm:%p",
rack, rsm, rm);
@@ -13243,7 +13258,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
}
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
}
- tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(tp->t_inpcb);
}
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
@@ -13329,7 +13344,6 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
#endif
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
if (rack->rack_no_prr == 0)
log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
else
@@ -13535,7 +13549,9 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
int cnt, i, did_out, ourfinisacked = 0;
struct tcpopt to_holder, *to = NULL;
+#ifdef TCP_ACCOUNTING
int win_up_req = 0;
+#endif
int nsegs = 0;
int under_pacing = 1;
int recovery = 0;
@@ -13741,7 +13757,9 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
rack_handle_probe_response(rack, tiwin,
tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
}
+#ifdef TCP_ACCOUNTING
win_up_req = 1;
+#endif
win_upd_ack = ae->ack;
win_seq = ae->seq;
the_win = tiwin;
@@ -13822,7 +13840,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
#endif
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
- /*
+ /*
* Clear the probe not answered flag
* since cum-ack moved forward.
*/
@@ -14049,7 +14067,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
/*
* We cheat here and don't send a RST, we should send one
* when the pacer drops the connection.
@@ -14065,7 +14082,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
}
sched_unpin();
#endif
- INP_WUNLOCK(rack->rc_inp);
+ (void)tcp_drop(tp, ETIMEDOUT);
m_freem(m);
return (1);
}
@@ -14171,7 +14188,12 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
ctf_calc_rwin(so, tp);
if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
send_out_a_rst:
- (void)tp->t_fb->tfb_tcp_output(tp);
+ if (tcp_output(tp) < 0) {
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ return (1);
+ }
did_out = 1;
}
rack_free_trim(rack);
@@ -14321,7 +14343,6 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
#endif
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
if (rack->rack_no_prr == 0)
log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
else
@@ -14656,8 +14677,9 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (nxt_pkt == 0) {
if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
do_output_now:
+ if (tcp_output(tp) < 0)
+ return (1);
did_out = 1;
- (void)tp->t_fb->tfb_tcp_output(tp);
}
rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
rack_free_trim(rack);
@@ -14695,7 +14717,7 @@ do_output_now:
late = 1;
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
}
- tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(tp->t_inpcb);
}
if (late && (did_out == 0)) {
/*
@@ -15612,7 +15634,6 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
log.u_bbr.flex1 = error;
log.u_bbr.flex2 = flags;
log.u_bbr.flex3 = rsm_is_null;
@@ -15643,8 +15664,10 @@ rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
{
#ifdef KERN_TLS
struct ktls_session *tls, *ntls;
+#ifdef INVARIANTS
struct mbuf *start;
#endif
+#endif
struct mbuf *m, *n, **np, *smb;
struct mbuf *top;
int32_t off, soff;
@@ -15662,8 +15685,10 @@ rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
tls = m->m_epg_tls;
else
tls = NULL;
+#ifdef INVARIANTS
start = m;
#endif
+#endif
while (len > 0) {
if (m == NULL) {
*plen = len_cp;
@@ -15860,7 +15885,6 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
u_char opt[TCP_MAXOLEN];
uint32_t hdrlen, optlen;
int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0;
- uint32_t us_cts;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
@@ -15971,7 +15995,6 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
if ((tso == 0) && (len > segsiz))
len = segsiz;
- us_cts = tcp_get_usecs(tv);
if ((len == 0) ||
(len <= MHLEN - hdrlen - max_linkhdr)) {
goto failed;
@@ -16128,7 +16151,6 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
if (rack->rack_no_prr)
log.u_bbr.flex1 = 0;
else
@@ -16239,7 +16261,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
if ((SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) ||
((rsm->r_flags & RACK_MUST_RXT) == 0)) {
/*
- * We have retransmitted all we need. If
+ * We have retransmitted all we need. If
* RACK_MUST_RXT is not set then we need to
* not retransmit this guy.
*/
@@ -16363,7 +16385,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
uint32_t hdrlen, optlen;
int cnt_thru = 1;
int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0;
- uint32_t us_cts, s_soff;
+ uint32_t s_soff;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
uint16_t add_flag = RACK_SENT_FP;
@@ -16459,7 +16481,6 @@ again:
}
if ((tso == 0) && (len > segsiz))
len = segsiz;
- us_cts = tcp_get_usecs(tv);
if ((len == 0) ||
(len <= MHLEN - hdrlen - max_linkhdr)) {
goto failed;
@@ -16629,7 +16650,6 @@ again:
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
if (rack->rack_no_prr)
log.u_bbr.flex1 = 0;
else
@@ -16884,12 +16904,20 @@ rack_output(struct tcpcb *tp)
}
/* Do the timers, which may override the pacer */
if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
- if (rack_process_timers(tp, rack, cts, hpts_calling, &doing_tlp)) {
+ int retval;
+
+ retval = rack_process_timers(tp, rack, cts, hpts_calling,
+ &doing_tlp);
+ if (retval != 0) {
counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
#ifdef TCP_ACCOUNTING
sched_unpin();
#endif
- return (0);
+ /*
+ * If timers want tcp_drop(), then pass error out,
+ * otherwise suppress it.
+ */
+ return (retval < 0 ? retval : 0);
}
}
if (rack->rc_in_persist) {
@@ -16908,7 +16936,7 @@ rack_output(struct tcpcb *tp)
(tp->t_state < TCPS_ESTABLISHED)) {
rack->rc_ack_can_sendout_data = 0;
if (tcp_in_hpts(rack->rc_inp))
- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ tcp_hpts_remove(rack->rc_inp);
} else if (tcp_in_hpts(rack->rc_inp)) {
/*
* On the hpts you can't pass even if ACKNOW is on, we will
@@ -17139,9 +17167,10 @@ again:
len = cwin;
}
if (rack->r_must_retran &&
+ (doing_tlp == 0) &&
(rsm == NULL)) {
/*
- * Non-Sack and we had a RTO or Sack/non-Sack and a
+ * Non-Sack and we had a RTO or Sack/non-Sack and a
* MTU change, we need to retransmit until we reach
* the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
*/
@@ -17176,7 +17205,7 @@ again:
sb_offset = rsm->r_start - tp->snd_una;
if (len >= segsiz)
len = segsiz;
- /*
+ /*
* Delay removing the flag RACK_MUST_RXT so
* that the fastpath for retransmit will
* work with this rsm.
@@ -17230,7 +17259,7 @@ again:
return (0);
}
if (rsm && (rsm->r_flags & RACK_MUST_RXT)) {
- /*
+ /*
* Clear the flag in prep for the send
* note that if we can't get an mbuf
* and fail, we won't retransmit this
@@ -17328,6 +17357,7 @@ again:
} else {
len = rack->r_ctl.rc_tlp_new_data;
}
+ rack->r_ctl.rc_tlp_new_data = 0;
} else {
len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
}
@@ -18801,7 +18831,6 @@ send:
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
- log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
if (rack->rack_no_prr)
log.u_bbr.flex1 = 0;
else
@@ -18971,10 +19000,6 @@ out:
rack->rc_gp_saw_ss = 1;
}
}
- if (doing_tlp && (rsm == NULL)) {
- /* Make sure new data TLP cnt is clear */
- rack->r_ctl.rc_tlp_new_data = 0;
- }
if (TCPS_HAVEESTABLISHED(tp->t_state) &&
(tp->t_flags & TF_SACK_PERMIT) &&
tp->rcv_numsacks > 0)
@@ -20402,6 +20427,7 @@ static struct tcp_function_block __tcp_rack = {
.tfb_tcp_mtu_chg = rack_mtu_change,
.tfb_pru_options = rack_pru_options,
.tfb_hwtls_change = rack_hw_tls_change,
+ .tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
};
/*
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index 88e028109c95..1be426927073 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -381,7 +381,7 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
* 1) It returns 0 if all went well and you (the caller) need
* to release the lock.
* 2) If nxt_pkt is set, then the function will surpress calls
- * to tfb_tcp_output() since you are promising to call again
+ * to tcp_output() since you are promising to call again
* with another packet.
* 3) If it returns 1, then you must free all the packets being
* shipped in, the tcb has been destroyed (or about to be destroyed).
@@ -969,11 +969,10 @@ ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
int32_t rstreason, int32_t tlen)
{
- if (tp->t_inpcb) {
- tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
- }
tcp_dropwithreset(m, th, tp, tlen, rstreason);
- INP_WUNLOCK(tp->t_inpcb);
+ tp = tcp_drop(tp, ETIMEDOUT);
+ if (tp)
+ INP_WUNLOCK(tp->t_inpcb);
}
uint32_t
diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h
index b5fd3e9b946b..1e63c2118d70 100644
--- a/sys/netinet/tcp_stacks/tcp_bbr.h
+++ b/sys/netinet/tcp_stacks/tcp_bbr.h
@@ -269,7 +269,7 @@ struct bbr_log {
uint8_t n_sackblks;
uint8_t applied; /* UU */
uint8_t inhpts; /* UU */
- uint8_t ininput; /* UU */
+ uint8_t __spare; /* UU */
uint8_t use_lt_bw; /* UU */
};
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 5b5df6821e6a..47b6ff173afe 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -375,13 +375,13 @@ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
-static void tcp_mtudisc(struct inpcb *, int);
+static struct inpcb *tcp_mtudisc(struct inpcb *, int);
static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
void *ip4hdr, const void *ip6hdr);
static struct tcp_function_block tcp_def_funcblk = {
.tfb_tcp_block_name = "freebsd",
- .tfb_tcp_output = tcp_output,
+ .tfb_tcp_output = tcp_default_output,
.tfb_tcp_do_segment = tcp_do_segment,
.tfb_tcp_ctloutput = tcp_default_ctloutput,
.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
@@ -1146,26 +1146,7 @@ static struct mtx isn_mtx;
#define ISN_LOCK() mtx_lock(&isn_mtx)
#define ISN_UNLOCK() mtx_unlock(&isn_mtx)
-/*
- * TCP initialization.
- */
-static void
-tcp_zone_change(void *tag)
-{
-
- uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
- uma_zone_set_max(V_tcpcb_zone, maxsockets);
- tcp_tw_zone_change();
-}
-
-static int
-tcp_inpcb_init(void *mem, int size, int flags)
-{
- struct inpcb *inp = mem;
-
- INP_LOCK_INIT(inp, "inp", "tcpinp");
- return (0);
-}
+INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash");
/*
* Take a value and get the next power of 2 that doesn't overflow.
@@ -1422,13 +1403,9 @@ deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
return (0);
}
-void
-tcp_init(void)
+static void
+tcp_vnet_init(void *arg __unused)
{
- const char *tcbhash_tuneable;
- int hashsize;
-
- tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
#ifdef TCP_HHOOK
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
@@ -1443,47 +1420,8 @@ tcp_init(void)
printf("%s: WARNING: unable to initialise TCP stats\n",
__func__);
#endif
- hashsize = TCBHASHSIZE;
- TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
- if (hashsize == 0) {
- /*
- * Auto tune the hash size based on maxsockets.
- * A perfect hash would have a 1:1 mapping
- * (hashsize = maxsockets) however it's been
- * suggested that O(2) average is better.
- */
- hashsize = maketcp_hashsize(maxsockets / 4);
- /*
- * Our historical default is 512,
- * do not autotune lower than this.
- */
- if (hashsize < 512)
- hashsize = 512;
- if (bootverbose && IS_DEFAULT_VNET(curvnet))
- printf("%s: %s auto tuned to %d\n", __func__,
- tcbhash_tuneable, hashsize);
- }
- /*
- * We require a hashsize to be a power of two.
- * Previously if it was not a power of two we would just reset it
- * back to 512, which could be a nasty surprise if you did not notice
- * the error message.
- * Instead what we do is clip it to the closest power of two lower
- * than the specified hash value.
- */
- if (!powerof2(hashsize)) {
- int oldhashsize = hashsize;
-
- hashsize = maketcp_hashsize(hashsize);
- /* prevent absurdly low value */
- if (hashsize < 16)
- hashsize = 16;
- printf("%s: WARNING: TCB hash size not a power of 2, "
- "clipped from %d to %d.\n", __func__, oldhashsize,
- hashsize);
- }
- in_pcbinfo_init(&V_tcbinfo, "tcp", hashsize, hashsize,
- "tcp_inpcb", tcp_inpcb_init);
+ in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize,
+ tcp_tcbhashsize);
/*
* These have to be type stable for the benefit of the timers.
@@ -1503,19 +1441,28 @@ tcp_init(void)
tcp_fastopen_init();
- /* Skip initialization of globals for non-default instances. */
- if (!IS_DEFAULT_VNET(curvnet))
- return;
+ COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
+ VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
+
+ V_tcp_msl = TCPTV_MSL;
+}
+VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
+ tcp_vnet_init, NULL);
+
+static void
+tcp_init(void *arg __unused)
+{
+ const char *tcbhash_tuneable;
+ int hashsize;
tcp_reass_global_init();
- /* XXX virtualize those bellow? */
+ /* XXX virtualize those below? */
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
tcp_keepidle = TCPTV_KEEP_IDLE;
tcp_keepintvl = TCPTV_KEEPINTVL;
tcp_maxpersistidle = TCPTV_KEEP_IDLE;
- tcp_msl = TCPTV_MSL;
tcp_rexmit_initial = TCPTV_RTOBASE;
if (tcp_rexmit_initial < 1)
tcp_rexmit_initial = 1;
@@ -1526,7 +1473,6 @@ tcp_init(void)
tcp_persmax = TCPTV_PERSMAX;
tcp_rexmit_slop = TCPTV_CPU_VAR;
tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
- tcp_tcbhashsize = hashsize;
/* Setup the tcp function block list */
TAILQ_INIT(&t_functions);
@@ -1561,8 +1507,6 @@ tcp_init(void)
ISN_LOCK_INIT();
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
- EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
- EVENTHANDLER_PRI_ANY);
tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK);
tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK);
@@ -1576,7 +1520,50 @@ tcp_init(void)
#ifdef TCPPCAP
tcp_pcap_init();
#endif
+
+ hashsize = TCBHASHSIZE;
+ tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
+ TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
+ if (hashsize == 0) {
+ /*
+ * Auto tune the hash size based on maxsockets.
+ * A perfect hash would have a 1:1 mapping
+ * (hashsize = maxsockets) however it's been
+ * suggested that O(2) average is better.
+ */
+ hashsize = maketcp_hashsize(maxsockets / 4);
+ /*
+ * Our historical default is 512,
+ * do not autotune lower than this.
+ */
+ if (hashsize < 512)
+ hashsize = 512;
+ if (bootverbose)
+ printf("%s: %s auto tuned to %d\n", __func__,
+ tcbhash_tuneable, hashsize);
+ }
+ /*
+ * We require a hashsize to be a power of two.
+ * Previously if it was not a power of two we would just reset it
+ * back to 512, which could be a nasty surprise if you did not notice
+ * the error message.
+ * Instead what we do is clip it to the closest power of two lower
+ * than the specified hash value.
+ */
+ if (!powerof2(hashsize)) {
+ int oldhashsize = hashsize;
+
+ hashsize = maketcp_hashsize(hashsize);
+ /* prevent absurdly low value */
+ if (hashsize < 16)
+ hashsize = 16;
+ printf("%s: WARNING: TCB hash size not a power of 2, "
+ "clipped from %d to %d.\n", __func__, oldhashsize,
+ hashsize);
+ }
+ tcp_tcbhashsize = hashsize;
}
+SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL);
#ifdef VIMAGE
static void
@@ -1614,6 +1601,9 @@ tcp_destroy(void *unused __unused)
*/
tcp_fastopen_destroy();
+ COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
+ VNET_PCPUSTAT_FREE(tcpstat);
+
#ifdef TCP_HHOOK
error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
if (error != 0) {
@@ -2096,7 +2086,6 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
- log.u_bbr.ininput = tp->t_inpcb->inp_in_dropq;
log.u_bbr.flex8 = 4;
log.u_bbr.pkts_out = tp->t_maxseg;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
@@ -2303,8 +2292,6 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
struct inpcb *inp;
struct tcpcb *tp;
VNET_ITERATOR_DECL(vnet_iter);
- struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
- INPLOOKUP_WLOCKPCB);
/*
* Check all active control blocks across all network stacks and change
@@ -2314,6 +2301,8 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_WLOCKPCB);
/*
* XXXGL: would new accept(2)d connections use algo being
* unloaded?
@@ -2391,7 +2380,8 @@ tcp_drop(struct tcpcb *tp, int errno)
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tcp_state_change(tp, TCPS_CLOSED);
- (void) tp->t_fb->tfb_tcp_output(tp);
+ /* Don't use tcp_output() here due to possible recursion. */
+ (void)tcp_output_nodrop(tp);
TCPSTAT_INC(tcps_drops);
} else
TCPSTAT_INC(tcps_conndrops);
@@ -2588,7 +2578,7 @@ tcp_close(struct tcpcb *tp)
tp->t_tfo_pending = NULL;
}
#ifdef TCPHPTS
- tcp_hpts_remove(inp, HPTS_REMOVE_ALL);
+ tcp_hpts_remove(inp);
#endif
in_pcbdrop(inp);
TCPSTAT_INC(tcps_closed);
@@ -2613,8 +2603,6 @@ tcp_close(struct tcpcb *tp)
void
tcp_drain(void)
{
- struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
- INPLOOKUP_WLOCKPCB);
VNET_ITERATOR_DECL(vnet_iter);
if (!do_tcpdrain)
@@ -2623,6 +2611,8 @@ tcp_drain(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_WLOCKPCB);
struct inpcb *inpb;
struct tcpcb *tcpb;
@@ -3019,7 +3009,7 @@ tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port)
inc.inc_fibnum =
inp->inp_inc.inc_fibnum;
tcp_hc_updatemtu(&inc, mtu);
- tcp_mtudisc(inp, mtu);
+ inp = tcp_mtudisc(inp, mtu);
}
} else
inp = (*notify)(inp,
@@ -3467,11 +3457,10 @@ static struct inpcb *
tcp_mtudisc_notify(struct inpcb *inp, int error)
{
- tcp_mtudisc(inp, -1);
- return (inp);
+ return (tcp_mtudisc(inp, -1));
}
-static void
+static struct inpcb *
tcp_mtudisc(struct inpcb *inp, int mtuoffer)
{
struct tcpcb *tp;
@@ -3480,7 +3469,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
(inp->inp_flags & INP_DROPPED))
- return;
+ return (inp);
tp = intotcpcb(inp);
KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
@@ -3510,7 +3499,10 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
*/
tp->t_fb->tfb_tcp_mtu_chg(tp);
}
- tp->t_fb->tfb_tcp_output(tp);
+ if (tcp_output(tp) < 0)
+ return (NULL);
+ else
+ return (inp);
}
#ifdef INET
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 7dd8443cad65..32ca3bc2209b 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1514,19 +1514,25 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
/*
- * If listening socket requested TCP digests, check that received
- * SYN has signature and it is correct. If signature doesn't match
- * or TCP_SIGNATURE support isn't enabled, drop the packet.
+ * When the socket is TCP-MD5 enabled check that,
+ * - a signed packet is valid
+ * - a non-signed packet does not have a security association
+ *
+ * If a signed packet fails validation or a non-signed packet has a
+ * security association, the packet will be dropped.
*/
if (ltflags & TF_SIGNATURE) {
- if ((to->to_flags & TOF_SIGNATURE) == 0) {
- TCPSTAT_INC(tcps_sig_err_nosigopt);
- goto done;
+ if (to->to_flags & TOF_SIGNATURE) {
+ if (!TCPMD5_ENABLED() ||
+ TCPMD5_INPUT(m, th, to->to_signature) != 0)
+ goto done;
+ } else {
+ if (TCPMD5_ENABLED() &&
+ TCPMD5_INPUT(m, NULL, NULL) != ENOENT)
+ goto done;
}
- if (!TCPMD5_ENABLED() ||
- TCPMD5_INPUT(m, th, to->to_signature) != 0)
- goto done;
- }
+ } else if (to->to_flags & TOF_SIGNATURE)
+ goto done;
#endif /* TCP_SIGNATURE */
/*
* See if we already have an entry for this connection.
@@ -1724,11 +1730,11 @@ skip_alloc:
}
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
/*
- * If listening socket requested TCP digests, flag this in the
+ * If incoming packet has an MD5 signature, flag this in the
* syncache so that syncache_respond() will do the right thing
* with the SYN+ACK.
*/
- if (ltflags & TF_SIGNATURE)
+ if (to->to_flags & TOF_SIGNATURE)
sc->sc_flags |= SCF_SIGNATURE;
#endif /* TCP_SIGNATURE */
if (to->to_flags & TOF_SACKPERM)
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 67e550b83bce..a3cb16869dc9 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -117,10 +117,10 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
&tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
"Time before a delayed ACK is sent");
-int tcp_msl;
+VNET_DEFINE(int, tcp_msl);
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
- &tcp_msl, 0, sysctl_msec_to_ticks, "I",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
+ &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I",
"Maximum segment lifetime");
int tcp_rexmit_initial;
@@ -292,8 +292,7 @@ tcp_timer_delack(void *xtp)
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
NET_EPOCH_ENTER(et);
- (void) tp->t_fb->tfb_tcp_output(tp);
- INP_WUNLOCK(inp);
+ (void) tcp_output_unlock(tp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
}
@@ -502,6 +501,7 @@ tcp_timer_persist(void *xtp)
struct tcpcb *tp = xtp;
struct inpcb *inp;
struct epoch_tracker et;
+ int outrv;
CURVNET_SET(tp->t_vnet);
#ifdef TCPDEBUG
int ostate;
@@ -563,8 +563,7 @@ tcp_timer_persist(void *xtp)
tcp_setpersist(tp);
tp->t_flags |= TF_FORCEDATA;
NET_EPOCH_ENTER(et);
- (void) tp->t_fb->tfb_tcp_output(tp);
- NET_EPOCH_EXIT(et);
+ outrv = tcp_output_nodrop(tp);
tp->t_flags &= ~TF_FORCEDATA;
#ifdef TCPDEBUG
@@ -572,7 +571,8 @@ tcp_timer_persist(void *xtp)
tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
- INP_WUNLOCK(inp);
+ (void) tcp_unlock_or_drop(tp, outrv);
+ NET_EPOCH_EXIT(et);
out:
CURVNET_RESTORE();
}
@@ -582,7 +582,7 @@ tcp_timer_rexmt(void * xtp)
{
struct tcpcb *tp = xtp;
CURVNET_SET(tp->t_vnet);
- int rexmt;
+ int rexmt, outrv;
struct inpcb *inp;
struct epoch_tracker et;
bool isipv6;
@@ -843,15 +843,15 @@ tcp_timer_rexmt(void * xtp)
cc_cong_signal(tp, NULL, CC_RTO);
NET_EPOCH_ENTER(et);
- (void) tp->t_fb->tfb_tcp_output(tp);
- NET_EPOCH_EXIT(et);
+ outrv = tcp_output_nodrop(tp);
#ifdef TCPDEBUG
if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
- INP_WUNLOCK(inp);
+ (void) tcp_unlock_or_drop(tp, outrv);
+ NET_EPOCH_EXIT(et);
out:
CURVNET_RESTORE();
}
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index c5317d1a4155..c0382d68c13c 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -195,7 +195,6 @@ extern int tcp_maxpersistidle;
extern int tcp_rexmit_initial;
extern int tcp_rexmit_min;
extern int tcp_rexmit_slop;
-extern int tcp_msl;
extern int tcp_ttl; /* time to live for TCP segs */
extern int tcp_backoff[];
extern int tcp_totbackoff;
@@ -212,6 +211,8 @@ VNET_DECLARE(int, tcp_pmtud_blackhole_mss);
#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss)
VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss);
#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss)
+VNET_DECLARE(int, tcp_msl);
+#define V_tcp_msl VNET(tcp_msl)
void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index b0ab3e02c61f..1efc93aef1f9 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
#include <sys/param.h>
@@ -96,6 +97,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+
+#include <netipsec/ipsec_support.h>
+
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
@@ -324,10 +328,8 @@ tcp_twstart(struct tcpcb *tp)
tw->snd_nxt = tp->snd_nxt;
tw->t_port = tp->t_port;
tw->rcv_nxt = tp->rcv_nxt;
- tw->iss = tp->iss;
- tw->irs = tp->irs;
- tw->t_starttime = tp->t_starttime;
tw->tw_time = 0;
+ tw->tw_flags = tp->t_flags;
/* XXX
* If this code will
@@ -345,7 +347,7 @@ tcp_twstart(struct tcpcb *tp)
* and might not be needed here any longer.
*/
#ifdef TCPHPTS
- tcp_hpts_remove(inp, HPTS_REMOVE_ALL);
+ tcp_hpts_remove(inp);
#endif
tcp_discardcb(tp);
soisdisconnected(so);
@@ -465,6 +467,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
INP_TRY_UPGRADE(inp) == 0)
goto drop;
tcp_twclose(tw, 0);
+ TCPSTAT_INC(tcps_tw_recycles);
return (1);
}
@@ -484,6 +487,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK);
}
INP_UNLOCK(inp);
+ TCPSTAT_INC(tcps_tw_resets);
return (0);
}
@@ -522,6 +526,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) {
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
tcp_twrespond(tw, TH_ACK);
+ TCPSTAT_INC(tcps_tw_responds);
goto dropnoprobe;
}
drop:
@@ -669,6 +674,10 @@ tcp_twrespond(struct tcptw *tw, int flags)
to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
to.to_tsecr = tw->t_recent;
}
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tw->tw_flags & TF_SIGNATURE)
+ to.to_flags |= TOF_SIGNATURE;
+#endif
optlen = tcp_addoptions(&to, (u_char *)(th + 1));
if (udp) {
@@ -686,6 +695,13 @@ tcp_twrespond(struct tcptw *tw, int flags)
th->th_flags = flags;
th->th_win = htons(tw->last_win);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tw->tw_flags & TF_SIGNATURE) {
+ if (!TCPMD5_ENABLED() ||
+ TCPMD5_OUTPUT(m, th, to.to_signature) != 0)
+ return (-1);
+ }
+#endif
#ifdef INET6
if (isipv6) {
if (tw->t_port) {
@@ -749,7 +765,7 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
- tw->tw_time = ticks + 2 * tcp_msl;
+ tw->tw_time = ticks + 2 * V_tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
TW_WUNLOCK(V_tw_lock);
}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 198852cc8fac..db3f85b43acc 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -593,7 +593,9 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out_in_epoch;
#endif
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output(tp);
+ KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
+ ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
out_in_epoch:
NET_EPOCH_EXIT(et);
out:
@@ -690,7 +692,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
(error = tcp_offload_connect(so, nam)) == 0)
goto out_in_epoch;
#endif
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output(tp);
goto out_in_epoch;
} else {
if ((inp->inp_vflag & INP_IPV6) == 0) {
@@ -714,12 +716,14 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
#endif
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
NET_EPOCH_ENTER(et);
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output(tp);
#ifdef INET
out_in_epoch:
#endif
NET_EPOCH_EXIT(et);
out:
+ KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
+ ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
/*
* If the implicit bind in the connect call fails, restore
* the flags we modified.
@@ -892,25 +896,23 @@ tcp_usr_shutdown(struct socket *so)
struct epoch_tracker et;
TCPDEBUG0;
- NET_EPOCH_ENTER(et);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("inp == NULL"));
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNRESET;
- goto out;
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
}
tp = intotcpcb(inp);
+ NET_EPOCH_ENTER(et);
TCPDEBUG1();
socantsendmore(so);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- error = tp->t_fb->tfb_tcp_output(tp);
-
-out:
+ error = tcp_output_nodrop(tp);
TCPDEBUG2(PRU_SHUTDOWN);
TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
- INP_WUNLOCK(inp);
+ error = tcp_unlock_or_drop(tp, error);
NET_EPOCH_EXIT(et);
return (error);
@@ -925,17 +927,18 @@ tcp_usr_rcvd(struct socket *so, int flags)
struct epoch_tracker et;
struct inpcb *inp;
struct tcpcb *tp = NULL;
- int error = 0;
+ int outrv = 0, error = 0;
TCPDEBUG0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNRESET;
- goto out;
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
}
tp = intotcpcb(inp);
+ NET_EPOCH_ENTER(et);
TCPDEBUG1();
/*
* For passively-created TFO connections, don't attempt a window
@@ -947,18 +950,17 @@ tcp_usr_rcvd(struct socket *so, int flags)
if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED))
goto out;
- NET_EPOCH_ENTER(et);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
tcp_offload_rcvd(tp);
else
#endif
- tp->t_fb->tfb_tcp_output(tp);
- NET_EPOCH_EXIT(et);
+ outrv = tcp_output_nodrop(tp);
out:
TCPDEBUG2(PRU_RCVD);
TCP_PROBE2(debug__user, tp, PRU_RCVD);
- INP_WUNLOCK(inp);
+ (void) tcp_unlock_or_drop(tp, outrv);
+ NET_EPOCH_EXIT(et);
return (error);
}
@@ -991,34 +993,31 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
bool restoreflags;
TCPDEBUG0;
- /*
- * We require the pcbinfo "read lock" if we will close the socket
- * as part of this call.
- */
- NET_EPOCH_ENTER(et);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
- INP_WLOCK(inp);
- vflagsav = inp->inp_vflag;
- incflagsav = inp->inp_inc.inc_flags;
- restoreflags = false;
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- if (control)
- m_freem(control);
- error = ECONNRESET;
- goto out;
- }
if (control != NULL) {
/* TCP doesn't do control messages (rights, creds, etc) */
if (control->m_len) {
m_freem(control);
- error = EINVAL;
- goto out;
+ return (EINVAL);
}
m_freem(control); /* empty control, just free it */
- control = NULL;
}
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ if (m != NULL && (flags & PRUS_NOTREADY) == 0)
+ m_freem(m);
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
+ }
+
+ vflagsav = inp->inp_vflag;
+ incflagsav = inp->inp_inc.inc_flags;
+ restoreflags = false;
tp = intotcpcb(inp);
+
+ NET_EPOCH_ENTER(et);
if ((flags & PRUS_OOB) != 0 &&
(error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
goto out;
@@ -1188,7 +1187,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
!(flags & PRUS_NOTREADY)) {
if (flags & PRUS_MORETOCOME)
tp->t_flags |= TF_MORETOCOME;
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output_nodrop(tp);
if (flags & PRUS_MORETOCOME)
tp->t_flags &= ~TF_MORETOCOME;
}
@@ -1255,7 +1254,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
if ((flags & PRUS_NOTREADY) == 0) {
tp->t_flags |= TF_FORCEDATA;
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output_nodrop(tp);
tp->t_flags &= ~TF_FORCEDATA;
}
}
@@ -1285,7 +1284,7 @@ out:
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
- INP_WUNLOCK(inp);
+ error = tcp_unlock_or_drop(tp, error);
NET_EPOCH_EXIT(et);
return (error);
}
@@ -1310,12 +1309,13 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
SOCKBUF_LOCK(&so->so_snd);
error = sbready(&so->so_snd, m, count);
SOCKBUF_UNLOCK(&so->so_snd);
- if (error == 0) {
- NET_EPOCH_ENTER(et);
- error = tp->t_fb->tfb_tcp_output(tp);
- NET_EPOCH_EXIT(et);
+ if (error) {
+ INP_WUNLOCK(inp);
+ return (error);
}
- INP_WUNLOCK(inp);
+ NET_EPOCH_ENTER(et);
+ error = tcp_output_unlock(tp);
+ NET_EPOCH_EXIT(et);
return (error);
}
@@ -1841,7 +1841,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
}
#ifdef TCPHPTS
/* Assure that we are not on any hpts */
- tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
+ tcp_hpts_remove(tp->t_inpcb);
#endif
if (blk->tfb_tcp_fb_init) {
error = (*blk->tfb_tcp_fb_init)(tp);
@@ -1935,10 +1935,8 @@ tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt)
int
tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
- int error;
struct inpcb *inp;
- error = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
@@ -2238,7 +2236,7 @@ unlock_and_done:
struct epoch_tracker et;
NET_EPOCH_ENTER(et);
- error = tp->t_fb->tfb_tcp_output(tp);
+ error = tcp_output_nodrop(tp);
NET_EPOCH_EXIT(et);
}
}
@@ -2767,7 +2765,8 @@ tcp_disconnect(struct tcpcb *tp)
sbflush(&so->so_rcv);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- tp->t_fb->tfb_tcp_output(tp);
+ /* Ignore stack's drop request, we already at it. */
+ (void)tcp_output_nodrop(tp);
}
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 57d7352b8f11..e9d021fb4684 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -316,7 +316,8 @@ struct tcptemp {
* function below.
*/
/* Flags for tcp functions */
-#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
+#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
+#define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */
/*
* If defining the optional tcp_timers, in the
@@ -385,6 +386,108 @@ struct tcp_function {
};
TAILQ_HEAD(tcp_funchead, tcp_function);
+
+struct tcpcb * tcp_drop(struct tcpcb *, int);
+
+#ifdef _NETINET_IN_PCB_H_
+/*
+ * tcp_output()
+ * Handles tcp_drop request from advanced stacks and reports that inpcb is
+ * gone with negative return code.
+ * Drop in replacement for the default stack.
+ */
+static inline int
+tcp_output(struct tcpcb *tp)
+{
+ int rv;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ rv = tp->t_fb->tfb_tcp_output(tp);
+ if (rv < 0) {
+ KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+ ("TCP stack %s requested tcp_drop(%p)",
+ tp->t_fb->tfb_tcp_block_name, tp));
+ tp = tcp_drop(tp, -rv);
+ if (tp)
+ INP_WUNLOCK(tp->t_inpcb);
+ }
+
+ return (rv);
+}
+
+/*
+ * tcp_output_unlock()
+ * Always returns unlocked, handles drop request from advanced stacks.
+ * Always returns positive error code.
+ */
+static inline int
+tcp_output_unlock(struct tcpcb *tp)
+{
+ int rv;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ rv = tp->t_fb->tfb_tcp_output(tp);
+ if (rv < 0) {
+ KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+ ("TCP stack %s requested tcp_drop(%p)",
+ tp->t_fb->tfb_tcp_block_name, tp));
+ rv = -rv;
+ tp = tcp_drop(tp, rv);
+ if (tp)
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ INP_WUNLOCK(tp->t_inpcb);
+
+ return (rv);
+}
+
+/*
+ * tcp_output_nodrop()
+ * Always returns locked. It is caller's responsibility to run tcp_drop()!
+ * Useful in syscall implementations, when we want to perform some logging
+ * and/or tracing with tcpcb before calling tcp_drop(). To be used with
+ * tcp_unlock_or_drop() later.
+ *
+ * XXXGL: maybe don't allow stacks to return a drop request at certain
+ * TCP states? Why would it do in connect(2)? In recv(2)?
+ */
+static inline int
+tcp_output_nodrop(struct tcpcb *tp)
+{
+ int rv;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ rv = tp->t_fb->tfb_tcp_output(tp);
+ KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+ ("TCP stack %s requested tcp_drop(%p)",
+ tp->t_fb->tfb_tcp_block_name, tp));
+ return (rv);
+}
+
+/*
+ * tcp_unlock_or_drop()
+ * Handle return code from tfb_tcp_output() after we have logged/traced,
+ * to be used with tcp_output_nodrop().
+ */
+static inline int
+tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval)
+{
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (tcp_output_retval < 0) {
+ tcp_output_retval = -tcp_output_retval;
+ if (tcp_drop(tp, tcp_output_retval) != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ INP_WUNLOCK(tp->t_inpcb);
+
+ return (tcp_output_retval);
+}
+#endif /* _NETINET_IN_PCB_H_ */
#endif /* _KERNEL */
/*
@@ -528,18 +631,14 @@ struct tcptw {
t_unused:16;
tcp_seq snd_nxt;
tcp_seq rcv_nxt;
- tcp_seq iss;
- tcp_seq irs;
u_short last_win; /* cached window value */
short tw_so_options; /* copy of so_options */
struct ucred *tw_cred; /* user credentials */
u_int32_t t_recent;
u_int32_t ts_offset; /* our timestamp offset */
- u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
- void *tw_pspare; /* TCP_SIGNATURE */
- u_int *tw_spare; /* TCP_SIGNATURE */
+ u_int tw_flags; /* tcpcb t_flags */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
@@ -711,6 +810,11 @@ struct tcpstat {
uint64_t tcps_dsack_bytes; /* Number of bytes DSACK'ed no TLP */
uint64_t tcps_dsack_tlp_bytes; /* Number of bytes DSACK'ed due to TLPs */
+ /* TCPS_TIME_WAIT usage stats */
+ uint64_t tcps_tw_recycles; /* Times time-wait was recycled. */
+ uint64_t tcps_tw_resets; /* Times time-wait sent a reset. */
+ uint64_t tcps_tw_responds; /* Times time-wait sent a valid ack. */
+
uint64_t _pad[6]; /* 3 UTO, 3 TBD */
};
@@ -971,10 +1075,7 @@ void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
void tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *);
-struct tcpcb *
- tcp_drop(struct tcpcb *, int);
void tcp_drain(void);
-void tcp_init(void);
void tcp_fini(void *);
char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
const void *);
@@ -1063,7 +1164,7 @@ struct inpcb *
tcp_drop_syn_sent(struct inpcb *, int);
struct tcpcb *
tcp_newtcpcb(struct inpcb *);
-int tcp_output(struct tcpcb *);
+int tcp_default_output(struct tcpcb *);
void tcp_state_change(struct tcpcb *, int);
void tcp_respond(struct tcpcb *, void *,
struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 0bf55958c618..676eca5462bd 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -532,7 +532,8 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
KASSERT(!(tp->t_flags & TF_TOE),
("%s: tp %p still offloaded.", __func__, tp));
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- (void) tp->t_fb->tfb_tcp_output(tp);
+ if (tcp_output(tp) < 0)
+ INP_WLOCK(inp); /* re-acquire */
} else {
tp = tcp_drop(tp, err);
if (tp == NULL)
diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h
index 263a64fbe588..d7def4e41fc8 100644
--- a/sys/netinet/udp.h
+++ b/sys/netinet/udp.h
@@ -36,6 +36,8 @@
#ifndef _NETINET_UDP_H_
#define _NETINET_UDP_H_
+#include <sys/types.h>
+
/*
* UDP protocol header.
* Per RFC 768, September, 1981.
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index a6be949286b8..ad5a2df7d4aa 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -170,36 +170,12 @@ static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
struct mbuf *, struct thread *, int);
#endif
-static void
-udp_zone_change(void *tag)
-{
-
- uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
- uma_zone_set_max(V_udpcb_zone, maxsockets);
-}
-
-static int
-udp_inpcb_init(void *mem, int size, int flags)
-{
- struct inpcb *inp;
+INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash");
+INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite",
+ "udplitehash");
- inp = mem;
- INP_LOCK_INIT(inp, "inp", "udpinp");
- return (0);
-}
-
-static int
-udplite_inpcb_init(void *mem, int size, int flags)
-{
- struct inpcb *inp;
-
- inp = mem;
- INP_LOCK_INIT(inp, "inp", "udpliteinp");
- return (0);
-}
-
-void
-udp_init(void)
+static void
+udp_init(void *arg __unused)
{
/*
@@ -209,23 +185,17 @@ udp_init(void)
* Once we can calculate the flowid that way and re-establish
* a 4-tuple, flip this to 4-tuple.
*/
- in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE,
- "udp_inpcb", udp_inpcb_init);
+ in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_udpcb_zone, maxsockets);
uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
- EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
- EVENTHANDLER_PRI_ANY);
-}
-
-void
-udplite_init(void)
-{
- in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE,
- UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init);
+ /* Additional pcbinfo for UDP-Lite */
+ in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
+ UDBHASHSIZE);
}
+VNET_SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);
/*
* Kernel module interface for updating udpstat. The argument is an index
@@ -491,7 +461,6 @@ udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
break;
}
}
- m_freem(m);
if (appends == 0) {
/*
@@ -505,6 +474,7 @@ udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
else
UDPSTAT_INC(udps_noportbcast);
}
+ m_freem(m);
return (IPPROTO_DONE);
}
@@ -637,7 +607,7 @@ udp_input(struct mbuf **mp, int *offp, int proto)
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
in_broadcast(ip->ip_dst, ifp))
- return (udp_multi_input(m, proto, udp_in));
+ return (udp_multi_input(m, proto, udp_in));
pcbinfo = udp_get_inpcbinfo(proto);
@@ -780,7 +750,7 @@ udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
if (PRC_IS_REDIRECT(cmd)) {
/* signal EHOSTDOWN, as it flushes the cached route */
- in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify);
+ in_pcbnotifyall(pcbinfo, faddr, EHOSTDOWN, udp_notify);
return;
}
@@ -1100,15 +1070,16 @@ udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
return (EINVAL);
/* Validate the interface index if specified. */
- if (pktinfo->ipi6_ifindex > V_if_index)
- return (ENXIO);
-
- ifp = NULL;
if (pktinfo->ipi6_ifindex) {
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
+ NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */
if (ifp == NULL)
return (ENXIO);
- }
+ } else
+ ifp = NULL;
if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
if (in_ifhasaddr(ifp, ia) == 0)
@@ -1695,10 +1666,8 @@ static void
udp_detach(struct socket *so)
{
struct inpcb *inp;
- struct inpcbinfo *pcbinfo;
struct udpcb *up;
- pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h
index 9a15016b37e3..cd9c4fd47e4f 100644
--- a/sys/netinet/udp_var.h
+++ b/sys/netinet/udp_var.h
@@ -36,6 +36,9 @@
#ifndef _NETINET_UDP_VAR_H_
#define _NETINET_UDP_VAR_H_
+#include <netinet/ip_var.h>
+#include <netinet/udp.h>
+
/*
* UDP kernel structures and variables.
*/
@@ -167,8 +170,6 @@ void udp_discardcb(struct udpcb *);
void udp_ctlinput(int, struct sockaddr *, void *);
void udplite_ctlinput(int, struct sockaddr *, void *);
int udp_ctloutput(struct socket *, struct sockopt *);
-void udp_init(void);
-void udplite_init(void);
int udp_input(struct mbuf **, int *, int);
void udplite_input(struct mbuf *, int);
struct inpcb *udp_notify(struct inpcb *inp, int errno);