aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRandall Stewart <rrs@FreeBSD.org>2022-02-21 11:30:17 +0000
committerRandall Stewart <rrs@FreeBSD.org>2022-02-21 11:30:17 +0000
commitea9017fb25ff51fcb022a5835b1e472e57490d34 (patch)
tree22be61dbd0d596f6b81d88caae5cbe74b42213f2
parent5ccf909af9c1117172ff0742515da2d2e0cef89e (diff)
downloadsrc-ea9017fb25ff51fcb022a5835b1e472e57490d34.tar.gz
src-ea9017fb25ff51fcb022a5835b1e472e57490d34.zip
tcp: Congestion control move to using reference counting.
In the transport call on 12/3 Gleb asked to move the CC modules towards using reference counting to prevent folks from unloading a module in use. It was also agreed that Michael would do a user space utility like tcp_drop that could be used to move all connections that are using a specific CC to some other CC. This is the half I committed to doing, making it so that we maintain a refcount on a cc module every time a pcb refers to it and decrementing that every time a pcb no longer uses a cc module. This also helps us simplify the whole unloading process by getting rid of tcp_ccunload() which munged through all the tcb's. Instead we mark a module as being removed and prevent further references to it. We also make sure that if a module is marked as being removed it cannot be made as the default and also the opposite of that, if its a default it fails and does not mark it as being removed. Reviewed by: Michael Tuexen, Gleb Smirnoff Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D33249
-rw-r--r--sys/netinet/cc/cc.c196
-rw-r--r--sys/netinet/cc/cc.h11
-rw-r--r--sys/netinet/tcp_subr.c95
-rw-r--r--sys/netinet/tcp_usrreq.c48
-rw-r--r--sys/netinet/tcp_var.h1
5 files changed, 171 insertions, 180 deletions
diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c
index 647ecb47f18d..55a5f6ef652e 100644
--- a/sys/netinet/cc/cc.c
+++ b/sys/netinet/cc/cc.c
@@ -107,6 +107,45 @@ VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
VNET_DEFINE(uint32_t, newreno_beta) = 50;
#define V_newreno_beta VNET(newreno_beta)
+void
+cc_refer(struct cc_algo *algo)
+{
+ CC_LIST_LOCK_ASSERT();
+ refcount_acquire(&algo->cc_refcount);
+}
+
+void
+cc_release(struct cc_algo *algo)
+{
+ CC_LIST_LOCK_ASSERT();
+ refcount_release(&algo->cc_refcount);
+}
+
+
+void
+cc_attach(struct tcpcb *tp, struct cc_algo *algo)
+{
+ /*
+ * Attach the tcpcb to the algorithm.
+ */
+ CC_LIST_RLOCK();
+ CC_ALGO(tp) = algo;
+ cc_refer(algo);
+ CC_LIST_RUNLOCK();
+}
+
+void
+cc_detach(struct tcpcb *tp)
+{
+ struct cc_algo *algo;
+
+ CC_LIST_RLOCK();
+ algo = CC_ALGO(tp);
+ CC_ALGO(tp) = NULL;
+ cc_release(algo);
+ CC_LIST_RUNLOCK();
+}
+
/*
* Sysctl handler to show and change the default CC algorithm.
*/
@@ -137,6 +176,10 @@ cc_default_algo(SYSCTL_HANDLER_ARGS)
STAILQ_FOREACH(funcs, &cc_list, entries) {
if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
continue;
+ if (funcs->flags & CC_MODULE_BEING_REMOVED) {
+ /* Its being removed, its not eligible */
+ continue;
+ }
V_default_cc_ptr = funcs;
error = 0;
break;
@@ -153,12 +196,12 @@ static int
cc_list_available(SYSCTL_HANDLER_ARGS)
{
struct cc_algo *algo;
- struct sbuf *s;
- int err, first, nalgos;
-
- err = nalgos = 0;
- first = 1;
+ int error, nalgos;
+ int linesz;
+ char *buffer, *cp;
+ size_t bufsz, outsz;
+ error = nalgos = 0;
CC_LIST_RLOCK();
STAILQ_FOREACH(algo, &cc_list, entries) {
nalgos++;
@@ -167,37 +210,34 @@ cc_list_available(SYSCTL_HANDLER_ARGS)
if (nalgos == 0) {
return (ENOENT);
}
- s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
-
- if (s == NULL)
- return (ENOMEM);
-
- /*
- * It is theoretically possible for the CC list to have grown in size
- * since the call to sbuf_new() and therefore for the sbuf to be too
- * small. If this were to happen (incredibly unlikely), the sbuf will
- * reach an overflow condition, sbuf_printf() will return an error and
- * the sysctl will fail gracefully.
- */
+ bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1);
+ buffer = malloc(bufsz, M_TEMP, M_WAITOK);
+ cp = buffer;
+
+ linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D',
+ "PCB count");
+ cp += linesz;
+ bufsz -= linesz;
+ outsz = linesz;
CC_LIST_RLOCK();
STAILQ_FOREACH(algo, &cc_list, entries) {
- err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
- if (err) {
- /* Sbuf overflow condition. */
- err = EOVERFLOW;
+ linesz = snprintf(cp, bufsz, "%-16s%c %u\n",
+ algo->name,
+ (algo == CC_DEFAULT_ALGO()) ? '*' : ' ',
+ algo->cc_refcount);
+ if (linesz >= bufsz) {
+ error = EOVERFLOW;
break;
}
- first = 0;
+ cp += linesz;
+ bufsz -= linesz;
+ outsz += linesz;
}
CC_LIST_RUNLOCK();
-
- if (!err) {
- sbuf_finish(s);
- err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
- }
-
- sbuf_delete(s);
- return (err);
+ if (error == 0)
+ error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
+ free(buffer, M_TEMP);
+ return (error);
}
/*
@@ -243,41 +283,36 @@ cc_init(void)
int
cc_deregister_algo(struct cc_algo *remove_cc)
{
- struct cc_algo *funcs, *tmpfuncs;
- int err;
-
- err = ENOENT;
+ struct cc_algo *funcs;
+ int found = 0;
/* Remove algo from cc_list so that new connections can't use it. */
CC_LIST_WLOCK();
- STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
- if (funcs == remove_cc) {
- if (cc_check_default(remove_cc)) {
- CC_LIST_WUNLOCK();
- return(EBUSY);
- }
- break;
- }
+
+ /* This is unlikely to fail */
+ STAILQ_FOREACH(funcs, &cc_list, entries) {
+ if (funcs == remove_cc)
+ found = 1;
}
- remove_cc->flags |= CC_MODULE_BEING_REMOVED;
- CC_LIST_WUNLOCK();
- err = tcp_ccalgounload(remove_cc);
- /*
- * Now back through and we either remove the temp flag
- * or pull the registration.
- */
- CC_LIST_WLOCK();
- STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
- if (funcs == remove_cc) {
- if (err == 0)
- STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
- else
- funcs->flags &= ~CC_MODULE_BEING_REMOVED;
- break;
- }
+ if (found == 0) {
+ /* Nothing to remove? */
+ CC_LIST_WUNLOCK();
+ return (ENOENT);
+ }
+ /* We assert it should have been MOD_QUIESCE'd */
+ KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED),
+ ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc));
+ if (cc_check_default(remove_cc)) {
+ CC_LIST_WUNLOCK();
+ return(EBUSY);
+ }
+ if (remove_cc->cc_refcount != 0) {
+ CC_LIST_WUNLOCK();
+ return (EBUSY);
}
+ STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries);
CC_LIST_WUNLOCK();
- return (err);
+ return (0);
}
/*
@@ -304,6 +339,9 @@ cc_register_algo(struct cc_algo *add_cc)
break;
}
}
+ /* Init its reference count */
+ if (err == 0)
+ refcount_init(&add_cc->cc_refcount, 0);
/*
* The first loaded congestion control module will become
* the default until we find the "CC_DEFAULT" defined in
@@ -526,6 +564,20 @@ newreno_cc_ack_received(struct cc_var *ccv, uint16_t type)
}
}
+static int
+cc_stop_new_assignments(struct cc_algo *algo)
+{
+ CC_LIST_WLOCK();
+ if (cc_check_default(algo)) {
+ /* A default cannot be removed */
+ CC_LIST_WUNLOCK();
+ return (EBUSY);
+ }
+ algo->flags |= CC_MODULE_BEING_REMOVED;
+ CC_LIST_WUNLOCK();
+ return (0);
+}
+
/*
* Handles kld related events. Returns 0 on success, non-zero on failure.
*/
@@ -555,16 +607,32 @@ cc_modevent(module_t mod, int event_type, void *data)
err = cc_register_algo(algo);
break;
- case MOD_QUIESCE:
case MOD_SHUTDOWN:
+ break;
+ case MOD_QUIESCE:
+ /* Stop any new assigments */
+ err = cc_stop_new_assignments(algo);
+ break;
case MOD_UNLOAD:
+ /*
+ * Deregister and remove the module from the list
+ */
+ CC_LIST_WLOCK();
+ /* Even with -f we can't unload if its the default */
+ if (cc_check_default(algo)) {
+ /* A default cannot be removed */
+ CC_LIST_WUNLOCK();
+ return (EBUSY);
+ }
+ /*
+ * If -f was used and users are still attached to
+ * the algorithm things are going to go boom.
+ */
err = cc_deregister_algo(algo);
- if (!err && algo->mod_destroy != NULL)
+ if ((err == 0) && (algo->mod_destroy != NULL)) {
algo->mod_destroy();
- if (err == ENOENT)
- err = 0;
+ }
break;
-
default:
err = EINVAL;
break;
diff --git a/sys/netinet/cc/cc.h b/sys/netinet/cc/cc.h
index 6f942da7aa83..1906207bddb4 100644
--- a/sys/netinet/cc/cc.h
+++ b/sys/netinet/cc/cc.h
@@ -200,6 +200,7 @@ struct cc_algo {
int (*ctl_output)(struct cc_var *, struct sockopt *, void *);
STAILQ_ENTRY (cc_algo) entries;
+ u_int cc_refcount;
uint8_t flags;
};
@@ -236,5 +237,15 @@ void newreno_cc_after_idle(struct cc_var *);
void newreno_cc_cong_signal(struct cc_var *, uint32_t );
void newreno_cc_ack_received(struct cc_var *, uint16_t);
+/* Called to temporarily keep an algo from going away during change */
+void cc_refer(struct cc_algo *algo);
+/* Called to release the temporary hold */
+void cc_release(struct cc_algo *algo);
+
+/* Called to attach a CC algorithm to a tcpcb */
+void cc_attach(struct tcpcb *, struct cc_algo *);
+/* Called to detach a CC algorithm from a tcpcb */
+void cc_detach(struct tcpcb *);
+
#endif /* _KERNEL */
#endif /* _NETINET_CC_CC_H_ */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 298ae38d5b27..95c34c581e59 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -2171,10 +2171,7 @@ tcp_newtcpcb(struct inpcb *inp)
/*
* Use the current system default CC algorithm.
*/
- CC_LIST_RLOCK();
- KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
- CC_ALGO(tp) = CC_DEFAULT_ALGO();
- CC_LIST_RUNLOCK();
+ cc_attach(tp, CC_DEFAULT_ALGO());
/*
* The tcpcb will hold a reference on its inpcb until tcp_discardcb()
@@ -2185,6 +2182,7 @@ tcp_newtcpcb(struct inpcb *inp)
if (CC_ALGO(tp)->cb_init != NULL)
if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
+ cc_detach(tp);
if (tp->t_fb->tfb_tcp_fb_fini)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
in_pcbrele_wlocked(inp);
@@ -2277,93 +2275,6 @@ tcp_newtcpcb(struct inpcb *inp)
}
/*
- * Switch the congestion control algorithm back to Vnet default for any active
- * control blocks using an algorithm which is about to go away. If the algorithm
- * has a cb_init function and it fails (no memory) then the operation fails and
- * the unload will not succeed.
- *
- */
-int
-tcp_ccalgounload(struct cc_algo *unload_algo)
-{
- struct cc_algo *oldalgo, *newalgo;
- struct inpcb *inp;
- struct tcpcb *tp;
- VNET_ITERATOR_DECL(vnet_iter);
-
- /*
- * Check all active control blocks across all network stacks and change
- * any that are using "unload_algo" back to its default. If "unload_algo"
- * requires cleanup code to be run, call it.
- */
- VNET_LIST_RLOCK();
- VNET_FOREACH(vnet_iter) {
- CURVNET_SET(vnet_iter);
- struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
- INPLOOKUP_WLOCKPCB);
- /*
- * XXXGL: would new accept(2)d connections use algo being
- * unloaded?
- */
- newalgo = CC_DEFAULT_ALGO();
- while ((inp = inp_next(&inpi)) != NULL) {
- /* Important to skip tcptw structs. */
- if (!(inp->inp_flags & INP_TIMEWAIT) &&
- (tp = intotcpcb(inp)) != NULL) {
- /*
- * By holding INP_WLOCK here, we are assured
- * that the connection is not currently
- * executing inside the CC module's functions.
- * We attempt to switch to the Vnets default,
- * if the init fails then we fail the whole
- * operation and the module unload will fail.
- */
- if (CC_ALGO(tp) == unload_algo) {
- struct cc_var cc_mem;
- int err;
-
- oldalgo = CC_ALGO(tp);
- memset(&cc_mem, 0, sizeof(cc_mem));
- cc_mem.ccvc.tcp = tp;
- if (newalgo->cb_init == NULL) {
- /*
- * No init we can skip the
- * dance around a possible failure.
- */
- CC_DATA(tp) = NULL;
- goto proceed;
- }
- err = (newalgo->cb_init)(&cc_mem, NULL);
- if (err) {
- /*
- * Presumably no memory the caller will
- * need to try again.
- */
- INP_WUNLOCK(inp);
- CURVNET_RESTORE();
- VNET_LIST_RUNLOCK();
- return (err);
- }
-proceed:
- if (oldalgo->cb_destroy != NULL)
- oldalgo->cb_destroy(tp->ccv);
- CC_ALGO(tp) = newalgo;
- memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
- if (TCPS_HAVEESTABLISHED(tp->t_state) &&
- (CC_ALGO(tp)->conn_init != NULL)) {
- /* Yep run the connection init for the new CC */
- CC_ALGO(tp)->conn_init(tp->ccv);
- }
- }
- }
- }
- CURVNET_RESTORE();
- }
- VNET_LIST_RUNLOCK();
- return (0);
-}
-
-/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
@@ -2443,6 +2354,8 @@ tcp_discardcb(struct tcpcb *tp)
if (CC_ALGO(tp)->cb_destroy != NULL)
CC_ALGO(tp)->cb_destroy(tp->ccv);
CC_DATA(tp) = NULL;
+ /* Detach from the CC algorithm */
+ cc_detach(tp);
#ifdef TCP_HHOOK
khelp_destroy_osd(tp->osd);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index ac9e79aed917..8c8cba8ea5ad 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2004,7 +2004,7 @@ copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls)
extern struct cc_algo newreno_cc_algo;
static int
-tcp_congestion(struct inpcb *inp, struct sockopt *sopt)
+tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
{
struct cc_algo *algo;
void *ptr = NULL;
@@ -2020,7 +2020,7 @@ tcp_congestion(struct inpcb *inp, struct sockopt *sopt)
return(error);
buf[sopt->sopt_valsize] = '\0';
CC_LIST_RLOCK();
- STAILQ_FOREACH(algo, &cc_list, entries)
+ STAILQ_FOREACH(algo, &cc_list, entries) {
if (strncmp(buf, algo->name,
TCP_CA_NAME_MAX) == 0) {
if (algo->flags & CC_MODULE_BEING_REMOVED) {
@@ -2029,30 +2029,24 @@ tcp_congestion(struct inpcb *inp, struct sockopt *sopt)
}
break;
}
+ }
if (algo == NULL) {
CC_LIST_RUNLOCK();
return(ESRCH);
}
-do_over:
+ /*
+ * With a reference the algorithm cannot be removed
+ * so we hold a reference through the change process.
+ */
+ cc_refer(algo);
+ CC_LIST_RUNLOCK();
if (algo->cb_init != NULL) {
/* We can now pre-get the memory for the CC */
mem_sz = (*algo->cc_data_sz)();
if (mem_sz == 0) {
goto no_mem_needed;
}
- CC_LIST_RUNLOCK();
ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
- CC_LIST_RLOCK();
- STAILQ_FOREACH(algo, &cc_list, entries)
- if (strncmp(buf, algo->name,
- TCP_CA_NAME_MAX) == 0)
- break;
- if (algo == NULL) {
- if (ptr)
- free(ptr, M_CC_MEM);
- CC_LIST_RUNLOCK();
- return(ESRCH);
- }
} else {
no_mem_needed:
mem_sz = 0;
@@ -2063,22 +2057,20 @@ no_mem_needed:
* back the inplock.
*/
memset(&cc_mem, 0, sizeof(cc_mem));
- if (mem_sz != (*algo->cc_data_sz)()) {
- if (ptr)
- free(ptr, M_CC_MEM);
- goto do_over;
- }
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
+ if (ptr)
+ free(ptr, M_CC_MEM);
+ /* Release our temp reference */
+ CC_LIST_RLOCK();
+ cc_release(algo);
CC_LIST_RUNLOCK();
- free(ptr, M_CC_MEM);
return (ECONNRESET);
}
tp = intotcpcb(inp);
if (ptr != NULL)
memset(ptr, 0, mem_sz);
- CC_LIST_RUNLOCK();
cc_mem.ccvc.tcp = tp;
/*
* We once again hold a write lock over the tcb so it's
@@ -2103,8 +2095,12 @@ no_mem_needed:
*/
if (CC_ALGO(tp)->cb_destroy != NULL)
CC_ALGO(tp)->cb_destroy(tp->ccv);
+ /* Detach the old CC from the tcpcb */
+ cc_detach(tp);
+ /* Copy in our temp memory that was inited */
memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
- tp->cc_algo = algo;
+ /* Now attach the new, which takes a reference */
+ cc_attach(tp, algo);
/* Ok now are we where we have gotten past any conn_init? */
if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
/* Yep run the connection init for the new CC */
@@ -2113,6 +2109,10 @@ no_mem_needed:
} else if (ptr)
free(ptr, M_CC_MEM);
INP_WUNLOCK(inp);
+ /* Now lets release our temp reference */
+ CC_LIST_RLOCK();
+ cc_release(algo);
+ CC_LIST_RUNLOCK();
return (error);
}
@@ -2336,7 +2336,7 @@ unlock_and_done:
break;
case TCP_CONGESTION:
- error = tcp_congestion(inp, sopt);
+ error = tcp_set_cc_mod(inp, sopt);
break;
case TCP_REUSPORT_LB_NUMA:
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 6b1f79a0b9ed..b9d37471771c 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1076,7 +1076,6 @@ VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
#endif
int tcp_addoptions(struct tcpopt *, u_char *);
-int tcp_ccalgounload(struct cc_algo *unload_algo);
struct tcpcb *
tcp_close(struct tcpcb *);
void tcp_discardcb(struct tcpcb *);