aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Gallatin <gallatin@FreeBSD.org>2021-07-06 14:17:33 +0000
committerAndrew Gallatin <gallatin@FreeBSD.org>2021-07-06 14:28:32 +0000
commit28d0a740dd9a67e4a4fa9fda5bb39b5963316f35 (patch)
tree20f30d10ba70a01a4a37a50617d92a8570062394
parentc9144ec14d2a5a53cfe91ada1b3b9c06b78dc999 (diff)
downloadsrc-28d0a740dd9a67e4a4fa9fda5bb39b5963316f35.tar.gz
src-28d0a740dd9a67e4a4fa9fda5bb39b5963316f35.zip
ktls: auto-disable ifnet (inline hw) kTLS
Ifnet (inline) hw kTLS NICs typically keep state within a TLS record, so that when transmitting in-order, they can continue encryption on each segment sent without DMA'ing extra state from the host. This breaks down when transmits are out of order (eg, TCP retransmits). In this case, the NIC must re-DMA the entire TLS record up to and including the segment being retransmitted. This means that when re-transmitting the last 1448 byte segment of a TLS record, the NIC will have to re-DMA the entire 16KB TLS record. This can lead to the NIC running out of PCIe bus bandwidth well before it saturates the network link if a lot of TCP connections have a high retransmoit rate. This change introduces a new sysctl (kern.ipc.tls.ifnet_max_rexmit_pct), where TCP connections with higher retransmit rate will be switched to SW kTLS so as to conserve PCIe bandwidth. Reviewed by: hselasky, markj, rrs Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D30908
-rw-r--r--sys/kern/uipc_ktls.c107
-rw-r--r--sys/netinet/tcp_var.h13
-rw-r--r--sys/sys/ktls.h15
3 files changed, 133 insertions, 2 deletions
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 7e87e7c740e3..88e29157289d 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_kern_tls.h"
#include "opt_ratelimit.h"
#include "opt_rss.h"
@@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
&ktls_number_threads, 0,
"Number of TLS threads in thread-pool");
+unsigned int ktls_ifnet_max_rexmit_pct = 2;
+SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
+ &ktls_ifnet_max_rexmit_pct, 2,
+ "Max percent bytes retransmitted before ifnet TLS is disabled");
+
static bool ktls_offload_enable;
SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
&ktls_offload_enable, 0,
@@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
&ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
+static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
+ &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
+
+static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
+ &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
+
SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"Software TLS session stats");
SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
}
}
}
+
+static void
+ktls_disable_ifnet_help(void *context, int pending __unused)
+{
+ struct ktls_session *tls;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct socket *so;
+ int err;
+
+ tls = context;
+ inp = tls->inp;
+ if (inp == NULL)
+ return;
+ INP_WLOCK(inp);
+ so = inp->inp_socket;
+ MPASS(so != NULL);
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ goto out;
+ }
+
+ if (so->so_snd.sb_tls_info != NULL)
+ err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
+ else
+ err = ENXIO;
+ if (err == 0) {
+ counter_u64_add(ktls_ifnet_disable_ok, 1);
+ /* ktls_set_tx_mode() drops inp wlock, so recheck flags */
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
+ (inp->inp_flags2 & INP_FREED) == 0 &&
+ (tp = intotcpcb(inp)) != NULL &&
+ tp->t_fb->tfb_hwtls_change != NULL)
+ (*tp->t_fb->tfb_hwtls_change)(tp, 0);
+ } else {
+ counter_u64_add(ktls_ifnet_disable_fail, 1);
+ }
+
+out:
+ SOCK_LOCK(so);
+ sorele(so);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
+ ktls_free(tls);
+}
+
+/*
+ * Called when re-transmits are becoming a substantial portion of the
+ * sends on this connection. When this happens, we transition the
+ * connection to software TLS. This is needed because most inline TLS
+ * NICs keep crypto state only for in-order transmits. This means
+ * that to handle a TCP rexmit (which is out-of-order), the NIC must
+ * re-DMA the entire TLS record up to and including the current
+ * segment. This means that when re-transmitting the last ~1448 byte
+ * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
+ * of magnitude more data than we are sending. This can cause the
+ * PCIe link to saturate well before the network, which can cause
+ * output drops, and a general loss of capacity.
+ */
+void
+ktls_disable_ifnet(void *arg)
+{
+ struct tcpcb *tp;
+ struct inpcb *inp;
+ struct socket *so;
+ struct ktls_session *tls;
+
+ tp = arg;
+ inp = tp->t_inpcb;
+ INP_WLOCK_ASSERT(inp);
+ so = inp->inp_socket;
+ SOCK_LOCK(so);
+ tls = so->so_snd.sb_tls_info;
+ if (tls->disable_ifnet_pending) {
+ SOCK_UNLOCK(so);
+ return;
+ }
+
+ /*
+ * note that disable_ifnet_pending is never cleared; disabling
+ * ifnet can only be done once per session, so we never want
+ * to do it again
+ */
+
+ (void)ktls_hold(tls);
+ in_pcbref(inp);
+ soref(so);
+ tls->disable_ifnet_pending = true;
+ tls->inp = inp;
+ SOCK_UNLOCK(so);
+ TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
+ (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
+}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index dd30f89896d2..3f72a821e71f 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -39,8 +39,10 @@
#include <netinet/tcp_fsm.h>
#ifdef _KERNEL
+#include "opt_kern_tls.h"
#include <net/vnet.h>
#include <sys/mbuf.h>
+#include <sys/ktls.h>
#endif
#define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */
@@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
static inline void
tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
- uint8_t is_tlp, int hw_tls __unused)
+ uint8_t is_tlp, int hw_tls)
{
+ uint64_t rexmit_percent;
+
if (is_tlp) {
tp->t_sndtlppack++;
tp->t_sndtlpbyte += len;
@@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
tp->t_snd_rxt_bytes += len;
else
tp->t_sndbytes += len;
+
+ if (hw_tls && is_rxt) {
+ rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes));
+ if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
+ ktls_disable_ifnet(tp);
+ }
+
}
#endif /* _KERNEL */
diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
index b28c94965c97..7fd8831878b4 100644
--- a/sys/sys/ktls.h
+++ b/sys/sys/ktls.h
@@ -189,10 +189,12 @@ struct ktls_session {
u_int wq_index;
volatile u_int refcount;
int mode;
- bool reset_pending;
struct task reset_tag_task;
+ struct task disable_ifnet_task;
struct inpcb *inp;
+ bool reset_pending;
+ bool disable_ifnet_pending;
} __aligned(CACHE_LINE_SIZE);
void ktls_check_rx(struct sockbuf *sb);
@@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
ktls_destroy(tls);
}
+#ifdef KERN_TLS
+extern unsigned int ktls_ifnet_max_rexmit_pct;
+void ktls_disable_ifnet(void *arg);
+#else
+#define ktls_ifnet_max_rexmit_pct 1
+inline void
+ktls_disable_ifnet(void *arg __unused)
+{
+}
+#endif
+
#endif /* !_KERNEL */
#endif /* !_SYS_KTLS_H_ */