aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/net/iflib.c454
-rw-r--r--sys/sys/cpuset.h1
2 files changed, 294 insertions, 161 deletions
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index fc0814d0fc19..faf58917c96b 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -195,6 +195,8 @@ struct iflib_ctx {
uint16_t ifc_sysctl_core_offset;
#define CORE_OFFSET_UNSPECIFIED 0xffff
uint8_t ifc_sysctl_separate_txrx;
+ uint8_t ifc_sysctl_use_logical_cores;
+ bool ifc_cpus_are_physical_cores;
qidx_t ifc_sysctl_ntxds[8];
qidx_t ifc_sysctl_nrxds[8];
@@ -725,7 +727,7 @@ struct cpu_offset {
SLIST_ENTRY(cpu_offset) entries;
cpuset_t set;
unsigned int refcount;
- uint16_t offset;
+ uint16_t next_cpuid;
};
static struct mtx cpu_offset_mtx;
MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
@@ -4682,41 +4684,291 @@ iflib_rem_pfil(if_ctx_t ctx)
pfil_head_unregister(pfil);
}
+
+/*
+ * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
+ * cpuid and wrapping as necessary.
+ */
+static unsigned int
+cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
+{
+ unsigned int first_valid;
+ unsigned int last_valid;
+
+ /* cpuid should always be in the valid set */
+ MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
+
+ /* valid set should never be empty */
+ MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
+
+ first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
+ last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
+ n = n % CPU_COUNT(&ctx->ifc_cpus);
+ while (n > 0) {
+ do {
+ cpuid++;
+ if (cpuid > last_valid)
+ cpuid = first_valid;
+ } while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
+ n--;
+ }
+
+ return (cpuid);
+}
+
+#if defined(SMP) && defined(SCHED_ULE)
+extern struct cpu_group *cpu_top; /* CPU topology */
+
+static int
+find_child_with_core(int cpu, struct cpu_group *grp)
+{
+ int i;
+
+ if (grp->cg_children == 0)
+ return -1;
+
+ MPASS(grp->cg_child);
+ for (i = 0; i < grp->cg_children; i++) {
+ if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
+ return i;
+ }
+
+ return -1;
+}
+
+
+/*
+ * Find an L2 neighbor of the given CPU or return -1 if none found. This
+ * does not distinguish among multiple L2 neighbors if the given CPU has
+ * more than one (it will always return the same result in that case).
+ */
+static int
+find_l2_neighbor(int cpu)
+{
+ struct cpu_group *grp;
+ int i;
+
+ grp = cpu_top;
+ if (grp == NULL)
+ return -1;
+
+ /*
+ * Find the smallest CPU group that contains the given core.
+ */
+ i = 0;
+ while ((i = find_child_with_core(cpu, grp)) != -1) {
+ /*
+ * If the smallest group containing the given CPU has less
+ * than two members, we conclude the given CPU has no
+ * L2 neighbor.
+ */
+ if (grp->cg_child[i].cg_count <= 1)
+ return (-1);
+ grp = &grp->cg_child[i];
+ }
+
+ /* Must share L2. */
+ if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
+ return -1;
+
+ /*
+ * Select the first member of the set that isn't the reference
+ * CPU, which at this point is guaranteed to exist.
+ */
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
+ return (i);
+ }
+
+ /* Should never be reached */
+ return (-1);
+}
+
+#else
+static int
+find_l2_neighbor(int cpu)
+{
+
+ return (-1);
+}
+#endif
+
+/*
+ * CPU mapping behaviors
+ * ---------------------
+ * 'separate txrx' refers to the separate_txrx sysctl
+ * 'use logical' refers to the use_logical_cores sysctl
+ * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
+ *
+ * separate use INTR
+ * txrx logical CPUS result
+ * ---------- --------- ------ ------------------------------------------------
+ * - - X RX and TX queues mapped to consecutive physical
+ * cores with RX/TX pairs on same core and excess
+ * of either following
+ * - X X RX and TX queues mapped to consecutive cores
+ * of any type with RX/TX pairs on same core and
+ * excess of either following
+ * X - X RX and TX queues mapped to consecutive physical
+ * cores; all RX then all TX
+ * X X X RX queues mapped to consecutive physical cores
+ * first, then TX queues mapped to L2 neighbor of
+ * the corresponding RX queue if one exists,
+ * otherwise to consecutive physical cores
+ * - n/a - RX and TX queues mapped to consecutive cores of
+ * any type with RX/TX pairs on same core and excess
+ * of either following
+ * X n/a - RX and TX queues mapped to consecutive cores of
+ * any type; all RX then all TX
+ */
+static unsigned int
+get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
+ bool is_tx)
+{
+ if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
+ unsigned int core_index;
+
+ if (ctx->ifc_sysctl_separate_txrx) {
+ /*
+ * When using separate CPUs for TX and RX, the assignment
+ * will always be of a consecutive CPU out of the set of
+ * context CPUs, except for the specific case where the
+ * context CPUs are phsyical cores, the use of logical cores
+ * has been enabled, the assignment is for TX, the TX qid
+ * corresponds to an RX qid, and the CPU assigned to the
+ * corresponding RX queue has an L2 neighbor.
+ */
+ if (ctx->ifc_sysctl_use_logical_cores &&
+ ctx->ifc_cpus_are_physical_cores &&
+ is_tx && qid < scctx->isc_nrxqsets) {
+ int l2_neighbor;
+ unsigned int rx_cpuid;
+
+ rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
+ l2_neighbor = find_l2_neighbor(rx_cpuid);
+ if (l2_neighbor != -1) {
+ return (l2_neighbor);
+ }
+ /*
+ * ... else fall through to the normal
+ * consecutive-after-RX assignment scheme.
+ *
+ * Note that we are assuming that all RX queue CPUs
+ * have an L2 neighbor, or all do not. If a mixed
+ * scenario is possible, we will have to keep track
+ * separately of how many queues prior to this one
+ * were not able to be assigned to an L2 neighbor.
+ */
+ }
+ if (is_tx)
+ core_index = scctx->isc_nrxqsets + qid;
+ else
+ core_index = qid;
+ } else {
+ core_index = qid;
+ }
+
+ return (cpuid_advance(ctx, base_cpuid, core_index));
+}
+
static uint16_t
get_ctx_core_offset(if_ctx_t ctx)
{
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
struct cpu_offset *op;
- uint16_t qc;
- uint16_t ret = ctx->ifc_sysctl_core_offset;
+ cpuset_t assigned_cpus;
+ unsigned int cores_consumed;
+ unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
+ unsigned int first_valid;
+ unsigned int last_valid;
+ unsigned int i;
- if (ret != CORE_OFFSET_UNSPECIFIED)
- return (ret);
+ first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
+ last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
- if (ctx->ifc_sysctl_separate_txrx)
- qc = scctx->isc_ntxqsets + scctx->isc_nrxqsets;
- else
- qc = max(scctx->isc_ntxqsets, scctx->isc_nrxqsets);
+ if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
+ /*
+ * Align the user-chosen base CPU ID to the next valid CPU
+ * for this device. If the chosen base CPU ID is smaller
+ * than the first valid CPU or larger than the last valid
+ * CPU, we assume the user does not know what the valid
+ * range is for this device and is thinking in terms of a
+ * zero-based reference frame, and so we shift the given
+ * value into the valid range (and wrap accordingly) so the
+ * intent is translated to the proper frame of reference.
+ * If the base CPU ID is within the valid first/last, but
+ * does not correspond to a valid CPU, it is advanced to the
+ * next valid CPU (wrapping if necessary).
+ */
+ if (base_cpuid < first_valid || base_cpuid > last_valid) {
+ /* shift from zero-based to first_valid-based */
+ base_cpuid += first_valid;
+ /* wrap to range [first_valid, last_valid] */
+ base_cpuid = (base_cpuid - first_valid) %
+ (last_valid - first_valid + 1);
+ }
+ if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
+ /*
+ * base_cpuid is in [first_valid, last_valid], but
+ * not a member of the valid set. In this case,
+ * there will always be a member of the valid set
+ * with a CPU ID that is greater than base_cpuid,
+ * and we simply advance to it.
+ */
+ while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
+ base_cpuid++;
+ }
+ return (base_cpuid);
+ }
+
+ /*
+ * Determine how many cores will be consumed by performing the CPU
+ * assignments and counting how many of the assigned CPUs correspond
+ * to CPUs in the set of context CPUs. This is done using the CPU
+ * ID first_valid as the base CPU ID, as the base CPU must be within
+ * the set of context CPUs.
+ *
+ * Note not all assigned CPUs will be in the set of context CPUs
+ * when separate CPUs are being allocated to TX and RX queues,
+ * assignment to logical cores has been enabled, the set of context
+ * CPUs contains only physical CPUs, and TX queues are mapped to L2
+ * neighbors of CPUs that RX queues have been mapped to - in this
+ * case we do only want to count how many CPUs in the set of context
+ * CPUs have been consumed, as that determines the next CPU in that
+ * set to start allocating at for the next device for which
+ * core_offset is not set.
+ */
+ CPU_ZERO(&assigned_cpus);
+ for (i = 0; i < scctx->isc_ntxqsets; i++)
+ CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
+ &assigned_cpus);
+ for (i = 0; i < scctx->isc_nrxqsets; i++)
+ CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
+ &assigned_cpus);
+ CPU_AND(&assigned_cpus, &ctx->ifc_cpus);
+ cores_consumed = CPU_COUNT(&assigned_cpus);
mtx_lock(&cpu_offset_mtx);
SLIST_FOREACH(op, &cpu_offsets, entries) {
if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
- ret = op->offset;
- op->offset += qc;
+ base_cpuid = op->next_cpuid;
+ op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
+ cores_consumed);
MPASS(op->refcount < UINT_MAX);
op->refcount++;
break;
}
}
- if (ret == CORE_OFFSET_UNSPECIFIED) {
- ret = 0;
+ if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
+ base_cpuid = first_valid;
op = malloc(sizeof(struct cpu_offset), M_IFLIB,
M_NOWAIT | M_ZERO);
if (op == NULL) {
device_printf(ctx->ifc_dev,
"allocation for cpu offset failed.\n");
} else {
- op->offset = qc;
+ op->next_cpuid = cpuid_advance(ctx, base_cpuid,
+ cores_consumed);
op->refcount = 1;
CPU_COPY(&ctx->ifc_cpus, &op->set);
SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
@@ -4724,7 +4976,7 @@ get_ctx_core_offset(if_ctx_t ctx)
}
mtx_unlock(&cpu_offset_mtx);
- return (ret);
+ return (base_cpuid);
}
static void
@@ -4855,7 +5107,9 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct
if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
device_printf(dev, "Unable to fetch CPU list\n");
CPU_COPY(&all_cpus, &ctx->ifc_cpus);
- }
+ ctx->ifc_cpus_are_physical_cores = false;
+ } else
+ ctx->ifc_cpus_are_physical_cores = true;
MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
/*
@@ -5970,128 +6224,6 @@ iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
}
-#ifdef SMP
-static int
-find_nth(if_ctx_t ctx, int qid)
-{
- cpuset_t cpus;
- int i, cpuid, eqid, count;
-
- CPU_COPY(&ctx->ifc_cpus, &cpus);
- count = CPU_COUNT(&cpus);
- eqid = qid % count;
- /* clear up to the qid'th bit */
- for (i = 0; i < eqid; i++) {
- cpuid = CPU_FFS(&cpus);
- MPASS(cpuid != 0);
- CPU_CLR(cpuid-1, &cpus);
- }
- cpuid = CPU_FFS(&cpus);
- MPASS(cpuid != 0);
- return (cpuid-1);
-}
-
-#ifdef SCHED_ULE
-extern struct cpu_group *cpu_top; /* CPU topology */
-
-static int
-find_child_with_core(int cpu, struct cpu_group *grp)
-{
- int i;
-
- if (grp->cg_children == 0)
- return -1;
-
- MPASS(grp->cg_child);
- for (i = 0; i < grp->cg_children; i++) {
- if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
- return i;
- }
-
- return -1;
-}
-
-/*
- * Find the nth "close" core to the specified core
- * "close" is defined as the deepest level that shares
- * at least an L2 cache. With threads, this will be
- * threads on the same core. If the shared cache is L3
- * or higher, simply returns the same core.
- */
-static int
-find_close_core(int cpu, int core_offset)
-{
- struct cpu_group *grp;
- int i;
- int fcpu;
- cpuset_t cs;
-
- grp = cpu_top;
- if (grp == NULL)
- return cpu;
- i = 0;
- while ((i = find_child_with_core(cpu, grp)) != -1) {
- /* If the child only has one cpu, don't descend */
- if (grp->cg_child[i].cg_count <= 1)
- break;
- grp = &grp->cg_child[i];
- }
-
- /* If they don't share at least an L2 cache, use the same CPU */
- if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
- return cpu;
-
- /* Now pick one */
- CPU_COPY(&grp->cg_mask, &cs);
-
- /* Add the selected CPU offset to core offset. */
- for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) {
- if (fcpu - 1 == cpu)
- break;
- CPU_CLR(fcpu - 1, &cs);
- }
- MPASS(fcpu);
-
- core_offset += i;
-
- CPU_COPY(&grp->cg_mask, &cs);
- for (i = core_offset % grp->cg_count; i > 0; i--) {
- MPASS(CPU_FFS(&cs));
- CPU_CLR(CPU_FFS(&cs) - 1, &cs);
- }
- MPASS(CPU_FFS(&cs));
- return CPU_FFS(&cs) - 1;
-}
-#else
-static int
-find_close_core(int cpu, int core_offset __unused)
-{
- return cpu;
-}
-#endif
-
-static int
-get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid)
-{
- switch (type) {
- case IFLIB_INTR_TX:
- /* TX queues get cores which share at least an L2 cache with the corresponding RX queue */
- /* XXX handle multiple RX threads per core and more than two core per L2 group */
- return qid / CPU_COUNT(&ctx->ifc_cpus) + 1;
- case IFLIB_INTR_RX:
- case IFLIB_INTR_RXTX:
- /* RX queues get the specified core */
- return qid / CPU_COUNT(&ctx->ifc_cpus);
- default:
- return -1;
- }
-}
-#else
-#define get_core_offset(ctx, type, qid) CPU_FIRST()
-#define find_close_core(cpuid, tid) CPU_FIRST()
-#define find_nth(ctx, gid) CPU_FIRST()
-#endif
-
/* Just to avoid copy/paste */
static inline int
iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
@@ -6099,21 +6231,14 @@ iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
const char *name)
{
device_t dev;
- int co, cpuid, err, tid;
+ unsigned int base_cpuid, cpuid;
+ int err;
dev = ctx->ifc_dev;
- co = ctx->ifc_sysctl_core_offset;
- if (ctx->ifc_sysctl_separate_txrx && type == IFLIB_INTR_TX)
- co += ctx->ifc_softc_ctx.isc_nrxqsets;
- cpuid = find_nth(ctx, qid + co);
- tid = get_core_offset(ctx, type, qid);
- if (tid < 0) {
- device_printf(dev, "get_core_offset failed\n");
- return (EOPNOTSUPP);
- }
- cpuid = find_close_core(cpuid, tid);
- err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev, irq->ii_res,
- name);
+ base_cpuid = ctx->ifc_sysctl_core_offset;
+ cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
+ err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
+ irq ? irq->ii_res : NULL, name);
if (err) {
device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
return (err);
@@ -6202,8 +6327,8 @@ iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
return (0);
if (tqrid != -1) {
- err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg,
- q, name);
+ err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
+ name);
if (err)
return (err);
} else {
@@ -6216,6 +6341,7 @@ iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
void
iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
{
+ device_t dev;
struct grouptask *gtask;
struct taskqgroup *tqg;
gtask_fn_t *fn;
@@ -6247,14 +6373,11 @@ iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
default:
panic("unknown net intr type");
}
- if (irq != NULL) {
- err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg,
- q, name);
- if (err)
- taskqgroup_attach(tqg, gtask, q, ctx->ifc_dev,
- irq->ii_res, name);
- } else {
- taskqgroup_attach(tqg, gtask, q, NULL, NULL, name);
+ err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
+ if (err) {
+ dev = ctx->ifc_dev;
+ taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
+ name);
}
}
@@ -6736,6 +6859,9 @@ iflib_add_device_sysctl_pre(if_ctx_t ctx)
SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
"use separate cores for TX and RX");
+ SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
+ CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
+ "try to make use of logical cores for TX and RX");
/* XXX change for per-queue sizes */
SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
@@ -6780,6 +6906,9 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
queue_list = SYSCTL_CHILDREN(queue_node);
+ SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
+ CTLFLAG_RD,
+ &txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
#if MEMORY_LOGGING
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
CTLFLAG_RD,
@@ -6862,6 +6991,9 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
queue_list = SYSCTL_CHILDREN(queue_node);
+ SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
+ CTLFLAG_RD,
+ &rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
CTLFLAG_RD,
diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h
index c48a0440d808..1a96bb4766ce 100644
--- a/sys/sys/cpuset.h
+++ b/sys/sys/cpuset.h
@@ -65,6 +65,7 @@
#define CPU_OR_ATOMIC(d, s) BIT_OR_ATOMIC(CPU_SETSIZE, d, s)
#define CPU_COPY_STORE_REL(f, t) BIT_COPY_STORE_REL(CPU_SETSIZE, f, t)
#define CPU_FFS(p) BIT_FFS(CPU_SETSIZE, p)
+#define CPU_FLS(p) BIT_FLS(CPU_SETSIZE, p)
#define CPU_COUNT(p) ((int)BIT_COUNT(CPU_SETSIZE, p))
#define CPUSET_FSET BITSET_FSET(_NCPUWORDS)
#define CPUSET_T_INITIALIZER BITSET_T_INITIALIZER