aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Roberson <jeff@FreeBSD.org>2020-01-31 00:49:51 +0000
committerJeff Roberson <jeff@FreeBSD.org>2020-01-31 00:49:51 +0000
commitd4665eaa663886465163c4c40133a3278bb59ea4 (patch)
tree56b3821745ada1e4eb7238b59c0d4b5539d59887
parent5a02cd314d7cff8bcecb38c647255b99888275a9 (diff)
downloadsrc-d4665eaa663886465163c4c40133a3278bb59ea4.tar.gz
src-d4665eaa663886465163c4c40133a3278bb59ea4.zip
Implement a safe memory reclamation feature that is tightly coupled with UMA.
This is in the same family of algorithms as Epoch/QSBR/RCU/PARSEC but is a unique algorithm. This has 3x the performance of epoch in a write heavy workload with less than half of the read side cost. The memory overhead is significantly lessened by limiting the free-to-use latency. A synthetic test uses 1/20th of the memory vs Epoch. There is significant further discussion in the comments and code review. This code should be considered experimental. I will write a man page after it has settled. After further validation the VM will begin using this feature to permit lockless page lookups. Both markj and cperciva tested on arm64 at large core counts to verify fences on weaker ordering architectures. I will commit a stress testing tool in a follow-up. Reviewed by: mmacy, markj, rlibby, hselasky Discussed with: sbahara Differential Revision: https://reviews.freebsd.org/D22586
Notes
Notes: svn path=/head/; revision=357314
-rw-r--r--sys/conf/files1
-rw-r--r--sys/kern/subr_smr.c387
-rw-r--r--sys/sys/_smr.h38
-rw-r--r--sys/sys/smr.h180
-rw-r--r--sys/vm/uma.h41
-rw-r--r--sys/vm/uma_core.c422
-rw-r--r--sys/vm/uma_int.h10
7 files changed, 972 insertions, 107 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 7de5055b5f00..09282c8acb53 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3848,6 +3848,7 @@ kern/subr_scanf.c standard
kern/subr_sglist.c standard
kern/subr_sleepqueue.c standard
kern/subr_smp.c standard
+kern/subr_smr.c standard
kern/subr_stack.c optional ddb | stack | ktr
kern/subr_stats.c optional stats
kern/subr_taskqueue.c standard
diff --git a/sys/kern/subr_smr.c b/sys/kern/subr_smr.c
new file mode 100644
index 000000000000..24df86651043
--- /dev/null
+++ b/sys/kern/subr_smr.c
@@ -0,0 +1,387 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 Jeffrey Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/smr.h>
+
+#include <vm/uma.h>
+
+/*
+ * This is a novel safe memory reclamation technique inspired by
+ * epoch based reclamation from Samy Al Bahra's concurrency kit which
+ * in turn was based on work described in:
+ * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
+ * of Cambridge Computing Laboratory.
+ * And shares some similarities with:
+ * Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level
+ * Data-Structures
+ *
+ * This is not an implementation of hazard pointers or related
+ * techniques. The term safe memory reclamation is used as a
+ * generic descriptor for algorithms that defer frees to avoid
+ * use-after-free errors with lockless datastructures.
+ *
+ * The basic approach is to maintain a monotonic write sequence
+ * number that is updated on some application defined granularity.
+ * Readers record the most recent write sequence number they have
+ * observed. A shared read sequence number records the lowest
+ * sequence number observed by any reader as of the last poll. Any
+ * write older than this value has been observed by all readers
+ * and memory can be reclaimed. Like Epoch we also detect idle
+ * readers by storing an invalid sequence number in the per-cpu
+ * state when the read section exits. Like Parsec we establish
+ * a global write clock that is used to mark memory on free.
+ *
+ * The write and read sequence numbers can be thought of as a two
+ * handed clock with readers always advancing towards writers. SMR
+ * maintains the invariant that all readers can safely access memory
+ * that was visible at the time they loaded their copy of the sequence
+ * number. Periodically the read sequence or hand is polled and
+ * advanced as far towards the write sequence as active readers allow.
+ * Memory which was freed between the old and new global read sequence
+ * number can now be reclaimed. When the system is idle the two hands
+ * meet and no deferred memory is outstanding. Readers never advance
+ * any sequence number, they only observe them. The shared read
+ * sequence number is consequently never higher than the write sequence.
+ * A stored sequence number that falls outside of this range has expired
+ * and needs no scan to reclaim.
+ *
+ * A notable distinction between this SMR and Epoch, qsbr, rcu, etc. is
+ * that advancing the sequence number is decoupled from detecting its
+ * observation. This results in a more granular assignment of sequence
+ * numbers even as read latencies prohibit all or some expiration.
+ * It also allows writers to advance the sequence number and save the
+ * poll for expiration until a later time when it is likely to
+ * complete without waiting. The batch granularity and free-to-use
+ * latency is dynamic and can be significantly smaller than in more
+ * strict systems.
+ *
+ * This mechanism is primarily intended to be used in coordination with
+ * UMA. By integrating with the allocator we avoid all of the callout
+ * queue machinery and are provided with an efficient way to batch
+ * sequence advancement and waiting. The allocator accumulates a full
+ * per-cpu cache of memory before advancing the sequence. It then
+ * delays waiting for this sequence to expire until the memory is
+ * selected for reuse. In this way we only increment the sequence
+ * value once for n=cache-size frees and the waits are done long
+ * after the sequence has been expired so they need only be verified
+ * to account for pathological conditions and to advance the read
+ * sequence. Tying the sequence number to the bucket size has the
+ * nice property that as the zone gets busier the buckets get larger
+ * and the sequence writes become fewer. If the coherency of advancing
+ * the write sequence number becomes too costly we can advance
+ * it for every N buckets in exchange for higher free-to-use
+ * latency and consequently higher memory consumption.
+ *
+ * If the read overhead of accessing the shared cacheline becomes
+ * especially burdensome an invariant TSC could be used in place of the
+ * sequence. The algorithm would then only need to maintain the minimum
+ * observed tsc. This would trade potential cache synchronization
+ * overhead for local serialization and cpu timestamp overhead.
+ */
+
+/*
+ * A simplified diagram:
+ *
+ * 0 UINT_MAX
+ * | -------------------- sequence number space -------------------- |
+ * ^ rd seq ^ wr seq
+ * | ----- valid sequence numbers ---- |
+ * ^cpuA ^cpuC
+ * | -- free -- | --------- deferred frees -------- | ---- free ---- |
+ *
+ *
+ * In this example cpuA has the lowest sequence number and poll can
+ * advance rd seq. cpuB is not running and is considered to observe
+ * wr seq.
+ *
+ * Freed memory that is tagged with a sequence number between rd seq and
+ * wr seq can not be safely reclaimed because cpuA may hold a reference to
+ * it. Any other memory is guaranteed to be unreferenced.
+ *
+ * Any writer is free to advance wr seq at any time however it may busy
+ * poll in pathological cases.
+ */
+
+static uma_zone_t smr_shared_zone;
+static uma_zone_t smr_zone;
+
+#ifndef INVARIANTS
+#define SMR_SEQ_INIT 1 /* All valid sequence numbers are odd. */
+#define SMR_SEQ_INCR 2
+
+/*
+ * SMR_SEQ_MAX_DELTA is the maximum distance allowed between rd_seq and
+ * wr_seq. For the modular arithmetic to work a value of UNIT_MAX / 2
+ * would be possible but it is checked after we increment the wr_seq so
+ * a safety margin is left to prevent overflow.
+ *
+ * We will block until SMR_SEQ_MAX_ADVANCE sequence numbers have progressed
+ * to prevent integer wrapping. See smr_advance() for more details.
+ */
+#define SMR_SEQ_MAX_DELTA (UINT_MAX / 4)
+#define SMR_SEQ_MAX_ADVANCE (SMR_SEQ_MAX_DELTA - 1024)
+#else
+/* We want to test the wrapping feature in invariants kernels. */
+#define SMR_SEQ_INCR (UINT_MAX / 10000)
+#define SMR_SEQ_INIT (UINT_MAX - 100000)
+/* Force extra polls to test the integer overflow detection. */
+#define SMR_SEQ_MAX_DELTA (1000)
+#define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2
+#endif
+
+/*
+ * Advance the write sequence and return the new value for use as the
+ * wait goal. This guarantees that any changes made by the calling
+ * thread prior to this call will be visible to all threads after
+ * rd_seq meets or exceeds the return value.
+ *
+ * This function may busy loop if the readers are roughly 1 billion
+ * sequence numbers behind the writers.
+ */
+smr_seq_t
+smr_advance(smr_t smr)
+{
+ smr_shared_t s;
+ smr_seq_t goal;
+
+ /*
+ * It is illegal to enter while in an smr section.
+ */
+ KASSERT(curthread->td_critnest == 0,
+ ("smr_advance: Not allowed in a critical section."));
+
+ /*
+ * Modifications not done in a smr section need to be visible
+ * before advancing the seq.
+ */
+ atomic_thread_fence_rel();
+
+ /*
+ * Increment the shared write sequence by 2. Since it is
+ * initialized to 1 this means the only valid values are
+ * odd and an observed value of 0 in a particular CPU means
+ * it is not currently in a read section.
+ */
+ s = smr->c_shared;
+ goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR;
+
+ /*
+ * Force a synchronization here if the goal is getting too
+ * far ahead of the read sequence number. This keeps the
+ * wrap detecting arithmetic working in pathological cases.
+ */
+ if (goal - atomic_load_int(&s->s_rd_seq) >= SMR_SEQ_MAX_DELTA)
+ smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
+
+ return (goal);
+}
+
+/*
+ * Poll to determine whether all readers have observed the 'goal' write
+ * sequence number.
+ *
+ * If wait is true this will spin until the goal is met.
+ *
+ * This routine will updated the minimum observed read sequence number in
+ * s_rd_seq if it does a scan. It may not do a scan if another call has
+ * advanced s_rd_seq beyond the callers goal already.
+ *
+ * Returns true if the goal is met and false if not.
+ */
+bool
+smr_poll(smr_t smr, smr_seq_t goal, bool wait)
+{
+ smr_shared_t s;
+ smr_t c;
+ smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq;
+ int i;
+ bool success;
+
+ /*
+ * It is illegal to enter while in an smr section.
+ */
+ KASSERT(!wait || curthread->td_critnest == 0,
+ ("smr_poll: Blocking not allowed in a critical section."));
+
+ /*
+ * Use a critical section so that we can avoid ABA races
+ * caused by long preemption sleeps.
+ */
+ success = true;
+ critical_enter();
+ s = smr->c_shared;
+
+ /*
+ * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
+ * observe an updated read sequence that is larger than write.
+ */
+ s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+ s_wr_seq = smr_current(smr);
+
+ /*
+ * Detect whether the goal is valid and has already been observed.
+ *
+ * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
+ * it to be valid. If it is not then the caller held on to it and
+ * the integer wrapped. If we wrapped back within range the caller
+ * will harmlessly scan.
+ *
+ * A valid goal must be greater than s_rd_seq or we have not verified
+ * that it has been observed and must fall through to polling.
+ */
+ if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal))
+ goto out;
+
+ /*
+ * Loop until all cores have observed the goal sequence or have
+ * gone inactive. Keep track of the oldest sequence currently
+ * active as rd_seq.
+ */
+ rd_seq = s_wr_seq;
+ CPU_FOREACH(i) {
+ c = zpcpu_get_cpu(smr, i);
+ c_seq = SMR_SEQ_INVALID;
+ for (;;) {
+ c_seq = atomic_load_int(&c->c_seq);
+ if (c_seq == SMR_SEQ_INVALID)
+ break;
+
+ /*
+ * There is a race described in smr.h:smr_enter that
+ * can lead to a stale seq value but not stale data
+ * access. If we find a value out of range here we
+ * pin it to the current min to prevent it from
+ * advancing until that stale section has expired.
+ *
+ * The race is created when a cpu loads the s_wr_seq
+ * value in a local register and then another thread
+ * advances s_wr_seq and calls smr_poll() which will
+ * oberve no value yet in c_seq and advance s_rd_seq
+ * up to s_wr_seq which is beyond the register
+ * cached value. This is only likely to happen on
+ * hypervisor or with a system management interrupt.
+ */
+ if (SMR_SEQ_LT(c_seq, s_rd_seq))
+ c_seq = s_rd_seq;
+
+ /*
+ * If the sequence number meets the goal we are
+ * done with this cpu.
+ */
+ if (SMR_SEQ_GEQ(c_seq, goal))
+ break;
+
+ /*
+ * If we're not waiting we will still scan the rest
+ * of the cpus and update s_rd_seq before returning
+ * an error.
+ */
+ if (!wait) {
+ success = false;
+ break;
+ }
+ cpu_spinwait();
+ }
+
+ /*
+ * Limit the minimum observed rd_seq whether we met the goal
+ * or not.
+ */
+ if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq))
+ rd_seq = c_seq;
+ }
+
+ /*
+ * Advance the rd_seq as long as we observed the most recent one.
+ */
+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
+ do {
+ if (SMR_SEQ_LEQ(rd_seq, s_rd_seq))
+ break;
+ } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0);
+
+out:
+ critical_exit();
+
+ return (success);
+}
+
+smr_t
+smr_create(const char *name)
+{
+ smr_t smr, c;
+ smr_shared_t s;
+ int i;
+
+ s = uma_zalloc(smr_shared_zone, M_WAITOK);
+ smr = uma_zalloc(smr_zone, M_WAITOK);
+
+ s->s_name = name;
+ s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+
+ /* Initialize all CPUS, not just those running. */
+ for (i = 0; i <= mp_maxid; i++) {
+ c = zpcpu_get_cpu(smr, i);
+ c->c_seq = SMR_SEQ_INVALID;
+ c->c_shared = s;
+ }
+ atomic_thread_fence_seq_cst();
+
+ return (smr);
+}
+
+void
+smr_destroy(smr_t smr)
+{
+
+ smr_synchronize(smr);
+ uma_zfree(smr_shared_zone, smr->c_shared);
+ uma_zfree(smr_zone, smr);
+}
+
+/*
+ * Initialize the UMA slab zone.
+ */
+void
+smr_init(void)
+{
+
+ smr_shared_zone = uma_zcreate("SMR SHARED", sizeof(struct smr_shared),
+ NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, 0);
+ smr_zone = uma_zcreate("SMR CPU", sizeof(struct smr),
+ NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, UMA_ZONE_PCPU);
+}
diff --git a/sys/sys/_smr.h b/sys/sys/_smr.h
new file mode 100644
index 000000000000..0a8804d3b4ac
--- /dev/null
+++ b/sys/sys/_smr.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _SYS__SMR_H_
+#define _SYS__SMR_H_
+
+typedef uint32_t smr_seq_t;
+typedef struct smr *smr_t;
+
+#endif /* __SYS_SMR_H_ */
diff --git a/sys/sys/smr.h b/sys/sys/smr.h
new file mode 100644
index 000000000000..4a6d8671bb7a
--- /dev/null
+++ b/sys/sys/smr.h
@@ -0,0 +1,180 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _SYS_SMR_H_
+#define _SYS_SMR_H_
+
+#include <sys/_smr.h>
+
+/*
+ * Safe memory reclamation. See subr_smr.c for a description of the
+ * algorithm.
+ *
+ * Readers synchronize with smr_enter()/exit() and writers may either
+ * free directly to a SMR UMA zone or use smr_synchronize or wait.
+ */
+
+/*
+ * Modular arithmetic for comparing sequence numbers that have
+ * potentially wrapped. Copied from tcp_seq.h.
+ */
+#define SMR_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0)
+#define SMR_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0)
+#define SMR_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0)
+#define SMR_SEQ_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
+
+#define SMR_SEQ_INVALID 0
+
+/* Shared SMR state. */
+struct smr_shared {
+ const char *s_name; /* Name for debugging/reporting. */
+ smr_seq_t s_wr_seq; /* Current write sequence #. */
+ smr_seq_t s_rd_seq; /* Minimum observed read sequence. */
+};
+typedef struct smr_shared *smr_shared_t;
+
+/* Per-cpu SMR state. */
+struct smr {
+ smr_seq_t c_seq; /* Current observed sequence. */
+ smr_shared_t c_shared; /* Shared SMR state. */
+};
+
+/*
+ * Return the current write sequence number.
+ */
+static inline smr_seq_t
+smr_current(smr_t smr)
+{
+
+ return (atomic_load_int(&smr->c_shared->s_wr_seq));
+}
+
+/*
+ * Enter a read section.
+ */
+static inline void
+smr_enter(smr_t smr)
+{
+
+ critical_enter();
+ smr = zpcpu_get(smr);
+ KASSERT(smr->c_seq == 0,
+ ("smr_enter(%s) does not support recursion.",
+ smr->c_shared->s_name));
+
+ /*
+ * Store the current observed write sequence number in our
+ * per-cpu state so that it can be queried via smr_poll().
+ * Frees that are newer than this stored value will be
+ * deferred until we call smr_exit().
+ *
+ * An acquire barrier is used to synchronize with smr_exit()
+ * and smr_poll().
+ *
+ * It is possible that a long delay between loading the wr_seq
+ * and storing the c_seq could create a situation where the
+ * rd_seq advances beyond our stored c_seq. In this situation
+ * only the observed wr_seq is stale, the fence still orders
+ * the load. See smr_poll() for details on how this condition
+ * is detected and handled there.
+ */
+ /* This is an add because we do not have atomic_store_acq_int */
+ atomic_add_acq_int(&smr->c_seq, smr_current(smr));
+}
+
+/*
+ * Exit a read section.
+ */
+static inline void
+smr_exit(smr_t smr)
+{
+
+ smr = zpcpu_get(smr);
+ CRITICAL_ASSERT(curthread);
+ KASSERT(smr->c_seq != SMR_SEQ_INVALID,
+ ("smr_exit(%s) not in a smr section.", smr->c_shared->s_name));
+
+ /*
+ * Clear the recorded sequence number. This allows poll() to
+ * detect CPUs not in read sections.
+ *
+ * Use release semantics to retire any stores before the sequence
+ * number is cleared.
+ */
+ atomic_store_rel_int(&smr->c_seq, SMR_SEQ_INVALID);
+ critical_exit();
+}
+
+/*
+ * Advances the write sequence number. Returns the sequence number
+ * required to ensure that all modifications are visible to readers.
+ */
+smr_seq_t smr_advance(smr_t smr);
+
+/*
+ * Returns true if a goal sequence has been reached. If
+ * wait is true this will busy loop until success.
+ */
+bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);
+
+/* Create a new SMR context. */
+smr_t smr_create(const char *name);
+void smr_destroy(smr_t smr);
+
+/*
+ * Blocking wait for all readers to observe 'goal'.
+ */
+static inline bool
+smr_wait(smr_t smr, smr_seq_t goal)
+{
+
+ return (smr_poll(smr, goal, true));
+}
+
+/*
+ * Synchronize advances the write sequence and returns when all
+ * readers have observed it.
+ *
+ * If your application can cache a sequence number returned from
+ * smr_advance() and poll or wait at a later time there will
+ * be less chance of busy looping while waiting for readers.
+ */
+static inline void
+smr_synchronize(smr_t smr)
+{
+
+ smr_wait(smr, smr_advance(smr));
+}
+
+/* Only at startup. */
+void smr_init(void);
+
+#endif /* _SYS_SMR_H_ */
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index 4df225104682..77cc24077d1a 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -40,6 +40,7 @@
#include <sys/param.h> /* For NULL */
#include <sys/malloc.h> /* For M_* */
+#include <sys/_smr.h>
/* User visible parameters */
#define UMA_SMALLEST_UNIT 8 /* Smallest item allocated */
@@ -259,16 +260,27 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
* mini-dumps.
*/
#define UMA_ZONE_PCPU 0x8000 /*
- * Allocates mp_maxid + 1 slabs of PAGE_SIZE
+ * Allocates mp_maxid + 1 slabs of
+ * PAGE_SIZE
*/
#define UMA_ZONE_FIRSTTOUCH 0x10000 /* First touch NUMA policy */
#define UMA_ZONE_ROUNDROBIN 0x20000 /* Round-robin NUMA policy. */
+#define UMA_ZONE_SMR 0x40000 /*
+ * Safe memory reclamation defers
+ * frees until all read sections
+ * have exited. This flag creates
+ * a unique SMR context for this
+ * zone. To share contexts see
+ * uma_zone_set_smr() below.
+ *
+ * See sys/smr.h for more details.
+ */
/* In use by UMA_ZFLAGs: 0xffe00000 */
/*
- * These flags are shared between the keg and zone. In zones wishing to add
- * new kegs these flags must be compatible. Some are determined based on
- * physical parameters of the request and may not be provided by the consumer.
+ * These flags are shared between the keg and zone. Some are determined
+ * based on physical parameters of the request and may not be provided by
+ * the consumer.
*/
#define UMA_ZONE_INHERIT \
(UMA_ZONE_NOTOUCH | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \
@@ -310,8 +322,13 @@ void uma_zdestroy(uma_zone_t zone);
*/
void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags);
+
+/* Allocate per-cpu data. Access the correct data with zpcpu_get(). */
void *uma_zalloc_pcpu_arg(uma_zone_t zone, void *arg, int flags);
+/* Use with SMR zones. */
+void *uma_zalloc_smr(uma_zone_t zone, int flags);
+
/*
* Allocate an item from a specific NUMA domain. This uses a slow path in
* the allocator but is guaranteed to allocate memory from the requested
@@ -359,8 +376,13 @@ uma_zalloc_pcpu(uma_zone_t zone, int flags)
*/
void uma_zfree_arg(uma_zone_t zone, void *item, void *arg);
+
+/* Use with PCPU zones. */
void uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *arg);
+/* Use with SMR zones. */
+void uma_zfree_smr(uma_zone_t zone, void *item);
+
/*
* Frees an item back to the specified zone's domain specific pool.
*
@@ -601,6 +623,17 @@ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf);
void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
/*
+ * Associate a zone with a smr context that is allocated after creation
+ * so that multiple zones may share the same context.
+ */
+void uma_zone_set_smr(uma_zone_t zone, smr_t smr);
+
+/*
+ * Fetch the smr context that was set or made in uma_zcreate().
+ */
+smr_t uma_zone_get_smr(uma_zone_t zone);
+
+/*
* These flags are setable in the allocf and visible in the freef.
*/
#define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 1fadc3e17705..fa5cdc3366f1 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sched.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
+#include <sys/smr.h>
#include <sys/taskqueue.h>
#include <sys/vmmeter.h>
@@ -103,6 +104,12 @@ __FBSDID("$FreeBSD$");
#include <machine/md_var.h>
+#ifdef INVARIANTS
+#define UMA_ALWAYS_CTORDTOR 1
+#else
+#define UMA_ALWAYS_CTORDTOR 0
+#endif
+
/*
* This is the zone and keg from which all zones are spawned.
*/
@@ -273,6 +280,8 @@ static int keg_ctor(void *, int, void *, int);
static void keg_dtor(void *, int, void *);
static int zone_ctor(void *, int, void *, int);
static void zone_dtor(void *, int, void *);
+static inline void item_dtor(uma_zone_t zone, void *item, int size,
+ void *udata, enum zfreeskip skip);
static int zero_init(void *, int, int);
static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
@@ -454,9 +463,9 @@ bucket_alloc(uma_zone_t zone, void *udata, int flags)
uma_bucket_t bucket;
/*
- * Don't allocate buckets in low memory situations.
+ * Don't allocate buckets early in boot.
*/
- if (bucketdisable)
+ if (__predict_false(booted < BOOT_KVA))
return (NULL);
/*
@@ -488,6 +497,9 @@ bucket_alloc(uma_zone_t zone, void *udata, int flags)
#endif
bucket->ub_cnt = 0;
bucket->ub_entries = ubz->ubz_entries;
+ bucket->ub_seq = SMR_SEQ_INVALID;
+ CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
+ zone->uz_name, zone, bucket);
}
return (bucket);
@@ -500,6 +512,8 @@ bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
KASSERT(bucket->ub_cnt == 0,
("bucket_free: Freeing a non free bucket."));
+ KASSERT(bucket->ub_seq == SMR_SEQ_INVALID,
+ ("bucket_free: Freeing an SMR bucket."));
if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
udata = (void *)(uintptr_t)zone->uz_flags;
ubz = bucket_zone_lookup(bucket->ub_entries);
@@ -517,23 +531,39 @@ bucket_zone_drain(void)
/*
* Attempt to satisfy an allocation by retrieving a full bucket from one of the
- * zone's caches.
+ * zone's caches. If a bucket is found the zone is not locked on return.
*/
static uma_bucket_t
zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
{
uma_bucket_t bucket;
+ int i;
+ bool dtor = false;
ZONE_LOCK_ASSERT(zone);
- if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
- MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
- TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
- zdom->uzd_nitems -= bucket->ub_cnt;
- if (zdom->uzd_imin > zdom->uzd_nitems)
- zdom->uzd_imin = zdom->uzd_nitems;
- zone->uz_bkt_count -= bucket->ub_cnt;
+ if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
+ return (NULL);
+
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
+ bucket->ub_seq != SMR_SEQ_INVALID) {
+ if (!smr_poll(zone->uz_smr, bucket->ub_seq, false))
+ return (NULL);
+ bucket->ub_seq = SMR_SEQ_INVALID;
+ dtor = (zone->uz_dtor != NULL) | UMA_ALWAYS_CTORDTOR;
}
+ MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
+ TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
+ zdom->uzd_nitems -= bucket->ub_cnt;
+ if (zdom->uzd_imin > zdom->uzd_nitems)
+ zdom->uzd_imin = zdom->uzd_nitems;
+ zone->uz_bkt_count -= bucket->ub_cnt;
+ ZONE_UNLOCK(zone);
+ if (dtor)
+ for (i = 0; i < bucket->ub_cnt; i++)
+ item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
+ NULL, SKIP_NONE);
+
return (bucket);
}
@@ -551,7 +581,7 @@ zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max,
("%s: zone %p overflow", __func__, zone));
- if (ws)
+ if (ws && bucket->ub_seq == SMR_SEQ_INVALID)
TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
else
TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
@@ -941,12 +971,23 @@ bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
if (bucket == NULL || bucket->ub_cnt == 0)
return;
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
+ bucket->ub_seq != SMR_SEQ_INVALID) {
+ smr_wait(zone->uz_smr, bucket->ub_seq);
+ for (i = 0; i < bucket->ub_cnt; i++)
+ item_dtor(zone, bucket->ub_bucket[i],
+ zone->uz_size, NULL, SKIP_NONE);
+ bucket->ub_seq = SMR_SEQ_INVALID;
+ }
if (zone->uz_fini)
for (i = 0; i < bucket->ub_cnt; i++)
zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
if (zone->uz_max_items > 0)
zone_free_limit(zone, bucket->ub_cnt);
+#ifdef INVARIANTS
+ bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
+#endif
bucket->ub_cnt = 0;
}
@@ -1035,12 +1076,21 @@ cache_drain_safe_cpu(uma_zone_t zone, void *unused)
zone_put_bucket(zone, &zone->uz_domain[domain], b1, false);
b1 = NULL;
}
+
+ /*
+ * Don't flush SMR zone buckets. This leaves the zone without a
+ * bucket and forces every free to synchronize().
+ */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ goto out;
b2 = cache_bucket_unload_free(cache);
if (b2 != NULL && b2->ub_cnt != 0) {
zone_put_bucket(zone, &zone->uz_domain[domain], b2, false);
b2 = NULL;
}
b3 = cache_bucket_unload_cross(cache);
+
+out:
critical_exit();
ZONE_UNLOCK(zone);
if (b1)
@@ -1135,7 +1185,7 @@ bucket_cache_reclaim(uma_zone_t zone, bool drain)
target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
zdom->uzd_imin);
while (zdom->uzd_nitems > target) {
- bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist);
+ bucket = TAILQ_FIRST(&zdom->uzd_buckets);
if (bucket == NULL)
break;
tofree = bucket->ub_cnt;
@@ -2294,7 +2344,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
zone->uz_bucket_size = 0;
zone->uz_bucket_size_min = 0;
zone->uz_bucket_size_max = BUCKET_MAX;
- zone->uz_flags = 0;
+ zone->uz_flags = (arg->flags & UMA_ZONE_SMR);
zone->uz_warning = NULL;
/* The domain structures follow the cpu structures. */
zone->uz_domain =
@@ -2375,7 +2425,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
karg.uminit = arg->uminit;
karg.fini = arg->fini;
karg.align = arg->align;
- karg.flags = arg->flags;
+ karg.flags = (arg->flags & ~UMA_ZONE_SMR);
karg.zone = zone;
error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
flags);
@@ -2399,6 +2449,10 @@ out:
zone->uz_fails = EARLY_COUNTER;
}
+ /* Caller requests a private SMR context. */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ zone->uz_smr = smr_create(zone->uz_name);
+
KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
(UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
("Invalid zone flag combination"));
@@ -2600,6 +2654,7 @@ uma_startup1(vm_offset_t virtual_avail)
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
bucket_init();
+ smr_init();
}
#ifndef UMA_MD_SMALL_ALLOC
@@ -2844,14 +2899,9 @@ uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
uma_zfree_arg(zone, item, udata);
}
-#ifdef INVARIANTS
-#define UMA_ALWAYS_CTORDTOR 1
-#else
-#define UMA_ALWAYS_CTORDTOR 0
-#endif
-
-static void *
-item_ctor(uma_zone_t zone, int size, void *udata, int flags, void *item)
+static inline void *
+item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags,
+ void *item)
{
#ifdef INVARIANTS
bool skipdbg;
@@ -2861,7 +2911,9 @@ item_ctor(uma_zone_t zone, int size, void *udata, int flags, void *item)
zone->uz_ctor != trash_ctor)
trash_ctor(item, size, udata, flags);
#endif
- if (__predict_false(zone->uz_ctor != NULL) &&
+ /* Check flags before loading ctor pointer. */
+ if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) &&
+ __predict_false(zone->uz_ctor != NULL) &&
zone->uz_ctor(item, size, udata, flags) != 0) {
counter_u64_add(zone->uz_fails, 1);
zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
@@ -2903,57 +2955,150 @@ item_dtor(uma_zone_t zone, void *item, int size, void *udata,
}
}
-/* See uma.h */
-void *
-uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
+#if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS)
+#define UMA_ZALLOC_DEBUG
+static int
+uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags)
{
- uma_cache_bucket_t bucket;
- uma_cache_t cache;
- void *item;
- int domain, size, uz_flags;
-
- /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
- random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
-
- /* This is the fast path allocation */
- CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
- zone, flags);
+ int error;
+ error = 0;
#ifdef WITNESS
if (flags & M_WAITOK) {
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
+ "uma_zalloc_debug: zone \"%s\"", zone->uz_name);
}
#endif
#ifdef INVARIANTS
- KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
+ KASSERT((flags & M_EXEC) == 0,
+ ("uma_zalloc_debug: called with M_EXEC"));
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
- ("uma_zalloc_arg: called with spinlock or critical section held"));
- if (zone->uz_flags & UMA_ZONE_PCPU)
- KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
- "with M_ZERO passed"));
+ ("uma_zalloc_debug: called within spinlock or critical section"));
+ KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0,
+ ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO"));
#endif
#ifdef DEBUG_MEMGUARD
- if (memguard_cmp_zone(zone)) {
+ if ((zone->uz_flags & UMA_ZONE_SMR == 0) && memguard_cmp_zone(zone)) {
+ void *item;
item = memguard_alloc(zone->uz_size, flags);
if (item != NULL) {
+ error = EJUSTRETURN;
if (zone->uz_init != NULL &&
- zone->uz_init(item, zone->uz_size, flags) != 0)
- return (NULL);
+ zone->uz_init(item, zone->uz_size, flags) != 0) {
+ *itemp = NULL;
+ return (error);
+ }
if (zone->uz_ctor != NULL &&
zone->uz_ctor(item, zone->uz_size, udata,
flags) != 0) {
counter_u64_add(zone->uz_fails, 1);
zone->uz_fini(item, zone->uz_size);
- return (NULL);
+ *itemp = NULL;
+ return (error);
}
- return (item);
+ *itemp = item;
+ return (error);
}
/* This is unfortunate but should not be fatal. */
}
#endif
+ return (error);
+}
+
+static int
+uma_zfree_debug(uma_zone_t zone, void *item, void *udata)
+{
+ KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+ ("uma_zfree_debug: called with spinlock or critical section held"));
+
+#ifdef DEBUG_MEMGUARD
+ if ((zone->uz_flags & UMA_ZONE_SMR == 0) && is_memguard_addr(item)) {
+ if (zone->uz_dtor != NULL)
+ zone->uz_dtor(item, zone->uz_size, udata);
+ if (zone->uz_fini != NULL)
+ zone->uz_fini(item, zone->uz_size);
+ memguard_free(item);
+ return (EJUSTRETURN);
+ }
+#endif
+ return (0);
+}
+#endif
+
+static __noinline void *
+uma_zalloc_single(uma_zone_t zone, void *udata, int flags)
+{
+ int domain;
+
+ /*
+ * We can not get a bucket so try to return a single item.
+ */
+ if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH)
+ domain = PCPU_GET(domain);
+ else
+ domain = UMA_ANYDOMAIN;
+ return (zone_alloc_item(zone, udata, domain, flags));
+}
+
+/* See uma.h */
+void *
+uma_zalloc_smr(uma_zone_t zone, int flags)
+{
+ uma_cache_bucket_t bucket;
+ uma_cache_t cache;
+ void *item;
+ int size, uz_flags;
+
+#ifdef UMA_ZALLOC_DEBUG
+ KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
+ ("uma_zalloc_arg: called with non-SMR zone.\n"));
+ if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN)
+ return (item);
+#endif
+
+ critical_enter();
+ do {
+ cache = &zone->uz_cpu[curcpu];
+ bucket = &cache->uc_allocbucket;
+ size = cache_uz_size(cache);
+ uz_flags = cache_uz_flags(cache);
+ if (__predict_true(bucket->ucb_cnt != 0)) {
+ item = cache_bucket_pop(cache, bucket);
+ critical_exit();
+ return (item_ctor(zone, uz_flags, size, NULL, flags,
+ item));
+ }
+ } while (cache_alloc(zone, cache, NULL, flags));
+ critical_exit();
+
+ return (uma_zalloc_single(zone, NULL, flags));
+}
+
+/* See uma.h */
+void *
+uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
+{
+ uma_cache_bucket_t bucket;
+ uma_cache_t cache;
+ void *item;
+ int size, uz_flags;
+
+ /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
+
+ /* This is the fast path allocation */
+ CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
+ zone, flags);
+
+#ifdef UMA_ZALLOC_DEBUG
+ KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
+ ("uma_zalloc_arg: called with SMR zone.\n"));
+ if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN)
+ return (item);
+#endif
+
/*
* If possible, allocate from the per-CPU cache. There are two
* requirements for safe access to the per-CPU cache: (1) the thread
@@ -2974,24 +3119,13 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
if (__predict_true(bucket->ucb_cnt != 0)) {
item = cache_bucket_pop(cache, bucket);
critical_exit();
- if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 ||
- UMA_ALWAYS_CTORDTOR))
- return (item_ctor(zone, size, udata, flags, item));
- if (flags & M_ZERO)
- bzero(item, size);
- return (item);
+ return (item_ctor(zone, uz_flags, size, udata, flags,
+ item));
}
} while (cache_alloc(zone, cache, udata, flags));
critical_exit();
- /*
- * We can not get a bucket so try to return a single item.
- */
- if (uz_flags & UMA_ZONE_FIRSTTOUCH)
- domain = PCPU_GET(domain);
- else
- domain = UMA_ANYDOMAIN;
- return (zone_alloc_item(zone, udata, domain, flags));
+ return (uma_zalloc_single(zone, udata, flags));
}
/*
@@ -3014,9 +3148,14 @@ cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
/*
* If we have run out of items in our alloc bucket see
* if we can switch with the free bucket.
+ *
+ * SMR Zones can't re-use the free bucket until the sequence has
+ * expired.
*/
- if (cache->uc_freebucket.ucb_cnt != 0) {
- cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket);
+ if ((zone->uz_flags & UMA_ZONE_SMR) == 0 &&
+ cache->uc_freebucket.ucb_cnt != 0) {
+ cache_bucket_swap(&cache->uc_freebucket,
+ &cache->uc_allocbucket);
return (true);
}
@@ -3070,7 +3209,6 @@ cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
}
if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
- ZONE_UNLOCK(zone);
KASSERT(bucket->ub_cnt != 0,
("uma_zalloc_arg: Returning an empty bucket."));
cache_bucket_load_alloc(cache, bucket);
@@ -3607,7 +3745,8 @@ zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
goto fail_cnt;
}
}
- item = item_ctor(zone, zone->uz_size, udata, flags, item);
+ item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags,
+ item);
if (item == NULL)
goto fail;
@@ -3630,6 +3769,54 @@ fail:
/* See uma.h */
void
+uma_zfree_smr(uma_zone_t zone, void *item)
+{
+ uma_cache_t cache;
+ uma_cache_bucket_t bucket;
+ int domain, itemdomain, uz_flags;
+
+#ifdef UMA_ZALLOC_DEBUG
+ KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
+ ("uma_zfree_smr: called with non-SMR zone.\n"));
+ KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer."));
+ if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN)
+ return;
+#endif
+ cache = &zone->uz_cpu[curcpu];
+ uz_flags = cache_uz_flags(cache);
+ domain = itemdomain = 0;
+#ifdef NUMA
+ if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
+ itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
+#endif
+ critical_enter();
+ do {
+ cache = &zone->uz_cpu[curcpu];
+ /* SMR Zones must free to the free bucket. */
+ bucket = &cache->uc_freebucket;
+#ifdef NUMA
+ domain = PCPU_GET(domain);
+ if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
+ domain != itemdomain) {
+ bucket = &cache->uc_crossbucket;
+ }
+#endif
+ if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
+ cache_bucket_push(cache, bucket, item);
+ critical_exit();
+ return;
+ }
+ } while (cache_free(zone, cache, NULL, item, itemdomain));
+ critical_exit();
+
+ /*
+ * If nothing else caught this, we'll just do an internal free.
+ */
+ zone_free_item(zone, item, NULL, SKIP_NONE);
+}
+
+/* See uma.h */
+void
uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
{
uma_cache_t cache;
@@ -3641,22 +3828,15 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone);
- KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
- ("uma_zfree_arg: called with spinlock or critical section held"));
-
+#ifdef UMA_ZALLOC_DEBUG
+ KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
+ ("uma_zfree_arg: called with SMR zone.\n"));
+ if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN)
+ return;
+#endif
/* uma_zfree(..., NULL) does nothing, to match free(9). */
if (item == NULL)
return;
-#ifdef DEBUG_MEMGUARD
- if (is_memguard_addr(item)) {
- if (zone->uz_dtor != NULL)
- zone->uz_dtor(item, zone->uz_size, udata);
- if (zone->uz_fini != NULL)
- zone->uz_fini(item, zone->uz_size);
- memguard_free(item);
- return;
- }
-#endif
/*
* We are accessing the per-cpu cache without a critical section to
@@ -3665,8 +3845,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
*/
cache = &zone->uz_cpu[curcpu];
uz_flags = cache_uz_flags(cache);
- if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 ||
- UMA_ALWAYS_CTORDTOR))
+ if (UMA_ALWAYS_CTORDTOR ||
+ __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0))
item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
/*
@@ -3697,6 +3877,13 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
critical_enter();
do {
cache = &zone->uz_cpu[curcpu];
+ /*
+ * Try to free into the allocbucket first to give LIFO
+ * ordering for cache-hot datastructures. Spill over
+ * into the freebucket if necessary. Alloc will swap
+ * them if one runs dry.
+ */
+ bucket = &cache->uc_allocbucket;
#ifdef NUMA
domain = PCPU_GET(domain);
if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
@@ -3704,18 +3891,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
bucket = &cache->uc_crossbucket;
} else
#endif
- {
- /*
- * Try to free into the allocbucket first to give LIFO
- * ordering for cache-hot datastructures. Spill over
- * into the freebucket if necessary. Alloc will swap
- * them if one runs dry.
- */
- bucket = &cache->uc_allocbucket;
- if (__predict_false(bucket->ucb_cnt >=
- bucket->ucb_entries))
- bucket = &cache->uc_freebucket;
- }
+ if (bucket->ucb_cnt >= bucket->ucb_entries)
+ bucket = &cache->uc_freebucket;
if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
cache_bucket_push(cache, bucket, item);
critical_exit();
@@ -3778,6 +3955,8 @@ zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
if (!TAILQ_EMPTY(&fullbuckets)) {
ZONE_LOCK(zone);
while ((b = TAILQ_FIRST(&fullbuckets)) != NULL) {
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ bucket->ub_seq = smr_current(zone->uz_smr);
TAILQ_REMOVE(&fullbuckets, b, ub_link);
if (zone->uz_bkt_count >= zone->uz_bkt_max) {
ZONE_UNLOCK(zone);
@@ -3796,6 +3975,7 @@ zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
}
if (bucket->ub_cnt != 0)
bucket_drain(zone, bucket);
+ bucket->ub_seq = SMR_SEQ_INVALID;
bucket_free(zone, bucket, udata);
}
#endif
@@ -3862,15 +4042,16 @@ cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
int itemdomain)
{
uma_cache_bucket_t cbucket;
- uma_bucket_t bucket;
+ uma_bucket_t newbucket, bucket;
int domain;
CRITICAL_ASSERT(curthread);
- if (zone->uz_bucket_size == 0 || bucketdisable)
+ if (zone->uz_bucket_size == 0)
return false;
cache = &zone->uz_cpu[curcpu];
+ newbucket = NULL;
/*
* FIRSTTOUCH domains need to free to the correct zdom. When
@@ -3895,14 +4076,29 @@ cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
/* We are no longer associated with this CPU. */
critical_exit();
+ /*
+ * Don't let SMR zones operate without a free bucket. Force
+ * a synchronize and re-use this one. We will only degrade
+ * to a synchronize every bucket_size items rather than every
+ * item if we fail to allocate a bucket.
+ */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
+ if (bucket != NULL)
+ bucket->ub_seq = smr_advance(zone->uz_smr);
+ newbucket = bucket_alloc(zone, udata, M_NOWAIT);
+ if (newbucket == NULL && bucket != NULL) {
+ bucket_drain(zone, bucket);
+ newbucket = bucket;
+ bucket = NULL;
+ }
+ } else if (!bucketdisable)
+ newbucket = bucket_alloc(zone, udata, M_NOWAIT);
+
if (bucket != NULL)
zone_free_bucket(zone, bucket, udata, domain, itemdomain);
- bucket = bucket_alloc(zone, udata, M_NOWAIT);
- CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
- zone->uz_name, zone, bucket);
critical_enter();
- if (bucket == NULL)
+ if ((bucket = newbucket) == NULL)
return (false);
cache = &zone->uz_cpu[curcpu];
#ifdef NUMA
@@ -4031,6 +4227,15 @@ static void
zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
{
+ /*
+ * If a free is sent directly to an SMR zone we have to
+ * synchronize immediately because the item can instantly
+ * be reallocated. This should only happen in degenerate
+ * cases when no memory is available for per-cpu caches.
+ */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE)
+ smr_synchronize(zone->uz_smr);
+
item_dtor(zone, item, zone->uz_size, udata, skip);
if (skip < SKIP_FINI && zone->uz_fini)
@@ -4257,6 +4462,25 @@ uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
/* See uma.h */
void
+uma_zone_set_smr(uma_zone_t zone, smr_t smr)
+{
+
+ ZONE_ASSERT_COLD(zone);
+
+ zone->uz_flags |= UMA_ZONE_SMR;
+ zone->uz_smr = smr;
+ zone_update_caches(zone);
+}
+
+smr_t
+uma_zone_get_smr(uma_zone_t zone)
+{
+
+ return (zone->uz_smr);
+}
+
+/* See uma.h */
+void
uma_zone_reserve(uma_zone_t zone, int items)
{
uma_keg_t keg;
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 9fc3c508236b..20645f975eeb 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -184,6 +184,7 @@
"\30VTOSLAB" \
"\27HASH" \
"\26OFFPAGE" \
+ "\23SMR" \
"\22ROUNDROBIN" \
"\21FIRSTTOUCH" \
"\20PCPU" \
@@ -245,9 +246,10 @@ struct uma_hash {
*/
struct uma_bucket {
TAILQ_ENTRY(uma_bucket) ub_link; /* Link into the zone */
- int16_t ub_cnt; /* Count of items in bucket. */
- int16_t ub_entries; /* Max items. */
- void *ub_bucket[]; /* actual allocation storage */
+ int16_t ub_cnt; /* Count of items in bucket. */
+ int16_t ub_entries; /* Max items. */
+ smr_seq_t ub_seq; /* SMR sequence number. */
+ void *ub_bucket[]; /* actual allocation storage */
};
typedef struct uma_bucket * uma_bucket_t;
@@ -484,7 +486,7 @@ struct uma_zone {
uint32_t uz_size; /* Size inherited from kegs */
uma_ctor uz_ctor; /* Constructor for each allocation */
uma_dtor uz_dtor; /* Destructor */
- uint64_t uz_spare0;
+ smr_t uz_smr; /* Safe memory reclaim context. */
uint64_t uz_max_items; /* Maximum number of items to alloc */
uint32_t uz_sleepers; /* Threads sleeping on limit */
uint16_t uz_bucket_size; /* Number of items in full bucket */