aboutsummaryrefslogtreecommitdiff
path: root/include/jemalloc/internal/tsd.h
diff options
context:
space:
mode:
authorWarner Losh <imp@FreeBSD.org>2025-02-05 23:20:13 +0000
committerWarner Losh <imp@FreeBSD.org>2025-02-05 23:20:13 +0000
commit48ec896efb0b78141df004eaa21288b84590c9da (patch)
tree33799792fd95c266d472ab1ae51d50ab4f942eb3 /include/jemalloc/internal/tsd.h
parentd28d7fbede216494aa3942af042cc084fcd6098a (diff)
jemalloc: Import 5.3.0 54eaed1d8b56b1aa528be3bdd1877e59c56fa90cvendor/jemalloc/5.3.0vendor/jemalloc
Import jemalloc 5.3.0. This import changes how manage the jemalloc vendor branch (which was just started anyway). Starting with 5.3.0, we import a clean tree from the upstream github, removing all the old files that are no longer upstream, or that we've kept around for some reason. We do this because we merge from this raw version of jemalloc into the FreeBSD contrib/jemalloc, then we run autogen stuff, generate all the generated .h files with gmake, then finally remove much of the generated files in contrib/jemalloc using an update script. Sponsored by: Netflix
Diffstat (limited to 'include/jemalloc/internal/tsd.h')
-rw-r--r--include/jemalloc/internal/tsd.h243
1 files changed, 173 insertions, 70 deletions
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 9ba2600453b6..66d688227037 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -1,10 +1,12 @@
#ifndef JEMALLOC_INTERNAL_TSD_H
#define JEMALLOC_INTERNAL_TSD_H
+#include "jemalloc/internal/activity_callback.h"
#include "jemalloc/internal/arena_types.h"
#include "jemalloc/internal/assert.h"
#include "jemalloc/internal/bin_types.h"
#include "jemalloc/internal/jemalloc_internal_externs.h"
+#include "jemalloc/internal/peak.h"
#include "jemalloc/internal/prof_types.h"
#include "jemalloc/internal/ql.h"
#include "jemalloc/internal/rtree_tsd.h"
@@ -15,39 +17,30 @@
/*
* Thread-Specific-Data layout
- * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
- * s: state
- * e: tcache_enabled
- * m: thread_allocated (config_stats)
- * f: thread_deallocated (config_stats)
- * p: prof_tdata (config_prof)
- * c: rtree_ctx (rtree cache accessed on deallocation)
- * t: tcache
- * --- data not accessed on tcache fast path: arena-related fields ---
- * d: arenas_tdata_bypass
- * r: reentrancy_level
- * x: narenas_tdata
- * i: iarena
- * a: arena
- * o: arenas_tdata
- * Loading TSD data is on the critical path of basically all malloc operations.
- * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
- * Use a compact layout to reduce cache footprint.
- * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
- * |---------------------------- 1st cacheline ----------------------------|
- * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] |
- * |---------------------------- 2nd cacheline ----------------------------|
- * | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
- * |---------------------------- 3nd cacheline ----------------------------|
- * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
- * +-------------------------------------------------------------------------+
- * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
*
- * The last 3 members (i, a and o) before tcache isn't really needed on tcache
- * fast path. However we have a number of unused tcache bins and witnesses
- * (never touched unless config_debug) at the end of tcache, so we place them
- * there to avoid breaking the cachelines and possibly paging in an extra page.
+ * At least some thread-local data gets touched on the fast-path of almost all
+ * malloc operations. But much of it is only necessary down slow-paths, or
+ * testing. We want to colocate the fast-path data so that it can live on the
+ * same cacheline if possible. So we define three tiers of hotness:
+ * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths.
+ * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general;
+ * there are "semi-slow" paths like "not a sized deallocation, but can still
+ * live in the tcache". We'll want to keep these closer to the fast-path
+ * data.
+ * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all.
+ *
+ * An additional concern is that the larger tcache bins won't be used (we have a
+ * bin per size class, but by default only cache relatively small objects). So
+ * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the
+ * TSD_DATA_SLOWER tier.
+ *
+ * As a result of all this, we put the slow data first, then the fast data, then
+ * the slower data, while keeping the tcache as the last element of the fast
+ * data (so that the fast -> slower transition happens midway through the
+ * tcache). While we don't yet play alignment tricks to guarantee it, this
+ * increases our odds of getting some cache/page locality on fast paths.
*/
+
#ifdef JEMALLOC_JET
typedef void (*test_callback_t)(int *);
# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
@@ -60,50 +53,112 @@ typedef void (*test_callback_t)(int *);
# define MALLOC_TEST_TSD_INITIALIZER
#endif
-/* O(name, type, nullable type */
-#define MALLOC_TSD \
+typedef ql_elm(tsd_t) tsd_link_t;
+
+/* O(name, type, nullable type) */
+#define TSD_DATA_SLOW \
O(tcache_enabled, bool, bool) \
- O(arenas_tdata_bypass, bool, bool) \
O(reentrancy_level, int8_t, int8_t) \
- O(narenas_tdata, uint32_t, uint32_t) \
- O(offset_state, uint64_t, uint64_t) \
- O(thread_allocated, uint64_t, uint64_t) \
- O(thread_deallocated, uint64_t, uint64_t) \
- O(bytes_until_sample, int64_t, int64_t) \
+ O(thread_allocated_last_event, uint64_t, uint64_t) \
+ O(thread_allocated_next_event, uint64_t, uint64_t) \
+ O(thread_deallocated_last_event, uint64_t, uint64_t) \
+ O(thread_deallocated_next_event, uint64_t, uint64_t) \
+ O(tcache_gc_event_wait, uint64_t, uint64_t) \
+ O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \
+ O(prof_sample_event_wait, uint64_t, uint64_t) \
+ O(prof_sample_last_event, uint64_t, uint64_t) \
+ O(stats_interval_event_wait, uint64_t, uint64_t) \
+ O(stats_interval_last_event, uint64_t, uint64_t) \
+ O(peak_alloc_event_wait, uint64_t, uint64_t) \
+ O(peak_dalloc_event_wait, uint64_t, uint64_t) \
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
- O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
+ O(prng_state, uint64_t, uint64_t) \
+ O(san_extents_until_guard_small, uint64_t, uint64_t) \
+ O(san_extents_until_guard_large, uint64_t, uint64_t) \
O(iarena, arena_t *, arena_t *) \
O(arena, arena_t *, arena_t *) \
- O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
+ O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \
+ O(sec_shard, uint8_t, uint8_t) \
O(binshards, tsd_binshards_t, tsd_binshards_t)\
- O(tcache, tcache_t, tcache_t) \
+ O(tsd_link, tsd_link_t, tsd_link_t) \
+ O(in_hook, bool, bool) \
+ O(peak, peak_t, peak_t) \
+ O(activity_callback_thunk, activity_callback_thunk_t, \
+ activity_callback_thunk_t) \
+ O(tcache_slow, tcache_slow_t, tcache_slow_t) \
+ O(rtree_ctx, rtree_ctx_t, rtree_ctx_t)
+
+#define TSD_DATA_SLOW_INITIALIZER \
+ /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
+ /* reentrancy_level */ 0, \
+ /* thread_allocated_last_event */ 0, \
+ /* thread_allocated_next_event */ 0, \
+ /* thread_deallocated_last_event */ 0, \
+ /* thread_deallocated_next_event */ 0, \
+ /* tcache_gc_event_wait */ 0, \
+ /* tcache_gc_dalloc_event_wait */ 0, \
+ /* prof_sample_event_wait */ 0, \
+ /* prof_sample_last_event */ 0, \
+ /* stats_interval_event_wait */ 0, \
+ /* stats_interval_last_event */ 0, \
+ /* peak_alloc_event_wait */ 0, \
+ /* peak_dalloc_event_wait */ 0, \
+ /* prof_tdata */ NULL, \
+ /* prng_state */ 0, \
+ /* san_extents_until_guard_small */ 0, \
+ /* san_extents_until_guard_large */ 0, \
+ /* iarena */ NULL, \
+ /* arena */ NULL, \
+ /* arena_decay_ticker */ \
+ TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \
+ /* sec_shard */ (uint8_t)-1, \
+ /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \
+ /* tsd_link */ {NULL}, \
+ /* in_hook */ false, \
+ /* peak */ PEAK_INITIALIZER, \
+ /* activity_callback_thunk */ \
+ ACTIVITY_CALLBACK_THUNK_INITIALIZER, \
+ /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \
+ /* rtree_ctx */ RTREE_CTX_INITIALIZER,
+
+/* O(name, type, nullable type) */
+#define TSD_DATA_FAST \
+ O(thread_allocated, uint64_t, uint64_t) \
+ O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
+ O(thread_deallocated, uint64_t, uint64_t) \
+ O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \
+ O(tcache, tcache_t, tcache_t)
+
+#define TSD_DATA_FAST_INITIALIZER \
+ /* thread_allocated */ 0, \
+ /* thread_allocated_next_event_fast */ 0, \
+ /* thread_deallocated */ 0, \
+ /* thread_deallocated_next_event_fast */ 0, \
+ /* tcache */ TCACHE_ZERO_INITIALIZER,
+
+/* O(name, type, nullable type) */
+#define TSD_DATA_SLOWER \
O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
MALLOC_TEST_TSD
+#define TSD_DATA_SLOWER_INITIALIZER \
+ /* witness */ WITNESS_TSD_INITIALIZER \
+ /* test data */ MALLOC_TEST_TSD_INITIALIZER
+
+
#define TSD_INITIALIZER { \
- ATOMIC_INIT(tsd_state_uninitialized), \
- TCACHE_ENABLED_ZERO_INITIALIZER, \
- false, \
- 0, \
- 0, \
- 0, \
- 0, \
- 0, \
- 0, \
- NULL, \
- RTREE_CTX_ZERO_INITIALIZER, \
- NULL, \
- NULL, \
- NULL, \
- TSD_BINSHARDS_ZERO_INITIALIZER, \
- TCACHE_ZERO_INITIALIZER, \
- WITNESS_TSD_INITIALIZER \
- MALLOC_TEST_TSD_INITIALIZER \
+ TSD_DATA_SLOW_INITIALIZER \
+ /* state */ ATOMIC_INIT(tsd_state_uninitialized), \
+ TSD_DATA_FAST_INITIALIZER \
+ TSD_DATA_SLOWER_INITIALIZER \
}
+#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32)
+void _malloc_tsd_cleanup_register(bool (*f)(void));
+#endif
+
void *malloc_tsd_malloc(size_t size);
void malloc_tsd_dalloc(void *wrapper);
-void malloc_tsd_cleanup_register(bool (*f)(void));
tsd_t *malloc_tsd_boot0(void);
void malloc_tsd_boot1(void);
void tsd_cleanup(void *arg);
@@ -189,14 +244,17 @@ struct tsd_s {
* setters below.
*/
+#define O(n, t, nt) \
+ t TSD_MANGLE(n);
+
+ TSD_DATA_SLOW
/*
* We manually limit the state to just a single byte. Unless the 8-bit
* atomics are unavailable (which is rare).
*/
tsd_state_t state;
-#define O(n, t, nt) \
- t TSD_MANGLE(n);
-MALLOC_TSD
+ TSD_DATA_FAST
+ TSD_DATA_SLOWER
#undef O
};
@@ -262,7 +320,9 @@ JEMALLOC_ALWAYS_INLINE t * \
tsd_##n##p_get_unsafe(tsd_t *tsd) { \
return &tsd->TSD_MANGLE(n); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
@@ -281,7 +341,9 @@ tsd_##n##p_get(tsd_t *tsd) { \
state == tsd_state_minimal_initialized); \
return tsd_##n##p_get_unsafe(tsd); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/*
@@ -297,7 +359,9 @@ tsdn_##n##p_get(tsdn_t *tsdn) { \
tsd_t *tsd = tsdn_tsd(tsdn); \
return (nt *)tsd_##n##p_get(tsd); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
@@ -306,7 +370,9 @@ JEMALLOC_ALWAYS_INLINE t \
tsd_##n##_get(tsd_t *tsd) { \
return *tsd_##n##p_get(tsd); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
@@ -317,7 +383,9 @@ tsd_##n##_set(tsd_t *tsd, t val) { \
tsd_state_get(tsd) != tsd_state_minimal_initialized); \
*tsd_##n##p_get(tsd) = val; \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
JEMALLOC_ALWAYS_INLINE void
@@ -382,7 +450,10 @@ tsd_fetch(void) {
static inline bool
tsd_nominal(tsd_t *tsd) {
- return (tsd_state_get(tsd) <= tsd_state_nominal_max);
+ bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max;
+ assert(nominal || tsd_reentrancy_level_get(tsd) > 0);
+
+ return nominal;
}
JEMALLOC_ALWAYS_INLINE tsdn_t *
@@ -412,4 +483,36 @@ tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
return tsd_rtree_ctx(tsdn_tsd(tsdn));
}
+static inline bool
+tsd_state_nocleanup(tsd_t *tsd) {
+ return tsd_state_get(tsd) == tsd_state_reincarnated ||
+ tsd_state_get(tsd) == tsd_state_minimal_initialized;
+}
+
+/*
+ * These "raw" tsd reentrancy functions don't have any debug checking to make
+ * sure that we're not touching arena 0. Better is to call pre_reentrancy and
+ * post_reentrancy if this is possible.
+ */
+static inline void
+tsd_pre_reentrancy_raw(tsd_t *tsd) {
+ bool fast = tsd_fast(tsd);
+ assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
+ ++*tsd_reentrancy_levelp_get(tsd);
+ if (fast) {
+ /* Prepare slow path for reentrancy. */
+ tsd_slow_update(tsd);
+ assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
+ }
+}
+
+static inline void
+tsd_post_reentrancy_raw(tsd_t *tsd) {
+ int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd);
+ assert(*reentrancy_level > 0);
+ if (--*reentrancy_level == 0) {
+ tsd_slow_update(tsd);
+ }
+}
+
#endif /* JEMALLOC_INTERNAL_TSD_H */