1 #ifndef JEMALLOC_INTERNAL_TSD_H
2 #define JEMALLOC_INTERNAL_TSD_H
3 
4 #include "jemalloc/internal/arena_types.h"
5 #include "jemalloc/internal/assert.h"
6 #include "jemalloc/internal/bin_types.h"
7 #include "jemalloc/internal/jemalloc_internal_externs.h"
8 #include "jemalloc/internal/prof_types.h"
9 #include "jemalloc/internal/ql.h"
10 #include "jemalloc/internal/rtree_tsd.h"
11 #include "jemalloc/internal/tcache_types.h"
12 #include "jemalloc/internal/tcache_structs.h"
13 #include "jemalloc/internal/util.h"
14 #include "jemalloc/internal/witness.h"
15 
16 /*
17  * Thread-Specific-Data layout
18  * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
19  * s: state
20  * e: tcache_enabled
21  * m: thread_allocated (config_stats)
22  * f: thread_deallocated (config_stats)
23  * p: prof_tdata (config_prof)
24  * c: rtree_ctx (rtree cache accessed on deallocation)
25  * t: tcache
26  * --- data not accessed on tcache fast path: arena-related fields ---
27  * d: arenas_tdata_bypass
28  * r: reentrancy_level
29  * x: narenas_tdata
30  * i: iarena
31  * a: arena
32  * o: arenas_tdata
33  * Loading TSD data is on the critical path of basically all malloc operations.
34  * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
35  * Use a compact layout to reduce cache footprint.
36  * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
37  * |----------------------------  1st cacheline  ----------------------------|
38  * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32  ........ ........ .......] |
39  * |----------------------------  2nd cacheline  ----------------------------|
40  * | [c * 64  ........ ........ ........ ........ ........ ........ .......] |
41  * |----------------------------  3nd cacheline  ----------------------------|
42  * | [c * 32  ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
43  * +-------------------------------------------------------------------------+
44  * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
45  *
46  * The last 3 members (i, a and o) before tcache isn't really needed on tcache
47  * fast path.  However we have a number of unused tcache bins and witnesses
48  * (never touched unless config_debug) at the end of tcache, so we place them
49  * there to avoid breaking the cachelines and possibly paging in an extra page.
50  */
51 #ifdef JEMALLOC_JET
52 typedef void (*test_callback_t)(int *);
53 #  define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
54 #  define MALLOC_TEST_TSD \
55     O(test_data,		int,			int)		\
56     O(test_callback,		test_callback_t,	int)
57 #  define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL
58 #else
59 #  define MALLOC_TEST_TSD
60 #  define MALLOC_TEST_TSD_INITIALIZER
61 #endif
62 
63 /*  O(name,			type,			nullable type */
64 #define MALLOC_TSD							\
65     O(tcache_enabled,		bool,			bool)		\
66     O(arenas_tdata_bypass,	bool,			bool)		\
67     O(reentrancy_level,		int8_t,			int8_t)		\
68     O(narenas_tdata,		uint32_t,		uint32_t)	\
69     O(offset_state,		uint64_t,		uint64_t)	\
70     O(thread_allocated,		uint64_t,		uint64_t)	\
71     O(thread_deallocated,	uint64_t,		uint64_t)	\
72     O(bytes_until_sample,	int64_t,		int64_t)	\
73     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
74     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
75     O(iarena,			arena_t *,		arena_t *)	\
76     O(arena,			arena_t *,		arena_t *)	\
77     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
78     O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
79     O(tcache,			tcache_t,		tcache_t)	\
80     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
81     MALLOC_TEST_TSD
82 
83 #define TSD_INITIALIZER {						\
84     ATOMIC_INIT(tsd_state_uninitialized),				\
85     TCACHE_ENABLED_ZERO_INITIALIZER,					\
86     false,								\
87     0,									\
88     0,									\
89     0,									\
90     0,									\
91     0,									\
92     0,									\
93     NULL,								\
94     RTREE_CTX_ZERO_INITIALIZER,						\
95     NULL,								\
96     NULL,								\
97     NULL,								\
98     TSD_BINSHARDS_ZERO_INITIALIZER,					\
99     TCACHE_ZERO_INITIALIZER,						\
100     WITNESS_TSD_INITIALIZER						\
101     MALLOC_TEST_TSD_INITIALIZER						\
102 }
103 
104 void *malloc_tsd_malloc(size_t size);
105 void malloc_tsd_dalloc(void *wrapper);
106 void malloc_tsd_cleanup_register(bool (*f)(void));
107 tsd_t *malloc_tsd_boot0(void);
108 void malloc_tsd_boot1(void);
109 void tsd_cleanup(void *arg);
110 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
111 void tsd_state_set(tsd_t *tsd, uint8_t new_state);
112 void tsd_slow_update(tsd_t *tsd);
113 void tsd_prefork(tsd_t *tsd);
114 void tsd_postfork_parent(tsd_t *tsd);
115 void tsd_postfork_child(tsd_t *tsd);
116 
117 /*
118  * Call ..._inc when your module wants to take all threads down the slow paths,
119  * and ..._dec when it no longer needs to.
120  */
121 void tsd_global_slow_inc(tsdn_t *tsdn);
122 void tsd_global_slow_dec(tsdn_t *tsdn);
123 bool tsd_global_slow();
124 
125 enum {
126 	/* Common case --> jnz. */
127 	tsd_state_nominal = 0,
128 	/* Initialized but on slow path. */
129 	tsd_state_nominal_slow = 1,
130 	/*
131 	 * Some thread has changed global state in such a way that all nominal
132 	 * threads need to recompute their fast / slow status the next time they
133 	 * get a chance.
134 	 *
135 	 * Any thread can change another thread's status *to* recompute, but
136 	 * threads are the only ones who can change their status *from*
137 	 * recompute.
138 	 */
139 	tsd_state_nominal_recompute = 2,
140 	/*
141 	 * The above nominal states should be lower values.  We use
142 	 * tsd_nominal_max to separate nominal states from threads in the
143 	 * process of being born / dying.
144 	 */
145 	tsd_state_nominal_max = 2,
146 
147 	/*
148 	 * A thread might free() during its death as its only allocator action;
149 	 * in such scenarios, we need tsd, but set up in such a way that no
150 	 * cleanup is necessary.
151 	 */
152 	tsd_state_minimal_initialized = 3,
153 	/* States during which we know we're in thread death. */
154 	tsd_state_purgatory = 4,
155 	tsd_state_reincarnated = 5,
156 	/*
157 	 * What it says on the tin; tsd that hasn't been initialized.  Note
158 	 * that even when the tsd struct lives in TLS, when need to keep track
159 	 * of stuff like whether or not our pthread destructors have been
160 	 * scheduled, so this really truly is different than the nominal state.
161 	 */
162 	tsd_state_uninitialized = 6
163 };
164 
165 /*
166  * Some TSD accesses can only be done in a nominal state.  To enforce this, we
167  * wrap TSD member access in a function that asserts on TSD state, and mangle
168  * field names to prevent touching them accidentally.
169  */
170 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
171 
172 #ifdef JEMALLOC_U8_ATOMICS
173 #  define tsd_state_t atomic_u8_t
174 #  define tsd_atomic_load atomic_load_u8
175 #  define tsd_atomic_store atomic_store_u8
176 #  define tsd_atomic_exchange atomic_exchange_u8
177 #else
178 #  define tsd_state_t atomic_u32_t
179 #  define tsd_atomic_load atomic_load_u32
180 #  define tsd_atomic_store atomic_store_u32
181 #  define tsd_atomic_exchange atomic_exchange_u32
182 #endif
183 
184 /* The actual tsd. */
185 struct tsd_s {
186 	/*
187 	 * The contents should be treated as totally opaque outside the tsd
188 	 * module.  Access any thread-local state through the getters and
189 	 * setters below.
190 	 */
191 
192 	/*
193 	 * We manually limit the state to just a single byte.  Unless the 8-bit
194 	 * atomics are unavailable (which is rare).
195 	 */
196 	tsd_state_t state;
197 #define O(n, t, nt)							\
198 	t TSD_MANGLE(n);
199 MALLOC_TSD
200 #undef O
201 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */
202 } JEMALLOC_ALIGNED(16);
203 
204 JEMALLOC_ALWAYS_INLINE uint8_t
tsd_state_get(tsd_t * tsd)205 tsd_state_get(tsd_t *tsd) {
206 	/*
207 	 * This should be atomic.  Unfortunately, compilers right now can't tell
208 	 * that this can be done as a memory comparison, and forces a load into
209 	 * a register that hurts fast-path performance.
210 	 */
211 	/* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
212 	return *(uint8_t *)&tsd->state;
213 }
214 
215 /*
216  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
217  * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
218  * explicitly converted to tsd_t, which is non-nullable.
219  */
220 struct tsdn_s {
221 	tsd_t tsd;
222 };
223 #define TSDN_NULL ((tsdn_t *)0)
224 JEMALLOC_ALWAYS_INLINE tsdn_t *
tsd_tsdn(tsd_t * tsd)225 tsd_tsdn(tsd_t *tsd) {
226 	return (tsdn_t *)tsd;
227 }
228 
229 JEMALLOC_ALWAYS_INLINE bool
tsdn_null(const tsdn_t * tsdn)230 tsdn_null(const tsdn_t *tsdn) {
231 	return tsdn == NULL;
232 }
233 
234 JEMALLOC_ALWAYS_INLINE tsd_t *
tsdn_tsd(tsdn_t * tsdn)235 tsdn_tsd(tsdn_t *tsdn) {
236 	assert(!tsdn_null(tsdn));
237 
238 	return &tsdn->tsd;
239 }
240 
241 /*
242  * We put the platform-specific data declarations and inlines into their own
243  * header files to avoid cluttering this file.  They define tsd_boot0,
244  * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set.
245  */
246 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
247 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h"
248 #elif (defined(JEMALLOC_TLS))
249 #include "jemalloc/internal/tsd_tls.h"
250 #elif (defined(_WIN32))
251 #include "jemalloc/internal/tsd_win.h"
252 #else
253 #include "jemalloc/internal/tsd_generic.h"
254 #endif
255 
256 /*
257  * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of
258  * foo.  This omits some safety checks, and so can be used during tsd
259  * initialization and cleanup.
260  */
261 #define O(n, t, nt)							\
262 JEMALLOC_ALWAYS_INLINE t *						\
263 tsd_##n##p_get_unsafe(tsd_t *tsd) {					\
264 	return &tsd->TSD_MANGLE(n);					\
265 }
266 MALLOC_TSD
267 #undef O
268 
269 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
270 #define O(n, t, nt)							\
271 JEMALLOC_ALWAYS_INLINE t *						\
272 tsd_##n##p_get(tsd_t *tsd) {						\
273 	/*								\
274 	 * Because the state might change asynchronously if it's	\
275 	 * nominal, we need to make sure that we only read it once.	\
276 	 */								\
277 	uint8_t state = tsd_state_get(tsd);				\
278 	assert(state == tsd_state_nominal ||				\
279 	    state == tsd_state_nominal_slow ||				\
280 	    state == tsd_state_nominal_recompute ||			\
281 	    state == tsd_state_reincarnated ||				\
282 	    state == tsd_state_minimal_initialized);			\
283 	return tsd_##n##p_get_unsafe(tsd);				\
284 }
285 MALLOC_TSD
286 #undef O
287 
288 /*
289  * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn
290  * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type.
291  */
292 #define O(n, t, nt)							\
293 JEMALLOC_ALWAYS_INLINE nt *						\
294 tsdn_##n##p_get(tsdn_t *tsdn) {						\
295 	if (tsdn_null(tsdn)) {						\
296 		return NULL;						\
297 	}								\
298 	tsd_t *tsd = tsdn_tsd(tsdn);					\
299 	return (nt *)tsd_##n##p_get(tsd);				\
300 }
301 MALLOC_TSD
302 #undef O
303 
304 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
305 #define O(n, t, nt)							\
306 JEMALLOC_ALWAYS_INLINE t						\
307 tsd_##n##_get(tsd_t *tsd) {						\
308 	return *tsd_##n##p_get(tsd);					\
309 }
310 MALLOC_TSD
311 #undef O
312 
313 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
314 #define O(n, t, nt)							\
315 JEMALLOC_ALWAYS_INLINE void						\
316 tsd_##n##_set(tsd_t *tsd, t val) {					\
317 	assert(tsd_state_get(tsd) != tsd_state_reincarnated &&		\
318 	    tsd_state_get(tsd) != tsd_state_minimal_initialized);	\
319 	*tsd_##n##p_get(tsd) = val;					\
320 }
321 MALLOC_TSD
322 #undef O
323 
324 JEMALLOC_ALWAYS_INLINE void
tsd_assert_fast(tsd_t * tsd)325 tsd_assert_fast(tsd_t *tsd) {
326 	/*
327 	 * Note that our fastness assertion does *not* include global slowness
328 	 * counters; it's not in general possible to ensure that they won't
329 	 * change asynchronously from underneath us.
330 	 */
331 	assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
332 	    tsd_reentrancy_level_get(tsd) == 0);
333 }
334 
335 JEMALLOC_ALWAYS_INLINE bool
tsd_fast(tsd_t * tsd)336 tsd_fast(tsd_t *tsd) {
337 	bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
338 	if (fast) {
339 		tsd_assert_fast(tsd);
340 	}
341 
342 	return fast;
343 }
344 
345 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_fetch_impl(bool init,bool minimal)346 tsd_fetch_impl(bool init, bool minimal) {
347 	tsd_t *tsd = tsd_get(init);
348 
349 	if (!init && tsd_get_allocates() && tsd == NULL) {
350 		return NULL;
351 	}
352 	assert(tsd != NULL);
353 
354 	if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
355 		return tsd_fetch_slow(tsd, minimal);
356 	}
357 	assert(tsd_fast(tsd));
358 	tsd_assert_fast(tsd);
359 
360 	return tsd;
361 }
362 
363 /* Get a minimal TSD that requires no cleanup.  See comments in free(). */
364 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_fetch_min(void)365 tsd_fetch_min(void) {
366 	return tsd_fetch_impl(true, true);
367 }
368 
369 /* For internal background threads use only. */
370 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_internal_fetch(void)371 tsd_internal_fetch(void) {
372 	tsd_t *tsd = tsd_fetch_min();
373 	/* Use reincarnated state to prevent full initialization. */
374 	tsd_state_set(tsd, tsd_state_reincarnated);
375 
376 	return tsd;
377 }
378 
379 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_fetch(void)380 tsd_fetch(void) {
381 	return tsd_fetch_impl(true, false);
382 }
383 
384 static inline bool
tsd_nominal(tsd_t * tsd)385 tsd_nominal(tsd_t *tsd) {
386 	return (tsd_state_get(tsd) <= tsd_state_nominal_max);
387 }
388 
389 JEMALLOC_ALWAYS_INLINE tsdn_t *
tsdn_fetch(void)390 tsdn_fetch(void) {
391 	if (!tsd_booted_get()) {
392 		return NULL;
393 	}
394 
395 	return tsd_tsdn(tsd_fetch_impl(false, false));
396 }
397 
398 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
tsd_rtree_ctx(tsd_t * tsd)399 tsd_rtree_ctx(tsd_t *tsd) {
400 	return tsd_rtree_ctxp_get(tsd);
401 }
402 
403 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
tsdn_rtree_ctx(tsdn_t * tsdn,rtree_ctx_t * fallback)404 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
405 	/*
406 	 * If tsd cannot be accessed, initialize the fallback rtree_ctx and
407 	 * return a pointer to it.
408 	 */
409 	if (unlikely(tsdn_null(tsdn))) {
410 		rtree_ctx_data_init(fallback);
411 		return fallback;
412 	}
413 	return tsd_rtree_ctx(tsdn_tsd(tsdn));
414 }
415 
416 #endif /* JEMALLOC_INTERNAL_TSD_H */
417