1 #ifndef JEMALLOC_INTERNAL_TSD_H 2 #define JEMALLOC_INTERNAL_TSD_H 3 4 #include "jemalloc/internal/arena_types.h" 5 #include "jemalloc/internal/assert.h" 6 #include "jemalloc/internal/bin_types.h" 7 #include "jemalloc/internal/jemalloc_internal_externs.h" 8 #include "jemalloc/internal/prof_types.h" 9 #include "jemalloc/internal/ql.h" 10 #include "jemalloc/internal/rtree_tsd.h" 11 #include "jemalloc/internal/tcache_types.h" 12 #include "jemalloc/internal/tcache_structs.h" 13 #include "jemalloc/internal/util.h" 14 #include "jemalloc/internal/witness.h" 15 16 /* 17 * Thread-Specific-Data layout 18 * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof --- 19 * s: state 20 * e: tcache_enabled 21 * m: thread_allocated (config_stats) 22 * f: thread_deallocated (config_stats) 23 * p: prof_tdata (config_prof) 24 * c: rtree_ctx (rtree cache accessed on deallocation) 25 * t: tcache 26 * --- data not accessed on tcache fast path: arena-related fields --- 27 * d: arenas_tdata_bypass 28 * r: reentrancy_level 29 * x: narenas_tdata 30 * i: iarena 31 * a: arena 32 * o: arenas_tdata 33 * Loading TSD data is on the critical path of basically all malloc operations. 34 * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective. 35 * Use a compact layout to reduce cache footprint. 36 * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+ 37 * |---------------------------- 1st cacheline ----------------------------| 38 * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] | 39 * |---------------------------- 2nd cacheline ----------------------------| 40 * | [c * 64 ........ ........ ........ ........ ........ ........ .......] | 41 * |---------------------------- 3nd cacheline ----------------------------| 42 * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... | 43 * +-------------------------------------------------------------------------+ 44 * Note: the entire tcache is embedded into TSD and spans multiple cachelines. 45 * 46 * The last 3 members (i, a and o) before tcache isn't really needed on tcache 47 * fast path. However we have a number of unused tcache bins and witnesses 48 * (never touched unless config_debug) at the end of tcache, so we place them 49 * there to avoid breaking the cachelines and possibly paging in an extra page. 50 */ 51 #ifdef JEMALLOC_JET 52 typedef void (*test_callback_t)(int *); 53 # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 54 # define MALLOC_TEST_TSD \ 55 O(test_data, int, int) \ 56 O(test_callback, test_callback_t, int) 57 # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL 58 #else 59 # define MALLOC_TEST_TSD 60 # define MALLOC_TEST_TSD_INITIALIZER 61 #endif 62 63 /* O(name, type, nullable type */ 64 #define MALLOC_TSD \ 65 O(tcache_enabled, bool, bool) \ 66 O(arenas_tdata_bypass, bool, bool) \ 67 O(reentrancy_level, int8_t, int8_t) \ 68 O(narenas_tdata, uint32_t, uint32_t) \ 69 O(offset_state, uint64_t, uint64_t) \ 70 O(thread_allocated, uint64_t, uint64_t) \ 71 O(thread_deallocated, uint64_t, uint64_t) \ 72 O(bytes_until_sample, int64_t, int64_t) \ 73 O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ 74 O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \ 75 O(iarena, arena_t *, arena_t *) \ 76 O(arena, arena_t *, arena_t *) \ 77 O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\ 78 O(binshards, tsd_binshards_t, tsd_binshards_t)\ 79 O(tcache, tcache_t, tcache_t) \ 80 O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ 81 MALLOC_TEST_TSD 82 83 #define TSD_INITIALIZER { \ 84 ATOMIC_INIT(tsd_state_uninitialized), \ 85 TCACHE_ENABLED_ZERO_INITIALIZER, \ 86 false, \ 87 0, \ 88 0, \ 89 0, \ 90 0, \ 91 0, \ 92 0, \ 93 NULL, \ 94 RTREE_CTX_ZERO_INITIALIZER, \ 95 NULL, \ 96 NULL, \ 97 NULL, \ 98 TSD_BINSHARDS_ZERO_INITIALIZER, \ 99 TCACHE_ZERO_INITIALIZER, \ 100 WITNESS_TSD_INITIALIZER \ 101 MALLOC_TEST_TSD_INITIALIZER \ 102 } 103 104 void *malloc_tsd_malloc(size_t size); 105 void malloc_tsd_dalloc(void *wrapper); 106 void malloc_tsd_cleanup_register(bool (*f)(void)); 107 tsd_t *malloc_tsd_boot0(void); 108 void malloc_tsd_boot1(void); 109 void tsd_cleanup(void *arg); 110 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal); 111 void tsd_state_set(tsd_t *tsd, uint8_t new_state); 112 void tsd_slow_update(tsd_t *tsd); 113 void tsd_prefork(tsd_t *tsd); 114 void tsd_postfork_parent(tsd_t *tsd); 115 void tsd_postfork_child(tsd_t *tsd); 116 117 /* 118 * Call ..._inc when your module wants to take all threads down the slow paths, 119 * and ..._dec when it no longer needs to. 120 */ 121 void tsd_global_slow_inc(tsdn_t *tsdn); 122 void tsd_global_slow_dec(tsdn_t *tsdn); 123 bool tsd_global_slow(); 124 125 enum { 126 /* Common case --> jnz. */ 127 tsd_state_nominal = 0, 128 /* Initialized but on slow path. */ 129 tsd_state_nominal_slow = 1, 130 /* 131 * Some thread has changed global state in such a way that all nominal 132 * threads need to recompute their fast / slow status the next time they 133 * get a chance. 134 * 135 * Any thread can change another thread's status *to* recompute, but 136 * threads are the only ones who can change their status *from* 137 * recompute. 138 */ 139 tsd_state_nominal_recompute = 2, 140 /* 141 * The above nominal states should be lower values. We use 142 * tsd_nominal_max to separate nominal states from threads in the 143 * process of being born / dying. 144 */ 145 tsd_state_nominal_max = 2, 146 147 /* 148 * A thread might free() during its death as its only allocator action; 149 * in such scenarios, we need tsd, but set up in such a way that no 150 * cleanup is necessary. 151 */ 152 tsd_state_minimal_initialized = 3, 153 /* States during which we know we're in thread death. */ 154 tsd_state_purgatory = 4, 155 tsd_state_reincarnated = 5, 156 /* 157 * What it says on the tin; tsd that hasn't been initialized. Note 158 * that even when the tsd struct lives in TLS, when need to keep track 159 * of stuff like whether or not our pthread destructors have been 160 * scheduled, so this really truly is different than the nominal state. 161 */ 162 tsd_state_uninitialized = 6 163 }; 164 165 /* 166 * Some TSD accesses can only be done in a nominal state. To enforce this, we 167 * wrap TSD member access in a function that asserts on TSD state, and mangle 168 * field names to prevent touching them accidentally. 169 */ 170 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n 171 172 #ifdef JEMALLOC_U8_ATOMICS 173 # define tsd_state_t atomic_u8_t 174 # define tsd_atomic_load atomic_load_u8 175 # define tsd_atomic_store atomic_store_u8 176 # define tsd_atomic_exchange atomic_exchange_u8 177 #else 178 # define tsd_state_t atomic_u32_t 179 # define tsd_atomic_load atomic_load_u32 180 # define tsd_atomic_store atomic_store_u32 181 # define tsd_atomic_exchange atomic_exchange_u32 182 #endif 183 184 /* The actual tsd. */ 185 struct tsd_s { 186 /* 187 * The contents should be treated as totally opaque outside the tsd 188 * module. Access any thread-local state through the getters and 189 * setters below. 190 */ 191 192 /* 193 * We manually limit the state to just a single byte. Unless the 8-bit 194 * atomics are unavailable (which is rare). 195 */ 196 tsd_state_t state; 197 #define O(n, t, nt) \ 198 t TSD_MANGLE(n); 199 MALLOC_TSD 200 #undef O 201 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */ 202 } JEMALLOC_ALIGNED(16); 203 204 JEMALLOC_ALWAYS_INLINE uint8_t 205 tsd_state_get(tsd_t *tsd) { 206 /* 207 * This should be atomic. Unfortunately, compilers right now can't tell 208 * that this can be done as a memory comparison, and forces a load into 209 * a register that hurts fast-path performance. 210 */ 211 /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */ 212 return *(uint8_t *)&tsd->state; 213 } 214 215 /* 216 * Wrapper around tsd_t that makes it possible to avoid implicit conversion 217 * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be 218 * explicitly converted to tsd_t, which is non-nullable. 219 */ 220 struct tsdn_s { 221 tsd_t tsd; 222 }; 223 #define TSDN_NULL ((tsdn_t *)0) 224 JEMALLOC_ALWAYS_INLINE tsdn_t * 225 tsd_tsdn(tsd_t *tsd) { 226 return (tsdn_t *)tsd; 227 } 228 229 JEMALLOC_ALWAYS_INLINE bool 230 tsdn_null(const tsdn_t *tsdn) { 231 return tsdn == NULL; 232 } 233 234 JEMALLOC_ALWAYS_INLINE tsd_t * 235 tsdn_tsd(tsdn_t *tsdn) { 236 assert(!tsdn_null(tsdn)); 237 238 return &tsdn->tsd; 239 } 240 241 /* 242 * We put the platform-specific data declarations and inlines into their own 243 * header files to avoid cluttering this file. They define tsd_boot0, 244 * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. 245 */ 246 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP 247 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h" 248 #elif (defined(JEMALLOC_TLS)) 249 #include "jemalloc/internal/tsd_tls.h" 250 #elif (defined(_WIN32)) 251 #include "jemalloc/internal/tsd_win.h" 252 #else 253 #include "jemalloc/internal/tsd_generic.h" 254 #endif 255 256 /* 257 * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of 258 * foo. This omits some safety checks, and so can be used during tsd 259 * initialization and cleanup. 260 */ 261 #define O(n, t, nt) \ 262 JEMALLOC_ALWAYS_INLINE t * \ 263 tsd_##n##p_get_unsafe(tsd_t *tsd) { \ 264 return &tsd->TSD_MANGLE(n); \ 265 } 266 MALLOC_TSD 267 #undef O 268 269 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ 270 #define O(n, t, nt) \ 271 JEMALLOC_ALWAYS_INLINE t * \ 272 tsd_##n##p_get(tsd_t *tsd) { \ 273 /* \ 274 * Because the state might change asynchronously if it's \ 275 * nominal, we need to make sure that we only read it once. \ 276 */ \ 277 uint8_t state = tsd_state_get(tsd); \ 278 assert(state == tsd_state_nominal || \ 279 state == tsd_state_nominal_slow || \ 280 state == tsd_state_nominal_recompute || \ 281 state == tsd_state_reincarnated || \ 282 state == tsd_state_minimal_initialized); \ 283 return tsd_##n##p_get_unsafe(tsd); \ 284 } 285 MALLOC_TSD 286 #undef O 287 288 /* 289 * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn 290 * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. 291 */ 292 #define O(n, t, nt) \ 293 JEMALLOC_ALWAYS_INLINE nt * \ 294 tsdn_##n##p_get(tsdn_t *tsdn) { \ 295 if (tsdn_null(tsdn)) { \ 296 return NULL; \ 297 } \ 298 tsd_t *tsd = tsdn_tsd(tsdn); \ 299 return (nt *)tsd_##n##p_get(tsd); \ 300 } 301 MALLOC_TSD 302 #undef O 303 304 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ 305 #define O(n, t, nt) \ 306 JEMALLOC_ALWAYS_INLINE t \ 307 tsd_##n##_get(tsd_t *tsd) { \ 308 return *tsd_##n##p_get(tsd); \ 309 } 310 MALLOC_TSD 311 #undef O 312 313 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ 314 #define O(n, t, nt) \ 315 JEMALLOC_ALWAYS_INLINE void \ 316 tsd_##n##_set(tsd_t *tsd, t val) { \ 317 assert(tsd_state_get(tsd) != tsd_state_reincarnated && \ 318 tsd_state_get(tsd) != tsd_state_minimal_initialized); \ 319 *tsd_##n##p_get(tsd) = val; \ 320 } 321 MALLOC_TSD 322 #undef O 323 324 JEMALLOC_ALWAYS_INLINE void 325 tsd_assert_fast(tsd_t *tsd) { 326 /* 327 * Note that our fastness assertion does *not* include global slowness 328 * counters; it's not in general possible to ensure that they won't 329 * change asynchronously from underneath us. 330 */ 331 assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && 332 tsd_reentrancy_level_get(tsd) == 0); 333 } 334 335 JEMALLOC_ALWAYS_INLINE bool 336 tsd_fast(tsd_t *tsd) { 337 bool fast = (tsd_state_get(tsd) == tsd_state_nominal); 338 if (fast) { 339 tsd_assert_fast(tsd); 340 } 341 342 return fast; 343 } 344 345 JEMALLOC_ALWAYS_INLINE tsd_t * 346 tsd_fetch_impl(bool init, bool minimal) { 347 tsd_t *tsd = tsd_get(init); 348 349 if (!init && tsd_get_allocates() && tsd == NULL) { 350 return NULL; 351 } 352 assert(tsd != NULL); 353 354 if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) { 355 return tsd_fetch_slow(tsd, minimal); 356 } 357 assert(tsd_fast(tsd)); 358 tsd_assert_fast(tsd); 359 360 return tsd; 361 } 362 363 /* Get a minimal TSD that requires no cleanup. See comments in free(). */ 364 JEMALLOC_ALWAYS_INLINE tsd_t * 365 tsd_fetch_min(void) { 366 return tsd_fetch_impl(true, true); 367 } 368 369 /* For internal background threads use only. */ 370 JEMALLOC_ALWAYS_INLINE tsd_t * 371 tsd_internal_fetch(void) { 372 tsd_t *tsd = tsd_fetch_min(); 373 /* Use reincarnated state to prevent full initialization. */ 374 tsd_state_set(tsd, tsd_state_reincarnated); 375 376 return tsd; 377 } 378 379 JEMALLOC_ALWAYS_INLINE tsd_t * 380 tsd_fetch(void) { 381 return tsd_fetch_impl(true, false); 382 } 383 384 static inline bool 385 tsd_nominal(tsd_t *tsd) { 386 return (tsd_state_get(tsd) <= tsd_state_nominal_max); 387 } 388 389 JEMALLOC_ALWAYS_INLINE tsdn_t * 390 tsdn_fetch(void) { 391 if (!tsd_booted_get()) { 392 return NULL; 393 } 394 395 return tsd_tsdn(tsd_fetch_impl(false, false)); 396 } 397 398 JEMALLOC_ALWAYS_INLINE rtree_ctx_t * 399 tsd_rtree_ctx(tsd_t *tsd) { 400 return tsd_rtree_ctxp_get(tsd); 401 } 402 403 JEMALLOC_ALWAYS_INLINE rtree_ctx_t * 404 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { 405 /* 406 * If tsd cannot be accessed, initialize the fallback rtree_ctx and 407 * return a pointer to it. 408 */ 409 if (unlikely(tsdn_null(tsdn))) { 410 rtree_ctx_data_init(fallback); 411 return fallback; 412 } 413 return tsd_rtree_ctx(tsdn_tsd(tsdn)); 414 } 415 416 #endif /* JEMALLOC_INTERNAL_TSD_H */ 417