1 #ifndef JEMALLOC_INTERNAL_TSD_H 2 #define JEMALLOC_INTERNAL_TSD_H 3 4 #include "jemalloc/internal/arena_types.h" 5 #include "jemalloc/internal/assert.h" 6 #include "jemalloc/internal/jemalloc_internal_externs.h" 7 #include "jemalloc/internal/prof_types.h" 8 #include "jemalloc/internal/ql.h" 9 #include "jemalloc/internal/rtree_tsd.h" 10 #include "jemalloc/internal/tcache_types.h" 11 #include "jemalloc/internal/tcache_structs.h" 12 #include "jemalloc/internal/util.h" 13 #include "jemalloc/internal/witness.h" 14 15 /* 16 * Thread-Specific-Data layout 17 * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof --- 18 * s: state 19 * e: tcache_enabled 20 * m: thread_allocated (config_stats) 21 * f: thread_deallocated (config_stats) 22 * p: prof_tdata (config_prof) 23 * c: rtree_ctx (rtree cache accessed on deallocation) 24 * t: tcache 25 * --- data not accessed on tcache fast path: arena-related fields --- 26 * d: arenas_tdata_bypass 27 * r: reentrancy_level 28 * x: narenas_tdata 29 * i: iarena 30 * a: arena 31 * o: arenas_tdata 32 * Loading TSD data is on the critical path of basically all malloc operations. 33 * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective. 34 * Use a compact layout to reduce cache footprint. 35 * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+ 36 * |---------------------------- 1st cacheline ----------------------------| 37 * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] | 38 * |---------------------------- 2nd cacheline ----------------------------| 39 * | [c * 64 ........ ........ ........ ........ ........ ........ .......] | 40 * |---------------------------- 3nd cacheline ----------------------------| 41 * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... | 42 * +-------------------------------------------------------------------------+ 43 * Note: the entire tcache is embedded into TSD and spans multiple cachelines. 44 * 45 * The last 3 members (i, a and o) before tcache isn't really needed on tcache 46 * fast path. However we have a number of unused tcache bins and witnesses 47 * (never touched unless config_debug) at the end of tcache, so we place them 48 * there to avoid breaking the cachelines and possibly paging in an extra page. 49 */ 50 #ifdef JEMALLOC_JET 51 typedef void (*test_callback_t)(int *); 52 # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 53 # define MALLOC_TEST_TSD \ 54 O(test_data, int, int) \ 55 O(test_callback, test_callback_t, int) 56 # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL 57 #else 58 # define MALLOC_TEST_TSD 59 # define MALLOC_TEST_TSD_INITIALIZER 60 #endif 61 62 /* O(name, type, nullable type */ 63 #define MALLOC_TSD \ 64 O(tcache_enabled, bool, bool) \ 65 O(arenas_tdata_bypass, bool, bool) \ 66 O(reentrancy_level, int8_t, int8_t) \ 67 O(narenas_tdata, uint32_t, uint32_t) \ 68 O(offset_state, uint64_t, uint64_t) \ 69 O(thread_allocated, uint64_t, uint64_t) \ 70 O(thread_deallocated, uint64_t, uint64_t) \ 71 O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ 72 O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \ 73 O(iarena, arena_t *, arena_t *) \ 74 O(arena, arena_t *, arena_t *) \ 75 O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\ 76 O(tcache, tcache_t, tcache_t) \ 77 O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ 78 MALLOC_TEST_TSD 79 80 #define TSD_INITIALIZER { \ 81 tsd_state_uninitialized, \ 82 TCACHE_ENABLED_ZERO_INITIALIZER, \ 83 false, \ 84 0, \ 85 0, \ 86 0, \ 87 0, \ 88 0, \ 89 NULL, \ 90 RTREE_CTX_ZERO_INITIALIZER, \ 91 NULL, \ 92 NULL, \ 93 NULL, \ 94 TCACHE_ZERO_INITIALIZER, \ 95 WITNESS_TSD_INITIALIZER \ 96 MALLOC_TEST_TSD_INITIALIZER \ 97 } 98 99 enum { 100 tsd_state_nominal = 0, /* Common case --> jnz. */ 101 tsd_state_nominal_slow = 1, /* Initialized but on slow path. */ 102 /* the above 2 nominal states should be lower values. */ 103 tsd_state_nominal_max = 1, /* used for comparison only. */ 104 tsd_state_minimal_initialized = 2, 105 tsd_state_purgatory = 3, 106 tsd_state_reincarnated = 4, 107 tsd_state_uninitialized = 5 108 }; 109 110 /* Manually limit tsd_state_t to a single byte. */ 111 typedef uint8_t tsd_state_t; 112 113 /* The actual tsd. */ 114 struct tsd_s { 115 /* 116 * The contents should be treated as totally opaque outside the tsd 117 * module. Access any thread-local state through the getters and 118 * setters below. 119 */ 120 tsd_state_t state; 121 #define O(n, t, nt) \ 122 t use_a_getter_or_setter_instead_##n; 123 MALLOC_TSD 124 #undef O 125 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */ 126 } JEMALLOC_ALIGNED(16); 127 128 /* 129 * Wrapper around tsd_t that makes it possible to avoid implicit conversion 130 * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be 131 * explicitly converted to tsd_t, which is non-nullable. 132 */ 133 struct tsdn_s { 134 tsd_t tsd; 135 }; 136 #define TSDN_NULL ((tsdn_t *)0) 137 JEMALLOC_ALWAYS_INLINE tsdn_t * 138 tsd_tsdn(tsd_t *tsd) { 139 return (tsdn_t *)tsd; 140 } 141 142 JEMALLOC_ALWAYS_INLINE bool 143 tsdn_null(const tsdn_t *tsdn) { 144 return tsdn == NULL; 145 } 146 147 JEMALLOC_ALWAYS_INLINE tsd_t * 148 tsdn_tsd(tsdn_t *tsdn) { 149 assert(!tsdn_null(tsdn)); 150 151 return &tsdn->tsd; 152 } 153 154 void *malloc_tsd_malloc(size_t size); 155 void malloc_tsd_dalloc(void *wrapper); 156 void malloc_tsd_cleanup_register(bool (*f)(void)); 157 tsd_t *malloc_tsd_boot0(void); 158 void malloc_tsd_boot1(void); 159 void tsd_cleanup(void *arg); 160 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal); 161 void tsd_slow_update(tsd_t *tsd); 162 163 /* 164 * We put the platform-specific data declarations and inlines into their own 165 * header files to avoid cluttering this file. They define tsd_boot0, 166 * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. 167 */ 168 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP 169 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h" 170 #elif (defined(JEMALLOC_TLS)) 171 #include "jemalloc/internal/tsd_tls.h" 172 #elif (defined(_WIN32)) 173 #include "jemalloc/internal/tsd_win.h" 174 #else 175 #include "jemalloc/internal/tsd_generic.h" 176 #endif 177 178 /* 179 * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of 180 * foo. This omits some safety checks, and so can be used during tsd 181 * initialization and cleanup. 182 */ 183 #define O(n, t, nt) \ 184 JEMALLOC_ALWAYS_INLINE t * \ 185 tsd_##n##p_get_unsafe(tsd_t *tsd) { \ 186 return &tsd->use_a_getter_or_setter_instead_##n; \ 187 } 188 MALLOC_TSD 189 #undef O 190 191 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ 192 #define O(n, t, nt) \ 193 JEMALLOC_ALWAYS_INLINE t * \ 194 tsd_##n##p_get(tsd_t *tsd) { \ 195 assert(tsd->state == tsd_state_nominal || \ 196 tsd->state == tsd_state_nominal_slow || \ 197 tsd->state == tsd_state_reincarnated || \ 198 tsd->state == tsd_state_minimal_initialized); \ 199 return tsd_##n##p_get_unsafe(tsd); \ 200 } 201 MALLOC_TSD 202 #undef O 203 204 /* 205 * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn 206 * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. 207 */ 208 #define O(n, t, nt) \ 209 JEMALLOC_ALWAYS_INLINE nt * \ 210 tsdn_##n##p_get(tsdn_t *tsdn) { \ 211 if (tsdn_null(tsdn)) { \ 212 return NULL; \ 213 } \ 214 tsd_t *tsd = tsdn_tsd(tsdn); \ 215 return (nt *)tsd_##n##p_get(tsd); \ 216 } 217 MALLOC_TSD 218 #undef O 219 220 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ 221 #define O(n, t, nt) \ 222 JEMALLOC_ALWAYS_INLINE t \ 223 tsd_##n##_get(tsd_t *tsd) { \ 224 return *tsd_##n##p_get(tsd); \ 225 } 226 MALLOC_TSD 227 #undef O 228 229 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ 230 #define O(n, t, nt) \ 231 JEMALLOC_ALWAYS_INLINE void \ 232 tsd_##n##_set(tsd_t *tsd, t val) { \ 233 assert(tsd->state != tsd_state_reincarnated && \ 234 tsd->state != tsd_state_minimal_initialized); \ 235 *tsd_##n##p_get(tsd) = val; \ 236 } 237 MALLOC_TSD 238 #undef O 239 240 JEMALLOC_ALWAYS_INLINE void 241 tsd_assert_fast(tsd_t *tsd) { 242 assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && 243 tsd_reentrancy_level_get(tsd) == 0); 244 } 245 246 JEMALLOC_ALWAYS_INLINE bool 247 tsd_fast(tsd_t *tsd) { 248 bool fast = (tsd->state == tsd_state_nominal); 249 if (fast) { 250 tsd_assert_fast(tsd); 251 } 252 253 return fast; 254 } 255 256 JEMALLOC_ALWAYS_INLINE tsd_t * 257 tsd_fetch_impl(bool init, bool minimal) { 258 tsd_t *tsd = tsd_get(init); 259 260 if (!init && tsd_get_allocates() && tsd == NULL) { 261 return NULL; 262 } 263 assert(tsd != NULL); 264 265 if (unlikely(tsd->state != tsd_state_nominal)) { 266 return tsd_fetch_slow(tsd, minimal); 267 } 268 assert(tsd_fast(tsd)); 269 tsd_assert_fast(tsd); 270 271 return tsd; 272 } 273 274 /* Get a minimal TSD that requires no cleanup. See comments in free(). */ 275 JEMALLOC_ALWAYS_INLINE tsd_t * 276 tsd_fetch_min(void) { 277 return tsd_fetch_impl(true, true); 278 } 279 280 /* For internal background threads use only. */ 281 JEMALLOC_ALWAYS_INLINE tsd_t * 282 tsd_internal_fetch(void) { 283 tsd_t *tsd = tsd_fetch_min(); 284 /* Use reincarnated state to prevent full initialization. */ 285 tsd->state = tsd_state_reincarnated; 286 287 return tsd; 288 } 289 290 JEMALLOC_ALWAYS_INLINE tsd_t * 291 tsd_fetch(void) { 292 return tsd_fetch_impl(true, false); 293 } 294 295 static inline bool 296 tsd_nominal(tsd_t *tsd) { 297 return (tsd->state <= tsd_state_nominal_max); 298 } 299 300 JEMALLOC_ALWAYS_INLINE tsdn_t * 301 tsdn_fetch(void) { 302 if (!tsd_booted_get()) { 303 return NULL; 304 } 305 306 return tsd_tsdn(tsd_fetch_impl(false, false)); 307 } 308 309 JEMALLOC_ALWAYS_INLINE rtree_ctx_t * 310 tsd_rtree_ctx(tsd_t *tsd) { 311 return tsd_rtree_ctxp_get(tsd); 312 } 313 314 JEMALLOC_ALWAYS_INLINE rtree_ctx_t * 315 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { 316 /* 317 * If tsd cannot be accessed, initialize the fallback rtree_ctx and 318 * return a pointer to it. 319 */ 320 if (unlikely(tsdn_null(tsdn))) { 321 rtree_ctx_data_init(fallback); 322 return fallback; 323 } 324 return tsd_rtree_ctx(tsdn_tsd(tsdn)); 325 } 326 327 #endif /* JEMALLOC_INTERNAL_TSD_H */ 328