1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team 1998-2008
4 *
5 * The block allocator and free list manager.
6 *
7 * This is the architecture independent part of the block allocator.
8 * It requires only the following support from the operating system:
9 *
10 * void *getMBlocks(uint32_t n);
11 *
12 * returns the address of an n*MBLOCK_SIZE region of memory, aligned on
13 * an MBLOCK_SIZE boundary. There are no other restrictions on the
14 * addresses of memory returned by getMBlocks().
15 *
16 * ---------------------------------------------------------------------------*/
17
18 #include "PosixSource.h"
19 #include "Rts.h"
20
21 #include "Storage.h"
22 #include "RtsUtils.h"
23 #include "BlockAlloc.h"
24 #include "OSMem.h"
25
26 #include <string.h>
27
28 static void initMBlock(void *mblock, uint32_t node);
29
30 /* -----------------------------------------------------------------------------
31
32 Implementation notes
33 ~~~~~~~~~~~~~~~~~~~~
34
35 Terminology:
36 - bdescr = block descriptor
37 - bgroup = block group (1 or more adjacent blocks)
38 - mblock = mega block
39 - mgroup = mega group (1 or more adjacent mblocks)
40
41 Invariants on block descriptors
42 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43 bd->start always points to the start of the block.
44
45 bd->free is either:
46 - zero for a non-group-head; bd->link points to the head
47 - (-1) for the head of a free block group
48 - or it points within the block (group)
49
50 bd->blocks is either:
51 - zero for a non-group-head; bd->link points to the head
52 - number of blocks in this group otherwise
53
54 bd->link either points to a block descriptor or is NULL
55
56 The following fields are not used by the allocator:
57 bd->flags
58 bd->gen_no
59 bd->gen
60 bd->dest
61
62 Exceptions: we don't maintain invariants for all the blocks within a
63 group on the free list, because it is expensive to modify every
64 bdescr in a group when coalescing. Just the head and last bdescrs
65 will be correct for a group on the free list.
66
67
68 Free lists
69 ~~~~~~~~~~
70
71 Preliminaries:
72 - most allocations are for a small number of blocks
73 - sometimes the OS gives us new memory backwards in the address
74 space, sometimes forwards, so we should not be biased towards
75 any particular layout in the address space
76 - We want to avoid fragmentation
77 - We want allocation and freeing to be O(1) or close.
78
79 Coalescing trick: when a bgroup is freed (freeGroup()), we can check
80 whether it can be coalesced with other free bgroups by checking the
81 bdescrs for the blocks on either side of it. This means that we can
82 coalesce in O(1) time. Every free bgroup must have its head and tail
83 bdescrs initialised, the rest don't matter.
84
85 We keep the free list in buckets, using a heap-sort strategy.
86 Bucket N contains blocks with sizes 2^N - 2^(N+1)-1. The list of
87 blocks in each bucket is doubly-linked, so that if a block is
88 coalesced we can easily remove it from its current free list.
89
90 To allocate a new block of size S, grab a block from bucket
91 log2ceiling(S) (i.e. log2() rounded up), in which all blocks are at
92 least as big as S, and split it if necessary. If there are no
93 blocks in that bucket, look at bigger buckets until a block is found
94 Allocation is therefore O(logN) time.
95
96 To free a block:
97 - coalesce it with neighbours.
98 - remove coalesced neighbour(s) from free list(s)
99 - add the new (coalesced) block to the front of the appropriate
100 bucket, given by log2(S) where S is the size of the block.
101
102 Free is O(1).
103
104 Megablocks
105 ~~~~~~~~~~
106
107 Separately from the free list of block groups, which are smaller than
108 an mblock, we maintain a free list of mblock groups. This is the unit
109 of memory the operating system gives us, and we may either split mblocks
110 into blocks or allocate them directly (when very large contiguous regions
111 of memory). mblocks have a different set of invariants than blocks:
112
113 bd->start points to the start of the block IF the block is in the first mblock
114 bd->blocks and bd->link are only valid IF this block is the first block
115 of the first mblock
116 No other fields are used (in particular, free is not used, meaning that
117 space that is not used by the (single) object is wasted.
118
119 This has implications for the free list as well:
120 We cannot play the coalescing trick with mblocks, because there is
121 no requirement that the bdescrs in the second and subsequent mblock
122 of an mgroup are initialised (the mgroup might be filled with a
123 large array, overwriting the bdescrs for example).
124
125 The separate free list for megablocks is thus sorted in *address*
126 order, so that we can coalesce. Allocation in this list is best-fit
127 by traversing the whole list: we don't expect this list to be long,
128 and allocation/freeing of large blocks is rare; avoiding
129 fragmentation is more important than performance here.
130
131 freeGroup() might end up moving a block from free_list to
132 free_mblock_list, if after coalescing we end up with a full mblock.
133
134 checkFreeListSanity() checks all the invariants on the free lists.
135
136 --------------------------------------------------------------------------- */
137
138 /* ---------------------------------------------------------------------------
139 WATCH OUT FOR OVERFLOW
140
141 Be very careful with integer overflow here. If you have an
142 expression like (n_blocks * BLOCK_SIZE), and n_blocks is an int or
143 a uint32_t, then it will very likely overflow on a 64-bit platform.
144 Always cast to StgWord (or W_ for short) first: ((W_)n_blocks * BLOCK_SIZE).
145
146 --------------------------------------------------------------------------- */
147
148 // free_list[i] contains blocks that are at least size 2^i, and at
149 // most size 2^(i+1) - 1.
150 //
151 // To find the free list in which to place a block, use log_2(size).
152 // To find a free block of the right size, use log_2_ceil(size).
153 //
154 // The largest free list (free_list[NUM_FREE_LISTS-1]) needs to contain sizes
155 // from half a megablock up to (but not including) a full megablock.
156
157 #define NUM_FREE_LISTS (MBLOCK_SHIFT-BLOCK_SHIFT)
158
159 // In THREADED_RTS mode, the free list is protected by sm_mutex.
160
161 static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS];
162 static bdescr *free_mblock_list[MAX_NUMA_NODES];
163
164 W_ n_alloc_blocks; // currently allocated blocks
165 W_ hw_alloc_blocks; // high-water allocated blocks
166
167 W_ n_alloc_blocks_by_node[MAX_NUMA_NODES];
168
169 /* -----------------------------------------------------------------------------
170 Initialisation
171 -------------------------------------------------------------------------- */
172
initBlockAllocator(void)173 void initBlockAllocator(void)
174 {
175 uint32_t i, node;
176 for (node = 0; node < MAX_NUMA_NODES; node++) {
177 for (i=0; i < NUM_FREE_LISTS; i++) {
178 free_list[node][i] = NULL;
179 }
180 free_mblock_list[node] = NULL;
181 n_alloc_blocks_by_node[node] = 0;
182 }
183 n_alloc_blocks = 0;
184 hw_alloc_blocks = 0;
185 }
186
187 /* -----------------------------------------------------------------------------
188 Accounting
189 -------------------------------------------------------------------------- */
190
191 STATIC_INLINE
recordAllocatedBlocks(uint32_t node,uint32_t n)192 void recordAllocatedBlocks(uint32_t node, uint32_t n)
193 {
194 n_alloc_blocks += n;
195 n_alloc_blocks_by_node[node] += n;
196 if (n > 0 && n_alloc_blocks > hw_alloc_blocks) {
197 hw_alloc_blocks = n_alloc_blocks;
198 }
199 }
200
201 STATIC_INLINE
recordFreedBlocks(uint32_t node,uint32_t n)202 void recordFreedBlocks(uint32_t node, uint32_t n)
203 {
204 ASSERT(n_alloc_blocks >= n);
205 n_alloc_blocks -= n;
206 n_alloc_blocks_by_node[node] -= n;
207 }
208
209 /* -----------------------------------------------------------------------------
210 Allocation
211 -------------------------------------------------------------------------- */
212
213 STATIC_INLINE bdescr *
tail_of(bdescr * bd)214 tail_of (bdescr *bd)
215 {
216 return bd + bd->blocks - 1;
217 }
218
219 STATIC_INLINE void
initGroup(bdescr * head)220 initGroup(bdescr *head)
221 {
222 head->free = head->start;
223 head->link = NULL;
224
225 // If this is a block group (but not a megablock group), we
226 // make the last block of the group point to the head. This is used
227 // when coalescing blocks in freeGroup(). We don't do this for
228 // megablock groups because blocks in the second and subsequent
229 // mblocks don't have bdescrs; freeing these is handled in a
230 // different way by free_mblock_group().
231 if (head->blocks > 1 && head->blocks <= BLOCKS_PER_MBLOCK) {
232 bdescr *last = tail_of(head);
233 last->blocks = 0;
234 last->link = head;
235 }
236
237 #if defined(DEBUG)
238 for (uint32_t i=0; i < head->blocks; i++) {
239 head[i].flags = 0;
240 }
241 #endif
242 }
243
244 #if SIZEOF_VOID_P == SIZEOF_LONG
245 #define CLZW(n) (__builtin_clzl(n))
246 #else
247 #define CLZW(n) (__builtin_clzll(n))
248 #endif
249
250 // log base 2 (floor), needs to support up to (2^NUM_FREE_LISTS)-1
251 STATIC_INLINE uint32_t
log_2(W_ n)252 log_2(W_ n)
253 {
254 ASSERT(n > 0 && n < (1<<NUM_FREE_LISTS));
255 #if defined(__GNUC__)
256 return CLZW(n) ^ (sizeof(StgWord)*8 - 1);
257 // generates good code on x86. __builtin_clz() compiles to bsr+xor, but
258 // we want just bsr, so the xor here cancels out gcc's xor.
259 #else
260 W_ i, x;
261 x = n;
262 for (i=0; i < NUM_FREE_LISTS; i++) {
263 x = x >> 1;
264 if (x == 0) return i;
265 }
266 return NUM_FREE_LISTS;
267 #endif
268 }
269
270 // log base 2 (ceiling), needs to support up to (2^NUM_FREE_LISTS)-1
271 STATIC_INLINE uint32_t
log_2_ceil(W_ n)272 log_2_ceil(W_ n)
273 {
274 ASSERT(n > 0 && n < (1<<NUM_FREE_LISTS));
275 #if defined(__GNUC__)
276 uint32_t r = log_2(n);
277 return (n & (n-1)) ? r+1 : r;
278 #else
279 W_ i, x;
280 x = 1;
281 for (i=0; i < MAX_FREE_LIST; i++) {
282 if (x >= n) return i;
283 x = x << 1;
284 }
285 return MAX_FREE_LIST;
286 #endif
287 }
288
289 STATIC_INLINE void
free_list_insert(uint32_t node,bdescr * bd)290 free_list_insert (uint32_t node, bdescr *bd)
291 {
292 uint32_t ln;
293
294 ASSERT(bd->blocks < BLOCKS_PER_MBLOCK);
295 ln = log_2(bd->blocks);
296
297 dbl_link_onto(bd, &free_list[node][ln]);
298 }
299
300 // After splitting a group, the last block of each group must have a
301 // tail that points to the head block, to keep our invariants for
302 // coalescing.
303 STATIC_INLINE void
setup_tail(bdescr * bd)304 setup_tail (bdescr *bd)
305 {
306 bdescr *tail;
307 tail = tail_of(bd);
308 if (tail != bd) {
309 tail->blocks = 0;
310 tail->free = 0;
311 tail->link = bd;
312 }
313 }
314
315
316 // Take a free block group bd, and split off a group of size n from
317 // it. Adjust the free list as necessary, and return the new group.
318 static bdescr *
split_free_block(bdescr * bd,uint32_t node,W_ n,uint32_t ln)319 split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln /* log_2_ceil(n) */)
320 {
321 bdescr *fg; // free group
322
323 ASSERT(bd->blocks > n);
324 dbl_link_remove(bd, &free_list[node][ln]);
325 fg = bd + bd->blocks - n; // take n blocks off the end
326 fg->blocks = n;
327 bd->blocks -= n;
328 setup_tail(bd);
329 ln = log_2(bd->blocks);
330 dbl_link_onto(bd, &free_list[node][ln]);
331 return fg;
332 }
333
334 // Take N blocks off the end, free the rest.
335 static bdescr *
split_block_high(bdescr * bd,W_ n)336 split_block_high (bdescr *bd, W_ n)
337 {
338 ASSERT(bd->blocks > n);
339
340 bdescr* ret = bd + bd->blocks - n; // take n blocks off the end
341 ret->blocks = n;
342 ret->start = ret->free = bd->start + (bd->blocks - n)*BLOCK_SIZE_W;
343 ret->link = NULL;
344
345 bd->blocks -= n;
346
347 setup_tail(ret);
348 setup_tail(bd);
349 freeGroup(bd);
350
351 return ret;
352 }
353
354 // Like `split_block_high`, but takes n blocks off the beginning rather
355 // than the end.
356 static bdescr *
split_block_low(bdescr * bd,W_ n)357 split_block_low (bdescr *bd, W_ n)
358 {
359 ASSERT(bd->blocks > n);
360
361 bdescr* bd_ = bd + n;
362 bd_->blocks = bd->blocks - n;
363 bd_->start = bd_->free = bd->start + n*BLOCK_SIZE_W;
364
365 bd->blocks = n;
366
367 setup_tail(bd_);
368 setup_tail(bd);
369 freeGroup(bd_);
370
371 return bd;
372 }
373
374 /* Only initializes the start pointers on the first megablock and the
375 * blocks field of the first bdescr; callers are responsible for calling
376 * initGroup afterwards.
377 */
378 static bdescr *
alloc_mega_group(uint32_t node,StgWord mblocks)379 alloc_mega_group (uint32_t node, StgWord mblocks)
380 {
381 bdescr *best, *bd, *prev;
382 StgWord n;
383
384 n = MBLOCK_GROUP_BLOCKS(mblocks);
385
386 best = NULL;
387 prev = NULL;
388 for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
389 {
390 if (bd->blocks == n)
391 {
392 if (prev) {
393 prev->link = bd->link;
394 } else {
395 free_mblock_list[node] = bd->link;
396 }
397 return bd;
398 }
399 else if (bd->blocks > n)
400 {
401 if (!best || bd->blocks < best->blocks)
402 {
403 best = bd;
404 }
405 }
406 }
407
408 if (best)
409 {
410 // we take our chunk off the end here.
411 StgWord best_mblocks = BLOCKS_TO_MBLOCKS(best->blocks);
412 bd = FIRST_BDESCR((StgWord8*)MBLOCK_ROUND_DOWN(best) +
413 (best_mblocks-mblocks)*MBLOCK_SIZE);
414
415 best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks);
416 initMBlock(MBLOCK_ROUND_DOWN(bd), node);
417 }
418 else
419 {
420 void *mblock;
421 if (RtsFlags.GcFlags.numa) {
422 mblock = getMBlocksOnNode(node, mblocks);
423 } else {
424 mblock = getMBlocks(mblocks);
425 }
426 initMBlock(mblock, node); // only need to init the 1st one
427 bd = FIRST_BDESCR(mblock);
428 }
429 bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
430 return bd;
431 }
432
433 bdescr *
allocGroupOnNode(uint32_t node,W_ n)434 allocGroupOnNode (uint32_t node, W_ n)
435 {
436 bdescr *bd, *rem;
437 StgWord ln;
438
439 if (n == 0) barf("allocGroup: requested zero blocks");
440
441 if (n >= BLOCKS_PER_MBLOCK)
442 {
443 StgWord mblocks;
444
445 mblocks = BLOCKS_TO_MBLOCKS(n);
446
447 // n_alloc_blocks doesn't count the extra blocks we get in a
448 // megablock group.
449 recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
450
451 bd = alloc_mega_group(node, mblocks);
452 // only the bdescrs of the first MB are required to be initialised
453 initGroup(bd);
454 goto finish;
455 }
456
457 recordAllocatedBlocks(node, n);
458
459 ln = log_2_ceil(n);
460
461 while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) {
462 ln++;
463 }
464
465 if (ln == NUM_FREE_LISTS) {
466 #if 0 /* useful for debugging fragmentation */
467 if ((W_)mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W
468 - (W_)((n_alloc_blocks - n) * BLOCK_SIZE_W) > (2*1024*1024)/sizeof(W_)) {
469 debugBelch("Fragmentation, wanted %d blocks, %ld MB free\n", n, ((mblocks_allocated * BLOCKS_PER_MBLOCK) - n_alloc_blocks) / BLOCKS_PER_MBLOCK);
470 RtsFlags.DebugFlags.block_alloc = 1;
471 checkFreeListSanity();
472 }
473 #endif
474
475 bd = alloc_mega_group(node,1);
476 bd->blocks = n;
477 initGroup(bd); // we know the group will fit
478 rem = bd + n;
479 rem->blocks = BLOCKS_PER_MBLOCK-n;
480 initGroup(rem); // init the slop
481 recordAllocatedBlocks(node,rem->blocks);
482 freeGroup(rem); // add the slop on to the free list
483 goto finish;
484 }
485
486 bd = free_list[node][ln];
487
488 if (bd->blocks == n) // exactly the right size!
489 {
490 dbl_link_remove(bd, &free_list[node][ln]);
491 initGroup(bd);
492 }
493 else if (bd->blocks > n) // block too big...
494 {
495 bd = split_free_block(bd, node, n, ln);
496 ASSERT(bd->blocks == n);
497 initGroup(bd);
498 }
499 else
500 {
501 barf("allocGroup: free list corrupted");
502 }
503
504 finish:
505 IF_DEBUG(zero_on_gc, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE));
506 IF_DEBUG(sanity, checkFreeListSanity());
507 return bd;
508 }
509
510 // Allocate `n` blocks aligned to `n` blocks, e.g. when n = 8, the blocks will
511 // be aligned at `8 * BLOCK_SIZE`. For a group with `n` blocks this can be used
512 // for easily accessing the beginning of the group from a location p in the
513 // group with
514 //
515 // p % (BLOCK_SIZE*n)
516 //
517 // Used by the non-moving collector for allocating segments.
518 //
519 // Because the storage manager does not support aligned allocations, we have to
520 // allocate `2*n - 1` blocks here to make sure we'll be able to find an aligned
521 // region in the allocated blocks. After finding the aligned area we want to
522 // free slop on the low and high sides, and block allocator doesn't support
523 // freeing only some portion of a megablock (we can only free whole megablocks).
524 // So we disallow allocating megablocks here, and allow allocating at most
525 // `BLOCKS_PER_MBLOCK / 2` blocks.
526 bdescr *
allocAlignedGroupOnNode(uint32_t node,W_ n)527 allocAlignedGroupOnNode (uint32_t node, W_ n)
528 {
529 // allocate enough blocks to have enough space aligned at n-block boundary
530 // free any slops on the low and high side of this space
531
532 // number of blocks to allocate to make sure we have enough aligned space
533 W_ num_blocks = 2*n - 1;
534
535 if (num_blocks >= BLOCKS_PER_MBLOCK) {
536 barf("allocAlignedGroupOnNode: allocating megablocks is not supported\n"
537 " requested blocks: %" FMT_Word "\n"
538 " required for alignment: %" FMT_Word "\n"
539 " megablock size (in blocks): %" FMT_Word,
540 n, num_blocks, (W_) BLOCKS_PER_MBLOCK);
541 }
542
543 W_ group_size = n * BLOCK_SIZE;
544
545 // To reduce splitting and fragmentation we use allocLargeChunkOnNode here.
546 // Tweak the max allocation to avoid allocating megablocks. Splitting slop
547 // below doesn't work with megablocks (freeGroup can't free only a portion
548 // of a megablock so we can't allocate megablocks and free some parts of
549 // them).
550 W_ max_blocks = stg_min(num_blocks * 3, BLOCKS_PER_MBLOCK - 1);
551 bdescr *bd = allocLargeChunkOnNode(node, num_blocks, max_blocks);
552 // We may allocate more than num_blocks, so update it
553 num_blocks = bd->blocks;
554
555 // slop on the low side
556 W_ slop_low = 0;
557 if ((uintptr_t)bd->start % group_size != 0) {
558 slop_low = group_size - ((uintptr_t)bd->start % group_size);
559 }
560
561 W_ slop_high = (num_blocks * BLOCK_SIZE) - group_size - slop_low;
562
563 ASSERT((slop_low % BLOCK_SIZE) == 0);
564 ASSERT((slop_high % BLOCK_SIZE) == 0);
565
566 W_ slop_low_blocks = slop_low / BLOCK_SIZE;
567 W_ slop_high_blocks = slop_high / BLOCK_SIZE;
568
569 ASSERT(slop_low_blocks + slop_high_blocks + n == num_blocks);
570
571 #if defined(DEBUG)
572 checkFreeListSanity();
573 W_ free_before = countFreeList();
574 #endif
575
576 if (slop_low_blocks != 0) {
577 bd = split_block_high(bd, num_blocks - slop_low_blocks);
578 ASSERT(countBlocks(bd) == num_blocks - slop_low_blocks);
579 }
580
581 #if defined(DEBUG)
582 ASSERT(countFreeList() == free_before + slop_low_blocks);
583 checkFreeListSanity();
584 #endif
585
586 // At this point the bd should be aligned, but we may have slop on the high side
587 ASSERT((uintptr_t)bd->start % group_size == 0);
588
589 #if defined(DEBUG)
590 free_before = countFreeList();
591 #endif
592
593 if (slop_high_blocks != 0) {
594 bd = split_block_low(bd, n);
595 ASSERT(bd->blocks == n);
596 }
597
598 #if defined(DEBUG)
599 ASSERT(countFreeList() == free_before + slop_high_blocks);
600 checkFreeListSanity();
601 #endif
602
603 // Should still be aligned
604 ASSERT((uintptr_t)bd->start % group_size == 0);
605
606 // Just to make sure I get this right
607 ASSERT(Bdescr(bd->start) == bd);
608
609 return bd;
610 }
611
612 STATIC_INLINE
nodeWithLeastBlocks(void)613 uint32_t nodeWithLeastBlocks (void)
614 {
615 uint32_t node = 0, i;
616 uint32_t min_blocks = n_alloc_blocks_by_node[0];
617 for (i = 1; i < n_numa_nodes; i++) {
618 if (n_alloc_blocks_by_node[i] < min_blocks) {
619 min_blocks = n_alloc_blocks_by_node[i];
620 node = i;
621 }
622 }
623 return node;
624 }
625
allocGroup(W_ n)626 bdescr* allocGroup (W_ n)
627 {
628 return allocGroupOnNode(nodeWithLeastBlocks(),n);
629 }
630
631
632 //
633 // Allocate a chunk of blocks that is at least min and at most max
634 // blocks in size. This API is used by the nursery allocator that
635 // wants contiguous memory preferably, but doesn't require it. When
636 // memory is fragmented we might have lots of chunks that are
637 // less than a full megablock, so allowing the nursery allocator to
638 // use these reduces fragmentation considerably. e.g. on a GHC build
639 // with +RTS -H, I saw fragmentation go from 17MB down to 3MB on a
640 // single compile.
641 //
642 // Further to this: in #7257 there is a program that creates serious
643 // fragmentation such that the heap is full of tiny <4 block chains.
644 // The nursery allocator therefore has to use single blocks to avoid
645 // fragmentation, but we make sure that we allocate large blocks
646 // preferably if there are any.
647 //
allocLargeChunkOnNode(uint32_t node,W_ min,W_ max)648 bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
649 {
650 bdescr *bd;
651 StgWord ln, lnmax;
652
653 if (min >= BLOCKS_PER_MBLOCK) {
654 return allocGroupOnNode(node,max);
655 }
656
657 ln = log_2_ceil(min);
658 lnmax = log_2_ceil(max);
659
660 while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) {
661 ln++;
662 }
663 if (ln == NUM_FREE_LISTS || ln == lnmax) {
664 return allocGroupOnNode(node,max);
665 }
666 bd = free_list[node][ln];
667
668 if (bd->blocks <= max) // exactly the right size!
669 {
670 dbl_link_remove(bd, &free_list[node][ln]);
671 initGroup(bd);
672 }
673 else // block too big...
674 {
675 bd = split_free_block(bd, node, max, ln);
676 ASSERT(bd->blocks == max);
677 initGroup(bd);
678 }
679
680 recordAllocatedBlocks(node, bd->blocks);
681
682 IF_DEBUG(zero_on_gc, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE));
683 IF_DEBUG(sanity, checkFreeListSanity());
684 return bd;
685 }
686
allocLargeChunk(W_ min,W_ max)687 bdescr* allocLargeChunk (W_ min, W_ max)
688 {
689 return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max);
690 }
691
692 bdescr *
allocGroup_lock(W_ n)693 allocGroup_lock(W_ n)
694 {
695 bdescr *bd;
696 ACQUIRE_SM_LOCK;
697 bd = allocGroup(n);
698 RELEASE_SM_LOCK;
699 return bd;
700 }
701
702 bdescr *
allocBlock_lock(void)703 allocBlock_lock(void)
704 {
705 bdescr *bd;
706 ACQUIRE_SM_LOCK;
707 bd = allocBlock();
708 RELEASE_SM_LOCK;
709 return bd;
710 }
711
712 bdescr *
allocGroupOnNode_lock(uint32_t node,W_ n)713 allocGroupOnNode_lock(uint32_t node, W_ n)
714 {
715 bdescr *bd;
716 ACQUIRE_SM_LOCK;
717 bd = allocGroupOnNode(node,n);
718 RELEASE_SM_LOCK;
719 return bd;
720 }
721
722 bdescr *
allocBlockOnNode_lock(uint32_t node)723 allocBlockOnNode_lock(uint32_t node)
724 {
725 bdescr *bd;
726 ACQUIRE_SM_LOCK;
727 bd = allocBlockOnNode(node);
728 RELEASE_SM_LOCK;
729 return bd;
730 }
731
732 /* -----------------------------------------------------------------------------
733 De-Allocation
734 -------------------------------------------------------------------------- */
735
736 STATIC_INLINE bdescr *
coalesce_mblocks(bdescr * p)737 coalesce_mblocks (bdescr *p)
738 {
739 bdescr *q;
740
741 q = p->link;
742 if (q != NULL &&
743 MBLOCK_ROUND_DOWN(q) ==
744 (StgWord8*)MBLOCK_ROUND_DOWN(p) +
745 BLOCKS_TO_MBLOCKS(p->blocks) * MBLOCK_SIZE) {
746 // can coalesce
747 p->blocks = MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(p->blocks) +
748 BLOCKS_TO_MBLOCKS(q->blocks));
749 p->link = q->link;
750 return p;
751 }
752 return q;
753 }
754
755 static void
free_mega_group(bdescr * mg)756 free_mega_group (bdescr *mg)
757 {
758 bdescr *bd, *prev;
759 uint32_t node;
760
761 // Find the right place in the free list. free_mblock_list is
762 // sorted by *address*, not by size as the free_list is.
763 prev = NULL;
764 node = mg->node;
765 bd = free_mblock_list[node];
766 while (bd && bd->start < mg->start) {
767 prev = bd;
768 bd = bd->link;
769 }
770
771 // coalesce backwards
772 if (prev)
773 {
774 mg->link = prev->link;
775 prev->link = mg;
776 mg = coalesce_mblocks(prev);
777 }
778 else
779 {
780 mg->link = free_mblock_list[node];
781 free_mblock_list[node] = mg;
782 }
783 // coalesce forwards
784 coalesce_mblocks(mg);
785
786 IF_DEBUG(sanity, checkFreeListSanity());
787 }
788
789
790 /* Note [Data races in freeGroup]
791 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
792 * freeGroup commits a rather serious concurrency sin in its block coalescence
793 * logic: When freeing a block it looks at bd->free of the previous/next block
794 * to see whether it is allocated. However, the free'ing thread likely does not
795 * own the previous/next block, nor do we make any attempt to synchronize with
796 * the thread that *does* own it; this makes this access a data race.
797 *
798 * The original design argued that this was correct because `bd->free` will
799 * only take a value of -1 when the block is free and thereby owned by the
800 * storage manager. However, this is nevertheless unsafe under the C11 data
801 * model, which guarantees no particular semantics for data races.
802 *
803 * We currently assume (and hope) we won't see torn values and consequently
804 * we will never see `bd->free == -1` for an allocated block which we do not
805 * own. However, this is all extremely dodgy.
806 *
807 * This is tracked as #18913.
808 */
809
810 void
freeGroup(bdescr * p)811 freeGroup(bdescr *p)
812 {
813 StgWord ln;
814 uint32_t node;
815
816 // not true in multithreaded GC:
817 // ASSERT_SM_LOCK();
818
819 ASSERT(RELAXED_LOAD(&p->free) != (P_)-1);
820
821 #if defined(DEBUG)
822 for (uint32_t i=0; i < p->blocks; i++) {
823 p[i].flags = 0;
824 }
825 #endif
826
827 node = p->node;
828
829 RELAXED_STORE(&p->free, (void *) -1); /* indicates that this block is free */
830 RELAXED_STORE(&p->gen, NULL);
831 RELAXED_STORE(&p->gen_no, 0);
832 /* fill the block group with garbage if sanity checking is on */
833 IF_DEBUG(zero_on_gc, memset(p->start, 0xaa, (W_)p->blocks * BLOCK_SIZE));
834
835 if (p->blocks == 0) barf("freeGroup: block size is zero");
836
837 if (p->blocks >= BLOCKS_PER_MBLOCK)
838 {
839 StgWord mblocks;
840
841 mblocks = BLOCKS_TO_MBLOCKS(p->blocks);
842 // If this is an mgroup, make sure it has the right number of blocks
843 ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks));
844
845 recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
846
847 free_mega_group(p);
848 return;
849 }
850
851 recordFreedBlocks(node, p->blocks);
852
853 // coalesce forwards
854 {
855 bdescr *next;
856 next = p + p->blocks;
857
858 // See Note [Data races in freeGroup].
859 TSAN_ANNOTATE_BENIGN_RACE(&next->free, "freeGroup");
860 if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(p))
861 && RELAXED_LOAD(&next->free) == (P_)-1)
862 {
863 p->blocks += next->blocks;
864 ln = log_2(next->blocks);
865 dbl_link_remove(next, &free_list[node][ln]);
866 if (p->blocks == BLOCKS_PER_MBLOCK)
867 {
868 free_mega_group(p);
869 return;
870 }
871 setup_tail(p);
872 }
873 }
874
875 // coalesce backwards
876 if (p != FIRST_BDESCR(MBLOCK_ROUND_DOWN(p)))
877 {
878 bdescr *prev;
879 prev = p - 1;
880 if (prev->blocks == 0) prev = prev->link; // find the head
881
882 // See Note [Data races in freeGroup].
883 TSAN_ANNOTATE_BENIGN_RACE(&prev->free, "freeGroup");
884 if (RELAXED_LOAD(&prev->free) == (P_)-1)
885 {
886 ln = log_2(prev->blocks);
887 dbl_link_remove(prev, &free_list[node][ln]);
888 prev->blocks += p->blocks;
889 if (prev->blocks >= BLOCKS_PER_MBLOCK)
890 {
891 free_mega_group(prev);
892 return;
893 }
894 p = prev;
895 }
896 }
897
898 setup_tail(p);
899 free_list_insert(node,p);
900
901 IF_DEBUG(sanity, checkFreeListSanity());
902 }
903
904 void
freeGroup_lock(bdescr * p)905 freeGroup_lock(bdescr *p)
906 {
907 ACQUIRE_SM_LOCK;
908 freeGroup(p);
909 RELEASE_SM_LOCK;
910 }
911
912 void
freeChain(bdescr * bd)913 freeChain(bdescr *bd)
914 {
915 bdescr *next_bd;
916 while (bd != NULL) {
917 next_bd = bd->link;
918 freeGroup(bd);
919 bd = next_bd;
920 }
921 }
922
923 void
freeChain_lock(bdescr * bd)924 freeChain_lock(bdescr *bd)
925 {
926 ACQUIRE_SM_LOCK;
927 freeChain(bd);
928 RELEASE_SM_LOCK;
929 }
930
931 static void
initMBlock(void * mblock,uint32_t node)932 initMBlock(void *mblock, uint32_t node)
933 {
934 bdescr *bd;
935 StgWord8 *block;
936
937 /* the first few Bdescr's in a block are unused, so we don't want to
938 * put them all on the free list.
939 */
940 block = FIRST_BLOCK(mblock);
941 bd = FIRST_BDESCR(mblock);
942
943 /* Initialise the start field of each block descriptor
944 */
945 for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1,
946 block += BLOCK_SIZE) {
947 bd->start = (void*)block;
948 bd->node = node;
949 }
950 }
951
952 /* -----------------------------------------------------------------------------
953 Stats / metrics
954 -------------------------------------------------------------------------- */
955
956 W_
countBlocks(bdescr * bd)957 countBlocks(bdescr *bd)
958 {
959 W_ n;
960 for (n=0; bd != NULL; bd=bd->link) {
961 n += bd->blocks;
962 }
963 return n;
964 }
965
966 // (*1) Just like countBlocks, except that we adjust the count for a
967 // megablock group so that it doesn't include the extra few blocks
968 // that would be taken up by block descriptors in the second and
969 // subsequent megablock. This is so we can tally the count with the
970 // number of blocks allocated in the system, for memInventory().
971 W_
countAllocdBlocks(bdescr * bd)972 countAllocdBlocks(bdescr *bd)
973 {
974 W_ n;
975 for (n=0; bd != NULL; bd=bd->link) {
976 n += bd->blocks;
977
978 // hack for megablock groups: see (*1) above
979 if (bd->blocks > BLOCKS_PER_MBLOCK) {
980 n -= (MBLOCK_SIZE / BLOCK_SIZE - BLOCKS_PER_MBLOCK)
981 * (bd->blocks/(MBLOCK_SIZE/BLOCK_SIZE));
982 }
983 }
984 return n;
985 }
986
returnMemoryToOS(uint32_t n)987 void returnMemoryToOS(uint32_t n /* megablocks */)
988 {
989 bdescr *bd;
990 uint32_t node;
991 StgWord size;
992
993 // ToDo: not fair, we free all the memory starting with node 0.
994 for (node = 0; n > 0 && node < n_numa_nodes; node++) {
995 bd = free_mblock_list[node];
996 while ((n > 0) && (bd != NULL)) {
997 size = BLOCKS_TO_MBLOCKS(bd->blocks);
998 if (size > n) {
999 StgWord newSize = size - n;
1000 char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
1001 freeAddr += newSize * MBLOCK_SIZE;
1002 bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
1003 freeMBlocks(freeAddr, n);
1004 n = 0;
1005 }
1006 else {
1007 char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
1008 n -= size;
1009 bd = bd->link;
1010 freeMBlocks(freeAddr, size);
1011 }
1012 }
1013 free_mblock_list[node] = bd;
1014 }
1015
1016 // Ask the OS to release any address space portion
1017 // that was associated with the just released MBlocks
1018 //
1019 // Historically, we used to ask the OS directly (via
1020 // osReleaseFreeMemory()) - now the MBlock layer might
1021 // have a reason to preserve the address space range,
1022 // so we keep it
1023 releaseFreeMemory();
1024
1025 IF_DEBUG(gc,
1026 if (n != 0) {
1027 debugBelch("Wanted to free %d more MBlocks than are freeable\n",
1028 n);
1029 }
1030 );
1031 }
1032
1033 /* -----------------------------------------------------------------------------
1034 Debugging
1035 -------------------------------------------------------------------------- */
1036
1037 #if defined(DEBUG)
1038 static void
check_tail(bdescr * bd)1039 check_tail (bdescr *bd)
1040 {
1041 bdescr *tail = tail_of(bd);
1042
1043 if (tail != bd)
1044 {
1045 ASSERT(tail->blocks == 0);
1046 ASSERT(tail->free == 0);
1047 ASSERT(tail->link == bd);
1048 }
1049 }
1050
1051 void
checkFreeListSanity(void)1052 checkFreeListSanity(void)
1053 {
1054 bdescr *bd, *prev;
1055 StgWord ln, min;
1056 uint32_t node;
1057
1058 for (node = 0; node < n_numa_nodes; node++) {
1059 min = 1;
1060 for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
1061 IF_DEBUG(block_alloc,
1062 debugBelch("free block list [%" FMT_Word "]:\n", ln));
1063
1064 prev = NULL;
1065 for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link)
1066 {
1067 IF_DEBUG(block_alloc,
1068 debugBelch("group at %p, length %ld blocks\n",
1069 bd->start, (long)bd->blocks));
1070 ASSERT(bd->free == (P_)-1);
1071 ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK);
1072 ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1));
1073 ASSERT(bd->link != bd); // catch easy loops
1074 ASSERT(bd->node == node);
1075
1076 check_tail(bd);
1077
1078 if (prev)
1079 ASSERT(bd->u.back == prev);
1080 else
1081 ASSERT(bd->u.back == NULL);
1082
1083 {
1084 bdescr *next;
1085 next = bd + bd->blocks;
1086 if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd)))
1087 {
1088 ASSERT(next->free != (P_)-1);
1089 }
1090 }
1091 }
1092 min = min << 1;
1093 }
1094
1095 prev = NULL;
1096 for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
1097 {
1098 IF_DEBUG(block_alloc,
1099 debugBelch("mega group at %p, length %ld blocks\n",
1100 bd->start, (long)bd->blocks));
1101
1102 ASSERT(bd->link != bd); // catch easy loops
1103
1104 if (bd->link != NULL)
1105 {
1106 // make sure the list is sorted
1107 ASSERT(bd->start < bd->link->start);
1108 }
1109
1110 ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK);
1111 ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks))
1112 == bd->blocks);
1113
1114 // make sure we're fully coalesced
1115 if (bd->link != NULL)
1116 {
1117 ASSERT(MBLOCK_ROUND_DOWN(bd->link) !=
1118 (StgWord8*)MBLOCK_ROUND_DOWN(bd) +
1119 BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE);
1120 }
1121 }
1122 }
1123 }
1124
1125 W_ /* BLOCKS */
countFreeList(void)1126 countFreeList(void)
1127 {
1128 bdescr *bd;
1129 W_ total_blocks = 0;
1130 StgWord ln;
1131 uint32_t node;
1132
1133 for (node = 0; node < n_numa_nodes; node++) {
1134 for (ln=0; ln < NUM_FREE_LISTS; ln++) {
1135 for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
1136 total_blocks += bd->blocks;
1137 }
1138 }
1139 for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) {
1140 total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks);
1141 // The caller of this function, memInventory(), expects to match
1142 // the total number of blocks in the system against mblocks *
1143 // BLOCKS_PER_MBLOCK, so we must subtract the space for the
1144 // block descriptors from *every* mblock.
1145 }
1146 }
1147 return total_blocks;
1148 }
1149
1150 void
markBlocks(bdescr * bd)1151 markBlocks (bdescr *bd)
1152 {
1153 for (; bd != NULL; bd = bd->link) {
1154 bd->flags |= BF_KNOWN;
1155 }
1156 }
1157
1158 void
reportUnmarkedBlocks(void)1159 reportUnmarkedBlocks (void)
1160 {
1161 void *mblock;
1162 void *state;
1163 bdescr *bd;
1164
1165 debugBelch("Unreachable blocks:\n");
1166 for (mblock = getFirstMBlock(&state); mblock != NULL;
1167 mblock = getNextMBlock(&state, mblock)) {
1168 for (bd = FIRST_BDESCR(mblock); bd <= LAST_BDESCR(mblock); ) {
1169 if (!(bd->flags & BF_KNOWN) && bd->free != (P_)-1) {
1170 debugBelch(" %p\n",bd);
1171 }
1172 if (bd->blocks >= BLOCKS_PER_MBLOCK) {
1173 mblock = (StgWord8*)mblock +
1174 (BLOCKS_TO_MBLOCKS(bd->blocks) - 1) * MBLOCK_SIZE;
1175 break;
1176 } else {
1177 bd += bd->blocks;
1178 }
1179 }
1180 }
1181 }
1182
1183 #endif
1184