1 /* -----------------------------------------------------------------------------
2  *
3  * (c) The GHC Team 1998-2008
4  *
5  * The block allocator and free list manager.
6  *
7  * This is the architecture independent part of the block allocator.
8  * It requires only the following support from the operating system:
9  *
10  *    void *getMBlocks(uint32_t n);
11  *
12  * returns the address of an n*MBLOCK_SIZE region of memory, aligned on
13  * an MBLOCK_SIZE boundary.  There are no other restrictions on the
14  * addresses of memory returned by getMBlocks().
15  *
16  * ---------------------------------------------------------------------------*/
17 
18 #include "PosixSource.h"
19 #include "Rts.h"
20 
21 #include "Storage.h"
22 #include "RtsUtils.h"
23 #include "BlockAlloc.h"
24 #include "OSMem.h"
25 
26 #include <string.h>
27 
28 static void  initMBlock(void *mblock, uint32_t node);
29 
30 /* -----------------------------------------------------------------------------
31 
32   Implementation notes
33   ~~~~~~~~~~~~~~~~~~~~
34 
35   Terminology:
36     - bdescr = block descriptor
37     - bgroup = block group (1 or more adjacent blocks)
38     - mblock = mega block
39     - mgroup = mega group (1 or more adjacent mblocks)
40 
41    Invariants on block descriptors
42    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43    bd->start always points to the start of the block.
44 
45    bd->free is either:
46       - zero for a non-group-head; bd->link points to the head
47       - (-1) for the head of a free block group
48       - or it points within the block (group)
49 
50    bd->blocks is either:
51       - zero for a non-group-head; bd->link points to the head
52       - number of blocks in this group otherwise
53 
54    bd->link either points to a block descriptor or is NULL
55 
56    The following fields are not used by the allocator:
57      bd->flags
58      bd->gen_no
59      bd->gen
60      bd->dest
61 
62   Exceptions: we don't maintain invariants for all the blocks within a
63   group on the free list, because it is expensive to modify every
64   bdescr in a group when coalescing.  Just the head and last bdescrs
65   will be correct for a group on the free list.
66 
67 
68   Free lists
69   ~~~~~~~~~~
70 
71   Preliminaries:
72     - most allocations are for a small number of blocks
73     - sometimes the OS gives us new memory backwards in the address
74       space, sometimes forwards, so we should not be biased towards
75       any particular layout in the address space
76     - We want to avoid fragmentation
77     - We want allocation and freeing to be O(1) or close.
78 
79   Coalescing trick: when a bgroup is freed (freeGroup()), we can check
80   whether it can be coalesced with other free bgroups by checking the
81   bdescrs for the blocks on either side of it.  This means that we can
82   coalesce in O(1) time.  Every free bgroup must have its head and tail
83   bdescrs initialised, the rest don't matter.
84 
85   We keep the free list in buckets, using a heap-sort strategy.
86   Bucket N contains blocks with sizes 2^N - 2^(N+1)-1.  The list of
87   blocks in each bucket is doubly-linked, so that if a block is
88   coalesced we can easily remove it from its current free list.
89 
90   To allocate a new block of size S, grab a block from bucket
91   log2ceiling(S) (i.e. log2() rounded up), in which all blocks are at
92   least as big as S, and split it if necessary.  If there are no
93   blocks in that bucket, look at bigger buckets until a block is found
94   Allocation is therefore O(logN) time.
95 
96   To free a block:
97     - coalesce it with neighbours.
98     - remove coalesced neighbour(s) from free list(s)
99     - add the new (coalesced) block to the front of the appropriate
100       bucket, given by log2(S) where S is the size of the block.
101 
102   Free is O(1).
103 
104   Megablocks
105   ~~~~~~~~~~
106 
107   Separately from the free list of block groups, which are smaller than
108   an mblock, we maintain a free list of mblock groups.  This is the unit
109   of memory the operating system gives us, and we may either split mblocks
110   into blocks or allocate them directly (when very large contiguous regions
111   of memory).  mblocks have a different set of invariants than blocks:
112 
113   bd->start points to the start of the block IF the block is in the first mblock
114   bd->blocks and bd->link are only valid IF this block is the first block
115     of the first mblock
116   No other fields are used (in particular, free is not used, meaning that
117     space that is not used by the (single) object is wasted.
118 
119   This has implications for the free list as well:
120   We cannot play the coalescing trick with mblocks, because there is
121   no requirement that the bdescrs in the second and subsequent mblock
122   of an mgroup are initialised (the mgroup might be filled with a
123   large array, overwriting the bdescrs for example).
124 
125   The separate free list for megablocks is thus sorted in *address*
126   order, so that we can coalesce.  Allocation in this list is best-fit
127   by traversing the whole list: we don't expect this list to be long,
128   and allocation/freeing of large blocks is rare; avoiding
129   fragmentation is more important than performance here.
130 
131   freeGroup() might end up moving a block from free_list to
132   free_mblock_list, if after coalescing we end up with a full mblock.
133 
134   checkFreeListSanity() checks all the invariants on the free lists.
135 
136   --------------------------------------------------------------------------- */
137 
138 /* ---------------------------------------------------------------------------
139    WATCH OUT FOR OVERFLOW
140 
141    Be very careful with integer overflow here.  If you have an
142    expression like (n_blocks * BLOCK_SIZE), and n_blocks is an int or
143    a uint32_t, then it will very likely overflow on a 64-bit platform.
144    Always cast to StgWord (or W_ for short) first: ((W_)n_blocks * BLOCK_SIZE).
145 
146   --------------------------------------------------------------------------- */
147 
148 // free_list[i] contains blocks that are at least size 2^i, and at
149 // most size 2^(i+1) - 1.
150 //
151 // To find the free list in which to place a block, use log_2(size).
152 // To find a free block of the right size, use log_2_ceil(size).
153 //
154 // The largest free list (free_list[NUM_FREE_LISTS-1]) needs to contain sizes
155 // from half a megablock up to (but not including) a full megablock.
156 
157 #define NUM_FREE_LISTS (MBLOCK_SHIFT-BLOCK_SHIFT)
158 
159 // In THREADED_RTS mode, the free list is protected by sm_mutex.
160 
161 static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS];
162 static bdescr *free_mblock_list[MAX_NUMA_NODES];
163 
164 W_ n_alloc_blocks;   // currently allocated blocks
165 W_ hw_alloc_blocks;  // high-water allocated blocks
166 
167 W_ n_alloc_blocks_by_node[MAX_NUMA_NODES];
168 
169 /* -----------------------------------------------------------------------------
170    Initialisation
171    -------------------------------------------------------------------------- */
172 
initBlockAllocator(void)173 void initBlockAllocator(void)
174 {
175     uint32_t i, node;
176     for (node = 0; node < MAX_NUMA_NODES; node++) {
177         for (i=0; i < NUM_FREE_LISTS; i++) {
178             free_list[node][i] = NULL;
179         }
180         free_mblock_list[node] = NULL;
181         n_alloc_blocks_by_node[node] = 0;
182     }
183     n_alloc_blocks = 0;
184     hw_alloc_blocks = 0;
185 }
186 
187 /* -----------------------------------------------------------------------------
188    Accounting
189    -------------------------------------------------------------------------- */
190 
191 STATIC_INLINE
recordAllocatedBlocks(uint32_t node,uint32_t n)192 void recordAllocatedBlocks(uint32_t node, uint32_t n)
193 {
194     n_alloc_blocks += n;
195     n_alloc_blocks_by_node[node] += n;
196     if (n > 0 && n_alloc_blocks > hw_alloc_blocks) {
197         hw_alloc_blocks = n_alloc_blocks;
198     }
199 }
200 
201 STATIC_INLINE
recordFreedBlocks(uint32_t node,uint32_t n)202 void recordFreedBlocks(uint32_t node, uint32_t n)
203 {
204     ASSERT(n_alloc_blocks >= n);
205     n_alloc_blocks -= n;
206     n_alloc_blocks_by_node[node] -= n;
207 }
208 
209 /* -----------------------------------------------------------------------------
210    Allocation
211    -------------------------------------------------------------------------- */
212 
213 STATIC_INLINE bdescr *
tail_of(bdescr * bd)214 tail_of (bdescr *bd)
215 {
216     return bd + bd->blocks - 1;
217 }
218 
219 STATIC_INLINE void
initGroup(bdescr * head)220 initGroup(bdescr *head)
221 {
222   head->free   = head->start;
223   head->link   = NULL;
224 
225   // If this is a block group (but not a megablock group), we
226   // make the last block of the group point to the head.  This is used
227   // when coalescing blocks in freeGroup().  We don't do this for
228   // megablock groups because blocks in the second and subsequent
229   // mblocks don't have bdescrs; freeing these is handled in a
230   // different way by free_mblock_group().
231   if (head->blocks > 1 && head->blocks <= BLOCKS_PER_MBLOCK) {
232       bdescr *last = tail_of(head);
233       last->blocks = 0;
234       last->link = head;
235   }
236 
237 #if defined(DEBUG)
238   for (uint32_t i=0; i < head->blocks; i++) {
239       head[i].flags = 0;
240   }
241 #endif
242 }
243 
244 #if SIZEOF_VOID_P == SIZEOF_LONG
245 #define CLZW(n) (__builtin_clzl(n))
246 #else
247 #define CLZW(n) (__builtin_clzll(n))
248 #endif
249 
250 // log base 2 (floor), needs to support up to (2^NUM_FREE_LISTS)-1
251 STATIC_INLINE uint32_t
log_2(W_ n)252 log_2(W_ n)
253 {
254     ASSERT(n > 0 && n < (1<<NUM_FREE_LISTS));
255 #if defined(__GNUC__)
256     return CLZW(n) ^ (sizeof(StgWord)*8 - 1);
257     // generates good code on x86.  __builtin_clz() compiles to bsr+xor, but
258     // we want just bsr, so the xor here cancels out gcc's xor.
259 #else
260     W_ i, x;
261     x = n;
262     for (i=0; i < NUM_FREE_LISTS; i++) {
263         x = x >> 1;
264         if (x == 0) return i;
265     }
266     return NUM_FREE_LISTS;
267 #endif
268 }
269 
270 // log base 2 (ceiling), needs to support up to (2^NUM_FREE_LISTS)-1
271 STATIC_INLINE uint32_t
log_2_ceil(W_ n)272 log_2_ceil(W_ n)
273 {
274     ASSERT(n > 0 && n < (1<<NUM_FREE_LISTS));
275 #if defined(__GNUC__)
276     uint32_t r = log_2(n);
277     return (n & (n-1)) ? r+1 : r;
278 #else
279     W_ i, x;
280     x = 1;
281     for (i=0; i < MAX_FREE_LIST; i++) {
282         if (x >= n) return i;
283         x = x << 1;
284     }
285     return MAX_FREE_LIST;
286 #endif
287 }
288 
289 STATIC_INLINE void
free_list_insert(uint32_t node,bdescr * bd)290 free_list_insert (uint32_t node, bdescr *bd)
291 {
292     uint32_t ln;
293 
294     ASSERT(bd->blocks < BLOCKS_PER_MBLOCK);
295     ln = log_2(bd->blocks);
296 
297     dbl_link_onto(bd, &free_list[node][ln]);
298 }
299 
300 // After splitting a group, the last block of each group must have a
301 // tail that points to the head block, to keep our invariants for
302 // coalescing.
303 STATIC_INLINE void
setup_tail(bdescr * bd)304 setup_tail (bdescr *bd)
305 {
306     bdescr *tail;
307     tail = tail_of(bd);
308     if (tail != bd) {
309         tail->blocks = 0;
310         tail->free = 0;
311         tail->link = bd;
312     }
313 }
314 
315 
316 // Take a free block group bd, and split off a group of size n from
317 // it.  Adjust the free list as necessary, and return the new group.
318 static bdescr *
split_free_block(bdescr * bd,uint32_t node,W_ n,uint32_t ln)319 split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln /* log_2_ceil(n) */)
320 {
321     bdescr *fg; // free group
322 
323     ASSERT(bd->blocks > n);
324     dbl_link_remove(bd, &free_list[node][ln]);
325     fg = bd + bd->blocks - n; // take n blocks off the end
326     fg->blocks = n;
327     bd->blocks -= n;
328     setup_tail(bd);
329     ln = log_2(bd->blocks);
330     dbl_link_onto(bd, &free_list[node][ln]);
331     return fg;
332 }
333 
334 // Take N blocks off the end, free the rest.
335 static bdescr *
split_block_high(bdescr * bd,W_ n)336 split_block_high (bdescr *bd, W_ n)
337 {
338     ASSERT(bd->blocks > n);
339 
340     bdescr* ret = bd + bd->blocks - n; // take n blocks off the end
341     ret->blocks = n;
342     ret->start = ret->free = bd->start + (bd->blocks - n)*BLOCK_SIZE_W;
343     ret->link = NULL;
344 
345     bd->blocks -= n;
346 
347     setup_tail(ret);
348     setup_tail(bd);
349     freeGroup(bd);
350 
351     return ret;
352 }
353 
354 // Like `split_block_high`, but takes n blocks off the beginning rather
355 // than the end.
356 static bdescr *
split_block_low(bdescr * bd,W_ n)357 split_block_low (bdescr *bd, W_ n)
358 {
359     ASSERT(bd->blocks > n);
360 
361     bdescr* bd_ = bd + n;
362     bd_->blocks = bd->blocks - n;
363     bd_->start = bd_->free = bd->start + n*BLOCK_SIZE_W;
364 
365     bd->blocks = n;
366 
367     setup_tail(bd_);
368     setup_tail(bd);
369     freeGroup(bd_);
370 
371     return bd;
372 }
373 
374 /* Only initializes the start pointers on the first megablock and the
375  * blocks field of the first bdescr; callers are responsible for calling
376  * initGroup afterwards.
377  */
378 static bdescr *
alloc_mega_group(uint32_t node,StgWord mblocks)379 alloc_mega_group (uint32_t node, StgWord mblocks)
380 {
381     bdescr *best, *bd, *prev;
382     StgWord n;
383 
384     n = MBLOCK_GROUP_BLOCKS(mblocks);
385 
386     best = NULL;
387     prev = NULL;
388     for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
389     {
390         if (bd->blocks == n)
391         {
392             if (prev) {
393                 prev->link = bd->link;
394             } else {
395                 free_mblock_list[node] = bd->link;
396             }
397             return bd;
398         }
399         else if (bd->blocks > n)
400         {
401             if (!best || bd->blocks < best->blocks)
402             {
403                 best = bd;
404             }
405         }
406     }
407 
408     if (best)
409     {
410         // we take our chunk off the end here.
411         StgWord best_mblocks  = BLOCKS_TO_MBLOCKS(best->blocks);
412         bd = FIRST_BDESCR((StgWord8*)MBLOCK_ROUND_DOWN(best) +
413                           (best_mblocks-mblocks)*MBLOCK_SIZE);
414 
415         best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks);
416         initMBlock(MBLOCK_ROUND_DOWN(bd), node);
417     }
418     else
419     {
420         void *mblock;
421         if (RtsFlags.GcFlags.numa) {
422             mblock = getMBlocksOnNode(node, mblocks);
423         } else {
424             mblock = getMBlocks(mblocks);
425         }
426         initMBlock(mblock, node); // only need to init the 1st one
427         bd = FIRST_BDESCR(mblock);
428     }
429     bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
430     return bd;
431 }
432 
433 bdescr *
allocGroupOnNode(uint32_t node,W_ n)434 allocGroupOnNode (uint32_t node, W_ n)
435 {
436     bdescr *bd, *rem;
437     StgWord ln;
438 
439     if (n == 0) barf("allocGroup: requested zero blocks");
440 
441     if (n >= BLOCKS_PER_MBLOCK)
442     {
443         StgWord mblocks;
444 
445         mblocks = BLOCKS_TO_MBLOCKS(n);
446 
447         // n_alloc_blocks doesn't count the extra blocks we get in a
448         // megablock group.
449         recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
450 
451         bd = alloc_mega_group(node, mblocks);
452         // only the bdescrs of the first MB are required to be initialised
453         initGroup(bd);
454         goto finish;
455     }
456 
457     recordAllocatedBlocks(node, n);
458 
459     ln = log_2_ceil(n);
460 
461     while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) {
462         ln++;
463     }
464 
465     if (ln == NUM_FREE_LISTS) {
466 #if 0  /* useful for debugging fragmentation */
467         if ((W_)mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W
468              - (W_)((n_alloc_blocks - n) * BLOCK_SIZE_W) > (2*1024*1024)/sizeof(W_)) {
469             debugBelch("Fragmentation, wanted %d blocks, %ld MB free\n", n, ((mblocks_allocated * BLOCKS_PER_MBLOCK) - n_alloc_blocks) / BLOCKS_PER_MBLOCK);
470             RtsFlags.DebugFlags.block_alloc = 1;
471             checkFreeListSanity();
472         }
473 #endif
474 
475         bd = alloc_mega_group(node,1);
476         bd->blocks = n;
477         initGroup(bd);                   // we know the group will fit
478         rem = bd + n;
479         rem->blocks = BLOCKS_PER_MBLOCK-n;
480         initGroup(rem);                  // init the slop
481         recordAllocatedBlocks(node,rem->blocks);
482         freeGroup(rem);                  // add the slop on to the free list
483         goto finish;
484     }
485 
486     bd = free_list[node][ln];
487 
488     if (bd->blocks == n)                // exactly the right size!
489     {
490         dbl_link_remove(bd, &free_list[node][ln]);
491         initGroup(bd);
492     }
493     else if (bd->blocks >  n)            // block too big...
494     {
495         bd = split_free_block(bd, node, n, ln);
496         ASSERT(bd->blocks == n);
497         initGroup(bd);
498     }
499     else
500     {
501         barf("allocGroup: free list corrupted");
502     }
503 
504 finish:
505     IF_DEBUG(zero_on_gc, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE));
506     IF_DEBUG(sanity, checkFreeListSanity());
507     return bd;
508 }
509 
510 // Allocate `n` blocks aligned to `n` blocks, e.g. when n = 8, the blocks will
511 // be aligned at `8 * BLOCK_SIZE`. For a group with `n` blocks this can be used
512 // for easily accessing the beginning of the group from a location p in the
513 // group with
514 //
515 //     p % (BLOCK_SIZE*n)
516 //
517 // Used by the non-moving collector for allocating segments.
518 //
519 // Because the storage manager does not support aligned allocations, we have to
520 // allocate `2*n - 1` blocks here to make sure we'll be able to find an aligned
521 // region in the allocated blocks. After finding the aligned area we want to
522 // free slop on the low and high sides, and block allocator doesn't support
523 // freeing only some portion of a megablock (we can only free whole megablocks).
524 // So we disallow allocating megablocks here, and allow allocating at most
525 // `BLOCKS_PER_MBLOCK / 2` blocks.
526 bdescr *
allocAlignedGroupOnNode(uint32_t node,W_ n)527 allocAlignedGroupOnNode (uint32_t node, W_ n)
528 {
529     // allocate enough blocks to have enough space aligned at n-block boundary
530     // free any slops on the low and high side of this space
531 
532     // number of blocks to allocate to make sure we have enough aligned space
533     W_ num_blocks = 2*n - 1;
534 
535     if (num_blocks >= BLOCKS_PER_MBLOCK) {
536         barf("allocAlignedGroupOnNode: allocating megablocks is not supported\n"
537              "    requested blocks: %" FMT_Word "\n"
538              "    required for alignment: %" FMT_Word "\n"
539              "    megablock size (in blocks): %" FMT_Word,
540              n, num_blocks, (W_) BLOCKS_PER_MBLOCK);
541     }
542 
543     W_ group_size = n * BLOCK_SIZE;
544 
545     // To reduce splitting and fragmentation we use allocLargeChunkOnNode here.
546     // Tweak the max allocation to avoid allocating megablocks. Splitting slop
547     // below doesn't work with megablocks (freeGroup can't free only a portion
548     // of a megablock so we can't allocate megablocks and free some parts of
549     // them).
550     W_ max_blocks = stg_min(num_blocks * 3, BLOCKS_PER_MBLOCK - 1);
551     bdescr *bd = allocLargeChunkOnNode(node, num_blocks, max_blocks);
552     // We may allocate more than num_blocks, so update it
553     num_blocks = bd->blocks;
554 
555     // slop on the low side
556     W_ slop_low = 0;
557     if ((uintptr_t)bd->start % group_size != 0) {
558         slop_low = group_size - ((uintptr_t)bd->start % group_size);
559     }
560 
561     W_ slop_high = (num_blocks * BLOCK_SIZE) - group_size - slop_low;
562 
563     ASSERT((slop_low % BLOCK_SIZE) == 0);
564     ASSERT((slop_high % BLOCK_SIZE) == 0);
565 
566     W_ slop_low_blocks = slop_low / BLOCK_SIZE;
567     W_ slop_high_blocks = slop_high / BLOCK_SIZE;
568 
569     ASSERT(slop_low_blocks + slop_high_blocks + n == num_blocks);
570 
571 #if defined(DEBUG)
572     checkFreeListSanity();
573     W_ free_before = countFreeList();
574 #endif
575 
576     if (slop_low_blocks != 0) {
577         bd = split_block_high(bd, num_blocks - slop_low_blocks);
578         ASSERT(countBlocks(bd) == num_blocks - slop_low_blocks);
579     }
580 
581 #if defined(DEBUG)
582     ASSERT(countFreeList() == free_before + slop_low_blocks);
583     checkFreeListSanity();
584 #endif
585 
586     // At this point the bd should be aligned, but we may have slop on the high side
587     ASSERT((uintptr_t)bd->start % group_size == 0);
588 
589 #if defined(DEBUG)
590     free_before = countFreeList();
591 #endif
592 
593     if (slop_high_blocks != 0) {
594         bd = split_block_low(bd, n);
595         ASSERT(bd->blocks == n);
596     }
597 
598 #if defined(DEBUG)
599     ASSERT(countFreeList() == free_before + slop_high_blocks);
600     checkFreeListSanity();
601 #endif
602 
603     // Should still be aligned
604     ASSERT((uintptr_t)bd->start % group_size == 0);
605 
606     // Just to make sure I get this right
607     ASSERT(Bdescr(bd->start) == bd);
608 
609     return bd;
610 }
611 
612 STATIC_INLINE
nodeWithLeastBlocks(void)613 uint32_t nodeWithLeastBlocks (void)
614 {
615     uint32_t node = 0, i;
616     uint32_t min_blocks = n_alloc_blocks_by_node[0];
617     for (i = 1; i < n_numa_nodes; i++) {
618         if (n_alloc_blocks_by_node[i] < min_blocks) {
619             min_blocks = n_alloc_blocks_by_node[i];
620             node = i;
621         }
622     }
623     return node;
624 }
625 
allocGroup(W_ n)626 bdescr* allocGroup (W_ n)
627 {
628     return allocGroupOnNode(nodeWithLeastBlocks(),n);
629 }
630 
631 
632 //
633 // Allocate a chunk of blocks that is at least min and at most max
634 // blocks in size. This API is used by the nursery allocator that
635 // wants contiguous memory preferably, but doesn't require it.  When
636 // memory is fragmented we might have lots of chunks that are
637 // less than a full megablock, so allowing the nursery allocator to
638 // use these reduces fragmentation considerably.  e.g. on a GHC build
639 // with +RTS -H, I saw fragmentation go from 17MB down to 3MB on a
640 // single compile.
641 //
642 // Further to this: in #7257 there is a program that creates serious
643 // fragmentation such that the heap is full of tiny <4 block chains.
644 // The nursery allocator therefore has to use single blocks to avoid
645 // fragmentation, but we make sure that we allocate large blocks
646 // preferably if there are any.
647 //
allocLargeChunkOnNode(uint32_t node,W_ min,W_ max)648 bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
649 {
650     bdescr *bd;
651     StgWord ln, lnmax;
652 
653     if (min >= BLOCKS_PER_MBLOCK) {
654         return allocGroupOnNode(node,max);
655     }
656 
657     ln = log_2_ceil(min);
658     lnmax = log_2_ceil(max);
659 
660     while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) {
661         ln++;
662     }
663     if (ln == NUM_FREE_LISTS || ln == lnmax) {
664         return allocGroupOnNode(node,max);
665     }
666     bd = free_list[node][ln];
667 
668     if (bd->blocks <= max)              // exactly the right size!
669     {
670         dbl_link_remove(bd, &free_list[node][ln]);
671         initGroup(bd);
672     }
673     else   // block too big...
674     {
675         bd = split_free_block(bd, node, max, ln);
676         ASSERT(bd->blocks == max);
677         initGroup(bd);
678     }
679 
680     recordAllocatedBlocks(node, bd->blocks);
681 
682     IF_DEBUG(zero_on_gc, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE));
683     IF_DEBUG(sanity, checkFreeListSanity());
684     return bd;
685 }
686 
allocLargeChunk(W_ min,W_ max)687 bdescr* allocLargeChunk (W_ min, W_ max)
688 {
689     return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max);
690 }
691 
692 bdescr *
allocGroup_lock(W_ n)693 allocGroup_lock(W_ n)
694 {
695     bdescr *bd;
696     ACQUIRE_SM_LOCK;
697     bd = allocGroup(n);
698     RELEASE_SM_LOCK;
699     return bd;
700 }
701 
702 bdescr *
allocBlock_lock(void)703 allocBlock_lock(void)
704 {
705     bdescr *bd;
706     ACQUIRE_SM_LOCK;
707     bd = allocBlock();
708     RELEASE_SM_LOCK;
709     return bd;
710 }
711 
712 bdescr *
allocGroupOnNode_lock(uint32_t node,W_ n)713 allocGroupOnNode_lock(uint32_t node, W_ n)
714 {
715     bdescr *bd;
716     ACQUIRE_SM_LOCK;
717     bd = allocGroupOnNode(node,n);
718     RELEASE_SM_LOCK;
719     return bd;
720 }
721 
722 bdescr *
allocBlockOnNode_lock(uint32_t node)723 allocBlockOnNode_lock(uint32_t node)
724 {
725     bdescr *bd;
726     ACQUIRE_SM_LOCK;
727     bd = allocBlockOnNode(node);
728     RELEASE_SM_LOCK;
729     return bd;
730 }
731 
732 /* -----------------------------------------------------------------------------
733    De-Allocation
734    -------------------------------------------------------------------------- */
735 
736 STATIC_INLINE bdescr *
coalesce_mblocks(bdescr * p)737 coalesce_mblocks (bdescr *p)
738 {
739     bdescr *q;
740 
741     q = p->link;
742     if (q != NULL &&
743         MBLOCK_ROUND_DOWN(q) ==
744         (StgWord8*)MBLOCK_ROUND_DOWN(p) +
745         BLOCKS_TO_MBLOCKS(p->blocks) * MBLOCK_SIZE) {
746         // can coalesce
747         p->blocks  = MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(p->blocks) +
748                                          BLOCKS_TO_MBLOCKS(q->blocks));
749         p->link = q->link;
750         return p;
751     }
752     return q;
753 }
754 
755 static void
free_mega_group(bdescr * mg)756 free_mega_group (bdescr *mg)
757 {
758     bdescr *bd, *prev;
759     uint32_t node;
760 
761     // Find the right place in the free list.  free_mblock_list is
762     // sorted by *address*, not by size as the free_list is.
763     prev = NULL;
764     node = mg->node;
765     bd = free_mblock_list[node];
766     while (bd && bd->start < mg->start) {
767         prev = bd;
768         bd = bd->link;
769     }
770 
771     // coalesce backwards
772     if (prev)
773     {
774         mg->link = prev->link;
775         prev->link = mg;
776         mg = coalesce_mblocks(prev);
777     }
778     else
779     {
780         mg->link = free_mblock_list[node];
781         free_mblock_list[node] = mg;
782     }
783     // coalesce forwards
784     coalesce_mblocks(mg);
785 
786     IF_DEBUG(sanity, checkFreeListSanity());
787 }
788 
789 
790 /* Note [Data races in freeGroup]
791  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
792  * freeGroup commits a rather serious concurrency sin in its block coalescence
793  * logic: When freeing a block it looks at bd->free of the previous/next block
794  * to see whether it is allocated. However, the free'ing thread likely does not
795  * own the previous/next block, nor do we make any attempt to synchronize with
796  * the thread that *does* own it; this makes this access a data race.
797  *
798  * The original design argued that this was correct because `bd->free` will
799  * only take a value of -1 when the block is free and thereby owned by the
800  * storage manager. However, this is nevertheless unsafe under the C11 data
801  * model, which guarantees no particular semantics for data races.
802  *
803  * We currently assume (and hope) we won't see torn values and consequently
804  * we will never see `bd->free == -1` for an allocated block which we do not
805  * own. However, this is all extremely dodgy.
806  *
807  * This is tracked as #18913.
808  */
809 
810 void
freeGroup(bdescr * p)811 freeGroup(bdescr *p)
812 {
813   StgWord ln;
814   uint32_t node;
815 
816   // not true in multithreaded GC:
817   // ASSERT_SM_LOCK();
818 
819   ASSERT(RELAXED_LOAD(&p->free) != (P_)-1);
820 
821 #if defined(DEBUG)
822   for (uint32_t i=0; i < p->blocks; i++) {
823       p[i].flags = 0;
824   }
825 #endif
826 
827   node = p->node;
828 
829   RELAXED_STORE(&p->free, (void *) -1);  /* indicates that this block is free */
830   RELAXED_STORE(&p->gen, NULL);
831   RELAXED_STORE(&p->gen_no, 0);
832   /* fill the block group with garbage if sanity checking is on */
833   IF_DEBUG(zero_on_gc, memset(p->start, 0xaa, (W_)p->blocks * BLOCK_SIZE));
834 
835   if (p->blocks == 0) barf("freeGroup: block size is zero");
836 
837   if (p->blocks >= BLOCKS_PER_MBLOCK)
838   {
839       StgWord mblocks;
840 
841       mblocks = BLOCKS_TO_MBLOCKS(p->blocks);
842       // If this is an mgroup, make sure it has the right number of blocks
843       ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks));
844 
845       recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
846 
847       free_mega_group(p);
848       return;
849   }
850 
851   recordFreedBlocks(node, p->blocks);
852 
853   // coalesce forwards
854   {
855       bdescr *next;
856       next = p + p->blocks;
857 
858       // See Note [Data races in freeGroup].
859       TSAN_ANNOTATE_BENIGN_RACE(&next->free, "freeGroup");
860       if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(p))
861           && RELAXED_LOAD(&next->free) == (P_)-1)
862       {
863           p->blocks += next->blocks;
864           ln = log_2(next->blocks);
865           dbl_link_remove(next, &free_list[node][ln]);
866           if (p->blocks == BLOCKS_PER_MBLOCK)
867           {
868               free_mega_group(p);
869               return;
870           }
871           setup_tail(p);
872       }
873   }
874 
875   // coalesce backwards
876   if (p != FIRST_BDESCR(MBLOCK_ROUND_DOWN(p)))
877   {
878       bdescr *prev;
879       prev = p - 1;
880       if (prev->blocks == 0) prev = prev->link; // find the head
881 
882       // See Note [Data races in freeGroup].
883       TSAN_ANNOTATE_BENIGN_RACE(&prev->free, "freeGroup");
884       if (RELAXED_LOAD(&prev->free) == (P_)-1)
885       {
886           ln = log_2(prev->blocks);
887           dbl_link_remove(prev, &free_list[node][ln]);
888           prev->blocks += p->blocks;
889           if (prev->blocks >= BLOCKS_PER_MBLOCK)
890           {
891               free_mega_group(prev);
892               return;
893           }
894           p = prev;
895       }
896   }
897 
898   setup_tail(p);
899   free_list_insert(node,p);
900 
901   IF_DEBUG(sanity, checkFreeListSanity());
902 }
903 
904 void
freeGroup_lock(bdescr * p)905 freeGroup_lock(bdescr *p)
906 {
907     ACQUIRE_SM_LOCK;
908     freeGroup(p);
909     RELEASE_SM_LOCK;
910 }
911 
912 void
freeChain(bdescr * bd)913 freeChain(bdescr *bd)
914 {
915   bdescr *next_bd;
916   while (bd != NULL) {
917     next_bd = bd->link;
918     freeGroup(bd);
919     bd = next_bd;
920   }
921 }
922 
923 void
freeChain_lock(bdescr * bd)924 freeChain_lock(bdescr *bd)
925 {
926     ACQUIRE_SM_LOCK;
927     freeChain(bd);
928     RELEASE_SM_LOCK;
929 }
930 
931 static void
initMBlock(void * mblock,uint32_t node)932 initMBlock(void *mblock, uint32_t node)
933 {
934     bdescr *bd;
935     StgWord8 *block;
936 
937     /* the first few Bdescr's in a block are unused, so we don't want to
938      * put them all on the free list.
939      */
940     block = FIRST_BLOCK(mblock);
941     bd    = FIRST_BDESCR(mblock);
942 
943     /* Initialise the start field of each block descriptor
944      */
945     for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1,
946              block += BLOCK_SIZE) {
947         bd->start = (void*)block;
948         bd->node = node;
949     }
950 }
951 
952 /* -----------------------------------------------------------------------------
953    Stats / metrics
954    -------------------------------------------------------------------------- */
955 
956 W_
countBlocks(bdescr * bd)957 countBlocks(bdescr *bd)
958 {
959     W_ n;
960     for (n=0; bd != NULL; bd=bd->link) {
961         n += bd->blocks;
962     }
963     return n;
964 }
965 
966 // (*1) Just like countBlocks, except that we adjust the count for a
967 // megablock group so that it doesn't include the extra few blocks
968 // that would be taken up by block descriptors in the second and
969 // subsequent megablock.  This is so we can tally the count with the
970 // number of blocks allocated in the system, for memInventory().
971 W_
countAllocdBlocks(bdescr * bd)972 countAllocdBlocks(bdescr *bd)
973 {
974     W_ n;
975     for (n=0; bd != NULL; bd=bd->link) {
976         n += bd->blocks;
977 
978         // hack for megablock groups: see (*1) above
979         if (bd->blocks > BLOCKS_PER_MBLOCK) {
980             n -= (MBLOCK_SIZE / BLOCK_SIZE - BLOCKS_PER_MBLOCK)
981                 * (bd->blocks/(MBLOCK_SIZE/BLOCK_SIZE));
982         }
983     }
984     return n;
985 }
986 
returnMemoryToOS(uint32_t n)987 void returnMemoryToOS(uint32_t n /* megablocks */)
988 {
989     bdescr *bd;
990     uint32_t node;
991     StgWord size;
992 
993     // ToDo: not fair, we free all the memory starting with node 0.
994     for (node = 0; n > 0 && node < n_numa_nodes; node++) {
995         bd = free_mblock_list[node];
996         while ((n > 0) && (bd != NULL)) {
997             size = BLOCKS_TO_MBLOCKS(bd->blocks);
998             if (size > n) {
999                 StgWord newSize = size - n;
1000                 char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
1001                 freeAddr += newSize * MBLOCK_SIZE;
1002                 bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
1003                 freeMBlocks(freeAddr, n);
1004                 n = 0;
1005             }
1006             else {
1007                 char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
1008                 n -= size;
1009                 bd = bd->link;
1010                 freeMBlocks(freeAddr, size);
1011             }
1012         }
1013         free_mblock_list[node] = bd;
1014     }
1015 
1016     // Ask the OS to release any address space portion
1017     // that was associated with the just released MBlocks
1018     //
1019     // Historically, we used to ask the OS directly (via
1020     // osReleaseFreeMemory()) - now the MBlock layer might
1021     // have a reason to preserve the address space range,
1022     // so we keep it
1023     releaseFreeMemory();
1024 
1025     IF_DEBUG(gc,
1026         if (n != 0) {
1027             debugBelch("Wanted to free %d more MBlocks than are freeable\n",
1028                        n);
1029         }
1030     );
1031 }
1032 
1033 /* -----------------------------------------------------------------------------
1034    Debugging
1035    -------------------------------------------------------------------------- */
1036 
1037 #if defined(DEBUG)
1038 static void
check_tail(bdescr * bd)1039 check_tail (bdescr *bd)
1040 {
1041     bdescr *tail = tail_of(bd);
1042 
1043     if (tail != bd)
1044     {
1045         ASSERT(tail->blocks == 0);
1046         ASSERT(tail->free == 0);
1047         ASSERT(tail->link == bd);
1048     }
1049 }
1050 
1051 void
checkFreeListSanity(void)1052 checkFreeListSanity(void)
1053 {
1054     bdescr *bd, *prev;
1055     StgWord ln, min;
1056     uint32_t node;
1057 
1058     for (node = 0; node < n_numa_nodes; node++) {
1059         min = 1;
1060         for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
1061             IF_DEBUG(block_alloc,
1062                      debugBelch("free block list [%" FMT_Word "]:\n", ln));
1063 
1064             prev = NULL;
1065             for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link)
1066             {
1067                 IF_DEBUG(block_alloc,
1068                          debugBelch("group at %p, length %ld blocks\n",
1069                                     bd->start, (long)bd->blocks));
1070                 ASSERT(bd->free == (P_)-1);
1071                 ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK);
1072                 ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1));
1073                 ASSERT(bd->link != bd); // catch easy loops
1074                 ASSERT(bd->node == node);
1075 
1076                 check_tail(bd);
1077 
1078                 if (prev)
1079                     ASSERT(bd->u.back == prev);
1080                 else
1081                     ASSERT(bd->u.back == NULL);
1082 
1083                 {
1084                     bdescr *next;
1085                     next = bd + bd->blocks;
1086                     if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd)))
1087                     {
1088                         ASSERT(next->free != (P_)-1);
1089                     }
1090                 }
1091             }
1092             min = min << 1;
1093         }
1094 
1095         prev = NULL;
1096         for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
1097         {
1098             IF_DEBUG(block_alloc,
1099                      debugBelch("mega group at %p, length %ld blocks\n",
1100                                 bd->start, (long)bd->blocks));
1101 
1102             ASSERT(bd->link != bd); // catch easy loops
1103 
1104             if (bd->link != NULL)
1105             {
1106                 // make sure the list is sorted
1107                 ASSERT(bd->start < bd->link->start);
1108             }
1109 
1110             ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK);
1111             ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks))
1112                    == bd->blocks);
1113 
1114             // make sure we're fully coalesced
1115             if (bd->link != NULL)
1116             {
1117                 ASSERT(MBLOCK_ROUND_DOWN(bd->link) !=
1118                        (StgWord8*)MBLOCK_ROUND_DOWN(bd) +
1119                        BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE);
1120             }
1121         }
1122     }
1123 }
1124 
1125 W_ /* BLOCKS */
countFreeList(void)1126 countFreeList(void)
1127 {
1128   bdescr *bd;
1129   W_ total_blocks = 0;
1130   StgWord ln;
1131   uint32_t node;
1132 
1133   for (node = 0; node < n_numa_nodes; node++) {
1134       for (ln=0; ln < NUM_FREE_LISTS; ln++) {
1135           for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
1136               total_blocks += bd->blocks;
1137           }
1138       }
1139       for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) {
1140           total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks);
1141           // The caller of this function, memInventory(), expects to match
1142           // the total number of blocks in the system against mblocks *
1143           // BLOCKS_PER_MBLOCK, so we must subtract the space for the
1144           // block descriptors from *every* mblock.
1145       }
1146   }
1147   return total_blocks;
1148 }
1149 
1150 void
markBlocks(bdescr * bd)1151 markBlocks (bdescr *bd)
1152 {
1153     for (; bd != NULL; bd = bd->link) {
1154         bd->flags |= BF_KNOWN;
1155     }
1156 }
1157 
1158 void
reportUnmarkedBlocks(void)1159 reportUnmarkedBlocks (void)
1160 {
1161     void *mblock;
1162     void *state;
1163     bdescr *bd;
1164 
1165     debugBelch("Unreachable blocks:\n");
1166     for (mblock = getFirstMBlock(&state); mblock != NULL;
1167          mblock = getNextMBlock(&state, mblock)) {
1168         for (bd = FIRST_BDESCR(mblock); bd <= LAST_BDESCR(mblock); ) {
1169             if (!(bd->flags & BF_KNOWN) && bd->free != (P_)-1) {
1170                 debugBelch("  %p\n",bd);
1171             }
1172             if (bd->blocks >= BLOCKS_PER_MBLOCK) {
1173                 mblock = (StgWord8*)mblock +
1174                     (BLOCKS_TO_MBLOCKS(bd->blocks) - 1) * MBLOCK_SIZE;
1175                 break;
1176             } else {
1177                 bd += bd->blocks;
1178             }
1179         }
1180     }
1181 }
1182 
1183 #endif
1184