1 /* frame_malloc.c                  -*-C-*-
2  *
3  *************************************************************************
4  *
5  *  @copyright
6  *  Copyright (C) 2009-2013, Intel Corporation
7  *  All rights reserved.
8  *
9  *  @copyright
10  *  Redistribution and use in source and binary forms, with or without
11  *  modification, are permitted provided that the following conditions
12  *  are met:
13  *
14  *    * Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *    * Redistributions in binary form must reproduce the above copyright
17  *      notice, this list of conditions and the following disclaimer in
18  *      the documentation and/or other materials provided with the
19  *      distribution.
20  *    * Neither the name of Intel Corporation nor the names of its
21  *      contributors may be used to endorse or promote products derived
22  *      from this software without specific prior written permission.
23  *
24  *  @copyright
25  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  *  POSSIBILITY OF SUCH DAMAGE.
37  **************************************************************************/
38 
39 #include "frame_malloc.h"
40 #include "bug.h"
41 #include "local_state.h"
42 #include "cilk_malloc.h"
43 
44 #ifndef __VXWORKS__
45 #include <memory.h>
46 #endif
47 
48 /* #define USE_MMAP 1 */
49 #if USE_MMAP
50 #define __USE_MISC 1
51 #include <sys/mman.h>
52 #include <errno.h>
53 #endif
54 
55 // Define to fill the stack frame header with the fill character when pushing
56 // it on a free list.  Note that this should be #ifdef'd out when checked in!
57 
58 #ifdef _DEBUG
59 #define HEADER_FILL_CHAR 0xbf
60 #endif
61 
62 // HEADER_FILL_CHAR should not be defined when checked in, so put out a warning
63 // message if this is a release build
64 
65 #if defined(NDEBUG) && defined (HEADER_FILL_CHAR)
66 #pragma message ("Warning: HEADER_FILL_CHAR defined for a release build")
67 #endif
68 
69 static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size);
70 
71 #ifndef _WIN32
72 
73 const unsigned short __cilkrts_bucket_sizes[FRAME_MALLOC_NBUCKETS] =
74 {
75     64, 128, 256, 512, 1024, 2048
76 };
77 
78 #define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket]
79 
80 /* threshold above which we use slow malloc */
81 #define FRAME_MALLOC_MAX_SIZE 2048
82 
83 #else // _WIN32
84 
85 /* Note that this must match the implementation of framesz_to_bucket in
86  * asmilator/layout.ml! */
87 #define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket)))
88 
89 /* threshold above which we use slow malloc */
90 #define FRAME_MALLOC_MAX_SIZE                                   \
91     FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1)
92 
93 #endif // _WIN32
94 
95 /* utility procedures */
push(struct free_list ** b,struct free_list * p)96 static void push(struct free_list **b, struct free_list *p)
97 {
98 #ifdef HEADER_FILL_CHAR
99     memset (p, HEADER_FILL_CHAR, FRAME_MALLOC_BUCKET_TO_SIZE(0));
100 #endif
101     /* cons! onto free list */
102     p->cdr = *b;
103     *b = p;
104 }
105 
pop(struct free_list ** b)106 static struct free_list *pop(struct free_list **b)
107 {
108     struct free_list *p = *b;
109     if (p)
110         *b = p->cdr;
111     return p;
112 }
113 
114 /*************************************************************
115   global allocator:
116 *************************************************************/
117 /* request slightly less than 2^K from the OS, which after malloc
118    overhead and alignment should end up filling each VM page almost
119    completely.  128 is a guess of the total malloc overhead and cache
120    line alignment */
121 #define FRAME_MALLOC_CHUNK (32 * 1024 - 128)
122 
123 /** Implements linked list of frames */
124 struct pool_cons {
125     char *p;                /**< This element of the list */
126     struct pool_cons *cdr;  /**< Remainder of the list */
127 };
128 
extend_global_pool(global_state_t * g)129 static void extend_global_pool(global_state_t *g)
130 {
131     /* FIXME: memalign to a cache line? */
132     struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c));
133     g->frame_malloc.pool_begin =
134         (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK);
135     g->frame_malloc.pool_end =
136         g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK;
137     g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK;
138     c->p = g->frame_malloc.pool_begin;
139     c->cdr = g->frame_malloc.pool_list;
140     g->frame_malloc.pool_list = c;
141 }
142 
143 /* the size is already canonicalized at this point */
global_alloc(global_state_t * g,int bucket)144 static struct free_list *global_alloc(global_state_t *g, int bucket)
145 {
146     struct free_list *mem;
147     size_t size;
148 
149     CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
150     size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
151     g->frame_malloc.allocated_from_global_pool += size;
152 
153     if (!(mem = pop(&g->frame_malloc.global_free_list[bucket]))) {
154 
155         CILK_ASSERT(g->frame_malloc.pool_begin <= g->frame_malloc.pool_end);
156         if (g->frame_malloc.pool_begin + size > g->frame_malloc.pool_end) {
157             /* We waste the fragment of pool. */
158             g->frame_malloc.wasted +=
159                 g->frame_malloc.pool_end - g->frame_malloc.pool_begin;
160             extend_global_pool(g);
161         }
162         mem = (struct free_list *)g->frame_malloc.pool_begin;
163         g->frame_malloc.pool_begin += size;
164     }
165 
166     return mem;
167 }
168 
global_free(global_state_t * g,void * mem,int bucket)169 static void global_free(global_state_t *g, void *mem, int bucket)
170 {
171     size_t size;
172 
173     CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
174     size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
175     g->frame_malloc.allocated_from_global_pool -= size;
176 
177     push(&g->frame_malloc.global_free_list[bucket], mem);
178 }
179 
__cilkrts_frame_malloc_global_init(global_state_t * g)180 void __cilkrts_frame_malloc_global_init(global_state_t *g)
181 {
182     int i;
183 
184     __cilkrts_mutex_init(&g->frame_malloc.lock);
185     g->frame_malloc.check_for_leaks = 1;
186     g->frame_malloc.pool_list = 0;
187     g->frame_malloc.pool_begin = 0;
188     g->frame_malloc.pool_end = 0;
189     g->frame_malloc.batch_size = 8000;
190     g->frame_malloc.potential_limit = 4 * g->frame_malloc.batch_size;
191     g->frame_malloc.allocated_from_os = 0;
192     g->frame_malloc.allocated_from_global_pool = 0;
193     g->frame_malloc.wasted = 0;
194     for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i)
195         g->frame_malloc.global_free_list[i] = 0;
196 }
197 
198 // Counts how many bytes are in the global free list.
count_memory_in_global_list(global_state_t * g)199 static size_t count_memory_in_global_list(global_state_t *g)
200 {
201 
202     // Count the memory remaining in the global free list.
203     size_t size_remaining_in_global_list = 0;
204     int i;
205     for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
206         struct free_list *p;
207         size_t size_in_bucket = 0;
208         p = g->frame_malloc.global_free_list[i];
209 
210         while (p) {
211             size_in_bucket += FRAME_MALLOC_BUCKET_TO_SIZE(i);
212             p = p->cdr;
213         }
214         size_remaining_in_global_list += size_in_bucket;
215     }
216     return size_remaining_in_global_list;
217 }
218 
219 
__cilkrts_frame_malloc_global_cleanup(global_state_t * g)220 void __cilkrts_frame_malloc_global_cleanup(global_state_t *g)
221 {
222     struct pool_cons *c;
223 
224     if (g->frame_malloc.check_for_leaks) {
225         size_t memory_in_global_list = count_memory_in_global_list(g);
226         // TBD: This check is weak.  Short of memory corruption,
227         // I don't see how we have more memory in the free list
228         // than allocated from the os.
229         // Ideally, we should count the memory in the global free list
230         // and check that we have it all.  But I believe the runtime
231         // itself also uses some memory, which is not being tracked.
232         if (memory_in_global_list > g->frame_malloc.allocated_from_os) {
233             __cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n");
234         }
235     }
236 
237     while ((c = g->frame_malloc.pool_list)) {
238         g->frame_malloc.pool_list = c->cdr;
239         __cilkrts_free(c->p);
240         __cilkrts_free(c);
241     }
242 
243     __cilkrts_mutex_destroy(0, &g->frame_malloc.lock);
244 
245     // Check that all the memory moved from the global pool into
246     // workers has been returned to the global pool.
247     if (g->frame_malloc.check_for_leaks
248         && (g->frame_malloc.allocated_from_global_pool != 0))
249     {
250         __cilkrts_bug("\n"
251                       "---------------------------" "\n"
252                       "  MEMORY LEAK DETECTED!!!  " "\n"
253                       "---------------------------" "\n"
254                       "\n"
255             );
256     }
257 }
258 
259 /*************************************************************
260   per-worker allocator
261 *************************************************************/
262 /* allocate a batch of frames of size SIZE from the global pool and
263    store them in the worker's free list */
allocate_batch(__cilkrts_worker * w,int bucket,size_t size)264 static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size)
265 {
266     global_state_t *g = w->g;
267 
268     __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
269 #if USE_MMAP
270         char *p = mmap(0, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
271         if (p == MAP_FAILED)
272             __cilkrts_bug("mmap failed %d", errno);
273         assert(size < 4096);
274         assert(p != MAP_FAILED);
275         mprotect(p, 4096, PROT_NONE);
276         mprotect(p + 8192, 4096, PROT_NONE);
277         w->l->bucket_potential[bucket] += size;
278         push(&w->l->free_list[bucket], (struct free_list *)(p + 8192 - size));
279 #else
280         size_t bytes_allocated = 0;
281         do {
282             w->l->bucket_potential[bucket] += size;
283             bytes_allocated += size;
284             push(&w->l->free_list[bucket], global_alloc(g, bucket));
285         } while (bytes_allocated < g->frame_malloc.batch_size);
286 #endif
287     } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
288 
289 }
290 
gc_bucket(__cilkrts_worker * w,int bucket,size_t size)291 static void gc_bucket(__cilkrts_worker *w, int bucket, size_t size)
292 {
293     struct free_list *p, *q;
294     global_state_t *g = w->g;
295     size_t pot = w->l->bucket_potential[bucket];
296     size_t newpot;
297 
298     /* Keep up to POT/2 elements in the free list.  The cost of
299        counting up to POT/2 is amortized against POT. */
300     newpot = 0;
301     for (newpot = 0, p = w->l->free_list[bucket]; p && 2 * newpot < pot;
302          p = p->cdr, newpot += size)
303         ;
304     w->l->bucket_potential[bucket] = newpot;
305 
306     if (p) {
307         /* free the rest of the list.  The cost of grabbing the lock
308            is amortized against POT/2; the cost of traversing the rest
309            of the list is amortized against the free operation that
310            puts the element on the list. */
311         __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
312             while ((q = pop(&p->cdr)))
313 #if USE_MMAP
314                 munmap((char *)q + size - 8192, 12288);
315 #else
316                 global_free(g, q, bucket);
317 #endif
318         } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
319     }
320 }
321 
322 // Free all the memory in this bucket for the specified worker,
323 // returning it to the global pool's free list.
move_bucket_to_global_free_list(__cilkrts_worker * w,int bucket)324 static void move_bucket_to_global_free_list(__cilkrts_worker *w,
325                                             int bucket)
326 {
327     struct free_list *p, *q;
328     global_state_t *g = w->g;
329     p = w->l->free_list[bucket];
330 
331     if (p) {
332         __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
333             while ((q = pop(&p))) {
334 #if USE_MMAP
335                 size_t size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
336                 munmap((char *)q + size - 8192, 12288);
337 #else
338                 global_free(g, q, bucket);
339 #endif
340             }
341         } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
342     }
343 
344     // I'm not sure this does anything useful now, since
345     // the worker is about to be destroyed. But why not?
346     w->l->bucket_potential[bucket] = 0;
347 }
348 
bucket_of_size(size_t size)349 static int bucket_of_size(size_t size)
350 {
351     int i;
352 
353     for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i)
354         if (size <= FRAME_MALLOC_BUCKET_TO_SIZE(i))
355             return i;
356 
357     CILK_ASSERT(0 /* can't happen */);
358     return -1;
359 }
360 
__cilkrts_frame_malloc_roundup(size_t size)361 size_t __cilkrts_frame_malloc_roundup(size_t size)
362 {
363     if (size > FRAME_MALLOC_MAX_SIZE) {
364         /* nothing, leave it alone */
365     } else {
366         int bucket = bucket_of_size(size);
367         size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
368     }
369     return size;
370 }
371 
__cilkrts_size_of_bucket(int bucket)372 size_t __cilkrts_size_of_bucket(int bucket)
373 {
374     CILK_ASSERT(bucket >= 0 && bucket < FRAME_MALLOC_NBUCKETS);
375     return FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
376 }
377 
__cilkrts_frame_malloc(__cilkrts_worker * w,size_t size)378 void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size)
379 {
380     int bucket;
381     void *mem;
382 
383     /* if too large, or if no worker, fall back to __cilkrts_malloc()  */
384     if (!w || size > FRAME_MALLOC_MAX_SIZE) {
385         NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE);
386         return __cilkrts_malloc(size);
387     }
388 
389     START_INTERVAL(w, INTERVAL_FRAME_ALLOC); {
390         bucket = bucket_of_size(size);
391         size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
392 
393         while (!(mem = pop(&w->l->free_list[bucket]))) {
394             /* get a batch of frames from the global pool */
395             START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) {
396                 allocate_batch(w, bucket, size);
397             } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL);
398         }
399     } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC);
400 
401     return mem;
402 }
403 
__cilkrts_frame_free(__cilkrts_worker * w,void * p0,size_t size)404 void __cilkrts_frame_free(__cilkrts_worker *w, void *p0, size_t size)
405 {
406     int bucket;
407     struct free_list *p = (struct free_list *)p0;
408 
409     /* if too large, or if no worker, fall back to __cilkrts_free()  */
410     if (!w || size > FRAME_MALLOC_MAX_SIZE) {
411         NOTE_INTERVAL(w, INTERVAL_FRAME_FREE_LARGE);
412         __cilkrts_free(p);
413         return;
414     }
415 
416 #if CILK_LIB_DEBUG
417     *(volatile long *)w;
418 #endif
419 
420     START_INTERVAL(w, INTERVAL_FRAME_FREE); {
421         bucket = bucket_of_size(size);
422         size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
423         w->l->bucket_potential[bucket] += size;
424         push(&w->l->free_list[bucket], p);
425         if (w->l->bucket_potential[bucket] >
426             w->g->frame_malloc.potential_limit) {
427             START_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL) {
428                 gc_bucket(w, bucket, size);
429             } STOP_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL);
430         }
431     } STOP_INTERVAL(w, INTERVAL_FRAME_FREE);
432 }
433 
__cilkrts_frame_malloc_per_worker_init(__cilkrts_worker * w)434 void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w)
435 {
436     int i;
437     local_state *l = w->l;
438 
439     for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
440         l->free_list[i] = 0;
441         l->bucket_potential[i] = 0;
442     }
443 }
444 
__cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker * w)445 void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w)
446 {
447     int i;
448     // Move memory to the global pool.  This operation
449     // ensures the memory does not become unreachable / leak
450     // when the worker is destroyed.
451     for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
452         move_bucket_to_global_free_list(w, i);
453     }
454 }
455 
456 /*
457   Local Variables: **
458   c-file-style:"bsd" **
459   c-basic-offset:4 **
460   indent-tabs-mode:nil **
461   End: **
462 */
463