1 /* os-unix.c                  -*-C-*-
2  *
3  *************************************************************************
4  *
5  *  @copyright
6  *  Copyright (C) 2009-2013, Intel Corporation
7  *  All rights reserved.
8  *
9  *  @copyright
10  *  Redistribution and use in source and binary forms, with or without
11  *  modification, are permitted provided that the following conditions
12  *  are met:
13  *
14  *    * Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *    * Redistributions in binary form must reproduce the above copyright
17  *      notice, this list of conditions and the following disclaimer in
18  *      the documentation and/or other materials provided with the
19  *      distribution.
20  *    * Neither the name of Intel Corporation nor the names of its
21  *      contributors may be used to endorse or promote products derived
22  *      from this software without specific prior written permission.
23  *
24  *  @copyright
25  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  *  POSSIBILITY OF SUCH DAMAGE.
37  **************************************************************************/
38 
39 #ifdef __linux__
40     // define _GNU_SOURCE before *any* #include.
41     // Even <stdint.h> will break later #includes if this macro is not
42     // already defined when it is #included.
43 #   define _GNU_SOURCE
44 #endif
45 
46 #include "os.h"
47 #include "bug.h"
48 #include "cilk_malloc.h"
49 #include <internal/abi.h>
50 
51 #if defined __linux__
52 #   include <sys/sysinfo.h>
53 #   include <sys/syscall.h>
54 #elif defined __APPLE__
55 #   include <sys/sysctl.h>
56     // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
57 #elif defined  __DragonFly__
58 // No additional include files
59 #elif defined  __FreeBSD__
60 // No additional include files
61 #elif defined __CYGWIN__
62 // Cygwin on Windows - no additional include files
63 #elif defined  __VXWORKS__
64 #   include <vxWorks.h>
65 #   include <vxCpuLib.h>
66 #   include <taskLib.h>
67 // Solaris
68 #elif defined __sun__ && defined __svr4__
69 #   include <sched.h>
70 #else
71 #   error "Unsupported OS"
72 #endif
73 
74 #include <stdarg.h>
75 #include <stddef.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 #include <unistd.h>
80 #include <pthread.h>
81 #include <sys/types.h>
82 
83 
84 
85 // /* Thread-local storage */
86 // #ifdef _WIN32
87 // typedef unsigned cilkos_tls_key_t;
88 // #else
89 // typedef pthread_key_t cilkos_tls_key_t;
90 // #endif
91 // cilkos_tls_key_t cilkos_allocate_tls_key();
92 // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
93 // void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
94 
95 #if !defined CILK_WORKER_TLS
96 static int cilk_keys_defined;
97 static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;
98 
99 #if SUPPORT_GET_CURRENT_FIBER > 0
100 static pthread_key_t fiber_key;
101 #endif
102 
103 static void *serial_worker;
104 
105 
106 // This destructor is called when a pthread dies to deallocate the
107 // pedigree node.
__cilkrts_pedigree_leaf_destructor(void * pedigree_tls_ptr)108 static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
109 {
110     __cilkrts_pedigree* pedigree_tls
111 	= (__cilkrts_pedigree*)pedigree_tls_ptr;
112     if (pedigree_tls) {
113         // Assert that we have either one or two nodes
114         // left in the pedigree chain.
115         // If we have more, then something is going wrong...
116         CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
117 	__cilkrts_free(pedigree_tls);
118     }
119 }
120 
__cilkrts_init_tls_variables(void)121 void __cilkrts_init_tls_variables(void)
122 {
123     int status;
124     /* This will be called once in serial execution before any
125        Cilk parallelism so we do not need to worry about races
126        on cilk_keys_defined. */
127     if (cilk_keys_defined)
128         return;
129     status = pthread_key_create(&worker_key, NULL);
130     CILK_ASSERT (status == 0);
131     status = pthread_key_create(&pedigree_leaf_key,
132 				__cilkrts_pedigree_leaf_destructor);
133     CILK_ASSERT (status == 0);
134     status = pthread_key_create(&tbb_interop_key, NULL);
135     CILK_ASSERT (status == 0);
136 
137 #if SUPPORT_GET_CURRENT_FIBER > 0
138     status = pthread_key_create(&fiber_key, NULL);
139     CILK_ASSERT (status == 0);
140 #endif
141     cilk_keys_defined = 1;
142     return;
143 }
144 
145 COMMON_SYSDEP
cilkos_get_current_thread_id(void)146 void* cilkos_get_current_thread_id(void)
147 {
148     return (void*)pthread_self();
149 }
150 
151 
__cilkrts_get_tls_worker()152 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
153 {
154     if (__builtin_expect(cilk_keys_defined, 1))
155         return (__cilkrts_worker *)pthread_getspecific(worker_key);
156     else
157         return serial_worker;
158 
159 }
160 
__cilkrts_get_tls_worker_fast()161 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
162 {
163   return (__cilkrts_worker *)pthread_getspecific(worker_key);
164 }
165 
166 COMMON_SYSDEP
__cilkrts_get_tls_tbb_interop(void)167 __cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
168 {
169     if (__builtin_expect(cilk_keys_defined, 1))
170         return (__cilk_tbb_stack_op_thunk *)
171             pthread_getspecific(tbb_interop_key);
172     else
173         return 0;
174 }
175 
176 // This counter should be updated atomically.
177 static int __cilkrts_global_pedigree_tls_counter = -1;
178 
179 COMMON_SYSDEP
__cilkrts_get_tls_pedigree_leaf(int create_new)180 __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
181 {
182     __cilkrts_pedigree *pedigree_tls;
183     if (__builtin_expect(cilk_keys_defined, 1)) {
184         pedigree_tls =
185             (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
186     }
187     else {
188         return 0;
189     }
190 
191     if (!pedigree_tls && create_new) {
192         // This call creates two nodes, X and Y.
193         // X == pedigree_tls[0] is the leaf node, which gets copied
194         // in and out of a user worker w when w binds and unbinds.
195         // Y == pedigree_tls[1] is the root node,
196         // which is a constant node that represents the user worker
197         // thread w.
198 	pedigree_tls = (__cilkrts_pedigree*)
199 	    __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));
200 
201         // This call sets the TLS pointer to the new node.
202 	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
203 
204         pedigree_tls[0].rank = 0;
205         pedigree_tls[0].parent = &pedigree_tls[1];
206 
207         // Create Y, whose rank begins as the global counter value.
208         pedigree_tls[1].rank =
209             __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);
210 
211         pedigree_tls[1].parent = NULL;
212         CILK_ASSERT(pedigree_tls[1].rank != -1);
213     }
214     return pedigree_tls;
215 }
216 
217 #if SUPPORT_GET_CURRENT_FIBER > 0
218 COMMON_SYSDEP
cilkos_get_tls_cilk_fiber(void)219 cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
220 {
221     if (__builtin_expect(cilk_keys_defined, 1))
222         return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
223     else
224         return NULL;
225 }
226 #endif
227 
228 COMMON_SYSDEP
__cilkrts_set_tls_worker(__cilkrts_worker * w)229 void __cilkrts_set_tls_worker(__cilkrts_worker *w)
230 {
231     if (__builtin_expect(cilk_keys_defined, 1)) {
232         int status;
233         status = pthread_setspecific(worker_key, w);
234         CILK_ASSERT (status == 0);
235         return;
236     }
237     else
238     {
239         serial_worker = w;
240     }
241 }
242 
243 COMMON_SYSDEP
__cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk * t)244 void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
245 {
246     if (__builtin_expect(cilk_keys_defined, 1)) {
247         int status;
248         status = pthread_setspecific(tbb_interop_key, t);
249         CILK_ASSERT (status == 0);
250         return;
251     }
252     abort();
253 }
254 
255 COMMON_SYSDEP
__cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree * pedigree_leaf)256 void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
257 {
258     if (__builtin_expect(cilk_keys_defined, 1)) {
259         int status;
260         status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
261         CILK_ASSERT (status == 0);
262         return;
263     }
264     abort();
265 }
266 
267 #if SUPPORT_GET_CURRENT_FIBER > 0
268 COMMON_SYSDEP
cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep * fiber)269 void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
270 {
271     if (__builtin_expect(cilk_keys_defined, 1)) {
272         int status;
273         status = pthread_setspecific(fiber_key, fiber);
274         CILK_ASSERT (status == 0);
275         return;
276     }
277     abort();
278 }
279 #endif
280 
281 #else
__cilkrts_init_tls_variables(void)282 void __cilkrts_init_tls_variables(void)
283 {
284 }
285 #endif
286 
287 #if defined (__linux__) && ! defined(__ANDROID__)
288 /*
289  * Get the thread id, rather than the pid. In the case of MIC offload, it's
290  * possible that we have multiple threads entering Cilk, and each has a
291  * different affinity.
292  */
linux_gettid(void)293 static pid_t linux_gettid(void)
294 {
295     return syscall(SYS_gettid);
296 }
297 
298 /*
299  * On Linux we look at the thread affinity mask and restrict ourself to one
300  * thread for each of the hardware contexts to which we are bound.
301  * Therefore if user does
302  * % taskset 0-1 cilkProgram
303  *       # restrict execution to hardware contexts zero and one
304  * the Cilk program will only use two threads even if it is running on a
305  * machine that has 32 hardware contexts.
306  * This is the right thing to do, because the threads are restricted to two
307  * hardware contexts by the affinity mask set by taskset, and if we were to
308  * create extra threads they would simply oversubscribe the hardware resources
309  * we can use.
310  * This is particularly important on MIC in offload mode, where the affinity
311  * mask is set by the offload library to force the offload code away from
312  * cores that have offload support threads running on them.
313  */
linux_get_affinity_count(int tid)314 static int linux_get_affinity_count (int tid)
315 {
316 #if !defined HAVE_PTHREAD_AFFINITY_NP
317   return 0;
318 #else
319 
320     cpu_set_t process_mask;
321 
322     // Extract the thread affinity mask
323     int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask);
324 
325     if (0 != err)
326     {
327         return 0;
328     }
329 
330     // We have extracted the mask OK, so now we can count the number of threads
331     // in it.  This is linear in the maximum number of CPUs available, We
332     // could do a logarithmic version, if we assume the format of the mask,
333     // but it's not really worth it. We only call this at thread startup
334     // anyway.
335     int available_procs = 0;
336     int i;
337     for (i = 0; i < CPU_SETSIZE; i++)
338     {
339         if (CPU_ISSET(i, &process_mask))
340         {
341             available_procs++;
342         }
343     }
344 
345     return available_procs;
346 #endif
347 }
348 #endif  //  defined (__linux__) && ! defined(__ANDROID__)
349 
350 /*
351  * __cilkrts_hardware_cpu_count
352  *
353  * Returns the number of available CPUs on this hardware.  This is architecture-
354  * specific.
355  */
356 
__cilkrts_hardware_cpu_count(void)357 COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
358 {
359 #if defined __ANDROID__ || (defined(__sun__) && defined(__svr4__))
360     return sysconf (_SC_NPROCESSORS_ONLN);
361 #elif defined __MIC__
362     /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
363     /// on KNC.  Also, ignore the last core.
364     int P = sysconf (_SC_NPROCESSORS_ONLN);
365     return P/2 - 2;
366 #elif defined __linux__
367     int affinity_count = linux_get_affinity_count(linux_gettid());
368 
369     return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN);
370 #elif defined __APPLE__
371     int count = 0;
372     int cmd[2] = { CTL_HW, HW_NCPU };
373     size_t len = sizeof count;
374     int status = sysctl(cmd, 2, &count, &len, 0, 0);
375     assert(status >= 0);
376     assert((unsigned)count == count);
377 
378     return count;
379 #elif defined  __FreeBSD__ || defined __CYGWIN__ || defined __DragonFly__
380     int ncores = sysconf(_SC_NPROCESSORS_ONLN);
381 
382     return ncores;
383     // Just get the number of processors
384 //    return sysconf(_SC_NPROCESSORS_ONLN);
385 #elif defined  __VXWORKS__
386     return __builtin_popcount( vxCpuEnabledGet() );
387 #else
388 #error "Unknown architecture"
389 #endif
390 }
391 
__cilkrts_sleep(void)392 COMMON_SYSDEP void __cilkrts_sleep(void)
393 {
394 #ifdef __VXWORKS__
395 	taskDelay(1);
396 #else
397     usleep(1);
398 #endif
399 }
400 
__cilkrts_yield(void)401 COMMON_SYSDEP void __cilkrts_yield(void)
402 {
403 #if __APPLE__ || __FreeBSD__ || __VXWORKS__
404     // On MacOS, call sched_yield to yield quantum.  I'm not sure why we
405     // don't do this on Linux also.
406     sched_yield();
407 #elif defined(__DragonFly__)
408     // On DragonFly BSD, call sched_yield to yield quantum.
409     sched_yield();
410 #elif defined(__MIC__)
411     // On MIC, pthread_yield() really trashes things.  Arch's measurements
412     // showed that calling _mm_delay_32() (or doing nothing) was a better
413     // option.  Delaying 1024 clock cycles is a reasonable compromise between
414     // giving up the processor and latency starting up when work becomes
415     // available
416     _mm_delay_32(1024);
417 #elif defined(__ANDROID__) || (defined(__sun__) && defined(__svr4__))
418     // On Android and Solaris, call sched_yield to yield quantum.  I'm not
419     // sure why we don't do this on Linux also.
420     sched_yield();
421 #else
422     // On Linux, call pthread_yield (which in turn will call sched_yield)
423     // to yield quantum.
424     pthread_yield();
425 #endif
426 }
427 
cilkos_getenv(char * value,__STDNS size_t vallen,const char * varname)428 COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
429                                            const char* varname)
430 {
431     CILK_ASSERT(value);
432     CILK_ASSERT(varname);
433 
434     const char* envstr = getenv(varname);
435     if (envstr)
436     {
437         size_t len = strlen(envstr);
438         if (len > vallen - 1)
439             return len + 1;
440 
441         strcpy(value, envstr);
442         return len;
443     }
444     else
445     {
446         value[0] = '\0';
447         return 0;
448     }
449 }
450 
451 /*
452  * Unrecoverable error: Print an error message and abort execution.
453  */
cilkos_error(const char * fmt,...)454 COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
455 {
456     va_list l;
457     fflush(NULL);
458     fprintf(stderr, "Cilk error: ");
459     va_start(l, fmt);
460     vfprintf(stderr, fmt, l);
461     va_end(l);
462     fprintf(stderr, "Exiting.\n");
463     fflush(stderr);
464 
465     abort();
466 }
467 
468 /*
469  * Print a warning message and return.
470  */
cilkos_warning(const char * fmt,...)471 COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
472 {
473     va_list l;
474     fflush(NULL);
475     fprintf(stderr, "Cilk warning: ");
476     va_start(l, fmt);
477     vfprintf(stderr, fmt, l);
478     va_end(l);
479     fflush(stderr);
480 }
481 
init_once()482 static void __attribute__((constructor)) init_once()
483 {
484     /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
485     __cilkrts_init_tls_variables();
486 }
487 
488 
489 #define PAGE 4096
490 #define CILK_MIN_STACK_SIZE (4*PAGE)
491 // Default size for the stacks that we create in Cilk for Unix.
492 #define CILK_DEFAULT_STACK_SIZE 0x100000
493 
494 /*
495  * Convert the user's specified stack size into a "reasonable" value
496  * for this OS.
497  */
cilkos_validate_stack_size(size_t specified_stack_size)498 size_t cilkos_validate_stack_size(size_t specified_stack_size) {
499     // Convert any negative value to the default.
500     if (specified_stack_size == 0) {
501         CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
502         return CILK_DEFAULT_STACK_SIZE;
503     }
504     // Round values in between 0 and CILK_MIN_STACK_SIZE up to
505     // CILK_MIN_STACK_SIZE.
506     if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
507         return CILK_MIN_STACK_SIZE;
508     }
509     if ((specified_stack_size % PAGE) > 0) {
510         // Round the user's stack size value up to nearest page boundary.
511         return (PAGE * (1 + specified_stack_size / PAGE));
512     }
513     return specified_stack_size;
514 }
515 
cilkos_atomic_add(volatile long * p,long x)516 long cilkos_atomic_add(volatile long* p, long x)
517 {
518     return __sync_add_and_fetch(p, x);
519 }
520 
521 /* End os-unix.c */
522