1 /* scheduler.c                  -*-C-*-
2  *
3  *************************************************************************
4  *
5  *  @copyright
6  *  Copyright (C) 2007-2013, Intel Corporation
7  *  All rights reserved.
8  *
9  *  @copyright
10  *  Redistribution and use in source and binary forms, with or without
11  *  modification, are permitted provided that the following conditions
12  *  are met:
13  *
14  *    * Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *    * Redistributions in binary form must reproduce the above copyright
17  *      notice, this list of conditions and the following disclaimer in
18  *      the documentation and/or other materials provided with the
19  *      distribution.
20  *    * Neither the name of Intel Corporation nor the names of its
21  *      contributors may be used to endorse or promote products derived
22  *      from this software without specific prior written permission.
23  *
24  *  @copyright
25  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  *  POSSIBILITY OF SUCH DAMAGE.
37  *
38  **************************************************************************/
39 
40 /*
41  * Cilk scheduler
42  */
43 
44 #include "scheduler.h"
45 #include "bug.h"
46 #include "os.h"
47 #include "os_mutex.h"
48 #include "local_state.h"
49 #include "signal_node.h"
50 #include "full_frame.h"
51 #include "sysdep.h"
52 #include "except.h"
53 #include "cilk_malloc.h"
54 #include "pedigrees.h"
55 #include "record-replay.h"
56 
57 #include <limits.h>
58 #include <string.h> /* memcpy */
59 #include <stdio.h>  // sprintf
60 #include <stdlib.h> // malloc, free, abort
61 
62 #ifdef _WIN32
63 #   pragma warning(disable:1786)   // disable warning: sprintf is deprecated
64 #   include "sysdep-win.h"
65 #   include "except-win32.h"
66 #endif  // _WIN32
67 
68 // ICL: Don't complain about conversion from pointer to same-sized integral
69 // type in __cilkrts_put_stack.  That's why we're using ptrdiff_t
70 #ifdef _WIN32
71 #   pragma warning(disable: 1684)
72 #endif
73 
74 #include "cilk/cilk_api.h"
75 #include "frame_malloc.h"
76 #include "metacall_impl.h"
77 #include "reducer_impl.h"
78 #include "cilk-tbb-interop.h"
79 #include "cilk-ittnotify.h"
80 #include "stats.h"
81 
82 // ICL: Don't complain about loss of precision in myrand
83 // I tried restoring the warning after the function, but it didn't
84 // suppress it
85 #ifdef _WIN32
86 #   pragma warning(disable: 2259)
87 #endif
88 
89 #ifndef _WIN32
90 #   include <unistd.h>
91 #endif
92 
93 #ifdef __VXWORKS__
94 // redeclare longjmp() with noreturn to stop warnings
95 extern __attribute__((noreturn))
96 		void longjmp(jmp_buf, int);
97 #endif
98 
99 //#define DEBUG_LOCKS 1
100 #ifdef DEBUG_LOCKS
101 // The currently executing worker must own this worker's lock
102 #   define ASSERT_WORKER_LOCK_OWNED(w) \
103         { \
104             __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \
105             CILK_ASSERT((w)->l->lock.owner == tls_worker); \
106         }
107 #else
108 #   define ASSERT_WORKER_LOCK_OWNED(w)
109 #endif // DEBUG_LOCKS
110 
111 // Options for the scheduler.
112 enum schedule_t { SCHEDULE_RUN,
113                   SCHEDULE_WAIT,
114                   SCHEDULE_EXIT };
115 
116 // Return values for provably_good_steal()
117 enum provably_good_steal_t
118 {
119     ABANDON_EXECUTION,  // Not the last child to the sync - attempt to steal work
120     CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker
121     WAIT_FOR_CONTINUE   // The replay log indicates that this was the worker
122                         // which continued.  Loop until we are the last worker
123                         // to the sync.
124 };
125 
126 
127 // Verify that "w" is the worker we are currently executing on.
128 // Because this check is expensive, this method is usually a no-op.
verify_current_wkr(__cilkrts_worker * w)129 static inline void verify_current_wkr(__cilkrts_worker *w)
130 {
131 #if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1))
132     // Lookup the worker from TLS and compare to w.
133     __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
134     if (w != tmp) {
135         fprintf(stderr, "Error.  W=%d, actual worker =%d...\n",
136                 w->self,
137                 tmp->self);
138     }
139     CILK_ASSERT(w == tmp);
140 #endif
141 }
142 
143 static enum schedule_t worker_runnable(__cilkrts_worker *w);
144 
145 // Scheduling-fiber functions:
146 static void do_return_from_spawn (__cilkrts_worker *w,
147                                   full_frame *ff,
148                                   __cilkrts_stack_frame *sf);
149 static void do_sync (__cilkrts_worker *w,
150                      full_frame *ff,
151                      __cilkrts_stack_frame *sf);
152 
153 // max is defined on Windows and VxWorks
154 #if (! defined(_WIN32)) && (! defined(__VXWORKS__))
155     // TBD: definition of max() for Linux.
156 #   define max(a, b) ((a) < (b) ? (b) : (a))
157 #endif
158 
__cilkrts_dump_stats_to_stderr(global_state_t * g)159 void __cilkrts_dump_stats_to_stderr(global_state_t *g)
160 {
161 #ifdef CILK_PROFILE
162     int i;
163     for (i = 0; i < g->total_workers; ++i) {
164         // Print out statistics for each worker.  We collected them,
165         // so why not print them out?
166         fprintf(stderr, "Stats for worker %d\n", i);
167         dump_stats_to_file(stderr, g->workers[i]->l->stats);
168         __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats);
169     }
170 
171     // Also print out aggregate statistics.
172     dump_stats_to_file(stderr, &g->stats);
173 #endif
174     fprintf(stderr,
175             "CILK PLUS Thread Info: P=%d, Q=%d\n",
176             g->P,
177             g->Q);
178     fprintf(stderr,
179             "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes",
180             (long long)g->frame_malloc.allocated_from_os);
181 #ifdef CILK_PROFILE
182     if (g->stats.stack_hwm)
183         fprintf(stderr, ", %ld stacks", g->stats.stack_hwm);
184 #endif
185     fputc('\n', stderr);
186 }
187 
validate_worker(__cilkrts_worker * w)188 static void validate_worker(__cilkrts_worker *w)
189 {
190     /* check the magic numbers, for debugging purposes */
191     if (w->l->worker_magic_0 != WORKER_MAGIC_0 ||
192         w->l->worker_magic_1 != WORKER_MAGIC_1)
193         abort_because_rts_is_corrupted();
194 }
195 
double_link(full_frame * left_ff,full_frame * right_ff)196 static void double_link(full_frame *left_ff, full_frame *right_ff)
197 {
198     if (left_ff)
199         left_ff->right_sibling = right_ff;
200     if (right_ff)
201         right_ff->left_sibling = left_ff;
202 }
203 
204 /* add CHILD to the right of all children of PARENT */
push_child(full_frame * parent_ff,full_frame * child_ff)205 static void push_child(full_frame *parent_ff, full_frame *child_ff)
206 {
207     double_link(parent_ff->rightmost_child, child_ff);
208     double_link(child_ff, 0);
209     parent_ff->rightmost_child = child_ff;
210 }
211 
212 /* unlink CHILD from the list of all children of PARENT */
unlink_child(full_frame * parent_ff,full_frame * child_ff)213 static void unlink_child(full_frame *parent_ff, full_frame *child_ff)
214 {
215     double_link(child_ff->left_sibling, child_ff->right_sibling);
216 
217     if (!child_ff->right_sibling) {
218         /* this is the rightmost child -- update parent link */
219         CILK_ASSERT(parent_ff->rightmost_child == child_ff);
220         parent_ff->rightmost_child = child_ff->left_sibling;
221     }
222     child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */
223 }
224 
incjoin(full_frame * ff)225 static void incjoin(full_frame *ff)
226 {
227     ++ff->join_counter;
228 }
229 
decjoin(full_frame * ff)230 static int decjoin(full_frame *ff)
231 {
232     CILK_ASSERT(ff->join_counter > 0);
233     return (--ff->join_counter);
234 }
235 
simulate_decjoin(full_frame * ff)236 static int simulate_decjoin(full_frame *ff)
237 {
238   CILK_ASSERT(ff->join_counter > 0);
239   return (ff->join_counter - 1);
240 }
241 
242 /*
243  * Pseudo-random generator defined by the congruence S' = 69070 * S
244  * mod (2^32 - 5).  Marsaglia (CACM July 1993) says on page 107 that
245  * this is a ``good one''.  There you go.
246  *
247  * The literature makes a big fuss about avoiding the division, but
248  * for us it is not worth the hassle.
249  */
250 static const unsigned RNGMOD = ((1ULL << 32) - 5);
251 static const unsigned RNGMUL = 69070U;
252 
myrand(__cilkrts_worker * w)253 static unsigned myrand(__cilkrts_worker *w)
254 {
255     unsigned state = w->l->rand_seed;
256     state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD);
257     w->l->rand_seed = state;
258     return state;
259 }
260 
mysrand(__cilkrts_worker * w,unsigned seed)261 static void mysrand(__cilkrts_worker *w, unsigned seed)
262 {
263     seed %= RNGMOD;
264     seed += (seed == 0); /* 0 does not belong to the multiplicative
265                             group.  Use 1 instead */
266     w->l->rand_seed = seed;
267 }
268 
269 /* W grabs its own lock */
__cilkrts_worker_lock(__cilkrts_worker * w)270 void __cilkrts_worker_lock(__cilkrts_worker *w)
271 {
272     validate_worker(w);
273     CILK_ASSERT(w->l->do_not_steal == 0);
274 
275     /* tell thieves to stay out of the way */
276     w->l->do_not_steal = 1;
277     __cilkrts_fence(); /* probably redundant */
278 
279     __cilkrts_mutex_lock(w, &w->l->lock);
280 }
281 
__cilkrts_worker_unlock(__cilkrts_worker * w)282 void __cilkrts_worker_unlock(__cilkrts_worker *w)
283 {
284     __cilkrts_mutex_unlock(w, &w->l->lock);
285     CILK_ASSERT(w->l->do_not_steal == 1);
286     /* The fence is probably redundant.  Use a release
287        operation when supported (gcc and compatibile);
288        that is faster on x86 which serializes normal stores. */
289 #if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110)
290     __sync_lock_release(&w->l->do_not_steal);
291 #else
292     w->l->do_not_steal = 0;
293     __cilkrts_fence(); /* store-store barrier, redundant on x86 */
294 #endif
295 }
296 
297 /* try to acquire the lock of some *other* worker */
worker_trylock_other(__cilkrts_worker * w,__cilkrts_worker * other)298 static int worker_trylock_other(__cilkrts_worker *w,
299                                 __cilkrts_worker *other)
300 {
301     int status = 0;
302 
303     validate_worker(other);
304 
305     /* This protocol guarantees that, after setting the DO_NOT_STEAL
306        flag, worker W can enter its critical section after waiting for
307        the thief currently in the critical section (if any) and at
308        most one other thief.
309 
310        This requirement is overly paranoid, but it should protect us
311        against future nonsense from OS implementors.
312     */
313 
314     /* compete for the right to disturb OTHER */
315     if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) {
316         if (other->l->do_not_steal) {
317             /* leave it alone */
318         } else {
319             status = __cilkrts_mutex_trylock(w, &other->l->lock);
320         }
321         __cilkrts_mutex_unlock(w, &other->l->steal_lock);
322     }
323 
324 
325     return status;
326 }
327 
worker_unlock_other(__cilkrts_worker * w,__cilkrts_worker * other)328 static void worker_unlock_other(__cilkrts_worker *w,
329                                 __cilkrts_worker *other)
330 {
331     __cilkrts_mutex_unlock(w, &other->l->lock);
332 }
333 
334 
335 /* Lock macro Usage:
336     BEGIN_WITH_WORKER_LOCK(w) {
337         statement;
338         statement;
339         BEGIN_WITH_FRAME_LOCK(w, ff) {
340             statement;
341             statement;
342         } END_WITH_FRAME_LOCK(w, ff);
343     } END_WITH_WORKER_LOCK(w);
344  */
345 #define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do
346 #define END_WITH_WORKER_LOCK(w)   while (__cilkrts_worker_unlock(w), 0)
347 
348 // TBD(jsukha): These are worker lock acquistions on
349 // a worker whose deque is empty.  My conjecture is that we
350 // do not need to hold the worker lock at these points.
351 // I have left them in for now, however.
352 //
353 // #define REMOVE_POSSIBLY_OPTIONAL_LOCKS
354 #ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS
355     #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do
356     #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (0)
357 #else
358     #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do
359     #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (__cilkrts_worker_unlock(w), 0)
360 #endif
361 
362 
363 #define BEGIN_WITH_FRAME_LOCK(w, ff)                                     \
364     do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do
365 
366 #define END_WITH_FRAME_LOCK(w, ff)                       \
367     while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0)
368 
369 /* W becomes the owner of F and F can be stolen from W */
make_runnable(__cilkrts_worker * w,full_frame * ff)370 static void make_runnable(__cilkrts_worker *w, full_frame *ff)
371 {
372     w->l->frame_ff = ff;
373 
374     /* CALL_STACK is invalid (the information is stored implicitly in W) */
375     ff->call_stack = 0;
376 }
377 
378 /*
379  * The worker parameter is unused, except for print-debugging purposes.
380  */
make_unrunnable(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * sf,int is_loot,const char * why)381 static void make_unrunnable(__cilkrts_worker *w,
382                             full_frame *ff,
383                             __cilkrts_stack_frame *sf,
384                             int is_loot,
385                             const char *why)
386 {
387     /* CALL_STACK becomes valid again */
388     ff->call_stack = sf;
389 
390     if (sf) {
391 #if CILK_LIB_DEBUG
392         if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0))
393             __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf);
394 #endif
395         sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED;
396         sf->worker = 0;
397 
398         if (is_loot)
399             __cilkrts_put_stack(ff, sf);
400 
401         /* perform any system-dependent action, such as saving the
402            state of the stack */
403         __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why);
404     }
405 }
406 
407 
408 /* Push the next full frame to be made active in this worker and increment its
409  * join counter.  __cilkrts_push_next_frame and pop_next_frame work on a
410  * one-element queue.  This queue is used to communicate across the runtime
411  * from the code that wants to activate a frame to the code that can actually
412  * begin execution on that frame.  They are asymetrical in that push
413  * increments the join counter but pop does not decrement it.  Rather, a
414  * single push/pop combination makes a frame active and increments its join
415  * counter once. */
__cilkrts_push_next_frame(__cilkrts_worker * w,full_frame * ff)416 void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff)
417 {
418     CILK_ASSERT(ff);
419     CILK_ASSERT(!w->l->next_frame_ff);
420     incjoin(ff);
421     w->l->next_frame_ff = ff;
422 }
423 
424 /* Get the next full-frame to be made active in this worker.  The join count
425  * of the full frame will have been incremented by the corresponding push
426  * event.  See __cilkrts_push_next_frame, above.
427  */
pop_next_frame(__cilkrts_worker * w)428 static full_frame *pop_next_frame(__cilkrts_worker *w)
429 {
430     full_frame *ff;
431     ff = w->l->next_frame_ff;
432     // Remove the frame from the next_frame field.
433     //
434     // If this is a user worker, then there is a chance that another worker
435     // from our team could push work into our next_frame (if it is the last
436     // worker doing work for this team).  The other worker's setting of the
437     // next_frame could race with our setting of next_frame to NULL.  This is
438     // the only possible race condition on next_frame.  However, if next_frame
439     // has a non-NULL value, then it means the team still has work to do, and
440     // there is no chance of another team member populating next_frame.  Thus,
441     // it is safe to set next_frame to NULL, if it was populated.  There is no
442     // need for an atomic op.
443     if (NULL != ff) {
444         w->l->next_frame_ff = NULL;
445     }
446     return ff;
447 }
448 
449 /*
450  * Identify the single worker that is allowed to cross a sync in this frame.  A
451  * thief should call this function when it is the first to steal work from a
452  * user worker.  "First to steal work" may mean that there has been parallelism
453  * in the user worker before, but the whole team sync'd, and this is the first
454  * steal after that.
455  *
456  * This should happen while holding the worker and frame lock.
457  */
set_sync_master(__cilkrts_worker * w,full_frame * ff)458 static void set_sync_master(__cilkrts_worker *w, full_frame *ff)
459 {
460     w->l->last_full_frame = ff;
461     ff->sync_master = w;
462 }
463 
464 /*
465  * The sync that ends all parallelism for a particular user worker is about to
466  * be crossed.  Decouple the worker and frame.
467  *
468  * No locks need to be held since the user worker isn't doing anything, and none
469  * of the system workers can steal from it.  But unset_sync_master() should be
470  * called before the user worker knows about this work (i.e., before it is
471  * inserted into the w->l->next_frame_ff is set).
472  */
unset_sync_master(__cilkrts_worker * w,full_frame * ff)473 static void unset_sync_master(__cilkrts_worker *w, full_frame *ff)
474 {
475     CILK_ASSERT(WORKER_USER == w->l->type);
476     CILK_ASSERT(ff->sync_master == w);
477     ff->sync_master = NULL;
478     w->l->last_full_frame = NULL;
479 }
480 
481 /********************************************************************
482  * THE protocol:
483  ********************************************************************/
484 /*
485  * This is a protocol for work stealing that minimizes the overhead on
486  * the victim.
487  *
488  * The protocol uses three shared pointers into the worker's deque:
489  * - T - the "tail"
490  * - H - the "head"
491  * - E - the "exception"  NB: In this case, "exception" has nothing to do
492  * with C++ throw-catch exceptions -- it refers only to a non-normal return,
493  * i.e., a steal or similar scheduling exception.
494  *
495  * with H <= E, H <= T.
496  *
497  * Stack frames SF, where H <= E < T, are available for stealing.
498  *
499  * The worker operates on the T end of the stack.  The frame being
500  * worked on is not on the stack.  To make a continuation available for
501  * stealing the worker pushes a from onto the stack: stores *T++ = SF.
502  * To return, it pops the frame off the stack: obtains SF = *--T.
503  *
504  * After decrementing T, the condition E > T signals to the victim that
505  * it should invoke the runtime system's "THE" exception handler.  The
506  * pointer E can become INFINITY, in which case the victim must invoke
507  * the THE exception handler as soon as possible.
508  *
509  * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998,
510  * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information
511  * on the THE protocol.
512  */
513 
514 /* the infinity value of E */
515 #define EXC_INFINITY  ((__cilkrts_stack_frame **) (-1))
516 
increment_E(__cilkrts_worker * victim)517 static void increment_E(__cilkrts_worker *victim)
518 {
519     __cilkrts_stack_frame *volatile *tmp;
520 
521     // The currently executing worker must own the worker lock to touch
522     // victim->exc
523     ASSERT_WORKER_LOCK_OWNED(victim);
524 
525     tmp = victim->exc;
526     if (tmp != EXC_INFINITY) {
527         /* On most x86 this pair of operations would be slightly faster
528            as an atomic exchange due to the implicit memory barrier in
529            an atomic instruction. */
530         victim->exc = tmp + 1;
531         __cilkrts_fence();
532     }
533 }
534 
decrement_E(__cilkrts_worker * victim)535 static void decrement_E(__cilkrts_worker *victim)
536 {
537     __cilkrts_stack_frame *volatile *tmp;
538 
539     // The currently executing worker must own the worker lock to touch
540     // victim->exc
541     ASSERT_WORKER_LOCK_OWNED(victim);
542 
543     tmp = victim->exc;
544     if (tmp != EXC_INFINITY) {
545         /* On most x86 this pair of operations would be slightly faster
546            as an atomic exchange due to the implicit memory barrier in
547            an atomic instruction. */
548         victim->exc = tmp - 1;
549         __cilkrts_fence(); /* memory fence not really necessary */
550     }
551 }
552 
553 #if 0
554 /* for now unused, will be necessary if we implement abort */
555 static void signal_THE_exception(__cilkrts_worker *wparent)
556 {
557     wparent->exc = EXC_INFINITY;
558     __cilkrts_fence();
559 }
560 #endif
561 
reset_THE_exception(__cilkrts_worker * w)562 static void reset_THE_exception(__cilkrts_worker *w)
563 {
564     // The currently executing worker must own the worker lock to touch
565     // w->exc
566     ASSERT_WORKER_LOCK_OWNED(w);
567 
568     w->exc = w->head;
569     __cilkrts_fence();
570 }
571 
572 /* conditions under which victim->head can be stolen: */
can_steal_from(__cilkrts_worker * victim)573 static int can_steal_from(__cilkrts_worker *victim)
574 {
575     return ((victim->head < victim->tail) &&
576             (victim->head < victim->protected_tail));
577 }
578 
579 /* Return TRUE if the frame can be stolen, false otherwise */
dekker_protocol(__cilkrts_worker * victim)580 static int dekker_protocol(__cilkrts_worker *victim)
581 {
582     // increment_E and decrement_E are going to touch victim->exc.  The
583     // currently executing worker must own victim's lock before they can
584     // modify it
585     ASSERT_WORKER_LOCK_OWNED(victim);
586 
587     /* ASSERT(E >= H); */
588 
589     increment_E(victim);
590 
591     /* ASSERT(E >= H + 1); */
592     if (can_steal_from(victim)) {
593         /* success, we can steal victim->head and set H <- H + 1
594            in detach() */
595         return 1;
596     } else {
597         /* failure, restore previous state */
598         decrement_E(victim);
599         return 0;
600     }
601 }
602 
603 
604 /* Link PARENT and CHILD in the spawn tree */
make_child(__cilkrts_worker * w,full_frame * parent_ff,__cilkrts_stack_frame * child_sf,cilk_fiber * fiber)605 static full_frame *make_child(__cilkrts_worker *w,
606                               full_frame *parent_ff,
607                               __cilkrts_stack_frame *child_sf,
608                               cilk_fiber *fiber)
609 {
610     full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf);
611 
612     child_ff->parent = parent_ff;
613     push_child(parent_ff, child_ff);
614 
615     //DBGPRINTF("%d-          make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n"
616     //    "            parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n"
617     //    "            child  - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n",
618     //          w->self, child, parent, child_sf,
619     //          parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child,
620     //          child->parent, child->left_sibling, child->right_sibling, child->rightmost_child);
621     CILK_ASSERT(parent_ff->call_stack);
622     child_ff->is_call_child = (fiber == NULL);
623 
624     /* PLACEHOLDER_FIBER is used as non-null marker indicating that
625        child should be treated as a spawn child even though we have not
626        yet assigned a real fiber to its parent. */
627     if (fiber == PLACEHOLDER_FIBER)
628         fiber = NULL; /* Parent actually gets a null fiber, for now */
629 
630     /* perform any system-dependent actions, such as capturing
631        parameter passing information */
632     /*__cilkrts_make_child_sysdep(child, parent);*/
633 
634     /* Child gets reducer map and stack of parent.
635        Parent gets a new map and new stack. */
636     child_ff->fiber_self = parent_ff->fiber_self;
637     child_ff->sync_master = NULL;
638 
639     if (child_ff->is_call_child) {
640         /* Cause segfault on any attempted access.  The parent gets
641            the child map and stack when the child completes. */
642         parent_ff->fiber_self = 0;
643     } else {
644         parent_ff->fiber_self = fiber;
645     }
646 
647     incjoin(parent_ff);
648     return child_ff;
649 }
650 
__cilkrts_advance_frame(__cilkrts_stack_frame * sf)651 static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf)
652 {
653     __cilkrts_stack_frame *p = sf->call_parent;
654     sf->call_parent = 0;
655     return p;
656 }
657 
658 /* w should be the currently executing worker.
659  * loot_sf is the youngest stack frame in the call stack being
660  *   unrolled (i.e., the most deeply nested stack frame.)
661  *
662  * When this method is called for a steal, loot_sf should be on a
663  * victim worker which is different from w.
664  * For CILK_FORCE_REDUCE, the victim worker will equal w.
665  *
666  * Before execution, the __cilkrts_stack_frame's have pointers from
667  * older to younger, i.e., a __cilkrts_stack_frame points to parent.
668  *
669  * This method creates a full frame for each __cilkrts_stack_frame in
670  * the call stack, with each full frame also pointing to its parent.
671  *
672  * The method returns the full frame created for loot_sf, i.e., the
673  * youngest full frame.
674  */
unroll_call_stack(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * const loot_sf)675 static full_frame *unroll_call_stack(__cilkrts_worker *w,
676                                      full_frame *ff,
677                                      __cilkrts_stack_frame *const loot_sf)
678 {
679     __cilkrts_stack_frame *sf = loot_sf;
680     __cilkrts_stack_frame *rev_sf = 0;
681     __cilkrts_stack_frame *t_sf;
682 
683     CILK_ASSERT(sf);
684     /*CILK_ASSERT(sf->call_parent != sf);*/
685 
686     /* The leafmost frame is unsynched. */
687     if (sf->worker != w)
688         sf->flags |= CILK_FRAME_UNSYNCHED;
689 
690     /* Reverse the call stack to make a linked list ordered from parent
691        to child.  sf->call_parent points to the child of SF instead of
692        the parent.  */
693     do {
694         t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent;
695         sf->call_parent = rev_sf;
696         rev_sf = sf;
697         sf = t_sf;
698     } while (sf);
699     sf = rev_sf;
700 
701     /* Promote each stack frame to a full frame in order from parent
702        to child, following the reversed list we just built. */
703     make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1");
704     /* T is the *child* of SF, because we have reversed the list */
705     for (t_sf = __cilkrts_advance_frame(sf); t_sf;
706          sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) {
707         ff = make_child(w, ff, t_sf, NULL);
708         make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2");
709     }
710 
711     /* XXX What if the leafmost frame does not contain a sync
712        and this steal is from promote own deque? */
713     /*sf->flags |= CILK_FRAME_UNSYNCHED;*/
714 
715     CILK_ASSERT(!sf->call_parent);
716     return ff;
717 }
718 
719 /* detach the top of the deque frame from the VICTIM and install a new
720    CHILD frame in its place */
detach_for_steal(__cilkrts_worker * w,__cilkrts_worker * victim,cilk_fiber * fiber)721 static void detach_for_steal(__cilkrts_worker *w,
722                              __cilkrts_worker *victim,
723                              cilk_fiber* fiber)
724 {
725     /* ASSERT: we own victim->lock */
726 
727     full_frame *parent_ff, *child_ff, *loot_ff;
728     __cilkrts_stack_frame *volatile *h;
729     __cilkrts_stack_frame *sf;
730 
731     w->l->team = victim->l->team;
732 
733     CILK_ASSERT(w->l->frame_ff == 0 || w == victim);
734 
735     h = victim->head;
736 
737     CILK_ASSERT(*h);
738 
739     victim->head = h + 1;
740 
741     parent_ff = victim->l->frame_ff;
742     BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
743         /* parent no longer referenced by victim */
744         decjoin(parent_ff);
745 
746         /* obtain the victim call stack */
747         sf = *h;
748 
749         /* perform system-dependent normalizations */
750         /*__cilkrts_normalize_call_stack_on_steal(sf);*/
751 
752         /* unroll PARENT_FF with call stack SF, adopt the youngest
753            frame LOOT.  If loot_ff == parent_ff, then we hold loot_ff->lock,
754            otherwise, loot_ff is newly created and we can modify it without
755            holding its lock. */
756         loot_ff = unroll_call_stack(w, parent_ff, sf);
757 
758         #if REDPAR_DEBUG >= 3
759         fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n",
760                 w->self, victim->self,
761                 parent_ff, loot_ff);
762         #endif
763 
764         if (WORKER_USER == victim->l->type &&
765             NULL == victim->l->last_full_frame) {
766             // Mark this looted frame as special: only the original user worker
767             // may cross the sync.
768             //
769             // This call is a shared access to
770             // victim->l->last_full_frame.
771             set_sync_master(victim, loot_ff);
772         }
773 
774         /* LOOT is the next frame that the thief W is supposed to
775            run, unless the thief is stealing from itself, in which
776            case the thief W == VICTIM executes CHILD and nobody
777            executes LOOT. */
778         if (w == victim) {
779             /* Pretend that frame has been stolen */
780             loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED;
781             loot_ff->simulated_stolen = 1;
782         }
783         else
784             __cilkrts_push_next_frame(w, loot_ff);
785 
786         // After this "push_next_frame" call, w now owns loot_ff.
787         child_ff = make_child(w, loot_ff, 0, fiber);
788 
789         BEGIN_WITH_FRAME_LOCK(w, child_ff) {
790             /* install child in the victim's work queue, taking
791                the parent_ff's place */
792             /* child is referenced by victim */
793             incjoin(child_ff);
794 
795             // With this call, w is bestowing ownership of the newly
796             // created frame child_ff to the victim, and victim is
797             // giving up ownership of parent_ff.
798             //
799             // Worker w will either take ownership of parent_ff
800             // if parent_ff == loot_ff, or parent_ff will be
801             // suspended.
802             //
803             // Note that this call changes the victim->frame_ff
804             // while the victim may be executing.
805             make_runnable(victim, child_ff);
806         } END_WITH_FRAME_LOCK(w, child_ff);
807     } END_WITH_FRAME_LOCK(w, parent_ff);
808 }
809 
810 /**
811  * @brief cilk_fiber_proc that resumes user code after a successful
812  * random steal.
813 
814  * This function longjmps back into the user code whose state is
815  * stored in cilk_fiber_get_data(fiber)->resume_sf.  The stack pointer
816  * is adjusted so that the code resumes on the specified fiber stack
817  * instead of its original stack.
818  *
819  * This method gets executed only on a fiber freshly allocated from a
820  * pool.
821  *
822  * @param fiber   The fiber being used to resume user code.
823  * @param arg     Unused.
824  */
825 static
fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber * fiber)826 void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber)
827 {
828     cilk_fiber_data *data = cilk_fiber_get_data(fiber);
829     __cilkrts_stack_frame* sf = data->resume_sf;
830     full_frame *ff;
831 
832     CILK_ASSERT(sf);
833 
834     // When we pull the resume_sf out of the fiber to resume it, clear
835     // the old value.
836     data->resume_sf = NULL;
837     CILK_ASSERT(sf->worker == data->owner);
838     ff = sf->worker->l->frame_ff;
839 
840     // For Win32, we need to overwrite the default exception handler
841     // in this function, so that when the OS exception handling code
842     // walks off the top of the current Cilk stack, it reaches our stub
843     // handler.
844 
845     // Also, this function needs to be wrapped into a try-catch block
846     // so the compiler generates the appropriate exception information
847     // in this frame.
848 
849     // TBD: IS THIS HANDLER IN THE WRONG PLACE?  Can we longjmp out of
850     // this function (and does it matter?)
851 #if defined(_WIN32) && !defined(_WIN64)
852     install_exception_stub_handler();
853     __try
854 #endif
855     {
856         char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf);
857 
858         // Notify the Intel tools that we're stealing code
859         ITT_SYNC_ACQUIRED(sf->worker);
860         NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
861 
862         // TBD: We'd like to move TBB-interop methods into the fiber
863         // eventually.
864         cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
865 
866         sf->flags &= ~CILK_FRAME_SUSPENDED;
867 
868         // longjmp to user code.  Don't process exceptions here,
869         // because we are resuming a stolen frame.
870         sysdep_longjmp_to_sf(new_sp, sf, NULL);
871         /*NOTREACHED*/
872         // Intel's C compiler respects the preceding lint pragma
873     }
874 #if defined(_WIN32) && !defined(_WIN64)
875     __except (CILK_ASSERT(!"should not execute the the stub filter"),
876               EXCEPTION_EXECUTE_HANDLER)
877     {
878         // If we are here, that means something very wrong
879         // has happened in our exception processing...
880         CILK_ASSERT(! "should not be here!");
881     }
882 #endif
883 }
884 
random_steal(__cilkrts_worker * w)885 static void random_steal(__cilkrts_worker *w)
886 {
887     __cilkrts_worker *victim = NULL;
888     cilk_fiber *fiber = NULL;
889     int n;
890     int success = 0;
891     int32_t victim_id;
892 
893     // Nothing's been stolen yet. When true, this will flag
894     // setup_for_execution_pedigree to increment the pedigree
895     w->l->work_stolen = 0;
896 
897     /* If the user has disabled stealing (using the debugger) we fail */
898     if (__builtin_expect(w->g->stealing_disabled, 0))
899         return;
900 
901     CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w);
902 
903     /* If there is only one processor work can still be stolen.
904        There must be only one worker to prevent stealing. */
905     CILK_ASSERT(w->g->total_workers > 1);
906 
907     /* pick random *other* victim */
908     n = myrand(w) % (w->g->total_workers - 1);
909     if (n >= w->self)
910         ++n;
911 
912     // If we're replaying a log, override the victim.  -1 indicates that
913     // we've exhausted the list of things this worker stole when we recorded
914     // the log so just return.  If we're not replaying a log,
915     // replay_get_next_recorded_victim() just returns the victim ID passed in.
916     n = replay_get_next_recorded_victim(w, n);
917     if (-1 == n)
918         return;
919 
920     victim = w->g->workers[n];
921 
922     START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
923         /* Verify that we can get a stack.  If not, no need to continue. */
924         fiber = cilk_fiber_allocate(&w->l->fiber_pool);
925     } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
926 
927 
928     if (NULL == fiber) {
929 #if FIBER_DEBUG >= 2
930         fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n",
931                 w->self);
932 #endif
933         return;
934     }
935 
936     /* do not steal from self */
937     CILK_ASSERT (victim != w);
938 
939     /* Execute a quick check before engaging in the THE protocol.
940        Avoid grabbing locks if there is nothing to steal. */
941     if (!can_steal_from(victim)) {
942         NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
943         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
944             int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
945             // Fibers we use when trying to steal should not be active,
946             // and thus should not have any other references.
947             CILK_ASSERT(0 == ref_count);
948         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
949         return;
950     }
951 
952     /* Attempt to steal work from the victim */
953     if (worker_trylock_other(w, victim)) {
954         if (w->l->type == WORKER_USER && victim->l->team != w) {
955 
956             // Fail to steal if this is a user worker and the victim is not
957             // on this team.  If a user worker were allowed to steal work
958             // descended from another user worker, the former might not be
959             // done with its work by the time it was needed to resume and
960             // unbind.  Therefore, user workers are not permitted to change
961             // teams.
962 
963             // There is no race on the victim's team because the victim cannot
964             // change its team until it runs out of work to do, at which point
965             // it will try to take out its own lock, and this worker already
966             // holds it.
967             NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER);
968 
969         } else if (victim->l->frame_ff) {
970             // A successful steal will change victim->frame_ff, even
971             // though the victim may be executing.  Thus, the lock on
972             // the victim's deque is also protecting victim->frame_ff.
973             if (dekker_protocol(victim)) {
974                 int proceed_with_steal = 1; // optimistic
975 
976                 // If we're replaying a log, verify that this the correct frame
977                 // to steal from the victim
978                 if (! replay_match_victim_pedigree(w, victim))
979                 {
980                     // Abort the steal attempt. decrement_E(victim) to
981                     // counter the increment_E(victim) done by the
982                     // dekker protocol
983                     decrement_E(victim);
984                     proceed_with_steal = 0;
985                 }
986 
987                 if (proceed_with_steal)
988                 {
989                     START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) {
990                         success = 1;
991                         detach_for_steal(w, victim, fiber);
992                         victim_id = victim->self;
993 
994                         #if REDPAR_DEBUG >= 1
995                         fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n",
996                                 w->self, victim->self, fiber);
997                         #endif
998 
999                         // The use of victim->self contradicts our
1000                         // classification of the "self" field as
1001                         // local.  But since this code is only for
1002                         // debugging, it is ok.
1003                         DBGPRINTF ("%d-%p: Stealing work from worker %d\n"
1004                             "            sf: %p, call parent: %p\n",
1005                             w->self, GetCurrentFiber(), victim->self,
1006                             w->l->next_frame_ff->call_stack,
1007                             w->l->next_frame_ff->call_stack->call_parent);
1008                     } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS);
1009                 }  // end if(proceed_with_steal)
1010             } else {
1011                 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER);
1012             }
1013         } else {
1014             NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
1015         }
1016         worker_unlock_other(w, victim);
1017     } else {
1018         NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK);
1019     }
1020 
1021     // Record whether work was stolen.  When true, this will flag
1022     // setup_for_execution_pedigree to increment the pedigree
1023     w->l->work_stolen = success;
1024 
1025     if (0 == success) {
1026         // failed to steal work.  Return the fiber to the pool.
1027         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
1028             int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
1029             // Fibers we use when trying to steal should not be active,
1030             // and thus should not have any other references.
1031             CILK_ASSERT(0 == ref_count);
1032         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
1033     }
1034     else
1035     {
1036         // Since our steal was successful, finish initialization of
1037         // the fiber.
1038         cilk_fiber_reset_state(fiber,
1039                                fiber_proc_to_resume_user_code_for_random_steal);
1040         // Record the pedigree of the frame that w has stolen.
1041         // record only if CILK_RECORD_LOG is set
1042         replay_record_steal(w, victim_id);
1043     }
1044 }
1045 
1046 
1047 
1048 /**
1049  * At a provably good steal, we need to transfer the child reducer map
1050  * from ff->children_reducer_map into v->reducer_map, where v is the
1051  * worker that resumes execution of ff.
1052  *
1053  * Normally, we have v == w, where w is the currently executing
1054  * worker.  In the case where we are resuming a team leader on a user
1055  * worker, however, v might differ from w.
1056 
1057  * Thus, this, operation is a no-op, since we can't really move
1058  * ff->children_reducer_map into w here.
1059  *
1060  * Instead, this work is done in setup_for_execution_reducers().
1061  */
provably_good_steal_reducers(__cilkrts_worker * w,full_frame * ff)1062 static inline void provably_good_steal_reducers(__cilkrts_worker *w,
1063                                                 full_frame       *ff)
1064 {
1065     // No-op.
1066 }
1067 
1068 /* at a provably good steal, incorporate the accumulated exceptions of
1069    children into the parent's exception */
provably_good_steal_exceptions(__cilkrts_worker * w,full_frame * ff)1070 static void provably_good_steal_exceptions(__cilkrts_worker *w,
1071                                            full_frame       *ff)
1072 {
1073     // ASSERT: we own ff->lock
1074     ff->pending_exception =
1075         __cilkrts_merge_pending_exceptions(w,
1076                                            ff->child_pending_exception,
1077                                            ff->pending_exception);
1078     ff->child_pending_exception = NULL;
1079 }
1080 
1081 /* At sync discard the frame's old stack and take the leftmost child's. */
provably_good_steal_stacks(__cilkrts_worker * w,full_frame * ff)1082 static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff)
1083 {
1084     CILK_ASSERT(NULL == ff->fiber_self);
1085     ff->fiber_self = ff->fiber_child;
1086     ff->fiber_child = NULL;
1087 }
1088 
__cilkrts_mark_synched(full_frame * ff)1089 static void __cilkrts_mark_synched(full_frame *ff)
1090 {
1091     ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED;
1092     ff->simulated_stolen = 0;
1093 }
1094 
1095 static
provably_good_steal(__cilkrts_worker * w,full_frame * ff)1096 enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w,
1097                                                full_frame       *ff)
1098 {
1099     // ASSERT: we hold w->lock and ff->lock
1100 
1101     enum provably_good_steal_t result = ABANDON_EXECUTION;
1102 
1103     // If the current replay entry is a sync record matching the worker's
1104     // pedigree, AND this isn't the last child to the sync, return
1105     // WAIT_FOR_CONTINUE to indicate that the caller should loop until
1106     // we find the right frame to steal and CONTINUE_EXECUTION is returned.
1107     int match_found = replay_match_sync_pedigree(w);
1108     if (match_found && (0 != simulate_decjoin(ff)))
1109         return WAIT_FOR_CONTINUE;
1110 
1111     START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) {
1112         if (decjoin(ff) == 0) {
1113             provably_good_steal_reducers(w, ff);
1114             provably_good_steal_exceptions(w, ff);
1115             provably_good_steal_stacks(w, ff);
1116             __cilkrts_mark_synched(ff);
1117 
1118             // If the original owner wants this frame back (to resume
1119             // it on its original thread) pass it back now.
1120             if (NULL != ff->sync_master) {
1121                 // The frame wants to go back and be executed by the original
1122                 // user thread.  We can throw caution to the wind and push the
1123                 // frame straight onto its queue because the only way we have
1124                 // gotten to this point of being able to continue execution of
1125                 // the frame is if the original user worker is spinning without
1126                 // work.
1127 
1128                 unset_sync_master(w->l->team, ff);
1129                 __cilkrts_push_next_frame(w->l->team, ff);
1130 
1131                 // If this is the team leader we're not abandoning the work
1132                 if (w == w->l->team)
1133                     result = CONTINUE_EXECUTION;
1134             } else {
1135                 __cilkrts_push_next_frame(w, ff);
1136                 result = CONTINUE_EXECUTION;  // Continue working on this thread
1137             }
1138 
1139             // The __cilkrts_push_next_frame() call changes ownership
1140             // of ff to the specified worker.
1141         }
1142     } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL);
1143 
1144     // Only write a SYNC record if:
1145     // - We're recording a log *AND*
1146     // - We're the worker continuing from this sync
1147     replay_record_sync(w, result == CONTINUE_EXECUTION);
1148 
1149     // If we're replaying a log, and matched a sync from the log, mark the
1150     // sync record seen if the sync isn't going to be abandoned.
1151     replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION);
1152 
1153     return result;
1154 }
1155 
unconditional_steal(__cilkrts_worker * w,full_frame * ff)1156 static void unconditional_steal(__cilkrts_worker *w,
1157                                 full_frame *ff)
1158 {
1159     // ASSERT: we hold ff->lock
1160 
1161     START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) {
1162         decjoin(ff);
1163         __cilkrts_push_next_frame(w, ff);
1164     } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL);
1165 }
1166 
1167 
1168 /* CHILD is about to die.  Give its exceptions to a sibling or to the
1169    parent.  */
splice_exceptions_for_call(__cilkrts_worker * w,full_frame * parent_ff,full_frame * child_ff)1170 static inline void splice_exceptions_for_call(__cilkrts_worker *w,
1171                                               full_frame *parent_ff,
1172                                               full_frame *child_ff)
1173 {
1174     // ASSERT: We own parent_ff->lock
1175     CILK_ASSERT(child_ff->is_call_child);
1176     CILK_ASSERT(NULL == child_ff->right_pending_exception);
1177     CILK_ASSERT(NULL == parent_ff->pending_exception);
1178 
1179     parent_ff->pending_exception = child_ff->pending_exception;
1180     child_ff->pending_exception = NULL;
1181 }
1182 
1183 /**
1184  * Merge exceptions for a dying child.
1185  *
1186  * @param w                   The currently executing worker.
1187  * @param ff                  The child frame that is dying.
1188  * @param left_exception_ptr  Pointer to the exception that is to our left.
1189  */
1190 static inline
splice_exceptions_for_spawn(__cilkrts_worker * w,full_frame * ff,struct pending_exception_info ** left_exception_ptr)1191 void splice_exceptions_for_spawn(__cilkrts_worker *w,
1192                                  full_frame *ff,
1193                                  struct pending_exception_info **left_exception_ptr)
1194 {
1195     // ASSERT: parent_ff == child_ff->parent.
1196     // ASSERT: We own parent_ff->lock
1197 
1198     // Merge current exception into the slot where the left
1199     // exception should go.
1200     *left_exception_ptr =
1201         __cilkrts_merge_pending_exceptions(w,
1202                                            *left_exception_ptr,
1203                                            ff->pending_exception);
1204     ff->pending_exception = NULL;
1205 
1206 
1207     // Merge right exception into the slot where the left exception
1208     // should go.
1209     *left_exception_ptr =
1210         __cilkrts_merge_pending_exceptions(w,
1211                                            *left_exception_ptr,
1212                                            ff->right_pending_exception);
1213     ff->right_pending_exception = NULL;
1214 }
1215 
1216 
splice_stacks_for_call(__cilkrts_worker * w,full_frame * parent_ff,full_frame * child_ff)1217 static inline void splice_stacks_for_call(__cilkrts_worker *w,
1218                                           full_frame *parent_ff,
1219                                           full_frame *child_ff)
1220 {
1221 #if CILK_LIB_DEBUG
1222     if (parent_ff->call_stack)
1223         CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ));
1224 #endif
1225 
1226     /* A synched frame does not have accumulated child reducers. */
1227     CILK_ASSERT(!child_ff->fiber_child);
1228     CILK_ASSERT(child_ff->is_call_child);
1229 
1230     /* An attached parent has no self fiber.  It may have
1231        accumulated child fibers or child owners, which should be
1232        ignored until sync. */
1233     CILK_ASSERT(!parent_ff->fiber_self);
1234     parent_ff->fiber_self = child_ff->fiber_self;
1235     child_ff->fiber_self = NULL;
1236 }
1237 
finalize_child_for_call(__cilkrts_worker * w,full_frame * parent_ff,full_frame * child_ff)1238 static void finalize_child_for_call(__cilkrts_worker *w,
1239                                     full_frame *parent_ff,
1240                                     full_frame *child_ff)
1241 {
1242     // ASSERT: we hold w->lock and parent_ff->lock
1243 
1244     START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) {
1245         CILK_ASSERT(child_ff->is_call_child);
1246         CILK_ASSERT(child_ff->join_counter == 0);
1247         CILK_ASSERT(!child_ff->rightmost_child);
1248         CILK_ASSERT(child_ff == parent_ff->rightmost_child);
1249 
1250         // CHILD is about to die.
1251         // Splicing out reducers is a no-op for a call since
1252         // w->reducer_map should already store the correct
1253         // reducer map.
1254 
1255         // ASSERT there are no maps left to reduce.
1256         CILK_ASSERT(NULL == child_ff->children_reducer_map);
1257         CILK_ASSERT(NULL == child_ff->right_reducer_map);
1258 
1259         splice_exceptions_for_call(w, parent_ff, child_ff);
1260 
1261         splice_stacks_for_call(w, parent_ff, child_ff);
1262 
1263         /* remove CHILD from list of children of PARENT */
1264         unlink_child(parent_ff, child_ff);
1265 
1266         /* continue with the parent. */
1267         unconditional_steal(w, parent_ff);
1268         __cilkrts_destroy_full_frame(w, child_ff);
1269     } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD);
1270 }
1271 
1272 
1273 /**
1274  * The invariant on ff->children_reducer_map is that when ff is
1275  * synched and when we are about to resume execution of ff, at least
1276  * one of ff->children_reducer_map and w->reducer_map must be NULL.
1277  *
1278  * Consider the two possibilities before resuming execution of ff:
1279  *
1280  * 1.  Suppose ff is synched and suspended.  Then either
1281  *
1282  *     (a) ff->children_reducer_map stores the reducer map that w
1283  *         should use, where w is the worker resuming execution of ff,
1284  *         OR
1285  *     (b) w already has a user map, and ff->children_reducer_map is NULL.
1286  *
1287  *     Case (a) happens when we are resuming execution of ff as a
1288  *     provably good steal.  In this case, w->reducer_map should be
1289  *     NULL and ff->children_reducer_map is valid.  To resume
1290  *     execution of ff on w, set w->reducer_map to
1291  *     ff->children_reducer_map.
1292  *
1293  *     Case (b) occurs when we resume execution of ff because ff is a
1294  *     called child.  Then, ff->children_reducer_map should be NULL,
1295  *     and w should already have a valid reducer map when resuming
1296  *     execution of ff.  We resume execution of ff without changing
1297  *     w->reducer_map.
1298  *
1299  * 2. Suppose frame ff is not synched (i.e., it is active and might have
1300  *    active children).   Then ff->children_reducer_map is the slot for
1301  *    storing the reducer map from ff's leftmost child, as in the reducer
1302  *    protocol.   The runtime may resume execution of ff while it is not
1303  *    synched only because of a steal.
1304  *    In this case, while we are resuming ff, ff->children_reducer_map
1305  *    may be non-NULL (because one of ff's children has completed).
1306  *    We resume execution of ff without changing w->reducer_map.
1307  */
setup_for_execution_reducers(__cilkrts_worker * w,full_frame * ff)1308 static void setup_for_execution_reducers(__cilkrts_worker *w,
1309                                          full_frame *ff)
1310 {
1311     // We only need to move ff->children_reducer_map into
1312     // w->reducer_map in case 1(a).
1313     //
1314     // First check whether ff is synched.
1315     __cilkrts_stack_frame *sf = ff->call_stack;
1316     if (!(sf->flags & CILK_FRAME_UNSYNCHED)) {
1317         // In this case, ff is synched. (Case 1).
1318         CILK_ASSERT(!ff->rightmost_child);
1319 
1320         // Test whether we are in case 1(a) and have
1321         // something to do.  Note that if both
1322         // ff->children_reducer_map and w->reducer_map are NULL, we
1323         // can't distinguish between cases 1(a) and 1(b) here.
1324         if (ff->children_reducer_map) {
1325             // We are in Case 1(a).
1326             CILK_ASSERT(!w->reducer_map);
1327             w->reducer_map = ff->children_reducer_map;
1328             ff->children_reducer_map = NULL;
1329         }
1330     }
1331 }
1332 
setup_for_execution_exceptions(__cilkrts_worker * w,full_frame * ff)1333 static void setup_for_execution_exceptions(__cilkrts_worker *w,
1334                                            full_frame *ff)
1335 {
1336     CILK_ASSERT(NULL == w->l->pending_exception);
1337     w->l->pending_exception = ff->pending_exception;
1338     ff->pending_exception = NULL;
1339 }
1340 
1341 #if 0 /* unused */
1342 static void setup_for_execution_stack(__cilkrts_worker *w,
1343                                       full_frame *ff)
1344 {
1345 }
1346 #endif
1347 
1348 /*
1349  * setup_for_execution_pedigree
1350  *
1351  * Copies the pedigree information from the frame we're resuming to the
1352  * worker.  Increments the pedigree if this is work that has been stolen
1353  * to match the increment on a return from a spawn helper.
1354  */
setup_for_execution_pedigree(__cilkrts_worker * w)1355 static void setup_for_execution_pedigree(__cilkrts_worker *w)
1356 {
1357     int pedigree_unsynched;
1358     __cilkrts_stack_frame *sf = w->current_stack_frame;
1359 
1360     CILK_ASSERT(NULL != sf);
1361 
1362     // If this isn't an ABI 1 or later frame, there's no pedigree information
1363     if (0 == CILK_FRAME_VERSION_VALUE(sf->flags))
1364         return;
1365 
1366     // Note whether the pedigree is unsynched and clear the flag before
1367     // we forget
1368     pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1369     sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1370 
1371     // If we're just marshalling onto this worker, do not increment
1372     // the rank since that wouldn't happen in a sequential execution
1373     if (w->l->work_stolen || pedigree_unsynched)
1374     {
1375         if (w->l->work_stolen)
1376             w->pedigree.rank = sf->parent_pedigree.rank + 1;
1377         else
1378             w->pedigree.rank = sf->parent_pedigree.rank;
1379     }
1380 
1381     w->pedigree.parent = sf->parent_pedigree.parent;
1382     w->l->work_stolen = 0;
1383 }
1384 
setup_for_execution(__cilkrts_worker * w,full_frame * ff,int is_return_from_call)1385 static void setup_for_execution(__cilkrts_worker *w,
1386                                 full_frame *ff,
1387                                 int is_return_from_call)
1388 {
1389     // ASSERT: We own w->lock and ff->lock || P == 1
1390 
1391     setup_for_execution_reducers(w, ff);
1392     setup_for_execution_exceptions(w, ff);
1393     /*setup_for_execution_stack(w, ff);*/
1394 
1395     ff->call_stack->worker = w;
1396     w->current_stack_frame = ff->call_stack;
1397 
1398     // If this is a return from a call, leave the pedigree alone
1399     if (! is_return_from_call)
1400         setup_for_execution_pedigree(w);
1401 
1402     __cilkrts_setup_for_execution_sysdep(w, ff);
1403 
1404     w->head = w->tail = w->l->ltq;
1405     reset_THE_exception(w);
1406 
1407     make_runnable(w, ff);
1408 }
1409 
1410 
1411 /*
1412  * Called by the scheduling fiber, right before
1413  * resuming a sf/ff for user code.
1414  *
1415  * This method associates the specified sf with the worker.
1416  *
1417  * It also asserts that w, ff, and sf all have the expected properties
1418  * for resuming user code.
1419  */
scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * sf)1420 void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w,
1421                                                   full_frame *ff,
1422                                                   __cilkrts_stack_frame *sf)
1423 {
1424     w->current_stack_frame = sf;
1425     sf->worker = w;
1426 
1427     // Lots of debugging checks on the state of the fiber we might be
1428     // resuming.
1429 #if FIBER_DEBUG >= 1
1430 #   if FIBER_DEBUG >= 3
1431     {
1432         fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n",
1433                 w->self, ff, sf);
1434     }
1435 #   endif
1436 
1437     const int flags = sf->flags;
1438     CILK_ASSERT(flags & CILK_FRAME_SUSPENDED);
1439     CILK_ASSERT(!sf->call_parent);
1440     CILK_ASSERT(w->head == w->tail);
1441 
1442     /* A frame can not be resumed unless it was suspended. */
1443     CILK_ASSERT(ff->sync_sp != NULL);
1444 
1445     /* The leftmost frame has no allocated stack */
1446     if (ff->simulated_stolen)
1447         CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED);
1448     else if (flags & CILK_FRAME_UNSYNCHED)
1449         /* XXX By coincidence sync_sp could be null. */
1450         CILK_ASSERT(ff->fiber_self != NULL);
1451     else
1452         /* XXX This frame could be resumed unsynched on the leftmost stack */
1453         CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w));
1454     CILK_ASSERT(w->l->frame_ff == ff);
1455 #endif
1456 }
1457 
1458 
1459 /**
1460  * This method is the first method that should execute after we've
1461  * switched to a scheduling fiber from user code.
1462  *
1463  * @param fiber The scheduling fiber for the current worker.
1464  * @param wptr  The current worker.
1465  */
enter_runtime_transition_proc(cilk_fiber * fiber)1466 static void enter_runtime_transition_proc(cilk_fiber *fiber)
1467 {
1468     // We can execute this method for one of three reasons:
1469     // 1. Undo-detach finds parent stolen.
1470     // 2. Sync suspends frame.
1471     // 3. Return from Cilk entry point.
1472     //
1473     //
1474     // In cases 1 and 2, the frame may be truly suspended or
1475     // may be immediately executed by this worker after provably_good_steal.
1476     //
1477     //
1478     // There is a fourth case, which can, but does not need to execute
1479     // this function:
1480     //   4. Starting up the scheduling loop on a user or
1481     //      system worker.  In this case, we won't have
1482     //      a scheduling stack function to run.
1483     __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1484     if (w->l->post_suspend) {
1485         // Run the continuation function passed to longjmp_into_runtime
1486         run_scheduling_stack_fcn(w);
1487 
1488         // After we have jumped into the runtime and run the
1489         // scheduling function, any reducer map the worker had before entering the runtime
1490         // should have already been saved into the appropriate full
1491         // frame.
1492         CILK_ASSERT(NULL == w->reducer_map);
1493 
1494         // There shouldn't be any uncaught exceptions.
1495         //
1496         // In Windows, the OS catches any exceptions not caught by the
1497         // user code.  Thus, we are omitting the check on Windows.
1498         //
1499         // On Android, calling std::uncaught_exception with the stlport
1500         // library causes a seg fault.  Since we're not supporting
1501         // exceptions there at this point, just don't do the check
1502         //
1503         // TBD: Is this check also safe to do on Windows?
1504         CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1505     }
1506 }
1507 
1508 
1509 /**
1510  * Method called to jump back to executing user code.
1511  *
1512  * A normal return from the runtime back to resuming user code calls
1513  * this method.  A computation executed using force_reduce also calls
1514  * this method to return to user code.
1515  *
1516  * This function should not contain any code that depends on a fiber.
1517  * In a force-reduce case, the user worker may not have a fiber.  In
1518  * the force-reduce case, we call this method directly instead of
1519  * calling @c user_code_resume_after_switch_into_runtime.
1520  */
1521 static inline NORETURN
cilkrts_resume(__cilkrts_stack_frame * sf,full_frame * ff)1522 cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff)
1523 {
1524     // Save the sync stack pointer, and do the bookkeeping
1525     char* sync_sp = ff->sync_sp;
1526     __cilkrts_take_stack(ff, sync_sp);  // leaves ff->sync_sp null
1527 
1528     sf->flags &= ~CILK_FRAME_SUSPENDED;
1529     // Actually longjmp to the user code.
1530     // We may have exceptions to deal with, since we are resuming
1531     // a previous-suspended frame.
1532     sysdep_longjmp_to_sf(sync_sp, sf, ff);
1533 }
1534 
1535 
1536 /**
1537  * Called by the user-code fiber right before resuming a full frame
1538  * (sf/ff).
1539  *
1540  * This method pulls sf/ff out of the worker, and then calls
1541  * cilkrts_resume to jump to user code.
1542  */
1543 static NORETURN
user_code_resume_after_switch_into_runtime(cilk_fiber * fiber)1544 user_code_resume_after_switch_into_runtime(cilk_fiber *fiber)
1545 {
1546     __cilkrts_worker *w = cilk_fiber_get_owner(fiber);
1547     __cilkrts_stack_frame *sf;
1548     full_frame *ff;
1549     sf = w->current_stack_frame;
1550     ff = sf->worker->l->frame_ff;
1551 
1552 #if FIBER_DEBUG >= 1
1553     CILK_ASSERT(ff->fiber_self == fiber);
1554     cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
1555     DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n",
1556                w->self, w, fiber);
1557     CILK_ASSERT(sf == fdata->resume_sf);
1558 #endif
1559 
1560     // Notify the Intel tools that we're stealing code
1561     ITT_SYNC_ACQUIRED(sf->worker);
1562     NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
1563     cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
1564 
1565     // Actually jump to user code.
1566     cilkrts_resume(sf, ff);
1567  }
1568 
1569 
1570 /* The current stack is about to either be suspended or destroyed.  This
1571  * function will switch to the stack on which the scheduler is suspended and
1572  * resume running the scheduler within function do_work().  Upon waking up,
1573  * the scheduler will run the 'cont' function, using the supplied worker and
1574  * frame.
1575  */
1576 static NORETURN
longjmp_into_runtime(__cilkrts_worker * w,scheduling_stack_fcn_t fcn,__cilkrts_stack_frame * sf)1577 longjmp_into_runtime(__cilkrts_worker *w,
1578                      scheduling_stack_fcn_t fcn,
1579                      __cilkrts_stack_frame *sf)
1580 {
1581     full_frame *ff, *ff2;
1582 
1583     CILK_ASSERT(!w->l->post_suspend);
1584     ff = w->l->frame_ff;
1585 
1586     // If we've got only one worker, stealing shouldn't be possible.
1587     // Assume that this is a steal or return from spawn in a force-reduce case.
1588     // We don't have a scheduling stack to switch to, so call the continuation
1589     // function directly.
1590     if (1 == w->g->P) {
1591         fcn(w, ff, sf);
1592 
1593         /* The call to function c() will have pushed ff as the next frame.  If
1594          * this were a normal (non-forced-reduce) execution, there would have
1595          * been a pop_next_frame call in a separate part of the runtime.  We
1596          * must call pop_next_frame here to complete the push/pop cycle. */
1597         ff2 = pop_next_frame(w);
1598 
1599         setup_for_execution(w, ff2, 0);
1600         scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame);
1601         cilkrts_resume(w->current_stack_frame, ff2);
1602 
1603 // Suppress clang warning that the expression result is unused
1604 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1605 #   pragma clang diagnostic push
1606 #   pragma clang diagnostic ignored "-Wunused-value"
1607 #endif // __clang__
1608         /* no return */
1609         CILK_ASSERT(((void)"returned from __cilkrts_resume", 0));
1610 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1611 #   pragma clang diagnostic pop
1612 #endif // __clang__
1613     }
1614 
1615     w->l->post_suspend = fcn;
1616     w->l->suspended_stack = sf;
1617 
1618     ITT_SYNC_RELEASING(w);
1619     ITT_SYNC_PREPARE(w);
1620 
1621 #if FIBER_DEBUG >= 2
1622     fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n",
1623             cilkos_get_current_thread_id(),
1624             w->self, w->l->frame_ff,
1625             sf);
1626 #endif
1627 
1628     // Current fiber is either the (1) one we are about to free,
1629     // or (2) it has been passed up to the parent.
1630     cilk_fiber *current_fiber = ( w->l->fiber_to_free ?
1631                                   w->l->fiber_to_free :
1632                                   w->l->frame_ff->parent->fiber_child );
1633     cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber);
1634     CILK_ASSERT(NULL == w->l->frame_ff->fiber_self);
1635 
1636     // Clear the sf in the current fiber for cleanliness, to prevent
1637     // us from accidentally resuming a bad sf.
1638     // Technically, resume_sf gets overwritten for a fiber when
1639     // we are about to resume it anyway.
1640     fdata->resume_sf = NULL;
1641     CILK_ASSERT(fdata->owner == w);
1642 
1643     // Set the function to execute immediately after switching to the
1644     // scheduling fiber, but before freeing any fibers.
1645     cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber,
1646                                     enter_runtime_transition_proc);
1647     cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN);
1648 
1649     if (w->l->fiber_to_free) {
1650         // Case 1: we are freeing this fiber.  We never
1651         // resume this fiber again after jumping into the runtime.
1652         w->l->fiber_to_free = NULL;
1653 
1654         // Extra check. Normally, the fiber we are about to switch to
1655         // should have a NULL owner.
1656         CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner);
1657 #if FIBER_DEBUG >= 4
1658         fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1659                 cilkos_get_current_thread_id(),
1660                 w->self,
1661                 current_fiber, w->l->scheduling_fiber);
1662 #endif
1663         cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE);
1664         NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER);
1665         cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber,
1666                                                                &w->l->fiber_pool,
1667                                                                w->l->scheduling_fiber);
1668         // We should never come back here!
1669         CILK_ASSERT(0);
1670     }
1671     else {
1672         // Case 2: We are passing the fiber to our parent because we
1673         // are leftmost.  We should come back later to
1674         // resume execution of user code.
1675         //
1676         // If we are not freeing a fiber, there we must be
1677         // returning from a spawn or processing an exception.  The
1678         // "sync" path always frees a fiber.
1679         //
1680         // We must be the leftmost child, and by left holder logic, we
1681         // have already moved the current fiber into our parent full
1682         // frame.
1683 #if FIBER_DEBUG >= 2
1684         fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1685                 cilkos_get_current_thread_id(),
1686                 w->self,
1687                 current_fiber, w->l->scheduling_fiber);
1688 #endif
1689 
1690         NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
1691 
1692         cilk_fiber_suspend_self_and_resume_other(current_fiber,
1693                                                  w->l->scheduling_fiber);
1694         // Resuming this fiber returns control back to
1695         // this function because our implementation uses OS fibers.
1696         //
1697         // On Unix, we could have the choice of passing the
1698         // user_code_resume_after_switch_into_runtime as an extra "resume_proc"
1699         // that resumes execution of user code instead of the
1700         // jumping back here, and then jumping back to user code.
1701 #if FIBER_DEBUG >= 2
1702         CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker());
1703 #endif
1704         user_code_resume_after_switch_into_runtime(current_fiber);
1705     }
1706 }
1707 
1708 /*
1709  * Send a message to the children of the specified worker: run or wait.
1710  */
notify_children(__cilkrts_worker * w,unsigned int msg)1711 static void notify_children(__cilkrts_worker *w, unsigned int msg)
1712 {
1713     int child_num;
1714     __cilkrts_worker *child;
1715     int num_sys_workers = w->g->P - 1;
1716 
1717     // If worker is "n", then its children are 2n + 1, and 2n + 2.
1718     child_num = (w->self << 1) + 1;
1719     if (child_num < num_sys_workers) {
1720         child = w->g->workers[child_num];
1721         CILK_ASSERT(child->l->signal_node);
1722         signal_node_msg(child->l->signal_node, msg);
1723         child_num++;
1724         if (child_num < num_sys_workers) {
1725             child = w->g->workers[child_num];
1726             CILK_ASSERT(child->l->signal_node);
1727             signal_node_msg(child->l->signal_node, msg);
1728         }
1729     }
1730 }
1731 
1732 /*
1733  * Notify this worker's children that they need to wait.
1734  */
notify_children_wait(__cilkrts_worker * w)1735 static void notify_children_wait(__cilkrts_worker *w)
1736 {
1737     notify_children(w, 0);
1738 }
1739 
1740 /*
1741  * Notify this worker's children to run and start trying to steal.
1742  */
notify_children_run(__cilkrts_worker * w)1743 static void notify_children_run(__cilkrts_worker *w)
1744 {
1745     notify_children(w, 1);
1746 }
1747 
1748 /**
1749  * A single "check" to find work, either on our queue or through a
1750  * steal attempt.  This method checks our local queue once, and
1751  * performs one steal attempt.
1752  */
check_for_work(__cilkrts_worker * w)1753 static full_frame* check_for_work(__cilkrts_worker *w)
1754 {
1755     full_frame *ff = NULL;
1756     ff = pop_next_frame(w);
1757     // If there is no work on the queue, try to steal some.
1758     if (NULL == ff) {
1759         START_INTERVAL(w, INTERVAL_STEALING) {
1760             if (w->l->type != WORKER_USER && w->l->team != NULL) {
1761                 // At this point, the worker knows for certain that it has run
1762                 // out of work.  Therefore, it loses its team affiliation.  User
1763                 // workers never change teams, of course.
1764                 __cilkrts_worker_lock(w);
1765                 w->l->team = NULL;
1766                 __cilkrts_worker_unlock(w);
1767             }
1768 
1769             // If we are about to do a random steal, we should have no
1770             // full frame...
1771             CILK_ASSERT(NULL == w->l->frame_ff);
1772             random_steal(w);
1773         } STOP_INTERVAL(w, INTERVAL_STEALING);
1774 
1775         // If the steal was successful, then the worker has populated its next
1776         // frame with the work to resume.
1777         ff = pop_next_frame(w);
1778         if (NULL == ff) {
1779             // Punish the worker for failing to steal.
1780             // No quantum for you!
1781             __cilkrts_yield();
1782             w->l->steal_failure_count++;
1783         } else {
1784             // Reset steal_failure_count since there is obviously still work to
1785             // be done.
1786             w->l->steal_failure_count = 0;
1787         }
1788     }
1789     return ff;
1790 }
1791 
1792 /**
1793  * Keep stealing or looking on our queue.
1794  *
1795  * Returns either when a full frame is found, or NULL if the
1796  * computation is done.
1797  */
search_until_work_found_or_done(__cilkrts_worker * w)1798 static full_frame* search_until_work_found_or_done(__cilkrts_worker *w)
1799 {
1800     full_frame *ff = NULL;
1801     // Find a full frame to execute (either through random stealing,
1802     // or because we pull it off w's 1-element queue).
1803     while (!ff) {
1804         // Check worker state to figure out our next action.
1805         switch (worker_runnable(w))
1806         {
1807         case SCHEDULE_RUN:             // One attempt at checking for work.
1808             ff = check_for_work(w);
1809             break;
1810         case SCHEDULE_WAIT:            // go into wait-mode.
1811             CILK_ASSERT(WORKER_SYSTEM == w->l->type);
1812             // If we are about to wait, then we better not have
1813             // a frame that we should execute...
1814             CILK_ASSERT(NULL == w->l->next_frame_ff);
1815             notify_children_wait(w);
1816             signal_node_wait(w->l->signal_node);
1817             // ...
1818             // Runtime is waking up.
1819             notify_children_run(w);
1820             w->l->steal_failure_count = 0;
1821             break;
1822         case SCHEDULE_EXIT:            // exit the scheduler.
1823             CILK_ASSERT(WORKER_USER != w->l->type);
1824             return NULL;
1825         default:
1826             CILK_ASSERT(0);
1827             abort();
1828         }
1829     }
1830     return ff;
1831 }
1832 
1833 /**
1834  * The proc method for a scheduling fiber on a user worker.
1835  *
1836  * When a user worker jumps into the runtime, it jumps into this
1837  * method by either starting it if the scheduling fiber has never run
1838  * before, or resuming the fiber if it was previously suspended.
1839  */
1840 COMMON_PORTABLE
scheduler_fiber_proc_for_user_worker(cilk_fiber * fiber)1841 void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber)
1842 {
1843     __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1844     CILK_ASSERT(w);
1845 
1846     // This must be a user worker
1847     CILK_ASSERT(WORKER_USER == w->l->type);
1848 
1849     // If we aren't the current worker, then something is very wrong
1850     // here..
1851     verify_current_wkr(w);
1852 
1853     __cilkrts_run_scheduler_with_exceptions(w);
1854 }
1855 
1856 
1857 /**
1858  * The body of the runtime scheduling loop.  This function executes in
1859  * 4 stages:
1860  *
1861  * 1. Transitions from the user code into the runtime by
1862  *    executing any scheduling-stack functions.
1863  * 2. Looks for a full frame enqueued from a successful provably
1864  *    good steal.
1865  * 3. If no full frame is found in step 2, steal until
1866  *    a frame is found or we are done.  If we are done, finish
1867  *    the scheduling loop.
1868  * 4. When a frame is found, setup to resume user code.
1869  *    In particular, suspend the current fiber and resume the
1870  *    user fiber to execute the frame.
1871  *
1872  * Returns a fiber object that we should switch to after completing
1873  * the body of the loop, or NULL if we should continue executing on
1874  * this fiber.
1875  *
1876  * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber
1877  *
1878  * @param current_fiber   The currently executing (scheduling_ fiber
1879  * @param wptr            The currently executing worker.
1880  * @param return          The next fiber we should switch to.
1881  */
worker_scheduling_loop_body(cilk_fiber * current_fiber,void * wptr)1882 static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber,
1883                                                void* wptr)
1884 {
1885     __cilkrts_worker *w = (__cilkrts_worker*) wptr;
1886     CILK_ASSERT(current_fiber == w->l->scheduling_fiber);
1887 
1888     // Stage 1: Transition from executing user code to the runtime code.
1889     // We don't need to do this call here any more, because
1890     // every switch to the scheduling fiber should make this call
1891     // using a post_switch_proc on the fiber.
1892     //
1893     //  enter_runtime_transition_proc(w->l->scheduling_fiber, wptr);
1894 
1895     // After Stage 1 is complete, w should no longer have
1896     // an associated full frame.
1897     CILK_ASSERT(NULL == w->l->frame_ff);
1898 
1899     // Stage 2.  First do a quick check of our 1-element queue.
1900     full_frame *ff = pop_next_frame(w);
1901 
1902     if (!ff) {
1903         // Stage 3.  We didn't find anything from our 1-element
1904         // queue.  Now go through the steal loop to find work.
1905         ff = search_until_work_found_or_done(w);
1906         if (!ff) {
1907             CILK_ASSERT(w->g->work_done);
1908             return NULL;
1909         }
1910     }
1911 
1912     // Stage 4.  Now that we have found a full frame to work on,
1913     // actually execute it.
1914     __cilkrts_stack_frame *sf;
1915 
1916     // There shouldn't be any uncaught exceptions.
1917     //
1918     // In Windows, the OS catches any exceptions not caught by the
1919     // user code.  Thus, we are omitting the check on Windows.
1920     //
1921     // On Android, calling std::uncaught_exception with the stlport
1922     // library causes a seg fault.  Since we're not supporting
1923     // exceptions there at this point, just don't do the check
1924     CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1925 
1926     BEGIN_WITH_WORKER_LOCK(w) {
1927         CILK_ASSERT(!w->l->frame_ff);
1928         BEGIN_WITH_FRAME_LOCK(w, ff) {
1929             sf = ff->call_stack;
1930             CILK_ASSERT(sf && !sf->call_parent);
1931             setup_for_execution(w, ff, 0);
1932         } END_WITH_FRAME_LOCK(w, ff);
1933     } END_WITH_WORKER_LOCK(w);
1934 
1935     /* run it */
1936     //
1937     // Prepare to run the full frame.  To do so, we need to:
1938     //   (a) Execute some code on this fiber (the scheduling
1939     //       fiber) to set up data structures, and
1940     //   (b) Suspend the scheduling fiber, and resume the
1941     //       user-code fiber.
1942 
1943     // Part (a). Set up data structures.
1944     scheduling_fiber_prepare_to_resume_user_code(w, ff, sf);
1945 
1946     cilk_fiber *other = w->l->frame_ff->fiber_self;
1947     cilk_fiber_data* other_data = cilk_fiber_get_data(other);
1948     cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber);
1949 
1950     // I believe two cases are possible here, both of which
1951     // should have other_data->resume_sf as NULL.
1952     //
1953     // 1. Resuming a fiber that was previously executing
1954     //    user code (i.e., a provably-good-steal).
1955     //    In this case, resume_sf should have been
1956     //    set to NULL when it was suspended.
1957     //
1958     // 2. Resuming code on a steal.  In this case, since we
1959     //    grabbed a new fiber, resume_sf should be NULL.
1960     CILK_ASSERT(NULL == other_data->resume_sf);
1961 
1962 #if FIBER_DEBUG >= 2
1963     fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n",
1964             w->self, other, other_data->resume_sf);
1965 #endif
1966     // Update our own fiber's data.
1967     current_fiber_data->resume_sf = NULL;
1968     // The scheduling fiber should have the right owner from before.
1969     CILK_ASSERT(current_fiber_data->owner == w);
1970     other_data->resume_sf = sf;
1971 
1972 
1973 #if FIBER_DEBUG >= 3
1974     fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n",
1975             cilkos_get_current_thread_id(),
1976             w->self,
1977             current_fiber, other,
1978             current_fiber_data->resume_sf,
1979             other_data->resume_sf);
1980 #endif
1981     return other;
1982 }
1983 
1984 
1985 /**
1986  * This function is executed once by each worker, to initialize its
1987  * scheduling loop.
1988  */
worker_scheduler_init_function(__cilkrts_worker * w)1989 static void worker_scheduler_init_function(__cilkrts_worker *w)
1990 {
1991     // First, execute the startup tasks that must happen for all
1992     // worker types.
1993     ITT_SYNC_PREPARE(w);
1994     /* Notify tools about the new worker. Inspector needs this, but we
1995        don't want to confuse Cilkscreen with system threads.  User threads
1996        do this notification in bind_thread */
1997     if (! w->g->under_ptool)
1998         __cilkrts_cilkscreen_establish_worker(w);
1999 
2000     // Seed the initial random number generator.
2001     // If we forget to do this, then the worker always steals from 0.
2002     // Programs will still execute correctly, but
2003     // you may see a subtle performance bug...
2004     mysrand(w, (w->self + 1));
2005 
2006     // The startup work varies, depending on the worker type.
2007     switch (w->l->type) {
2008     case WORKER_USER:
2009         // Stop working once we've entered the scheduler.
2010         // For user workers, INTERVAL_IN_SCHEDULER counts the time
2011         // since we called bind_thread.
2012         break;
2013 
2014     case WORKER_SYSTEM:
2015         // If a system worker is starting, we must also be starting
2016         // the runtime.
2017 
2018         // Runtime begins in a wait-state and is woken up by the first user
2019         // worker when the runtime is ready.
2020         signal_node_wait(w->l->signal_node);
2021         // ...
2022         // Runtime is waking up.
2023         notify_children_run(w);
2024         w->l->steal_failure_count = 0;
2025 
2026         // For system threads, count all the time this thread is
2027         // alive in the scheduling loop.
2028         START_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2029         START_INTERVAL(w, INTERVAL_WORKING);
2030         break;
2031     default:
2032         __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n",
2033                       w, w->l->type);
2034     }
2035 }
2036 
2037 /**
2038  * This function is executed once by each worker, to finish its
2039  * scheduling loop.
2040  *
2041  * @note Currently, only system workers finish their loops.  User
2042  * workers will jump away to user code without exiting their
2043  * scheduling loop.
2044  */
worker_scheduler_terminate_function(__cilkrts_worker * w)2045 static void worker_scheduler_terminate_function(__cilkrts_worker *w)
2046 {
2047     // A user worker should never finish by falling through the
2048     // scheduling loop.
2049     CILK_ASSERT(WORKER_USER != w->l->type);
2050     STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2051     STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2052 }
2053 
2054 /**
2055  * The main scheduler function executed by a worker's scheduling
2056  * fiber.
2057  *
2058  * This method is started by either a new system worker, or a user
2059  * worker that has stalled and just been imported into the runtime.
2060  */
worker_scheduler_function(__cilkrts_worker * w)2061 static void worker_scheduler_function(__cilkrts_worker *w)
2062 {
2063     worker_scheduler_init_function(w);
2064 
2065     // The main scheduling loop body.
2066 
2067     while (!w->g->work_done) {
2068         // Set intervals.  Now we are in the runtime instead of working.
2069         START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2070         STOP_INTERVAL(w, INTERVAL_WORKING);
2071 
2072         // Execute the "body" of the scheduling loop, and figure
2073         // out the fiber to jump to next.
2074         cilk_fiber* fiber_to_resume
2075             = worker_scheduling_loop_body(w->l->scheduling_fiber, w);
2076 
2077         if (fiber_to_resume) {
2078             // Suspend the current fiber and resume next one.
2079             NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
2080             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2081             START_INTERVAL(w, INTERVAL_WORKING);
2082             cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber,
2083                                                      fiber_to_resume);
2084 
2085             // Return here only when this (scheduling) fiber is
2086             // resumed (i.e., this worker wants to reenter the runtime).
2087         }
2088     }
2089 
2090     // Finish the scheduling loop.
2091     worker_scheduler_terminate_function(w);
2092 }
2093 
2094 
2095 /*************************************************************
2096   Forward declarations for reduction protocol.
2097 *************************************************************/
2098 
2099 static __cilkrts_worker*
2100 execute_reductions_for_sync(__cilkrts_worker *w,
2101                             full_frame *ff,
2102                             __cilkrts_stack_frame *sf_at_sync);
2103 
2104 static __cilkrts_worker*
2105 execute_reductions_for_spawn_return(__cilkrts_worker *w,
2106                                     full_frame *ff,
2107                                     __cilkrts_stack_frame *returning_sf);
2108 
2109 
2110 
2111 /*************************************************************
2112   Scheduler functions that are callable by client code
2113 *************************************************************/
disown(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * sf,const char * why)2114 static full_frame *disown(__cilkrts_worker *w,
2115                           full_frame *ff,
2116                           __cilkrts_stack_frame *sf,
2117                           const char *why)
2118 {
2119     CILK_ASSERT(ff);
2120     make_unrunnable(w, ff, sf, sf != 0, why);
2121     w->l->frame_ff = 0;
2122     return ff->parent;
2123 }
2124 
2125 /**
2126  * Called when ff is returning from a spawn, and we need to execute a
2127  * reduction.
2128  *
2129  * @param w             The currently executing worker.
2130  * @param ff            The full frame for w.
2131  * @param returning_sf  The stack frame for the spawn helper that is returning.
2132  *
2133  * Normally, by the time we gain control in the runtime, the worker
2134  * has already popped off the __cilkrts_stack_frame "returning_sf"
2135  * from its call chain.
2136  *
2137  * When we have only serial reductions, w->current_stack_frame is not
2138  * needed any more, because w is about to enter the runtime scheduling
2139  * loop anyway.  Similarly, the frame "ff" is slated to be destroyed
2140  * after the runtime finishes the return from spawn and splices ff out
2141  * of the tree of full frames.
2142  *
2143  * To execute a parallel reduction, however, we still want
2144  * w->current_stack_frame == returning_sf, and we are going to use the
2145  * frame ff for a little bit longer.
2146  *
2147  * This method:
2148  *
2149  *   1. Puts returning_sf back as w's current stack frame.
2150  *   2. Makes "ff" runnable again on w.
2151  */
2152 static inline
restore_frame_for_spawn_return_reduction(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * returning_sf)2153 void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w,
2154                                               full_frame *ff,
2155                                               __cilkrts_stack_frame *returning_sf) {
2156 #if REDPAR_DEBUG >= 2
2157     CILK_ASSERT(returning_sf);
2158     CILK_ASSERT(returning_sf->worker == w);
2159 #endif
2160     // Change w's current stack frame back to "returning_sf".
2161     //
2162     // Intuitively, w->current_stack_frame should be
2163     // returning_sf->call_parent at this point.
2164     //
2165     // We can not assert this, however, because the pop of
2166     // returning_sf from the call chain has already cleared
2167     // returning_sf->call_parent.  We don't want to restore the call
2168     // parent of returning_sf, because its parent has been stolen, and
2169     // the runtime assumes that steals break this link.
2170 
2171     // We cannot assert call_parent is NULL either, since that's not true for
2172     // Win64 exception handling
2173 //    CILK_ASSERT(returning_sf->call_parent == NULL);
2174     w->current_stack_frame = returning_sf;
2175 
2176     // Make the full frame "ff" runnable again, in preparation for
2177     // executing the reduction.
2178     make_runnable(w, ff);
2179 }
2180 
2181 
__cilkrts_c_sync(__cilkrts_worker * w,__cilkrts_stack_frame * sf_at_sync)2182 NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
2183                           __cilkrts_stack_frame *sf_at_sync)
2184 {
2185     full_frame *ff;
2186 
2187     // Claim: This read of w->l->frame_ff can occur without
2188     // holding the worker lock because when w has reached a sync
2189     // and entered the runtime (because it stalls), w's deque is empty
2190     // and no one else can steal and change w->l->frame_ff.
2191 
2192     ff = w->l->frame_ff;
2193 #ifdef _WIN32
2194     __cilkrts_save_exception_state(w, ff);
2195 #else
2196     // Move any pending exceptions into the full frame
2197     CILK_ASSERT(NULL == ff->pending_exception);
2198     ff->pending_exception = w->l->pending_exception;
2199     w->l->pending_exception = NULL;
2200 #endif
2201 
2202     w = execute_reductions_for_sync(w, ff, sf_at_sync);
2203 
2204 #if FIBER_DEBUG >= 3
2205     fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n",
2206             cilkos_get_current_thread_id(), w->self, ff);
2207 #endif
2208 
2209     longjmp_into_runtime(w, do_sync, sf_at_sync);
2210 }
2211 
do_sync(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * sf)2212 static void do_sync(__cilkrts_worker *w, full_frame *ff,
2213                     __cilkrts_stack_frame *sf)
2214 {
2215     //int abandoned = 1;
2216     enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2217 
2218     START_INTERVAL(w, INTERVAL_SYNC_CHECK) {
2219         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2220 
2221             CILK_ASSERT(ff);
2222             BEGIN_WITH_FRAME_LOCK(w, ff) {
2223                 CILK_ASSERT(sf->call_parent == 0);
2224                 CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED);
2225 
2226                 // Before switching into the scheduling fiber, we should have
2227                 // already taken care of deallocating the current
2228                 // fiber.
2229                 CILK_ASSERT(NULL == ff->fiber_self);
2230 
2231                 // Update the frame's pedigree information if this is an ABI 1
2232                 // or later frame
2233                 if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
2234                 {
2235                     sf->parent_pedigree.rank = w->pedigree.rank;
2236                     sf->parent_pedigree.parent = w->pedigree.parent;
2237 
2238                     // Note that the pedigree rank needs to be updated
2239                     // when setup_for_execution_pedigree runs
2240                     sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
2241                 }
2242 
2243                 /* the decjoin() occurs in provably_good_steal() */
2244                 steal_result = provably_good_steal(w, ff);
2245 
2246             } END_WITH_FRAME_LOCK(w, ff);
2247             // set w->l->frame_ff = NULL after checking abandoned
2248             if (WAIT_FOR_CONTINUE != steal_result) {
2249                 w->l->frame_ff = NULL;
2250             }
2251         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2252     } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK);
2253 
2254     // Now, if we are in a replay situation and provably_good_steal() returned
2255     // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call
2256     // provably_good_steal(), and release locks until we get a value other
2257     // than WAIT_FOR_CONTINUE from the function.
2258 #ifdef CILK_RECORD_REPLAY
2259     // We don't have to explicitly check for REPLAY_LOG below because
2260     // steal_result can only be set to WAIT_FOR_CONTINUE during replay
2261     while(WAIT_FOR_CONTINUE == steal_result)
2262     {
2263         __cilkrts_sleep();
2264         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2265         {
2266             ff = w->l->frame_ff;
2267             BEGIN_WITH_FRAME_LOCK(w, ff)
2268             {
2269                 steal_result = provably_good_steal(w, ff);
2270             } END_WITH_FRAME_LOCK(w, ff);
2271             if (WAIT_FOR_CONTINUE != steal_result)
2272                 w->l->frame_ff = NULL;
2273         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2274     }
2275 #endif  // CILK_RECORD_REPLAY
2276 
2277 #ifdef ENABLE_NOTIFY_ZC_INTRINSIC
2278     // If we can't make any further progress on this thread, tell Inspector
2279     // that we're abandoning the work and will go find something else to do.
2280     if (ABANDON_EXECUTION == steal_result)
2281     {
2282         NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0);
2283     }
2284 #endif // defined ENABLE_NOTIFY_ZC_INTRINSIC
2285 
2286     return; /* back to scheduler loop */
2287 }
2288 
2289 /* worker W completely promotes its own deque, simulating the case
2290    where the whole deque is stolen.  We use this mechanism to force
2291    the allocation of new storage for reducers for race-detection
2292    purposes. */
__cilkrts_promote_own_deque(__cilkrts_worker * w)2293 void __cilkrts_promote_own_deque(__cilkrts_worker *w)
2294 {
2295     // Remember the fiber we start this method on.
2296     CILK_ASSERT(w->l->frame_ff);
2297     cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self;
2298 
2299     BEGIN_WITH_WORKER_LOCK(w) {
2300         while (dekker_protocol(w)) {
2301             /* PLACEHOLDER_FIBER is used as non-null marker to tell detach()
2302                and make_child() that this frame should be treated as a spawn
2303                parent, even though we have not assigned it a stack. */
2304             detach_for_steal(w, w, PLACEHOLDER_FIBER);
2305         }
2306     } END_WITH_WORKER_LOCK(w);
2307 
2308 
2309     // TBD: The management of full frames and fibers is a bit
2310     // sketchy here.  We are promoting stack frames into full frames,
2311     // and pretending they are stolen away, but no other worker is
2312     // actually working on them.  Some runtime invariants
2313     // may be broken here.
2314     //
2315     // Technically, if we are simulating a steal from w
2316     // w should get a new full frame, but
2317     // keep the same fiber.  A real thief would be taking the
2318     // loot frame away, get a new fiber, and starting executing the
2319     // loot frame.
2320     //
2321     // What should a fake thief do?  Where does the frame go?
2322 
2323     // In any case, we should be finishing the promotion process with
2324     // the same fiber with.
2325     CILK_ASSERT(w->l->frame_ff);
2326     CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber);
2327 }
2328 
2329 
2330 
2331 /* the client code calls this function after a spawn when the dekker
2332    protocol fails.  The function may either return or longjmp
2333    into the rts
2334 
2335    This function takes in a "returning_sf" argument which corresponds
2336    to the __cilkrts_stack_frame that we are finishing (i.e., the
2337    argument to __cilkrts_leave_frame).
2338    */
__cilkrts_c_THE_exception_check(__cilkrts_worker * w,__cilkrts_stack_frame * returning_sf)2339 void __cilkrts_c_THE_exception_check(__cilkrts_worker *w,
2340                                      __cilkrts_stack_frame *returning_sf)
2341 {
2342     full_frame *ff;
2343     int stolen_p;
2344     __cilkrts_stack_frame *saved_sf = NULL;
2345 
2346     START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2347 
2348     BEGIN_WITH_WORKER_LOCK(w) {
2349         ff = w->l->frame_ff;
2350         CILK_ASSERT(ff);
2351         /* This code is called only upon a normal return and never
2352            upon an exceptional return.  Assert that this is the
2353            case. */
2354         CILK_ASSERT(!w->l->pending_exception);
2355 
2356         reset_THE_exception(w);
2357         stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was
2358                                                   speculatively
2359                                                   decremented by the
2360                                                   compiled code */
2361 
2362         if (stolen_p) {
2363             /* XXX This will be charged to THE for accounting purposes */
2364             __cilkrts_save_exception_state(w, ff);
2365 
2366             // Save the value of the current stack frame.
2367             saved_sf = w->current_stack_frame;
2368 
2369             // Reverse the decrement from undo_detach.
2370             // This update effectively resets the deque to be
2371             // empty (i.e., changes w->tail back to equal w->head).
2372             // We need to reset the deque to execute parallel
2373             // reductions.  When we have only serial reductions, it
2374             // does not matter, since serial reductions do not
2375             // change the deque.
2376             w->tail++;
2377 #if REDPAR_DEBUG > 1
2378             // ASSERT our deque is empty.
2379             CILK_ASSERT(w->head == w->tail);
2380 #endif
2381         }
2382     } END_WITH_WORKER_LOCK(w);
2383 
2384     STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2385 
2386     if (stolen_p)
2387     {
2388         w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2389 
2390         // "Mr. Policeman?  My parent always told me that if I was in trouble
2391         // I should ask a nice policeman for help.  I can't find my parent
2392         // anywhere..."
2393         //
2394         // Write a record to the replay log for an attempt to return to a stolen parent
2395         replay_record_orphaned(w);
2396 
2397         // Update the pedigree only after we've finished the
2398         // reductions.
2399         update_pedigree_on_leave_frame(w, returning_sf);
2400 
2401         // Notify Inspector that the parent has been stolen and we're
2402         // going to abandon this work and go do something else.  This
2403         // will match the cilk_leave_begin in the compiled code
2404         NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf);
2405 
2406         DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self);
2407         longjmp_into_runtime(w, do_return_from_spawn, 0);
2408         DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self);
2409     }
2410     else
2411     {
2412         NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS);
2413         return;
2414     }
2415 }
2416 
2417 /* Return an exception to a stolen parent. */
__cilkrts_exception_from_spawn(__cilkrts_worker * w,__cilkrts_stack_frame * returning_sf)2418 NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
2419                                         __cilkrts_stack_frame *returning_sf)
2420 {
2421     full_frame *ff = w->l->frame_ff;
2422     // This is almost the same as THE_exception_check, except
2423     // the detach didn't happen, we don't need to undo the tail
2424     // update.
2425     CILK_ASSERT(w->head == w->tail);
2426     w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2427 
2428     longjmp_into_runtime(w, do_return_from_spawn, 0);
2429     CILK_ASSERT(0);
2430 }
2431 
do_return_from_spawn(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * sf)2432 static void do_return_from_spawn(__cilkrts_worker *w,
2433                                  full_frame *ff,
2434                                  __cilkrts_stack_frame *sf)
2435 {
2436     full_frame *parent_ff;
2437     enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2438 
2439     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2440         CILK_ASSERT(ff);
2441         CILK_ASSERT(!ff->is_call_child);
2442         CILK_ASSERT(sf == NULL);
2443         parent_ff = ff->parent;
2444 
2445         BEGIN_WITH_FRAME_LOCK(w, ff) {
2446             decjoin(ff);
2447         } END_WITH_FRAME_LOCK(w, ff);
2448 
2449         BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2450             if (parent_ff->simulated_stolen)
2451                 unconditional_steal(w, parent_ff);
2452             else
2453                 steal_result = provably_good_steal(w, parent_ff);
2454         } END_WITH_FRAME_LOCK(w, parent_ff);
2455 
2456     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2457 
2458     // Loop here in replay mode
2459 #ifdef CILK_RECORD_REPLAY
2460     // We don't have to explicitly check for REPLAY_LOG below because
2461     // steal_result can only get set to WAIT_FOR_CONTINUE during replay.
2462     // We also don't have to worry about the simulated_stolen flag
2463     // because steal_result can only be set to WAIT_FOR_CONTINUE by
2464     // provably_good_steal().
2465     while(WAIT_FOR_CONTINUE == steal_result)
2466     {
2467         __cilkrts_sleep();
2468         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2469         {
2470             BEGIN_WITH_FRAME_LOCK(w, parent_ff)
2471             {
2472                 steal_result = provably_good_steal(w, parent_ff);
2473             } END_WITH_FRAME_LOCK(w, parent_ff);
2474         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2475     }
2476 #endif  // CILK_RECORD_REPLAY
2477 
2478     // Cleanup the child frame.
2479     __cilkrts_destroy_full_frame(w, ff);
2480     return;
2481 }
2482 
2483 #ifdef _WIN32
2484 /* migrate an exception across fibers.  Call this function when an exception has
2485  * been thrown and has to traverse across a steal.  The exception has already
2486  * been wrapped up, so all that remains is to longjmp() into the continuation,
2487  * sync, and re-raise it.
2488  */
__cilkrts_migrate_exception(__cilkrts_stack_frame * sf)2489 void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) {
2490 
2491     __cilkrts_worker *w = sf->worker;
2492     full_frame *ff;
2493 
2494     BEGIN_WITH_WORKER_LOCK(w) {
2495         ff = w->l->frame_ff;
2496         reset_THE_exception(w);
2497         /* there is no need to check for a steal because we wouldn't be here if
2498            there weren't a steal. */
2499         __cilkrts_save_exception_state(w, ff);
2500 
2501         CILK_ASSERT(w->head == w->tail);
2502     } END_WITH_WORKER_LOCK(w);
2503 
2504     {
2505         // TBD(jsukha): This function emulates the
2506         // the "do_return_from_spawn" path.
2507         w = execute_reductions_for_spawn_return(w, ff, sf);
2508     }
2509 
2510     longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */
2511     CILK_ASSERT(! "Shouldn't be here...");
2512 }
2513 #endif
2514 
2515 
2516 /* Pop a call stack from TAIL.  Return the call stack, or NULL if the
2517    queue is empty */
__cilkrts_pop_tail(__cilkrts_worker * w)2518 __cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w)
2519 {
2520     __cilkrts_stack_frame *sf;
2521     BEGIN_WITH_WORKER_LOCK(w) {
2522         __cilkrts_stack_frame *volatile *tail = w->tail;
2523         if (w->head < tail) {
2524             --tail;
2525             sf = *tail;
2526             w->tail = tail;
2527         } else {
2528             sf = 0;
2529         }
2530     } END_WITH_WORKER_LOCK(w);
2531     return sf;
2532 }
2533 
2534 #ifdef CILK_RECORD_REPLAY
simulate_pop_tail(__cilkrts_worker * w)2535 __cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w)
2536 {
2537     __cilkrts_stack_frame *sf;
2538     BEGIN_WITH_WORKER_LOCK(w) {
2539         if (w->head < w->tail) {
2540             sf = *(w->tail-1);
2541         } else {
2542             sf = 0;
2543         }
2544     } END_WITH_WORKER_LOCK(w);
2545     return sf;
2546 }
2547 #endif
2548 
2549 
2550 /* Return from a call, not a spawn. */
__cilkrts_return(__cilkrts_worker * w)2551 void __cilkrts_return(__cilkrts_worker *w)
2552 {
2553     full_frame *ff, *parent_ff;
2554     START_INTERVAL(w, INTERVAL_RETURNING);
2555 
2556     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2557         ff = w->l->frame_ff;
2558         CILK_ASSERT(ff);
2559         CILK_ASSERT(ff->join_counter == 1);
2560         /* This path is not used to return from spawn. */
2561         CILK_ASSERT(ff->is_call_child);
2562 
2563         BEGIN_WITH_FRAME_LOCK(w, ff) {
2564             // After this call, w->l->frame_ff != ff.
2565             // Technically, w will "own" ff until ff is freed,
2566             // however, because ff is a dying leaf full frame.
2567             parent_ff = disown(w, ff, 0, "return");
2568             decjoin(ff);
2569 
2570 #ifdef _WIN32
2571             __cilkrts_save_exception_state(w, ff);
2572 #else
2573             // Move the pending exceptions into the full frame
2574             // This should always be NULL if this isn't a
2575             // return with an exception
2576             CILK_ASSERT(NULL == ff->pending_exception);
2577             ff->pending_exception = w->l->pending_exception;
2578             w->l->pending_exception = NULL;
2579 #endif  // _WIN32
2580 
2581         } END_WITH_FRAME_LOCK(w, ff);
2582 
2583         __cilkrts_fence(); /* redundant */
2584 
2585         CILK_ASSERT(parent_ff);
2586 
2587         BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2588             finalize_child_for_call(w, parent_ff, ff);
2589         } END_WITH_FRAME_LOCK(w, parent_ff);
2590 
2591         ff = pop_next_frame(w);
2592         /* ff will be non-null except when the parent frame is owned
2593            by another worker.
2594            CILK_ASSERT(ff)
2595         */
2596         CILK_ASSERT(!w->l->frame_ff);
2597         if (ff) {
2598             BEGIN_WITH_FRAME_LOCK(w, ff) {
2599                 __cilkrts_stack_frame *sf = ff->call_stack;
2600                 CILK_ASSERT(sf && !sf->call_parent);
2601                 setup_for_execution(w, ff, 1);
2602             } END_WITH_FRAME_LOCK(w, ff);
2603         }
2604     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2605 
2606     STOP_INTERVAL(w, INTERVAL_RETURNING);
2607 }
2608 
__cilkrts_unbind_thread()2609 static void __cilkrts_unbind_thread()
2610 {
2611     int stop_cilkscreen = 0;
2612     global_state_t *g;
2613 
2614     // Take out the global OS mutex to protect accesses to the table of workers
2615     global_os_mutex_lock();
2616 
2617     if (cilkg_is_published()) {
2618         __cilkrts_worker *w = __cilkrts_get_tls_worker();
2619         if (w) {
2620             g = w->g;
2621 
2622             // If there's only 1 worker, the counts will be stopped in
2623             // __cilkrts_scheduler
2624             if (g->P > 1)
2625             {
2626                 STOP_INTERVAL(w, INTERVAL_WORKING);
2627                 STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2628             }
2629 
2630             __cilkrts_set_tls_worker(0);
2631 
2632             if (w->self == -1) {
2633                 // This worker is an overflow worker.  I.e., it was created on-
2634                 // demand when the global pool ran out of workers.
2635                 destroy_worker(w);
2636                 __cilkrts_free(w);
2637             } else {
2638                 // This is a normal user worker and needs to be counted by the
2639                 // global state for the purposes of throttling system workers.
2640                 w->l->type = WORKER_FREE;
2641                 __cilkrts_leave_cilk(g);
2642             }
2643 
2644             stop_cilkscreen = (0 == g->Q);
2645         }
2646     }
2647     global_os_mutex_unlock();
2648 
2649     /* Turn off Cilkscreen.  This needs to be done when we are NOT holding the
2650      * os mutex. */
2651     if (stop_cilkscreen)
2652         __cilkrts_cilkscreen_disable_instrumentation();
2653 }
2654 
2655 /* special return from the initial frame */
2656 
__cilkrts_c_return_from_initial(__cilkrts_worker * w)2657 void __cilkrts_c_return_from_initial(__cilkrts_worker *w)
2658 {
2659     struct cilkred_map *rm;
2660 
2661     /* This is only called on a user thread worker. */
2662     CILK_ASSERT(w->l->type == WORKER_USER);
2663 
2664     #if REDPAR_DEBUG >= 3
2665     fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n",
2666             w->self, w->l->frame_ff);
2667     #endif
2668 
2669     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2670         full_frame *ff = w->l->frame_ff;
2671         CILK_ASSERT(ff);
2672         CILK_ASSERT(ff->join_counter == 1);
2673         w->l->frame_ff = 0;
2674 
2675         CILK_ASSERT(ff->fiber_self);
2676         // Save any TBB interop data for the next time this thread enters Cilk
2677         cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self);
2678 
2679         // Deallocate cilk_fiber that mapped to the user stack.  The stack
2680         // itself does not get deallocated (of course) but our data
2681         // structure becomes divorced from it.
2682 
2683 #if FIBER_DEBUG >= 1
2684         fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self  = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n",
2685                 cilkos_get_current_thread_id(),
2686                 w->self,
2687                 ff->fiber_self,
2688                 w->l->scheduling_fiber,
2689                 w->l->type);
2690 #endif
2691         // The fiber in ff is a user-code fiber.  The fiber in
2692         // w->l->scheduling_fiber is a scheduling fiber.  These fibers should
2693         // never be equal.  When a user worker returns (and will unbind), we
2694         // should destroy only the fiber in ff.  The scheduling fiber will be
2695         // re-used.
2696 
2697         CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber);
2698 
2699         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
2700             // This fiber might not be deallocated here if there
2701             // is a pending exception on Windows that refers
2702             // to this fiber.
2703             //
2704             // First "suspend" the fiber, and then try to delete it.
2705             cilk_fiber_deallocate_from_thread(ff->fiber_self);
2706         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
2707         ff->fiber_self = NULL;
2708 
2709         /* Save reducer map into global_state object */
2710         rm = w->reducer_map;
2711         w->reducer_map = NULL;
2712 
2713 #if REDPAR_DEBUG >= 3
2714         fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n",
2715                 w->self,
2716                 rm,
2717                 ff);
2718 #endif
2719         __cilkrts_destroy_full_frame(w, ff);
2720 
2721 
2722         /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */
2723     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2724 
2725 
2726     save_pedigree_leaf_from_user_worker(w);
2727 
2728     // Workers can have NULL reducer maps now.
2729     if (rm) {
2730         __cilkrts_destroy_reducer_map(w, rm);
2731     }
2732 
2733 
2734 #if FIBER_DEBUG >= 1
2735     __cilkrts_worker* tmp = w;
2736     int tmp_id = w->self;
2737     fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n",
2738             w->self,
2739             w);
2740 #endif
2741 
2742     w = NULL;
2743 
2744     __cilkrts_unbind_thread();
2745 
2746 #if FIBER_DEBUG >= 1
2747 
2748     fprintf(stderr, "w=%p, %d: Finished unbind\n",
2749             tmp, tmp_id);
2750 #endif
2751 
2752     /* Other workers will stop trying to steal if this was the last worker. */
2753 
2754     return;
2755 }
2756 
2757 
2758 /*
2759  * __cilkrts_restore_stealing
2760  *
2761  * Restore the protected_tail to a previous state, possibly allowing frames
2762  * to be stolen.  The dekker_protocol has been extended to steal only if
2763  * head+1 is < protected_tail.
2764  */
2765 
__cilkrts_restore_stealing(__cilkrts_worker * w,__cilkrts_stack_frame * volatile * saved_protected_tail)2766 void __cilkrts_restore_stealing(
2767     __cilkrts_worker *w,
2768     __cilkrts_stack_frame *volatile *saved_protected_tail)
2769 {
2770     /* On most x86 this pair of operations would be slightly faster
2771        as an atomic exchange due to the implicit memory barrier in
2772        an atomic instruction. */
2773     w->protected_tail = saved_protected_tail;
2774     __cilkrts_fence();
2775 }
2776 
2777 /*
2778  * __cilkrts_disallow_stealing
2779  *
2780  * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any
2781  * frames from being stolen.  If NEW_PROTECTED_TAIL is NULL, prevent
2782  * stealing from the whole queue.  The dekker_protocol has been
2783  * extended to only steal if head+1 is also < protected_tail.
2784  */
2785 
__cilkrts_disallow_stealing(__cilkrts_worker * w,__cilkrts_stack_frame * volatile * new_protected_tail)2786 __cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
2787     __cilkrts_worker *w,
2788     __cilkrts_stack_frame *volatile *new_protected_tail)
2789 {
2790     __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail;
2791 
2792     if (!new_protected_tail)
2793         new_protected_tail = w->l->ltq;
2794 
2795     if (w->protected_tail > new_protected_tail) {
2796         w->protected_tail = new_protected_tail;
2797         /* Issue a store-store barrier.  The update to protected_tail
2798            here must precede the update to tail in the next spawn.
2799            On x86 this is probably not needed. */
2800 #if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__)
2801         _mm_sfence();
2802 #else
2803         __cilkrts_fence();
2804 #endif
2805     }
2806 
2807     return saved_protected_tail;
2808 }
2809 
2810 /*************************************************************
2811   Initialization and startup
2812 *************************************************************/
2813 
make_worker(global_state_t * g,int self,__cilkrts_worker * w)2814 __cilkrts_worker *make_worker(global_state_t *g,
2815                               int self, __cilkrts_worker *w)
2816 {
2817     w->self = self;
2818     w->g = g;
2819 
2820     w->pedigree.rank = 0;    // Initial rank is 0
2821     w->pedigree.parent = NULL;
2822 
2823     w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l));
2824 
2825     __cilkrts_frame_malloc_per_worker_init(w);
2826 
2827     w->reducer_map = NULL;
2828     w->current_stack_frame = NULL;
2829     w->reserved = NULL;
2830 
2831     w->l->worker_magic_0 = WORKER_MAGIC_0;
2832     w->l->team = NULL;
2833     w->l->type = WORKER_FREE;
2834 
2835     __cilkrts_mutex_init(&w->l->lock);
2836     __cilkrts_mutex_init(&w->l->steal_lock);
2837     w->l->do_not_steal = 0;
2838     w->l->frame_ff = 0;
2839     w->l->next_frame_ff = 0;
2840     w->l->last_full_frame = NULL;
2841 
2842     w->l->ltq = (__cilkrts_stack_frame **)
2843         __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq));
2844     w->ltq_limit = w->l->ltq + g->ltqsize;
2845     w->head = w->tail = w->l->ltq;
2846 
2847     cilk_fiber_pool_init(&w->l->fiber_pool,
2848                          &g->fiber_pool,
2849                          g->stack_size,
2850                          g->fiber_pool_size,
2851                          0,   // alloc_max is 0.  We don't allocate from the heap directly without checking the parent pool.
2852                          0);
2853 #if FIBER_DEBUG >= 2
2854     fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n",
2855             cilkos_get_current_thread_id(),
2856             w->self, w,
2857             &w->l->fiber_pool);
2858 #endif
2859     w->l->scheduling_fiber = NULL;
2860     w->l->original_pedigree_leaf = NULL;
2861     w->l->rand_seed = 0; /* the scheduler will overwrite this field */
2862 
2863     w->l->post_suspend = 0;
2864     w->l->suspended_stack = 0;
2865     w->l->fiber_to_free = NULL;
2866     w->l->pending_exception = NULL;
2867 
2868 #if CILK_PROFILE
2869     w->l->stats = __cilkrts_malloc(sizeof(statistics));
2870     __cilkrts_init_stats(w->l->stats);
2871 #else
2872     w->l->stats = NULL;
2873 #endif
2874     w->l->steal_failure_count = 0;
2875 
2876     w->l->work_stolen = 0;
2877 
2878     // Initialize record/replay assuming we're doing neither
2879     w->l->record_replay_fptr = NULL;
2880     w->l->replay_list_root = NULL;
2881     w->l->replay_list_entry = NULL;
2882     w->l->signal_node = NULL;
2883     // Nothing's been stolen yet
2884     w->l->worker_magic_1 = WORKER_MAGIC_1;
2885 
2886     /*w->parallelism_disabled = 0;*/
2887 
2888     // Allow stealing all frames. Sets w->saved_protected_tail
2889     __cilkrts_restore_stealing(w, w->ltq_limit);
2890 
2891     __cilkrts_init_worker_sysdep(w);
2892 
2893     reset_THE_exception(w);
2894 
2895     return w;
2896 }
2897 
destroy_worker(__cilkrts_worker * w)2898 void destroy_worker(__cilkrts_worker *w)
2899 {
2900     CILK_ASSERT (NULL == w->l->pending_exception);
2901 
2902     // Deallocate the scheduling fiber
2903     if (NULL != w->l->scheduling_fiber)
2904     {
2905         // The scheduling fiber is the main fiber for system workers and must
2906         // be deallocated by the thread that created it.  Thus, we can
2907         // deallocate only free workers' (formerly user workers) scheduling
2908         // fibers here.
2909         CILK_ASSERT(WORKER_FREE == w->l->type);
2910 
2911 #if FIBER_DEBUG >=1
2912         fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n",
2913                 cilkos_get_current_thread_id(),
2914                 w,
2915                 w->self,
2916                 w->l->scheduling_fiber);
2917 #endif
2918         int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL);
2919         // Scheduling fiber should never have extra references because of exceptions.
2920         CILK_ASSERT(0 == ref_count);
2921         w->l->scheduling_fiber = NULL;
2922     }
2923 
2924 #if CILK_PROFILE
2925     if (w->l->stats) {
2926         __cilkrts_free(w->l->stats);
2927     }
2928 #else
2929     CILK_ASSERT(NULL == w->l->stats);
2930 #endif
2931 
2932     /* Free any cached fibers. */
2933     cilk_fiber_pool_destroy(&w->l->fiber_pool);
2934 
2935     __cilkrts_destroy_worker_sysdep(w);
2936 
2937     if (w->l->signal_node) {
2938         CILK_ASSERT(WORKER_SYSTEM == w->l->type);
2939         signal_node_destroy(w->l->signal_node);
2940     }
2941 
2942     __cilkrts_free(w->l->ltq);
2943     __cilkrts_mutex_destroy(0, &w->l->lock);
2944     __cilkrts_mutex_destroy(0, &w->l->steal_lock);
2945     __cilkrts_frame_malloc_per_worker_cleanup(w);
2946 
2947     __cilkrts_free(w->l);
2948 
2949     // The caller is responsible for freeing the worker memory
2950 }
2951 
2952 /*
2953  * Make a worker into a system worker.
2954  */
make_worker_system(__cilkrts_worker * w)2955 static void make_worker_system(__cilkrts_worker *w) {
2956     CILK_ASSERT(WORKER_FREE == w->l->type);
2957     w->l->type = WORKER_SYSTEM;
2958     w->l->signal_node = signal_node_create();
2959 }
2960 
__cilkrts_deinit_internal(global_state_t * g)2961 void __cilkrts_deinit_internal(global_state_t *g)
2962 {
2963     int i;
2964     __cilkrts_worker *w;
2965 
2966     // If there's no global state then we're done
2967     if (NULL == g)
2968         return;
2969 
2970 #ifdef CILK_PROFILE
2971     __cilkrts_dump_stats_to_stderr(g);
2972 #endif
2973 
2974     w = g->workers[0];
2975     if (w->l->frame_ff) {
2976         __cilkrts_destroy_full_frame(w, w->l->frame_ff);
2977         w->l->frame_ff = 0;
2978     }
2979 
2980     // Release any resources used for record/replay
2981     replay_term(g);
2982 
2983     // Destroy any system dependent global state
2984     __cilkrts_destroy_global_sysdep(g);
2985 
2986     for (i = 0; i < g->total_workers; ++i)
2987         destroy_worker(g->workers[i]);
2988 
2989     // Free memory for all worker blocks which were allocated contiguously
2990     __cilkrts_free(g->workers[0]);
2991 
2992     __cilkrts_free(g->workers);
2993 
2994     cilk_fiber_pool_destroy(&g->fiber_pool);
2995     __cilkrts_frame_malloc_global_cleanup(g);
2996 
2997     cilkg_deinit_global_state();
2998 }
2999 
3000 /*
3001  * Wake the runtime by notifying the system workers that they can steal.  The
3002  * first user worker into the runtime should call this.
3003  */
wake_runtime(global_state_t * g)3004 static void wake_runtime(global_state_t *g)
3005 {
3006     __cilkrts_worker *root;
3007     if (g->P > 1) {
3008         // Send a message to the root node.  The message will propagate.
3009         root = g->workers[0];
3010         CILK_ASSERT(root->l->signal_node);
3011         signal_node_msg(root->l->signal_node, 1);
3012     }
3013 }
3014 
3015 /*
3016  * Put the runtime to sleep.  The last user worker out of the runtime should
3017  * call this.  Like Dad always said, turn out the lights when nobody's in the
3018  * room.
3019  */
sleep_runtime(global_state_t * g)3020 static void sleep_runtime(global_state_t *g)
3021 {
3022     __cilkrts_worker *root;
3023     if (g->P > 1) {
3024         // Send a message to the root node.  The message will propagate.
3025         root = g->workers[0];
3026         CILK_ASSERT(root->l->signal_node);
3027         signal_node_msg(root->l->signal_node, 0);
3028     }
3029 }
3030 
3031 /* Called when a user thread joins Cilk.
3032    Global lock must be held. */
__cilkrts_enter_cilk(global_state_t * g)3033 void __cilkrts_enter_cilk(global_state_t *g)
3034 {
3035     if (g->Q++ == 0) {
3036         // If this is the first user thread to enter Cilk wake
3037         // up all the workers.
3038         wake_runtime(g);
3039     }
3040 }
3041 
3042 /* Called when a user thread leaves Cilk.
3043    Global lock must be held. */
__cilkrts_leave_cilk(global_state_t * g)3044 void __cilkrts_leave_cilk(global_state_t *g)
3045 {
3046     if (--g->Q == 0) {
3047         // Put the runtime to sleep.
3048         sleep_runtime(g);
3049     }
3050 }
3051 
3052 /*
3053  * worker_runnable
3054  *
3055  * Return true if the worker should continue to try to steal.  False, otherwise.
3056  */
3057 
3058 NOINLINE
worker_runnable(__cilkrts_worker * w)3059 static enum schedule_t worker_runnable(__cilkrts_worker *w)
3060 {
3061     global_state_t *g = w->g;
3062 
3063     /* If this worker has something to do, do it.
3064        Otherwise the work would be lost. */
3065     if (w->l->next_frame_ff)
3066         return SCHEDULE_RUN;
3067 
3068     // If Cilk has explicitly (by the user) been told to exit (i.e., by
3069     // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0.
3070     if (g->work_done)
3071         return SCHEDULE_EXIT;
3072 
3073     if (0 == w->self) {
3074         // This worker is the root node and is the only one that may query the
3075         // global state to see if there are still any user workers in Cilk.
3076         if (w->l->steal_failure_count > g->max_steal_failures) {
3077             if (signal_node_should_wait(w->l->signal_node)) {
3078                 return SCHEDULE_WAIT;
3079             } else {
3080                 // Reset the steal_failure_count since we have verified that
3081                 // user workers are still in Cilk.
3082                 w->l->steal_failure_count = 0;
3083             }
3084         }
3085     } else if (WORKER_SYSTEM == w->l->type &&
3086                signal_node_should_wait(w->l->signal_node)) {
3087         // This worker has been notified by its parent that it should stop
3088         // trying to steal.
3089         return SCHEDULE_WAIT;
3090     }
3091 
3092     return SCHEDULE_RUN;
3093 }
3094 
3095 
3096 
3097 // Initialize the worker structs, but don't start the workers themselves.
init_workers(global_state_t * g)3098 static void init_workers(global_state_t *g)
3099 {
3100     int total_workers = g->total_workers;
3101     int i;
3102     struct CILK_ALIGNAS(256) buffered_worker {
3103         __cilkrts_worker w;
3104         char buf[64];
3105     } *workers_memory;
3106 
3107     /* not needed if only one worker */
3108     cilk_fiber_pool_init(&g->fiber_pool,
3109                          NULL,
3110                          g->stack_size,
3111                          g->global_fiber_pool_size,           // buffer_size
3112                          g->max_stacks,                       // maximum # to allocate
3113                          1);
3114 
3115     cilk_fiber_pool_set_fiber_limit(&g->fiber_pool,
3116                                     (g->max_stacks ? g->max_stacks : INT_MAX));
3117 
3118     g->workers = (__cilkrts_worker **)
3119         __cilkrts_malloc(total_workers * sizeof(*g->workers));
3120 
3121     // Allocate 1 block of memory for workers to make life easier for tools
3122     // like Inspector which run multithreaded and need to know the memory
3123     // range for all the workers that will be accessed in a user's program
3124     workers_memory = (struct buffered_worker*)
3125         __cilkrts_malloc(sizeof(*workers_memory) * total_workers);
3126 
3127     // Notify any tools that care (Cilkscreen and Inspector) that they should
3128     // ignore memory allocated for the workers
3129     __cilkrts_cilkscreen_ignore_block(&workers_memory[0],
3130                                       &workers_memory[total_workers]);
3131 
3132     // Initialize worker structs, including unused worker slots.
3133     for (i = 0; i < total_workers; ++i) {
3134         g->workers[i] = make_worker(g, i, &workers_memory[i].w);
3135     }
3136 
3137     // Set the workers in the first P - 1 slots to be system workers.
3138     // Remaining worker structs already have type == 0.
3139     for (i = 0; i < g->system_workers; ++i) {
3140         make_worker_system(g->workers[i]);
3141     }
3142 }
3143 
__cilkrts_init_internal(int start)3144 void __cilkrts_init_internal(int start)
3145 {
3146     global_state_t *g = NULL;
3147 
3148     if (cilkg_is_published()) {
3149         g = cilkg_init_global_state();
3150     }
3151     else {
3152 
3153         // We think the state has not been published yet.
3154         // Grab the lock and try to initialize/publish.
3155         global_os_mutex_lock();
3156 
3157         if (cilkg_is_published()) {
3158             // Some other thread must have snuck in and published.
3159             g = cilkg_init_global_state();
3160         }
3161         else {
3162             // Initialize and retrieve global state
3163             g = cilkg_init_global_state();
3164 
3165             // Set the scheduler pointer
3166             g->scheduler = worker_scheduler_function;
3167 
3168             // If we're running under a sequential P-Tool (Cilkscreen or
3169             // Cilkview) then there's only one worker and we need to tell
3170             // the tool about the extent of the stack
3171             if (g->under_ptool)
3172                 __cilkrts_establish_c_stack();
3173             init_workers(g);
3174 
3175             // Initialize per-work record/replay logging
3176             replay_init_workers(g);
3177 
3178             // Initialize any system dependent global state
3179             __cilkrts_init_global_sysdep(g);
3180 
3181 
3182             cilkg_publish_global_state(g);
3183         }
3184 
3185         global_os_mutex_unlock();
3186     }
3187 
3188     CILK_ASSERT(g);
3189 
3190     if (start && !g->workers_running)
3191     {
3192         // Acquire the global OS mutex while we're starting the workers
3193         global_os_mutex_lock();
3194         if (!g->workers_running)
3195             // Start P - 1 system workers since P includes the first user
3196             // worker.
3197             __cilkrts_start_workers(g, g->P - 1);
3198         global_os_mutex_unlock();
3199     }
3200 }
3201 
3202 
3203 /************************************************************************
3204   Methods for reducer protocol.
3205 
3206   Reductions occur in two places:
3207     A. A full frame "ff" is returning from a spawn with a stolen parent.
3208     B. A full frame "ff" is stalling at a sync.
3209 
3210   To support parallel reductions, reduction functions need to be
3211   executed while control is on a user stack, before jumping into the
3212   runtime.  These reductions can not occur while holding a worker or
3213   frame lock.
3214 
3215   Before a worker w executes a reduction in either Case A or B, w's
3216   deque is empty.
3217 
3218   Since parallel reductions push work onto the deque, we must do extra
3219   work to set up runtime data structures properly before reductions
3220   begin to allow stealing.  ( Normally, when we have only serial
3221   reductions, once a worker w starts a reduction, its deque remains
3222   empty until w either steals another frame or resumes a suspended
3223   frame.  Thus, we don't care about the state of the deque, since w
3224   will reset its deque when setting up execution of a frame. )
3225 
3226   To allow for parallel reductions, we coerce the runtime data
3227   structures so that, from their perspective, it looks as though we
3228   have spliced in an "execute_reductions()" function.  Consider the
3229   two cases for reductions:
3230 
3231     Case A: Return from a spawn with a stolen parent.
3232       Consider a spawned function g is returning on a worker w.
3233       Assume:
3234           -   g was spawned from a parent function f.
3235           -   ff is the full frame for g's spawn helper
3236           -   sf be the __cilkrts_stack_frame for g's spawn helper.
3237 
3238       We are conceptually splicing "execute_reductions()" so that it
3239       occurs immediately before the spawn helper of g returns to f.
3240 
3241       We do so by creating two different world views --- one for the
3242       runtime data structures, and one for the actual control flow.
3243 
3244         - Before reductions begin, the runtime data structures should
3245           look as though the spawn helper of g is calling
3246           "execute_reductions()", in terms of both the user stack and
3247           worker deque.  More precisely, w should satisfy the
3248           following properties:
3249 
3250               (a) w has ff as its full frame,
3251               (b) w has sf as its __cilkrts_stack_frame, and
3252               (c) w has an empty deque.
3253 
3254           If the runtime satisfies these properties, then if w
3255           encounters a spawn in a parallel reduction, it can push onto
3256           a valid deque.  Also, when a steal from w occurs, it will
3257           build the correct tree of full frames when w is stolen from.
3258 
3259         - In actual control flow, however, once the
3260           "execute_reductions()" function returns, it is actually
3261           returning to runtime code instead of g's spawn helper.
3262 
3263           At the point a worker w began executing reductions, the
3264           control flow / compiled code had already finished g's spawn
3265           helper, and w was about to enter the runtime.  With parallel
3266           reductions, some worker v (which might be different from w)
3267           is the one returning to the runtime.
3268 
3269 
3270       The reduction logic consists of 4 steps:
3271 
3272        A1. Restore runtime data structures to make it look as though
3273            the spawn helper of g() is still the currently executing
3274            frame for w.
3275 
3276        A2. Execute reductions on the user stack.  Reductions also
3277            includes the logic for exceptions and stacks.  Note that
3278            reductions start on w, but may finish on a different
3279            worker if there is parallelism in the reduce.
3280 
3281        A3. Splice out ff from the tree of full frames.
3282 
3283        A4. Jump into the runtime/scheduling stack and execute
3284            "do_return_from_spawn".  This method
3285 
3286            (a) Frees the user stack we were just on if it is no longer needed.
3287            (b) Decrement the join counter on ff->parent, and tries to do a
3288                provably good steal.
3289            (c) Clean up the full frame ff.
3290 
3291 
3292    Case B: Stalling at a sync.
3293 
3294      Consider a function g(), with full frame ff and
3295      __cilkrts_stack_frame sf.  Suppose g() stalls at a sync, and we
3296      are executing reductions.
3297 
3298      Conceptually, we are splicing in an "execute_reductions()"
3299      function into g() as the last action that g() takes immediately
3300      before it executes the cilk_sync.
3301 
3302      The reduction logic for this case is similar to Case A.
3303 
3304        B1. Restore the runtime data structures.
3305 
3306            The main difference from Case A is that ff/sf is still a
3307            frame that needs to be executed later (since it is stalling
3308            at a cilk_sync).  Thus, we also need to save the current
3309            stack information into "ff" so that we can correctly resume
3310            execution of "ff" after the sync.
3311 
3312        B2. Execute reductions on the user stack.
3313 
3314        B3. No frame to splice out of the tree.
3315 
3316        B4. Jump into the runtime/scheduling stack and execute "do_sync".
3317            This method:
3318            (a) Frees the user stack we were just on if it is no longer needed.
3319            (b) Tries to execute a provably good steal.
3320 
3321   Finally, for the reducer protocol, we consider two reduction paths,
3322   namely a "fast" and "slow" path.  On a fast path, only trivial
3323   merges of reducer maps happen (i.e., one or both of the maps are
3324   NULL).  Otherwise, on the slow path, a reduction actually needs to
3325   happen.
3326 
3327 *****************************************************************/
3328 
3329 /**
3330  * @brief Locations to store the result of a reduction.
3331  *
3332  * Struct storing pointers to the fields in our "left" sibling that we
3333  * should update when splicing out a full frame or stalling at a sync.
3334  */
3335 typedef struct {
3336     /** A pointer to the location of our left reducer map. */
3337     struct cilkred_map **map_ptr;
3338 
3339     /** A pointer to the location of our left exception. */
3340     struct pending_exception_info **exception_ptr;
3341 } splice_left_ptrs;
3342 
3343 /**
3344  * For a full frame returning from a spawn, calculate the pointers to
3345  * the maps and exceptions to my left.
3346  *
3347  * @param w   The currently executing worker.
3348  * @param ff  Full frame that is dying
3349  * @return    Pointers to our "left" for reducers and exceptions.
3350  */
3351 static inline
compute_left_ptrs_for_spawn_return(__cilkrts_worker * w,full_frame * ff)3352 splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w,
3353                                                     full_frame *ff)
3354 {
3355     // ASSERT: we hold the lock on ff->parent
3356 
3357     splice_left_ptrs left_ptrs;
3358     if (ff->left_sibling) {
3359         left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map;
3360         left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception;
3361     }
3362     else {
3363         full_frame *parent_ff = ff->parent;
3364         left_ptrs.map_ptr = &parent_ff->children_reducer_map;
3365         left_ptrs.exception_ptr = &parent_ff->child_pending_exception;
3366     }
3367     return left_ptrs;
3368 }
3369 
3370 /**
3371  * For a full frame at a sync, calculate the pointers to the maps and
3372  * exceptions to my left.
3373  *
3374  * @param w   The currently executing worker.
3375  * @param ff  Full frame that is stalling at a sync.
3376  * @return    Pointers to our "left" for reducers and exceptions.
3377  */
3378 static inline
compute_left_ptrs_for_sync(__cilkrts_worker * w,full_frame * ff)3379 splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w,
3380                                             full_frame *ff)
3381 {
3382     // ASSERT: we hold the lock on ff
3383     splice_left_ptrs left_ptrs;
3384 
3385     // Figure out which map to the left we should merge into.
3386     if (ff->rightmost_child) {
3387         CILK_ASSERT(ff->rightmost_child->parent == ff);
3388         left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map);
3389         left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception);
3390     }
3391     else {
3392         // We have no children.  Then, we should be the last
3393         // worker at the sync... "left" is our child map.
3394         left_ptrs.map_ptr = &(ff->children_reducer_map);
3395         left_ptrs.exception_ptr = &(ff->child_pending_exception);
3396     }
3397     return left_ptrs;
3398 }
3399 
3400 /**
3401  * After we have completed all reductions on a spawn return, call this
3402  * method to finish up before jumping into the runtime.
3403  *
3404  *   1. Perform the "reduction" on stacks, i.e., execute the left
3405  *      holder logic to pass the leftmost stack up.
3406  *
3407  *      w->l->fiber_to_free holds any stack that needs to be freed
3408  *      when control switches into the runtime fiber.
3409  *
3410  *   2. Unlink and remove child_ff from the tree of full frames.
3411  *
3412  * @param   w          The currently executing worker.
3413  * @param   parent_ff  The parent of child_ff.
3414  * @param   child_ff   The full frame returning from a spawn.
3415  */
3416 static inline
finish_spawn_return_on_user_stack(__cilkrts_worker * w,full_frame * parent_ff,full_frame * child_ff)3417 void finish_spawn_return_on_user_stack(__cilkrts_worker *w,
3418                                        full_frame *parent_ff,
3419                                        full_frame *child_ff)
3420 {
3421     CILK_ASSERT(w->l->fiber_to_free == NULL);
3422 
3423     // Execute left-holder logic for stacks.
3424     if (child_ff->left_sibling || parent_ff->fiber_child) {
3425         // Case where we are not the leftmost stack.
3426         CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self);
3427 
3428         // Remember any fiber we need to free in the worker.
3429         // After we jump into the runtime, we will actually do the
3430         // free.
3431         w->l->fiber_to_free = child_ff->fiber_self;
3432     }
3433     else {
3434         // We are leftmost, pass stack/fiber up to parent.
3435         // Thus, no stack/fiber to free.
3436         parent_ff->fiber_child = child_ff->fiber_self;
3437         w->l->fiber_to_free = NULL;
3438     }
3439 
3440     child_ff->fiber_self = NULL;
3441 
3442     unlink_child(parent_ff, child_ff);
3443 }
3444 
3445 
3446 /**
3447  * Executes any fast reductions necessary to splice ff out of the tree
3448  * of full frames.
3449  *
3450  * This "fast" path performs only trivial merges of reducer maps,
3451  * i.e,. when one of them is NULL.
3452  * (See slow_path_reductions_for_spawn_return() for slow path.)
3453  *
3454  * Returns: 1 if we finished all reductions.
3455  * Returns: 0 if there are still reductions to execute, and
3456  *            we should execute the slow path.
3457  *
3458  * This method assumes w holds the frame lock on parent_ff.
3459  * After this method completes:
3460  *    1. We have spliced ff out of the tree of full frames.
3461  *    2. The reducer maps of child_ff have been deposited
3462  *       "left" according to the reducer protocol.
3463  *    3. w->l->stack_to_free stores the stack
3464  *       that needs to be freed once we jump into the runtime.
3465  *
3466  * We have not, however, decremented the join counter on ff->parent.
3467  * This prevents any other workers from resuming execution of the parent.
3468  *
3469  * @param   w    The currently executing worker.
3470  * @param   ff   The full frame returning from a spawn.
3471  * @return  NULL if we finished all reductions.
3472  * @return  The address where the left map is stored (which should be passed to
3473  *          slow_path_reductions_for_spawn_return()) if there are
3474  *          still reductions to execute.
3475  */
3476 struct cilkred_map**
fast_path_reductions_for_spawn_return(__cilkrts_worker * w,full_frame * ff)3477 fast_path_reductions_for_spawn_return(__cilkrts_worker *w,
3478                                       full_frame *ff)
3479 {
3480     // ASSERT: we hold ff->parent->lock.
3481     splice_left_ptrs left_ptrs;
3482 
3483     CILK_ASSERT(NULL == w->l->pending_exception);
3484 
3485     // Figure out the pointers to the left where I want
3486     // to put reducers and exceptions.
3487     left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3488 
3489     // Go ahead and merge exceptions while holding the lock.
3490     splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3491 
3492     // Now check if we have any reductions to perform.
3493     //
3494     // Consider all the cases of left, middle and right maps.
3495     //  0. (-, -, -)  :  finish and return 1
3496     //  1. (L, -, -)  :  finish and return 1
3497     //  2. (-, M, -)  :  slide over to left, finish, and return 1.
3498     //  3. (L, M, -)  :  return 0
3499     //  4. (-, -, R)  :  slide over to left, finish, and return 1.
3500     //  5. (L, -, R)  :  return 0
3501     //  6. (-, M, R)  :  return 0
3502     //  7. (L, M, R)  :  return 0
3503     //
3504     // In terms of code:
3505     //  L == *left_ptrs.map_ptr
3506     //  M == w->reducer_map
3507     //  R == f->right_reducer_map.
3508     //
3509     // The goal of the code below is to execute the fast path with
3510     // as few branches and writes as possible.
3511 
3512     int case_value = (*(left_ptrs.map_ptr) != NULL);
3513     case_value += ((w->reducer_map != NULL) << 1);
3514     case_value += ((ff->right_reducer_map != NULL) << 2);
3515 
3516     // Fastest path is case_value == 0 or 1.
3517     if (case_value >=2) {
3518         switch (case_value) {
3519         case 2:
3520             *(left_ptrs.map_ptr) = w->reducer_map;
3521             w->reducer_map = NULL;
3522             return NULL;
3523             break;
3524         case 4:
3525             *(left_ptrs.map_ptr) = ff->right_reducer_map;
3526             ff->right_reducer_map = NULL;
3527             return NULL;
3528         default:
3529             // If we have to execute the slow path, then
3530             // return the pointer to the place to deposit the left
3531             // map.
3532             return left_ptrs.map_ptr;
3533         }
3534     }
3535 
3536     // Do nothing
3537     return NULL;
3538 }
3539 
3540 
3541 /**
3542  * Executes any reductions necessary to splice "ff" frame out of
3543  * the steal tree.
3544  *
3545  * This method executes the "slow" path for reductions on a spawn
3546  * return, i.e., there are non-NULL maps that need to be merged
3547  * together.
3548  *
3549  * This method should execute only if
3550  * fast_path_reductions_for_spawn_return() returns a non-NULL
3551  * left_map_ptr.
3552  *
3553  * Upon entry, left_map_ptr should be the location of the left map
3554  * at the start of the reduction, as calculated by
3555  * fast_path_reductions_for_spawn_return().
3556  *
3557  * After this method completes:
3558  *    1. We have spliced ff out of the tree of full frames.
3559  *    2. The reducer maps of child_ff have been deposited
3560  *       "left" according to the reducer protocol.
3561  *    3. w->l->stack_to_free stores the stack
3562  *       that needs to be freed once we jump into the runtime.
3563  * We have not, however, decremented the join counter on ff->parent,
3564  * so no one can resume execution of the parent yet.
3565  *
3566  * WARNING:
3567  *   This method assumes the lock on ff->parent is held upon entry, and
3568  *   Upon exit, the worker that returns still holds a lock on ff->parent
3569  *   This method can, however, release and reacquire the lock on ff->parent.
3570  *
3571  * @param w             The currently executing worker.
3572  * @param ff            The full frame returning from a spawn.
3573  * @param left_map_ptr  Pointer to our initial left map.
3574  * @return              The worker that this method returns on.
3575  */
3576 static __cilkrts_worker*
slow_path_reductions_for_spawn_return(__cilkrts_worker * w,full_frame * ff,struct cilkred_map ** left_map_ptr)3577 slow_path_reductions_for_spawn_return(__cilkrts_worker *w,
3578                                       full_frame *ff,
3579                                       struct cilkred_map **left_map_ptr)
3580 {
3581 
3582     // CILK_ASSERT: w is holding frame lock on parent_ff.
3583 #if REDPAR_DEBUG > 0
3584     CILK_ASSERT(!ff->rightmost_child);
3585     CILK_ASSERT(!ff->is_call_child);
3586 #endif
3587 
3588     // Loop invariant:
3589     // When beginning this loop, we should
3590     //   1. Be holding the lock on ff->parent.
3591     //   2. left_map_ptr should be the address of the pointer to the left map.
3592     //   3. All maps should be slid over left by one, if possible.
3593     //   4. All exceptions should be merged so far.
3594     while (1) {
3595 
3596         // Slide middle map left if possible.
3597         if (!(*left_map_ptr)) {
3598             *left_map_ptr = w->reducer_map;
3599             w->reducer_map = NULL;
3600         }
3601         // Slide right map to middle if possible.
3602         if (!w->reducer_map) {
3603             w->reducer_map = ff->right_reducer_map;
3604             ff->right_reducer_map = NULL;
3605         }
3606 
3607         // Since we slid everything left by one,
3608         // we are finished if there is no middle map.
3609         if (!w->reducer_map) {
3610             verify_current_wkr(w);
3611             return w;
3612         }
3613         else {
3614             struct cilkred_map* left_map;
3615             struct cilkred_map* middle_map;
3616             struct cilkred_map* right_map;
3617 
3618             // Take all the maps from their respective locations.
3619             // We can't leave them in place and execute a reduction because these fields
3620             // might change once we release the lock.
3621             left_map = *left_map_ptr;
3622             *left_map_ptr = NULL;
3623             middle_map = w->reducer_map;
3624             w->reducer_map = NULL;
3625             right_map = ff->right_reducer_map;
3626             ff->right_reducer_map = NULL;
3627 
3628             // WARNING!!! Lock release here.
3629             // We have reductions to execute (and we can't hold locks).
3630             __cilkrts_frame_unlock(w, ff->parent);
3631 
3632             // Merge all reducers into the left map.
3633             left_map = repeated_merge_reducer_maps(&w,
3634                                                    left_map,
3635                                                    middle_map);
3636             verify_current_wkr(w);
3637             left_map = repeated_merge_reducer_maps(&w,
3638                                                    left_map,
3639                                                    right_map);
3640             verify_current_wkr(w);
3641             CILK_ASSERT(NULL == w->reducer_map);
3642             // Put the final answer back into w->reducer_map.
3643             w->reducer_map = left_map;
3644 
3645             // Save any exceptions generated because of the reduction
3646             // process from the returning worker.  These get merged
3647             // the next time around the loop.
3648             CILK_ASSERT(NULL == ff->pending_exception);
3649             ff->pending_exception = w->l->pending_exception;
3650             w->l->pending_exception = NULL;
3651 
3652             // Lock ff->parent for the next loop around.
3653             __cilkrts_frame_lock(w, ff->parent);
3654 
3655             // Once we have the lock again, recompute who is to our
3656             // left.
3657             splice_left_ptrs left_ptrs;
3658             left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3659 
3660             // Update the pointer for the left map.
3661             left_map_ptr = left_ptrs.map_ptr;
3662             // Splice the exceptions for spawn.
3663             splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3664         }
3665     }
3666     // We should never break out of this loop.
3667 
3668     CILK_ASSERT(0);
3669     return NULL;
3670 }
3671 
3672 
3673 
3674 /**
3675  * Execute reductions when returning from a spawn whose parent has
3676  * been stolen.
3677  *
3678  * Execution may start on w, but may finish on a different worker.
3679  * This method acquires/releases the lock on ff->parent.
3680  *
3681  * @param w            The currently executing worker.
3682  * @param ff           The full frame of the spawned function that is returning.
3683  * @param returning_sf The __cilkrts_stack_frame for this returning function.
3684  * @return             The worker returning from this method.
3685  */
3686 static __cilkrts_worker*
execute_reductions_for_spawn_return(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * returning_sf)3687 execute_reductions_for_spawn_return(__cilkrts_worker *w,
3688                                     full_frame *ff,
3689                                     __cilkrts_stack_frame *returning_sf)
3690 {
3691     // Step A1 from reducer protocol described above.
3692     //
3693     // Coerce the runtime into thinking that
3694     // ff/returning_sf are still on the bottom of
3695     // w's deque.
3696     restore_frame_for_spawn_return_reduction(w, ff, returning_sf);
3697 
3698     // Step A2 and A3: Execute reductions on user stack.
3699     BEGIN_WITH_FRAME_LOCK(w, ff->parent) {
3700         struct cilkred_map **left_map_ptr;
3701         left_map_ptr = fast_path_reductions_for_spawn_return(w, ff);
3702 
3703         // Pointer will be non-NULL if there are
3704         // still reductions to execute.
3705         if (left_map_ptr) {
3706             // WARNING: This method call may release the lock
3707             // on ff->parent and re-acquire it (possibly on a
3708             // different worker).
3709             // We can't hold locks while actually executing
3710             // reduce functions.
3711             w = slow_path_reductions_for_spawn_return(w,
3712                                                       ff,
3713                                                       left_map_ptr);
3714             verify_current_wkr(w);
3715         }
3716 
3717         finish_spawn_return_on_user_stack(w, ff->parent, ff);
3718         // WARNING: the use of this lock macro is deceptive.
3719         // The worker may have changed here.
3720     } END_WITH_FRAME_LOCK(w, ff->parent);
3721     return w;
3722 }
3723 
3724 
3725 
3726 /**
3727  * Execute fast "reductions" when ff stalls at a sync.
3728  *
3729  * @param   w  The currently executing worker.
3730  * @param   ff The full frame stalling at a sync.
3731  * @return  1 if we are finished with all reductions after calling this method.
3732  * @return  0 if we still need to execute the slow path reductions.
3733  */
3734 static inline
fast_path_reductions_for_sync(__cilkrts_worker * w,full_frame * ff)3735 int fast_path_reductions_for_sync(__cilkrts_worker *w,
3736                                   full_frame *ff) {
3737     // Return 0 if there is some reduction that needs to happen.
3738     return !(w->reducer_map  || ff->pending_exception);
3739 }
3740 
3741 /**
3742  * Executes slow reductions when ff stalls at a sync.
3743  * This method should execute only if
3744  *   fast_path_reductions_for_sync(w, ff) returned 0.
3745  *
3746  * After this method completes:
3747  *   1. ff's current reducer map has been deposited into
3748  *       right_reducer_map of ff's rightmost child, or
3749  *       ff->children_reducer_map if ff has no children.
3750  *   2. Similarly for ff's current exception.
3751  *   3. Nothing to calculate for stacks --- if we are stalling
3752  *      we will always free a stack.
3753  *
3754  * This method may repeatedly acquire/release the lock on ff.
3755  *
3756  * @param   w  The currently executing worker.
3757  * @param   ff The full frame stalling at a sync.
3758  * @return  The worker returning from this method.
3759  */
3760 static __cilkrts_worker*
slow_path_reductions_for_sync(__cilkrts_worker * w,full_frame * ff)3761 slow_path_reductions_for_sync(__cilkrts_worker *w,
3762                               full_frame *ff)
3763 {
3764     struct cilkred_map *left_map;
3765     struct cilkred_map *middle_map;
3766 
3767 #if (REDPAR_DEBUG > 0)
3768     CILK_ASSERT(ff);
3769     CILK_ASSERT(w->head == w->tail);
3770 #endif
3771 
3772     middle_map = w->reducer_map;
3773     w->reducer_map = NULL;
3774 
3775     // Loop invariant: middle_map should be valid (the current map to reduce).
3776     //                 left_map is junk.
3777     //                 w->reducer_map == NULL.
3778     while (1) {
3779         BEGIN_WITH_FRAME_LOCK(w, ff) {
3780             splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff);
3781 
3782             // Grab the "left" map and store pointers to those locations.
3783             left_map = *(left_ptrs.map_ptr);
3784             *(left_ptrs.map_ptr) = NULL;
3785 
3786             // Slide the maps in our struct left as far as possible.
3787             if (!left_map) {
3788                 left_map = middle_map;
3789                 middle_map = NULL;
3790             }
3791 
3792             *(left_ptrs.exception_ptr) =
3793                 __cilkrts_merge_pending_exceptions(w,
3794                                                    *left_ptrs.exception_ptr,
3795                                                    ff->pending_exception);
3796             ff->pending_exception = NULL;
3797 
3798             // If there is no middle map, then we are done.
3799             // Deposit left and return.
3800             if (!middle_map) {
3801                 *(left_ptrs).map_ptr = left_map;
3802                 #if (REDPAR_DEBUG > 0)
3803                 CILK_ASSERT(NULL == w->reducer_map);
3804                 #endif
3805                 // Sanity check upon leaving the loop.
3806                 verify_current_wkr(w);
3807                 // Make sure to unlock before we return!
3808                 __cilkrts_frame_unlock(w, ff);
3809                 return w;
3810             }
3811         } END_WITH_FRAME_LOCK(w, ff);
3812 
3813         // If we get here, we have a nontrivial reduction to execute.
3814         middle_map = repeated_merge_reducer_maps(&w,
3815                                                  left_map,
3816                                                  middle_map);
3817         verify_current_wkr(w);
3818 
3819         // Save any exceptions generated because of the reduction
3820         // process.  These get merged the next time around the
3821         // loop.
3822         CILK_ASSERT(NULL == ff->pending_exception);
3823         ff->pending_exception = w->l->pending_exception;
3824         w->l->pending_exception = NULL;
3825     }
3826 
3827     // We should never break out of the loop above.
3828     CILK_ASSERT(0);
3829     return NULL;
3830 }
3831 
3832 
3833 /**
3834  * Execute reductions when ff stalls at a sync.
3835  *
3836  * Execution starts on w, but may finish on a different worker.
3837  * This method may acquire/release the lock on ff.
3838  *
3839  * @param w          The currently executing worker.
3840  * @param ff         The full frame of the spawned function at the sync
3841  * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync
3842  * @return           The worker returning from this method.
3843  */
3844 static __cilkrts_worker*
execute_reductions_for_sync(__cilkrts_worker * w,full_frame * ff,__cilkrts_stack_frame * sf_at_sync)3845 execute_reductions_for_sync(__cilkrts_worker *w,
3846                             full_frame *ff,
3847                             __cilkrts_stack_frame *sf_at_sync)
3848 {
3849     int finished_reductions;
3850     // Step B1 from reducer protocol above:
3851     // Restore runtime invariants.
3852     //
3853     // The following code for this step is almost equivalent to
3854     // the following sequence:
3855     //   1. disown(w, ff, sf_at_sync, "sync") (which itself
3856     //        calls make_unrunnable(w, ff, sf_at_sync))
3857     //   2. make_runnable(w, ff, sf_at_sync).
3858     //
3859     // The "disown" will mark the frame "sf_at_sync"
3860     // as stolen and suspended, and save its place on the stack,
3861     // so it can be resumed after the sync.
3862     //
3863     // The difference is, that we don't want the disown to
3864     // break the following connections yet, since we are
3865     // about to immediately make sf/ff runnable again anyway.
3866     //   sf_at_sync->worker == w
3867     //   w->l->frame_ff == ff.
3868     //
3869     // These connections are needed for parallel reductions, since
3870     // we will use sf / ff as the stack frame / full frame for
3871     // executing any potential reductions.
3872     //
3873     // TBD: Can we refactor the disown / make_unrunnable code
3874     // to avoid the code duplication here?
3875 
3876     ff->call_stack = NULL;
3877 
3878     // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and
3879     // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of
3880     // the stack so that a worker can resume the frame in the correct
3881     // place.
3882     //
3883     // But on this path, CILK_FRAME_STOLEN should already be set.
3884     // Also, we technically don't want to suspend the frame until
3885     // the reduction finishes.
3886     // We do, however, need to save the stack before
3887     // we start any reductions, since the reductions might push more
3888     // data onto the stack.
3889     CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN);
3890 
3891     __cilkrts_put_stack(ff, sf_at_sync);
3892     __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1,
3893                                      "execute_reductions_for_sync");
3894     CILK_ASSERT(w->l->frame_ff == ff);
3895 
3896     // Step B2: Execute reductions on user stack.
3897     // Check if we have any "real" reductions to do.
3898     finished_reductions = fast_path_reductions_for_sync(w, ff);
3899 
3900     if (!finished_reductions) {
3901         // Still have some real reductions to execute.
3902         // Run them here.
3903 
3904         // This method may acquire/release the lock on ff.
3905         w = slow_path_reductions_for_sync(w, ff);
3906 
3907         // The previous call may return on a different worker.
3908         // than what we started on.
3909         verify_current_wkr(w);
3910     }
3911 
3912 #if REDPAR_DEBUG >= 0
3913     CILK_ASSERT(w->l->frame_ff == ff);
3914     CILK_ASSERT(ff->call_stack == NULL);
3915 #endif
3916 
3917     // Now we suspend the frame ff (since we've
3918     // finished the reductions).  Roughly, we've split apart the
3919     // "make_unrunnable" call here --- we've already saved the
3920     // stack info earlier before the reductions execute.
3921     // All that remains is to restore the call stack back into the
3922     // full frame, and mark the frame as suspended.
3923     ff->call_stack = sf_at_sync;
3924     sf_at_sync->flags |= CILK_FRAME_SUSPENDED;
3925 
3926     // At a nontrivial sync, we should always free the current fiber,
3927     // because it can not be leftmost.
3928     w->l->fiber_to_free = ff->fiber_self;
3929     ff->fiber_self = NULL;
3930     return w;
3931 }
3932 
3933 
3934 /*
3935   Local Variables: **
3936   c-file-style:"bsd" **
3937   c-basic-offset:4 **
3938   indent-tabs-mode:nil **
3939   End: **
3940 */
3941