1 /*
2  * kmp_wait_release.h -- Wait/Release implementation
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_WAIT_RELEASE_H
14 #define KMP_WAIT_RELEASE_H
15 
16 #include "kmp.h"
17 #include "kmp_itt.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 /*!
24 @defgroup WAIT_RELEASE Wait/Release operations
25 
26 The definitions and functions here implement the lowest level thread
27 synchronizations of suspending a thread and awaking it. They are used to build
28 higher level operations such as barriers and fork/join.
29 */
30 
31 /*!
32 @ingroup WAIT_RELEASE
33 @{
34 */
35 
36 /*!
37  * The flag_type describes the storage used for the flag.
38  */
39 enum flag_type {
40   flag32, /**< 32 bit flags */
41   flag64, /**< 64 bit flags */
42   flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
43 };
44 
45 /*!
46  * Base class for wait/release volatile flag
47  */
48 template <typename P> class kmp_flag_native {
49   volatile P *loc;
50   flag_type t;
51 
52 public:
53   typedef P flag_t;
54   kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {}
55   volatile P *get() { return loc; }
56   void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
57   void set(volatile P *new_loc) { loc = new_loc; }
58   flag_type get_type() { return t; }
59   P load() { return *loc; }
60   void store(P val) { *loc = val; }
61 };
62 
63 /*!
64  * Base class for wait/release atomic flag
65  */
66 template <typename P> class kmp_flag {
67   std::atomic<P>
68       *loc; /**< Pointer to the flag storage that is modified by another thread
69              */
70   flag_type t; /**< "Type" of the flag in loc */
71 public:
72   typedef P flag_t;
73   kmp_flag(std::atomic<P> *p, flag_type ft) : loc(p), t(ft) {}
74   /*!
75    * @result the pointer to the actual flag
76    */
77   std::atomic<P> *get() { return loc; }
78   /*!
79    * @result void* pointer to the actual flag
80    */
81   void *get_void_p() { return RCAST(void *, loc); }
82   /*!
83    * @param new_loc in   set loc to point at new_loc
84    */
85   void set(std::atomic<P> *new_loc) { loc = new_loc; }
86   /*!
87    * @result the flag_type
88    */
89   flag_type get_type() { return t; }
90   /*!
91    * @result flag value
92    */
93   P load() { return loc->load(std::memory_order_acquire); }
94   /*!
95    * @param val the new flag value to be stored
96    */
97   void store(P val) { loc->store(val, std::memory_order_release); }
98   // Derived classes must provide the following:
99   /*
100   kmp_info_t * get_waiter(kmp_uint32 i);
101   kmp_uint32 get_num_waiters();
102   bool done_check();
103   bool done_check_val(P old_loc);
104   bool notdone_check();
105   P internal_release();
106   void suspend(int th_gtid);
107   void resume(int th_gtid);
108   P set_sleeping();
109   P unset_sleeping();
110   bool is_sleeping();
111   bool is_any_sleeping();
112   bool is_sleeping_val(P old_loc);
113   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
114                     int *thread_finished
115                     USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
116                     is_constrained);
117   */
118 };
119 
120 #if OMPT_SUPPORT
121 OMPT_NOINLINE
122 static void __ompt_implicit_task_end(kmp_info_t *this_thr,
123                                      ompt_state_t ompt_state,
124                                      ompt_data_t *tId) {
125   int ds_tid = this_thr->th.th_info.ds.ds_tid;
126   if (ompt_state == ompt_state_wait_barrier_implicit) {
127     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
128 #if OMPT_OPTIONAL
129     void *codeptr = NULL;
130     if (ompt_enabled.ompt_callback_sync_region_wait) {
131       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
132           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
133           codeptr);
134     }
135     if (ompt_enabled.ompt_callback_sync_region) {
136       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
137           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
138           codeptr);
139     }
140 #endif
141     if (!KMP_MASTER_TID(ds_tid)) {
142       if (ompt_enabled.ompt_callback_implicit_task) {
143         int flags = this_thr->th.ompt_thread_info.parallel_flags;
144         flags = (flags & ompt_parallel_league) ? ompt_task_initial
145                                                : ompt_task_implicit;
146         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
147             ompt_scope_end, NULL, tId, 0, ds_tid, flags);
148       }
149       // return to idle state
150       this_thr->th.ompt_thread_info.state = ompt_state_idle;
151     } else {
152       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
153     }
154   }
155 }
156 #endif
157 
158 /* Spin wait loop that first does pause/yield, then sleep. A thread that calls
159    __kmp_wait_*  must make certain that another thread calls __kmp_release
160    to wake it back up to prevent deadlocks!
161 
162    NOTE: We may not belong to a team at this point.  */
163 template <class C, int final_spin, bool cancellable = false,
164           bool sleepable = true>
165 static inline bool
166 __kmp_wait_template(kmp_info_t *this_thr,
167                     C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
168 #if USE_ITT_BUILD && USE_ITT_NOTIFY
169   volatile void *spin = flag->get();
170 #endif
171   kmp_uint32 spins;
172   int th_gtid;
173   int tasks_completed = FALSE;
174   int oversubscribed;
175 #if !KMP_USE_MONITOR
176   kmp_uint64 poll_count;
177   kmp_uint64 hibernate_goal;
178 #else
179   kmp_uint32 hibernate;
180 #endif
181 
182   KMP_FSYNC_SPIN_INIT(spin, NULL);
183   if (flag->done_check()) {
184     KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
185     return false;
186   }
187   th_gtid = this_thr->th.th_info.ds.ds_gtid;
188   if (cancellable) {
189     kmp_team_t *team = this_thr->th.th_team;
190     if (team && team->t.t_cancel_request == cancel_parallel)
191       return true;
192   }
193 #if KMP_OS_UNIX
194   if (final_spin)
195     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
196 #endif
197   KA_TRACE(20,
198            ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
199 #if KMP_STATS_ENABLED
200   stats_state_e thread_state = KMP_GET_THREAD_STATE();
201 #endif
202 
203 /* OMPT Behavior:
204 THIS function is called from
205   __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
206             these have join / fork behavior
207 
208        In these cases, we don't change the state or trigger events in THIS
209 function.
210        Events are triggered in the calling code (__kmp_barrier):
211 
212                 state := ompt_state_overhead
213             barrier-begin
214             barrier-wait-begin
215                 state := ompt_state_wait_barrier
216           call join-barrier-implementation (finally arrive here)
217           {}
218           call fork-barrier-implementation (finally arrive here)
219           {}
220                 state := ompt_state_overhead
221             barrier-wait-end
222             barrier-end
223                 state := ompt_state_work_parallel
224 
225 
226   __kmp_fork_barrier  (after thread creation, before executing implicit task)
227           call fork-barrier-implementation (finally arrive here)
228           {} // worker arrive here with state = ompt_state_idle
229 
230 
231   __kmp_join_barrier  (implicit barrier at end of parallel region)
232                 state := ompt_state_barrier_implicit
233             barrier-begin
234             barrier-wait-begin
235           call join-barrier-implementation (finally arrive here
236 final_spin=FALSE)
237           {
238           }
239   __kmp_fork_barrier  (implicit barrier at end of parallel region)
240           call fork-barrier-implementation (finally arrive here final_spin=TRUE)
241 
242        Worker after task-team is finished:
243             barrier-wait-end
244             barrier-end
245             implicit-task-end
246             idle-begin
247                 state := ompt_state_idle
248 
249        Before leaving, if state = ompt_state_idle
250             idle-end
251                 state := ompt_state_overhead
252 */
253 #if OMPT_SUPPORT
254   ompt_state_t ompt_entry_state;
255   ompt_data_t *tId;
256   if (ompt_enabled.enabled) {
257     ompt_entry_state = this_thr->th.ompt_thread_info.state;
258     if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
259         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
260       ompt_lw_taskteam_t *team =
261           this_thr->th.th_team->t.ompt_serialized_team_info;
262       if (team) {
263         tId = &(team->ompt_task_info.task_data);
264       } else {
265         tId = OMPT_CUR_TASK_DATA(this_thr);
266       }
267     } else {
268       tId = &(this_thr->th.ompt_thread_info.task_data);
269     }
270     if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
271                        this_thr->th.th_task_team == NULL)) {
272       // implicit task is done. Either no taskqueue, or task-team finished
273       __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
274     }
275   }
276 #endif
277 
278   KMP_INIT_YIELD(spins); // Setup for waiting
279 
280   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
281       __kmp_pause_status == kmp_soft_paused) {
282 #if KMP_USE_MONITOR
283 // The worker threads cannot rely on the team struct existing at this point.
284 // Use the bt values cached in the thread struct instead.
285 #ifdef KMP_ADJUST_BLOCKTIME
286     if (__kmp_pause_status == kmp_soft_paused ||
287         (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
288       // Force immediate suspend if not set by user and more threads than
289       // available procs
290       hibernate = 0;
291     else
292       hibernate = this_thr->th.th_team_bt_intervals;
293 #else
294     hibernate = this_thr->th.th_team_bt_intervals;
295 #endif /* KMP_ADJUST_BLOCKTIME */
296 
297     /* If the blocktime is nonzero, we want to make sure that we spin wait for
298        the entirety of the specified #intervals, plus up to one interval more.
299        This increment make certain that this thread doesn't go to sleep too
300        soon.  */
301     if (hibernate != 0)
302       hibernate++;
303 
304     // Add in the current time value.
305     hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
306     KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
307                   th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
308                   hibernate - __kmp_global.g.g_time.dt.t_value));
309 #else
310     if (__kmp_pause_status == kmp_soft_paused) {
311       // Force immediate suspend
312       hibernate_goal = KMP_NOW();
313     } else
314       hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
315     poll_count = 0;
316 #endif // KMP_USE_MONITOR
317   }
318 
319   oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
320   KMP_MB();
321 
322   // Main wait spin loop
323   while (flag->notdone_check()) {
324     kmp_task_team_t *task_team = NULL;
325     if (__kmp_tasking_mode != tskm_immediate_exec) {
326       task_team = this_thr->th.th_task_team;
327       /* If the thread's task team pointer is NULL, it means one of 3 things:
328          1) A newly-created thread is first being released by
329          __kmp_fork_barrier(), and its task team has not been set up yet.
330          2) All tasks have been executed to completion.
331          3) Tasking is off for this region.  This could be because we are in a
332          serialized region (perhaps the outer one), or else tasking was manually
333          disabled (KMP_TASKING=0).  */
334       if (task_team != NULL) {
335         if (TCR_SYNC_4(task_team->tt.tt_active)) {
336           if (KMP_TASKING_ENABLED(task_team))
337             flag->execute_tasks(
338                 this_thr, th_gtid, final_spin,
339                 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
340           else
341             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
342         } else {
343           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
344 #if OMPT_SUPPORT
345           // task-team is done now, other cases should be catched above
346           if (final_spin && ompt_enabled.enabled)
347             __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
348 #endif
349           this_thr->th.th_task_team = NULL;
350           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
351         }
352       } else {
353         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
354       } // if
355     } // if
356 
357     KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
358     if (TCR_4(__kmp_global.g.g_done)) {
359       if (__kmp_global.g.g_abort)
360         __kmp_abort_thread();
361       break;
362     }
363 
364     // If we are oversubscribed, or have waited a bit (and
365     // KMP_LIBRARY=throughput), then yield
366     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
367 
368 #if KMP_STATS_ENABLED
369     // Check if thread has been signalled to idle state
370     // This indicates that the logical "join-barrier" has finished
371     if (this_thr->th.th_stats->isIdle() &&
372         KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
373       KMP_SET_THREAD_STATE(IDLE);
374       KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
375     }
376 #endif
377     // Check if the barrier surrounding this wait loop has been cancelled
378     if (cancellable) {
379       kmp_team_t *team = this_thr->th.th_team;
380       if (team && team->t.t_cancel_request == cancel_parallel)
381         break;
382     }
383 
384     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
385     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
386         __kmp_pause_status != kmp_soft_paused)
387       continue;
388 
389     // Don't suspend if there is a likelihood of new tasks being spawned.
390     if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
391       continue;
392 
393 #if KMP_USE_MONITOR
394     // If we have waited a bit more, fall asleep
395     if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
396       continue;
397 #else
398     if (KMP_BLOCKING(hibernate_goal, poll_count++))
399       continue;
400 #endif
401     // Don't suspend if wait loop designated non-sleepable
402     // in template parameters
403     if (!sleepable)
404       continue;
405 
406     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
407         __kmp_pause_status != kmp_soft_paused)
408       continue;
409 
410     KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
411 
412 #if KMP_OS_UNIX
413     if (final_spin)
414       KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
415 #endif
416     flag->suspend(th_gtid);
417 #if KMP_OS_UNIX
418     if (final_spin)
419       KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
420 #endif
421 
422     if (TCR_4(__kmp_global.g.g_done)) {
423       if (__kmp_global.g.g_abort)
424         __kmp_abort_thread();
425       break;
426     } else if (__kmp_tasking_mode != tskm_immediate_exec &&
427                this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
428       this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
429     }
430     // TODO: If thread is done with work and times out, disband/free
431   }
432 
433 #if OMPT_SUPPORT
434   ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
435   if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
436 #if OMPT_OPTIONAL
437     if (final_spin) {
438       __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
439       ompt_exit_state = this_thr->th.ompt_thread_info.state;
440     }
441 #endif
442     if (ompt_exit_state == ompt_state_idle) {
443       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
444     }
445   }
446 #endif
447 #if KMP_STATS_ENABLED
448   // If we were put into idle state, pop that off the state stack
449   if (KMP_GET_THREAD_STATE() == IDLE) {
450     KMP_POP_PARTITIONED_TIMER();
451     KMP_SET_THREAD_STATE(thread_state);
452     this_thr->th.th_stats->resetIdleFlag();
453   }
454 #endif
455 
456 #if KMP_OS_UNIX
457   if (final_spin)
458     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
459 #endif
460   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
461   if (cancellable) {
462     kmp_team_t *team = this_thr->th.th_team;
463     if (team && team->t.t_cancel_request == cancel_parallel) {
464       if (tasks_completed) {
465         // undo the previous decrement of unfinished_threads so that the
466         // thread can decrement at the join barrier with no problem
467         kmp_task_team_t *task_team = this_thr->th.th_task_team;
468         std::atomic<kmp_int32> *unfinished_threads =
469             &(task_team->tt.tt_unfinished_threads);
470         KMP_ATOMIC_INC(unfinished_threads);
471       }
472       return true;
473     }
474   }
475   return false;
476 }
477 
478 /* Release any threads specified as waiting on the flag by releasing the flag
479    and resume the waiting thread if indicated by the sleep bit(s). A thread that
480    calls __kmp_wait_template must call this function to wake up the potentially
481    sleeping thread and prevent deadlocks!  */
482 template <class C> static inline void __kmp_release_template(C *flag) {
483 #ifdef KMP_DEBUG
484   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
485 #endif
486   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
487   KMP_DEBUG_ASSERT(flag->get());
488   KMP_FSYNC_RELEASING(flag->get_void_p());
489 
490   flag->internal_release();
491 
492   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
493                  flag->load()));
494 
495   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
496     // Only need to check sleep stuff if infinite block time not set.
497     // Are *any* threads waiting on flag sleeping?
498     if (flag->is_any_sleeping()) {
499       for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
500         // if sleeping waiter exists at i, sets current_waiter to i inside flag
501         kmp_info_t *waiter = flag->get_waiter(i);
502         if (waiter) {
503           int wait_gtid = waiter->th.th_info.ds.ds_gtid;
504           // Wake up thread if needed
505           KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
506                         "flag(%p) set\n",
507                         gtid, wait_gtid, flag->get()));
508           flag->resume(wait_gtid); // unsets flag's current_waiter when done
509         }
510       }
511     }
512   }
513 }
514 
515 template <typename FlagType> struct flag_traits {};
516 
517 template <> struct flag_traits<kmp_uint32> {
518   typedef kmp_uint32 flag_t;
519   static const flag_type t = flag32;
520   static inline flag_t tcr(flag_t f) { return TCR_4(f); }
521   static inline flag_t test_then_add4(volatile flag_t *f) {
522     return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
523   }
524   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
525     return KMP_TEST_THEN_OR32(f, v);
526   }
527   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
528     return KMP_TEST_THEN_AND32(f, v);
529   }
530 };
531 
532 template <> struct flag_traits<kmp_uint64> {
533   typedef kmp_uint64 flag_t;
534   static const flag_type t = flag64;
535   static inline flag_t tcr(flag_t f) { return TCR_8(f); }
536   static inline flag_t test_then_add4(volatile flag_t *f) {
537     return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
538   }
539   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
540     return KMP_TEST_THEN_OR64(f, v);
541   }
542   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
543     return KMP_TEST_THEN_AND64(f, v);
544   }
545 };
546 
547 // Basic flag that does not use C11 Atomics
548 template <typename FlagType>
549 class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
550   typedef flag_traits<FlagType> traits_type;
551   FlagType checker; /**< Value to compare flag to to check if flag has been
552                        released. */
553   kmp_info_t
554       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
555   kmp_uint32
556       num_waiting_threads; /**< Number of threads sleeping on this thread. */
557 public:
558   kmp_basic_flag_native(volatile FlagType *p)
559       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
560   kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
561       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
562     waiting_threads[0] = thr;
563   }
564   kmp_basic_flag_native(volatile FlagType *p, FlagType c)
565       : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
566         num_waiting_threads(0) {}
567   /*!
568    * param i in   index into waiting_threads
569    * @result the thread that is waiting at index i
570    */
571   kmp_info_t *get_waiter(kmp_uint32 i) {
572     KMP_DEBUG_ASSERT(i < num_waiting_threads);
573     return waiting_threads[i];
574   }
575   /*!
576    * @result num_waiting_threads
577    */
578   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
579   /*!
580    * @param thr in   the thread which is now waiting
581    *
582    * Insert a waiting thread at index 0.
583    */
584   void set_waiter(kmp_info_t *thr) {
585     waiting_threads[0] = thr;
586     num_waiting_threads = 1;
587   }
588   /*!
589    * @result true if the flag object has been released.
590    */
591   bool done_check() { return traits_type::tcr(*(this->get())) == checker; }
592   /*!
593    * @param old_loc in   old value of flag
594    * @result true if the flag's old value indicates it was released.
595    */
596   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
597   /*!
598    * @result true if the flag object is not yet released.
599    * Used in __kmp_wait_template like:
600    * @code
601    * while (flag.notdone_check()) { pause(); }
602    * @endcode
603    */
604   bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
605   /*!
606    * @result Actual flag value before release was applied.
607    * Trigger all waiting threads to run by modifying flag to release state.
608    */
609   void internal_release() {
610     (void)traits_type::test_then_add4((volatile FlagType *)this->get());
611   }
612   /*!
613    * @result Actual flag value before sleep bit(s) set.
614    * Notes that there is at least one thread sleeping on the flag by setting
615    * sleep bit(s).
616    */
617   FlagType set_sleeping() {
618     return traits_type::test_then_or((volatile FlagType *)this->get(),
619                                      KMP_BARRIER_SLEEP_STATE);
620   }
621   /*!
622    * @result Actual flag value before sleep bit(s) cleared.
623    * Notes that there are no longer threads sleeping on the flag by clearing
624    * sleep bit(s).
625    */
626   FlagType unset_sleeping() {
627     return traits_type::test_then_and((volatile FlagType *)this->get(),
628                                       ~KMP_BARRIER_SLEEP_STATE);
629   }
630   /*!
631    * @param old_loc in   old value of flag
632    * Test whether there are threads sleeping on the flag's old value in old_loc.
633    */
634   bool is_sleeping_val(FlagType old_loc) {
635     return old_loc & KMP_BARRIER_SLEEP_STATE;
636   }
637   /*!
638    * Test whether there are threads sleeping on the flag.
639    */
640   bool is_sleeping() { return is_sleeping_val(*(this->get())); }
641   bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
642   kmp_uint8 *get_stolen() { return NULL; }
643   enum barrier_type get_bt() { return bs_last_barrier; }
644 };
645 
646 template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
647   typedef flag_traits<FlagType> traits_type;
648   FlagType checker; /**< Value to compare flag to to check if flag has been
649                        released. */
650   kmp_info_t
651       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
652   kmp_uint32
653       num_waiting_threads; /**< Number of threads sleeping on this thread. */
654 public:
655   kmp_basic_flag(std::atomic<FlagType> *p)
656       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
657   kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
658       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
659     waiting_threads[0] = thr;
660   }
661   kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
662       : kmp_flag<FlagType>(p, traits_type::t), checker(c),
663         num_waiting_threads(0) {}
664   /*!
665    * param i in   index into waiting_threads
666    * @result the thread that is waiting at index i
667    */
668   kmp_info_t *get_waiter(kmp_uint32 i) {
669     KMP_DEBUG_ASSERT(i < num_waiting_threads);
670     return waiting_threads[i];
671   }
672   /*!
673    * @result num_waiting_threads
674    */
675   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
676   /*!
677    * @param thr in   the thread which is now waiting
678    *
679    * Insert a waiting thread at index 0.
680    */
681   void set_waiter(kmp_info_t *thr) {
682     waiting_threads[0] = thr;
683     num_waiting_threads = 1;
684   }
685   /*!
686    * @result true if the flag object has been released.
687    */
688   bool done_check() { return this->load() == checker; }
689   /*!
690    * @param old_loc in   old value of flag
691    * @result true if the flag's old value indicates it was released.
692    */
693   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
694   /*!
695    * @result true if the flag object is not yet released.
696    * Used in __kmp_wait_template like:
697    * @code
698    * while (flag.notdone_check()) { pause(); }
699    * @endcode
700    */
701   bool notdone_check() { return this->load() != checker; }
702   /*!
703    * @result Actual flag value before release was applied.
704    * Trigger all waiting threads to run by modifying flag to release state.
705    */
706   void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
707   /*!
708    * @result Actual flag value before sleep bit(s) set.
709    * Notes that there is at least one thread sleeping on the flag by setting
710    * sleep bit(s).
711    */
712   FlagType set_sleeping() {
713     return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
714   }
715   /*!
716    * @result Actual flag value before sleep bit(s) cleared.
717    * Notes that there are no longer threads sleeping on the flag by clearing
718    * sleep bit(s).
719    */
720   FlagType unset_sleeping() {
721     return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
722   }
723   /*!
724    * @param old_loc in   old value of flag
725    * Test whether there are threads sleeping on the flag's old value in old_loc.
726    */
727   bool is_sleeping_val(FlagType old_loc) {
728     return old_loc & KMP_BARRIER_SLEEP_STATE;
729   }
730   /*!
731    * Test whether there are threads sleeping on the flag.
732    */
733   bool is_sleeping() { return is_sleeping_val(this->load()); }
734   bool is_any_sleeping() { return is_sleeping_val(this->load()); }
735   kmp_uint8 *get_stolen() { return NULL; }
736   enum barrier_type get_bt() { return bs_last_barrier; }
737 };
738 
739 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
740 public:
741   kmp_flag_32(std::atomic<kmp_uint32> *p) : kmp_basic_flag<kmp_uint32>(p) {}
742   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
743       : kmp_basic_flag<kmp_uint32>(p, thr) {}
744   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
745       : kmp_basic_flag<kmp_uint32>(p, c) {}
746   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
747   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
748   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
749                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
750                     kmp_int32 is_constrained) {
751     return __kmp_execute_tasks_32(
752         this_thr, gtid, this, final_spin,
753         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
754   }
755   void wait(kmp_info_t *this_thr,
756             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
757     if (final_spin)
758       __kmp_wait_template<kmp_flag_32, TRUE>(
759           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
760     else
761       __kmp_wait_template<kmp_flag_32, FALSE>(
762           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
763   }
764   void release() { __kmp_release_template(this); }
765   flag_type get_ptr_type() { return flag32; }
766 };
767 
768 class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
769 public:
770   kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native<kmp_uint64>(p) {}
771   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
772       : kmp_basic_flag_native<kmp_uint64>(p, thr) {}
773   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
774       : kmp_basic_flag_native<kmp_uint64>(p, c) {}
775   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
776   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
777   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
778                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
779                     kmp_int32 is_constrained) {
780     return __kmp_execute_tasks_64(
781         this_thr, gtid, this, final_spin,
782         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
783   }
784   void wait(kmp_info_t *this_thr,
785             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
786     if (final_spin)
787       __kmp_wait_template<kmp_flag_64, TRUE>(
788           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
789     else
790       __kmp_wait_template<kmp_flag_64, FALSE>(
791           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
792   }
793   bool wait_cancellable_nosleep(kmp_info_t *this_thr,
794                                 int final_spin
795                                     USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
796     bool retval = false;
797     if (final_spin)
798       retval = __kmp_wait_template<kmp_flag_64, TRUE, true, false>(
799           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
800     else
801       retval = __kmp_wait_template<kmp_flag_64, FALSE, true, false>(
802           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
803     return retval;
804   }
805   void release() { __kmp_release_template(this); }
806   flag_type get_ptr_type() { return flag64; }
807 };
808 
809 // Hierarchical 64-bit on-core barrier instantiation
810 class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
811   kmp_uint64 checker;
812   kmp_info_t *waiting_threads[1];
813   kmp_uint32 num_waiting_threads;
814   kmp_uint32
815       offset; /**< Portion of flag that is of interest for an operation. */
816   bool flag_switch; /**< Indicates a switch in flag location. */
817   enum barrier_type bt; /**< Barrier type. */
818   kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
819                            location. */
820 #if USE_ITT_BUILD
821   void *
822       itt_sync_obj; /**< ITT object that must be passed to new flag location. */
823 #endif
824   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
825     return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
826   }
827 
828 public:
829   kmp_flag_oncore(volatile kmp_uint64 *p)
830       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
831         flag_switch(false) {}
832   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
833       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
834         offset(idx), flag_switch(false) {}
835   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
836                   enum barrier_type bar_t,
837                   kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
838       : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
839         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
840         this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
841   kmp_info_t *get_waiter(kmp_uint32 i) {
842     KMP_DEBUG_ASSERT(i < num_waiting_threads);
843     return waiting_threads[i];
844   }
845   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
846   void set_waiter(kmp_info_t *thr) {
847     waiting_threads[0] = thr;
848     num_waiting_threads = 1;
849   }
850   bool done_check_val(kmp_uint64 old_loc) {
851     return byteref(&old_loc, offset) == checker;
852   }
853   bool done_check() { return done_check_val(*get()); }
854   bool notdone_check() {
855     // Calculate flag_switch
856     if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
857       flag_switch = true;
858     if (byteref(get(), offset) != 1 && !flag_switch)
859       return true;
860     else if (flag_switch) {
861       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
862       kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go,
863                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
864       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
865     }
866     return false;
867   }
868   void internal_release() {
869     // Other threads can write their own bytes simultaneously.
870     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
871       byteref(get(), offset) = 1;
872     } else {
873       kmp_uint64 mask = 0;
874       byteref(&mask, offset) = 1;
875       KMP_TEST_THEN_OR64(get(), mask);
876     }
877   }
878   kmp_uint64 set_sleeping() {
879     return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
880   }
881   kmp_uint64 unset_sleeping() {
882     return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
883   }
884   bool is_sleeping_val(kmp_uint64 old_loc) {
885     return old_loc & KMP_BARRIER_SLEEP_STATE;
886   }
887   bool is_sleeping() { return is_sleeping_val(*get()); }
888   bool is_any_sleeping() { return is_sleeping_val(*get()); }
889   void wait(kmp_info_t *this_thr, int final_spin) {
890     if (final_spin)
891       __kmp_wait_template<kmp_flag_oncore, TRUE>(
892           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
893     else
894       __kmp_wait_template<kmp_flag_oncore, FALSE>(
895           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
896   }
897   void release() { __kmp_release_template(this); }
898   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
899   void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
900   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
901                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
902                     kmp_int32 is_constrained) {
903     return __kmp_execute_tasks_oncore(
904         this_thr, gtid, this, final_spin,
905         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
906   }
907   kmp_uint8 *get_stolen() { return NULL; }
908   enum barrier_type get_bt() { return bt; }
909   flag_type get_ptr_type() { return flag_oncore; }
910 };
911 
912 // Used to wake up threads, volatile void* flag is usually the th_sleep_loc
913 // associated with int gtid.
914 static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
915   if (!flag)
916     return;
917 
918   switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) {
919   case flag32:
920     __kmp_resume_32(gtid, NULL);
921     break;
922   case flag64:
923     __kmp_resume_64(gtid, NULL);
924     break;
925   case flag_oncore:
926     __kmp_resume_oncore(gtid, NULL);
927     break;
928   }
929 }
930 
931 /*!
932 @}
933 */
934 
935 #endif // KMP_WAIT_RELEASE_H
936