1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #if KMP_MIC && USE_NGO_STORES
26 // ICV copying
27 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
28 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
29 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
31 #else
32 #define ngo_load(src) ((void)0)
33 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
34 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
35 #define ngo_sync() ((void)0)
36 #endif /* KMP_MIC && USE_NGO_STORES */
37 
38 void __kmp_print_structure(void); // Forward declaration
39 
40 // ---------------------------- Barrier Algorithms ----------------------------
41 
42 // Linear Barrier
43 template <bool cancellable = false>
__kmp_linear_barrier_gather_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))44 static bool __kmp_linear_barrier_gather_template(
45     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
47   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
48   kmp_team_t *team = this_thr->th.th_team;
49   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
50   kmp_info_t **other_threads = team->t.t_threads;
51 
52   KA_TRACE(
53       20,
54       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55        gtid, team->t.t_id, tid, bt));
56   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57 
58 #if USE_ITT_BUILD && USE_ITT_NOTIFY
59   // Barrier imbalance - save arrive time to the thread
60   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
61     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
62         __itt_get_timestamp();
63   }
64 #endif
65   // We now perform a linear reduction to signal that all of the threads have
66   // arrived.
67   if (!KMP_MASTER_TID(tid)) {
68     KA_TRACE(20,
69              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
70               "arrived(%p): %llu => %llu\n",
71               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
72               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
73               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74     // Mark arrival to primary thread
75     /* After performing this write, a worker thread may not assume that the team
76        is valid any more - it could be deallocated by the primary thread at any
77        time. */
78     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
79     flag.release();
80   } else {
81     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
82     int nproc = this_thr->th.th_team_nproc;
83     int i;
84     // Don't have to worry about sleep bit here or atomic since team setting
85     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
86 
87     // Collect all the worker team member threads.
88     for (i = 1; i < nproc; ++i) {
89 #if KMP_CACHE_MANAGE
90       // Prefetch next thread's arrived count
91       if (i + 1 < nproc)
92         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
93 #endif /* KMP_CACHE_MANAGE */
94       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
95                     "arrived(%p) == %llu\n",
96                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
97                     team->t.t_id, i,
98                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
99 
100       // Wait for worker thread to arrive
101       if (cancellable) {
102         kmp_flag_64<true, false> flag(
103             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
104         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
105           return true;
106       } else {
107         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
108                            new_state);
109         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
110       }
111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
112       // Barrier imbalance - write min of the thread time and the other thread
113       // time to the thread.
114       if (__kmp_forkjoin_frames_mode == 2) {
115         this_thr->th.th_bar_min_time = KMP_MIN(
116             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
117       }
118 #endif
119       if (reduce) {
120         KA_TRACE(100,
121                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
122                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
123                   team->t.t_id, i));
124         OMPT_REDUCTION_DECL(this_thr, gtid);
125         OMPT_REDUCTION_BEGIN;
126         (*reduce)(this_thr->th.th_local.reduce_data,
127                   other_threads[i]->th.th_local.reduce_data);
128         OMPT_REDUCTION_END;
129       }
130     }
131     // Don't have to worry about sleep bit here or atomic since team setting
132     team_bar->b_arrived = new_state;
133     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
134                   "arrived(%p) = %llu\n",
135                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
136                   new_state));
137   }
138   KA_TRACE(
139       20,
140       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
141        gtid, team->t.t_id, tid, bt));
142   return false;
143 }
144 
145 template <bool cancellable = false>
__kmp_linear_barrier_release_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))146 static bool __kmp_linear_barrier_release_template(
147     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
148     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
149   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
150   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
151   kmp_team_t *team;
152 
153   if (KMP_MASTER_TID(tid)) {
154     unsigned int i;
155     kmp_uint32 nproc = this_thr->th.th_team_nproc;
156     kmp_info_t **other_threads;
157 
158     team = __kmp_threads[gtid]->th.th_team;
159     KMP_DEBUG_ASSERT(team != NULL);
160     other_threads = team->t.t_threads;
161 
162     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
163                   "barrier type %d\n",
164                   gtid, team->t.t_id, tid, bt));
165 
166     if (nproc > 1) {
167 #if KMP_BARRIER_ICV_PUSH
168       {
169         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
170         if (propagate_icvs) {
171           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
172           for (i = 1; i < nproc; ++i) {
173             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
174                                      team, i, FALSE);
175             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
176                            &team->t.t_implicit_task_taskdata[0].td_icvs);
177           }
178           ngo_sync();
179         }
180       }
181 #endif // KMP_BARRIER_ICV_PUSH
182 
183       // Now, release all of the worker threads
184       for (i = 1; i < nproc; ++i) {
185 #if KMP_CACHE_MANAGE
186         // Prefetch next thread's go flag
187         if (i + 1 < nproc)
188           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
189 #endif /* KMP_CACHE_MANAGE */
190         KA_TRACE(
191             20,
192             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
193              "go(%p): %u => %u\n",
194              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
195              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
196              other_threads[i]->th.th_bar[bt].bb.b_go,
197              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
198         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
199                            other_threads[i]);
200         flag.release();
201       }
202     }
203   } else { // Wait for the PRIMARY thread to release us
204     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
205                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
206     if (cancellable) {
207       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
208       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
209         return true;
210     } else {
211       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
212       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
213     }
214 #if USE_ITT_BUILD && USE_ITT_NOTIFY
215     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
216       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
217       // disabled)
218       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
219       // Cancel wait on previous parallel region...
220       __kmp_itt_task_starting(itt_sync_obj);
221 
222       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
223         return false;
224 
225       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
226       if (itt_sync_obj != NULL)
227         // Call prepare as early as possible for "new" barrier
228         __kmp_itt_task_finished(itt_sync_obj);
229     } else
230 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
231         // Early exit for reaping threads releasing forkjoin barrier
232         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
233       return false;
234 // The worker thread may now assume that the team is valid.
235 #ifdef KMP_DEBUG
236     tid = __kmp_tid_from_gtid(gtid);
237     team = __kmp_threads[gtid]->th.th_team;
238 #endif
239     KMP_DEBUG_ASSERT(team != NULL);
240     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
241     KA_TRACE(20,
242              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
243               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
244     KMP_MB(); // Flush all pending memory write invalidates.
245   }
246   KA_TRACE(
247       20,
248       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
249        gtid, team->t.t_id, tid, bt));
250   return false;
251 }
252 
__kmp_linear_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))253 static void __kmp_linear_barrier_gather(
254     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
255     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
256   __kmp_linear_barrier_gather_template<false>(
257       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
258 }
259 
__kmp_linear_barrier_gather_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))260 static bool __kmp_linear_barrier_gather_cancellable(
261     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
262     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
263   return __kmp_linear_barrier_gather_template<true>(
264       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
265 }
266 
__kmp_linear_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))267 static void __kmp_linear_barrier_release(
268     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
269     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
270   __kmp_linear_barrier_release_template<false>(
271       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
272 }
273 
__kmp_linear_barrier_release_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))274 static bool __kmp_linear_barrier_release_cancellable(
275     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
276     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
277   return __kmp_linear_barrier_release_template<true>(
278       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
279 }
280 
281 // Tree barrier
__kmp_tree_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))282 static void __kmp_tree_barrier_gather(
283     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
284     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
285   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
286   kmp_team_t *team = this_thr->th.th_team;
287   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
288   kmp_info_t **other_threads = team->t.t_threads;
289   kmp_uint32 nproc = this_thr->th.th_team_nproc;
290   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
291   kmp_uint32 branch_factor = 1 << branch_bits;
292   kmp_uint32 child;
293   kmp_uint32 child_tid;
294   kmp_uint64 new_state = 0;
295 
296   KA_TRACE(
297       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
298            gtid, team->t.t_id, tid, bt));
299   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
300 
301 #if USE_ITT_BUILD && USE_ITT_NOTIFY
302   // Barrier imbalance - save arrive time to the thread
303   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
304     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
305         __itt_get_timestamp();
306   }
307 #endif
308   // Perform tree gather to wait until all threads have arrived; reduce any
309   // required data as we go
310   child_tid = (tid << branch_bits) + 1;
311   if (child_tid < nproc) {
312     // Parent threads wait for all their children to arrive
313     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
314     child = 1;
315     do {
316       kmp_info_t *child_thr = other_threads[child_tid];
317       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
318 #if KMP_CACHE_MANAGE
319       // Prefetch next thread's arrived count
320       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
321         KMP_CACHE_PREFETCH(
322             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
323 #endif /* KMP_CACHE_MANAGE */
324       KA_TRACE(20,
325                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
326                 "arrived(%p) == %llu\n",
327                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
328                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
329       // Wait for child to arrive
330       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
331       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
332 #if USE_ITT_BUILD && USE_ITT_NOTIFY
333       // Barrier imbalance - write min of the thread time and a child time to
334       // the thread.
335       if (__kmp_forkjoin_frames_mode == 2) {
336         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
337                                                child_thr->th.th_bar_min_time);
338       }
339 #endif
340       if (reduce) {
341         KA_TRACE(100,
342                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
343                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
344                   team->t.t_id, child_tid));
345         OMPT_REDUCTION_DECL(this_thr, gtid);
346         OMPT_REDUCTION_BEGIN;
347         (*reduce)(this_thr->th.th_local.reduce_data,
348                   child_thr->th.th_local.reduce_data);
349         OMPT_REDUCTION_END;
350       }
351       child++;
352       child_tid++;
353     } while (child <= branch_factor && child_tid < nproc);
354   }
355 
356   if (!KMP_MASTER_TID(tid)) { // Worker threads
357     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
358 
359     KA_TRACE(20,
360              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
361               "arrived(%p): %llu => %llu\n",
362               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
363               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
364               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
365 
366     // Mark arrival to parent thread
367     /* After performing this write, a worker thread may not assume that the team
368        is valid any more - it could be deallocated by the primary thread at any
369        time.  */
370     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
371     flag.release();
372   } else {
373     // Need to update the team arrived pointer if we are the primary thread
374     if (nproc > 1) // New value was already computed above
375       team->t.t_bar[bt].b_arrived = new_state;
376     else
377       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
378     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
379                   "arrived(%p) = %llu\n",
380                   gtid, team->t.t_id, tid, team->t.t_id,
381                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
382   }
383   KA_TRACE(20,
384            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
385             gtid, team->t.t_id, tid, bt));
386 }
387 
__kmp_tree_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))388 static void __kmp_tree_barrier_release(
389     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
390     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
391   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
392   kmp_team_t *team;
393   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
394   kmp_uint32 nproc;
395   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
396   kmp_uint32 branch_factor = 1 << branch_bits;
397   kmp_uint32 child;
398   kmp_uint32 child_tid;
399 
400   // Perform a tree release for all of the threads that have been gathered
401   if (!KMP_MASTER_TID(
402           tid)) { // Handle fork barrier workers who aren't part of a team yet
403     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
404                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
405     // Wait for parent thread to release us
406     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
407     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
409     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
410       // In fork barrier where we could not get the object reliably (or
411       // ITTNOTIFY is disabled)
412       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
413       // Cancel wait on previous parallel region...
414       __kmp_itt_task_starting(itt_sync_obj);
415 
416       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
417         return;
418 
419       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
420       if (itt_sync_obj != NULL)
421         // Call prepare as early as possible for "new" barrier
422         __kmp_itt_task_finished(itt_sync_obj);
423     } else
424 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
425         // Early exit for reaping threads releasing forkjoin barrier
426         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
427       return;
428 
429     // The worker thread may now assume that the team is valid.
430     team = __kmp_threads[gtid]->th.th_team;
431     KMP_DEBUG_ASSERT(team != NULL);
432     tid = __kmp_tid_from_gtid(gtid);
433 
434     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
435     KA_TRACE(20,
436              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
437               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
438     KMP_MB(); // Flush all pending memory write invalidates.
439   } else {
440     team = __kmp_threads[gtid]->th.th_team;
441     KMP_DEBUG_ASSERT(team != NULL);
442     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
443                   "barrier type %d\n",
444                   gtid, team->t.t_id, tid, bt));
445   }
446   nproc = this_thr->th.th_team_nproc;
447   child_tid = (tid << branch_bits) + 1;
448 
449   if (child_tid < nproc) {
450     kmp_info_t **other_threads = team->t.t_threads;
451     child = 1;
452     // Parent threads release all their children
453     do {
454       kmp_info_t *child_thr = other_threads[child_tid];
455       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
456 #if KMP_CACHE_MANAGE
457       // Prefetch next thread's go count
458       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
459         KMP_CACHE_PREFETCH(
460             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
461 #endif /* KMP_CACHE_MANAGE */
462 
463 #if KMP_BARRIER_ICV_PUSH
464       {
465         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
466         if (propagate_icvs) {
467           __kmp_init_implicit_task(team->t.t_ident,
468                                    team->t.t_threads[child_tid], team,
469                                    child_tid, FALSE);
470           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
471                     &team->t.t_implicit_task_taskdata[0].td_icvs);
472         }
473       }
474 #endif // KMP_BARRIER_ICV_PUSH
475       KA_TRACE(20,
476                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
477                 "go(%p): %u => %u\n",
478                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
479                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
480                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
481       // Release child from barrier
482       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
483       flag.release();
484       child++;
485       child_tid++;
486     } while (child <= branch_factor && child_tid < nproc);
487   }
488   KA_TRACE(
489       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
490            gtid, team->t.t_id, tid, bt));
491 }
492 
493 // Hyper Barrier
__kmp_hyper_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))494 static void __kmp_hyper_barrier_gather(
495     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
496     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
497   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
498   kmp_team_t *team = this_thr->th.th_team;
499   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
500   kmp_info_t **other_threads = team->t.t_threads;
501   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
502   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
503   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
504   kmp_uint32 branch_factor = 1 << branch_bits;
505   kmp_uint32 offset;
506   kmp_uint32 level;
507 
508   KA_TRACE(
509       20,
510       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
511        gtid, team->t.t_id, tid, bt));
512   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
513 
514 #if USE_ITT_BUILD && USE_ITT_NOTIFY
515   // Barrier imbalance - save arrive time to the thread
516   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
517     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
518         __itt_get_timestamp();
519   }
520 #endif
521   /* Perform a hypercube-embedded tree gather to wait until all of the threads
522      have arrived, and reduce any required data as we go.  */
523   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
524   for (level = 0, offset = 1; offset < num_threads;
525        level += branch_bits, offset <<= branch_bits) {
526     kmp_uint32 child;
527     kmp_uint32 child_tid;
528 
529     if (((tid >> level) & (branch_factor - 1)) != 0) {
530       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
531 
532       KMP_MB(); // Synchronize parent and child threads.
533       KA_TRACE(20,
534                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
535                 "arrived(%p): %llu => %llu\n",
536                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
537                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
538                 thr_bar->b_arrived,
539                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
540       // Mark arrival to parent thread
541       /* After performing this write (in the last iteration of the enclosing for
542          loop), a worker thread may not assume that the team is valid any more
543          - it could be deallocated by the primary thread at any time.  */
544       p_flag.set_waiter(other_threads[parent_tid]);
545       p_flag.release();
546       break;
547     }
548 
549     // Parent threads wait for children to arrive
550     if (new_state == KMP_BARRIER_UNUSED_STATE)
551       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
552     for (child = 1, child_tid = tid + (1 << level);
553          child < branch_factor && child_tid < num_threads;
554          child++, child_tid += (1 << level)) {
555       kmp_info_t *child_thr = other_threads[child_tid];
556       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
557 #if KMP_CACHE_MANAGE
558       kmp_uint32 next_child_tid = child_tid + (1 << level);
559       // Prefetch next thread's arrived count
560       if (child + 1 < branch_factor && next_child_tid < num_threads)
561         KMP_CACHE_PREFETCH(
562             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
563 #endif /* KMP_CACHE_MANAGE */
564       KA_TRACE(20,
565                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
566                 "arrived(%p) == %llu\n",
567                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
568                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
569       // Wait for child to arrive
570       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
571       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
572       KMP_MB(); // Synchronize parent and child threads.
573 #if USE_ITT_BUILD && USE_ITT_NOTIFY
574       // Barrier imbalance - write min of the thread time and a child time to
575       // the thread.
576       if (__kmp_forkjoin_frames_mode == 2) {
577         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
578                                                child_thr->th.th_bar_min_time);
579       }
580 #endif
581       if (reduce) {
582         KA_TRACE(100,
583                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
584                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585                   team->t.t_id, child_tid));
586         OMPT_REDUCTION_DECL(this_thr, gtid);
587         OMPT_REDUCTION_BEGIN;
588         (*reduce)(this_thr->th.th_local.reduce_data,
589                   child_thr->th.th_local.reduce_data);
590         OMPT_REDUCTION_END;
591       }
592     }
593   }
594 
595   if (KMP_MASTER_TID(tid)) {
596     // Need to update the team arrived pointer if we are the primary thread
597     if (new_state == KMP_BARRIER_UNUSED_STATE)
598       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
599     else
600       team->t.t_bar[bt].b_arrived = new_state;
601     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
602                   "arrived(%p) = %llu\n",
603                   gtid, team->t.t_id, tid, team->t.t_id,
604                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
605   }
606   KA_TRACE(
607       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
608            gtid, team->t.t_id, tid, bt));
609 }
610 
611 // The reverse versions seem to beat the forward versions overall
612 #define KMP_REVERSE_HYPER_BAR
__kmp_hyper_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))613 static void __kmp_hyper_barrier_release(
614     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
615     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
616   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
617   kmp_team_t *team;
618   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
619   kmp_info_t **other_threads;
620   kmp_uint32 num_threads;
621   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
622   kmp_uint32 branch_factor = 1 << branch_bits;
623   kmp_uint32 child;
624   kmp_uint32 child_tid;
625   kmp_uint32 offset;
626   kmp_uint32 level;
627 
628   /* Perform a hypercube-embedded tree release for all of the threads that have
629      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
630      are released in the reverse order of the corresponding gather, otherwise
631      threads are released in the same order. */
632   if (KMP_MASTER_TID(tid)) { // primary thread
633     team = __kmp_threads[gtid]->th.th_team;
634     KMP_DEBUG_ASSERT(team != NULL);
635     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
636                   "barrier type %d\n",
637                   gtid, team->t.t_id, tid, bt));
638 #if KMP_BARRIER_ICV_PUSH
639     if (propagate_icvs) { // primary already has ICVs in final destination; copy
640       copy_icvs(&thr_bar->th_fixed_icvs,
641                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
642     }
643 #endif
644   } else { // Handle fork barrier workers who aren't part of a team yet
645     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
646                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
647     // Wait for parent thread to release us
648     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
649     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
650 #if USE_ITT_BUILD && USE_ITT_NOTIFY
651     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
652       // In fork barrier where we could not get the object reliably
653       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
654       // Cancel wait on previous parallel region...
655       __kmp_itt_task_starting(itt_sync_obj);
656 
657       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
658         return;
659 
660       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
661       if (itt_sync_obj != NULL)
662         // Call prepare as early as possible for "new" barrier
663         __kmp_itt_task_finished(itt_sync_obj);
664     } else
665 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
666         // Early exit for reaping threads releasing forkjoin barrier
667         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
668       return;
669 
670     // The worker thread may now assume that the team is valid.
671     team = __kmp_threads[gtid]->th.th_team;
672     KMP_DEBUG_ASSERT(team != NULL);
673     tid = __kmp_tid_from_gtid(gtid);
674 
675     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
676     KA_TRACE(20,
677              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
678               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
679     KMP_MB(); // Flush all pending memory write invalidates.
680   }
681   num_threads = this_thr->th.th_team_nproc;
682   other_threads = team->t.t_threads;
683 
684 #ifdef KMP_REVERSE_HYPER_BAR
685   // Count up to correct level for parent
686   for (level = 0, offset = 1;
687        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
688        level += branch_bits, offset <<= branch_bits)
689     ;
690 
691   // Now go down from there
692   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
693        level -= branch_bits, offset >>= branch_bits)
694 #else
695   // Go down the tree, level by level
696   for (level = 0, offset = 1; offset < num_threads;
697        level += branch_bits, offset <<= branch_bits)
698 #endif // KMP_REVERSE_HYPER_BAR
699   {
700 #ifdef KMP_REVERSE_HYPER_BAR
701     /* Now go in reverse order through the children, highest to lowest.
702        Initial setting of child is conservative here. */
703     child = num_threads >> ((level == 0) ? level : level - 1);
704     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
705         child_tid = tid + (child << level);
706          child >= 1; child--, child_tid -= (1 << level))
707 #else
708     if (((tid >> level) & (branch_factor - 1)) != 0)
709       // No need to go lower than this, since this is the level parent would be
710       // notified
711       break;
712     // Iterate through children on this level of the tree
713     for (child = 1, child_tid = tid + (1 << level);
714          child < branch_factor && child_tid < num_threads;
715          child++, child_tid += (1 << level))
716 #endif // KMP_REVERSE_HYPER_BAR
717     {
718       if (child_tid >= num_threads)
719         continue; // Child doesn't exist so keep going
720       else {
721         kmp_info_t *child_thr = other_threads[child_tid];
722         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
723 #if KMP_CACHE_MANAGE
724         kmp_uint32 next_child_tid = child_tid - (1 << level);
725 // Prefetch next thread's go count
726 #ifdef KMP_REVERSE_HYPER_BAR
727         if (child - 1 >= 1 && next_child_tid < num_threads)
728 #else
729         if (child + 1 < branch_factor && next_child_tid < num_threads)
730 #endif // KMP_REVERSE_HYPER_BAR
731           KMP_CACHE_PREFETCH(
732               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
733 #endif /* KMP_CACHE_MANAGE */
734 
735 #if KMP_BARRIER_ICV_PUSH
736         if (propagate_icvs) // push my fixed ICVs to my child
737           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
738 #endif // KMP_BARRIER_ICV_PUSH
739 
740         KA_TRACE(
741             20,
742             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
743              "go(%p): %u => %u\n",
744              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
745              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
746              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
747         // Release child from barrier
748         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
749         flag.release();
750       }
751     }
752   }
753 #if KMP_BARRIER_ICV_PUSH
754   if (propagate_icvs &&
755       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
756     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
757                              FALSE);
758     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
759               &thr_bar->th_fixed_icvs);
760   }
761 #endif
762   KA_TRACE(
763       20,
764       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
765        gtid, team->t.t_id, tid, bt));
766 }
767 
768 // Hierarchical Barrier
769 
770 // Initialize thread barrier data
771 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
772    Performs the minimum amount of initialization required based on how the team
773    has changed. Returns true if leaf children will require both on-core and
774    traditional wake-up mechanisms. For example, if the team size increases,
775    threads already in the team will respond to on-core wakeup on their parent
776    thread, but threads newly added to the team will only be listening on the
777    their local b_go. */
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt,kmp_bstate_t * thr_bar,kmp_uint32 nproc,int gtid,int tid,kmp_team_t * team)778 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
779                                                    kmp_bstate_t *thr_bar,
780                                                    kmp_uint32 nproc, int gtid,
781                                                    int tid, kmp_team_t *team) {
782   // Checks to determine if (re-)initialization is needed
783   bool uninitialized = thr_bar->team == NULL;
784   bool team_changed = team != thr_bar->team;
785   bool team_sz_changed = nproc != thr_bar->nproc;
786   bool tid_changed = tid != thr_bar->old_tid;
787   bool retval = false;
788 
789   if (uninitialized || team_sz_changed) {
790     __kmp_get_hierarchy(nproc, thr_bar);
791   }
792 
793   if (uninitialized || team_sz_changed || tid_changed) {
794     thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
795     thr_bar->parent_tid = -1; // default for primary thread
796     if (!KMP_MASTER_TID(tid)) {
797       // if not primary thread, find parent thread in hierarchy
798       kmp_uint32 d = 0;
799       while (d < thr_bar->depth) { // find parent based on level of thread in
800         // hierarchy, and note level
801         kmp_uint32 rem;
802         if (d == thr_bar->depth - 2) { // reached level right below the primary
803           thr_bar->parent_tid = 0;
804           thr_bar->my_level = d;
805           break;
806         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
807           // TODO: can we make the above op faster?
808           // thread is not a subtree root at next level, so this is max
809           thr_bar->parent_tid = tid - rem;
810           thr_bar->my_level = d;
811           break;
812         }
813         ++d;
814       }
815     }
816     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
817                             (thr_bar->skip_per_level[thr_bar->my_level])),
818                        &(thr_bar->offset));
819     thr_bar->old_tid = tid;
820     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
821     thr_bar->team = team;
822     thr_bar->parent_bar =
823         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
824   }
825   if (uninitialized || team_changed || tid_changed) {
826     thr_bar->team = team;
827     thr_bar->parent_bar =
828         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
829     retval = true;
830   }
831   if (uninitialized || team_sz_changed || tid_changed) {
832     thr_bar->nproc = nproc;
833     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
834     if (thr_bar->my_level == 0)
835       thr_bar->leaf_kids = 0;
836     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
837       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
838     thr_bar->leaf_state = 0;
839     for (int i = 0; i < thr_bar->leaf_kids; ++i)
840       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
841   }
842   return retval;
843 }
844 
__kmp_hierarchical_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))845 static void __kmp_hierarchical_barrier_gather(
846     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
847     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
848   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
849   kmp_team_t *team = this_thr->th.th_team;
850   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
851   kmp_uint32 nproc = this_thr->th.th_team_nproc;
852   kmp_info_t **other_threads = team->t.t_threads;
853   kmp_uint64 new_state = 0;
854 
855   int level = team->t.t_level;
856   if (other_threads[0]
857           ->th.th_teams_microtask) // are we inside the teams construct?
858     if (this_thr->th.th_teams_size.nteams > 1)
859       ++level; // level was not increased in teams construct for team_of_masters
860   if (level == 1)
861     thr_bar->use_oncore_barrier = 1;
862   else
863     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
864 
865   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
866                 "barrier type %d\n",
867                 gtid, team->t.t_id, tid, bt));
868   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
869 
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY
871   // Barrier imbalance - save arrive time to the thread
872   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
873     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
874   }
875 #endif
876 
877   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
878                                                team);
879 
880   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
881     kmp_int32 child_tid;
882     new_state =
883         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
884     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
885         thr_bar->use_oncore_barrier) {
886       if (thr_bar->leaf_kids) {
887         // First, wait for leaf children to check-in on my b_arrived flag
888         kmp_uint64 leaf_state =
889             KMP_MASTER_TID(tid)
890                 ? thr_bar->b_arrived | thr_bar->leaf_state
891                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
892         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
893                       "for leaf kids\n",
894                       gtid, team->t.t_id, tid));
895         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
896         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
897         if (reduce) {
898           OMPT_REDUCTION_DECL(this_thr, gtid);
899           OMPT_REDUCTION_BEGIN;
900           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
901                ++child_tid) {
902             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
903                            "T#%d(%d:%d)\n",
904                            gtid, team->t.t_id, tid,
905                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
906                            child_tid));
907             (*reduce)(this_thr->th.th_local.reduce_data,
908                       other_threads[child_tid]->th.th_local.reduce_data);
909           }
910           OMPT_REDUCTION_END;
911         }
912         // clear leaf_state bits
913         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
914       }
915       // Next, wait for higher level children on each child's b_arrived flag
916       for (kmp_uint32 d = 1; d < thr_bar->my_level;
917            ++d) { // gather lowest level threads first, but skip 0
918         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
919                    skip = thr_bar->skip_per_level[d];
920         if (last > nproc)
921           last = nproc;
922         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
923           kmp_info_t *child_thr = other_threads[child_tid];
924           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
925           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
926                         "T#%d(%d:%d) "
927                         "arrived(%p) == %llu\n",
928                         gtid, team->t.t_id, tid,
929                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930                         child_tid, &child_bar->b_arrived, new_state));
931           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
932           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
933           if (reduce) {
934             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
935                            "T#%d(%d:%d)\n",
936                            gtid, team->t.t_id, tid,
937                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
938                            child_tid));
939             (*reduce)(this_thr->th.th_local.reduce_data,
940                       child_thr->th.th_local.reduce_data);
941           }
942         }
943       }
944     } else { // Blocktime is not infinite
945       for (kmp_uint32 d = 0; d < thr_bar->my_level;
946            ++d) { // Gather lowest level threads first
947         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948                    skip = thr_bar->skip_per_level[d];
949         if (last > nproc)
950           last = nproc;
951         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952           kmp_info_t *child_thr = other_threads[child_tid];
953           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955                         "T#%d(%d:%d) "
956                         "arrived(%p) == %llu\n",
957                         gtid, team->t.t_id, tid,
958                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959                         child_tid, &child_bar->b_arrived, new_state));
960           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962           if (reduce) {
963             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
964                            "T#%d(%d:%d)\n",
965                            gtid, team->t.t_id, tid,
966                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
967                            child_tid));
968             (*reduce)(this_thr->th.th_local.reduce_data,
969                       child_thr->th.th_local.reduce_data);
970           }
971         }
972       }
973     }
974   }
975   // All subordinates are gathered; now release parent if not primary thread
976 
977   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
978     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
979                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
980                   gtid, team->t.t_id, tid,
981                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
982                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
983                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
984     /* Mark arrival to parent: After performing this write, a worker thread may
985        not assume that the team is valid any more - it could be deallocated by
986        the primary thread at any time. */
987     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
988         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
989       // flag; release it
990       kmp_flag_64<> flag(&thr_bar->b_arrived,
991                          other_threads[thr_bar->parent_tid]);
992       flag.release();
993     } else {
994       // Leaf does special release on "offset" bits of parent's b_arrived flag
995       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
996       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
997                            thr_bar->offset + 1);
998       flag.set_waiter(other_threads[thr_bar->parent_tid]);
999       flag.release();
1000     }
1001   } else { // Primary thread needs to update the team's b_arrived value
1002     team->t.t_bar[bt].b_arrived = new_state;
1003     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1004                   "arrived(%p) = %llu\n",
1005                   gtid, team->t.t_id, tid, team->t.t_id,
1006                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1007   }
1008   // Is the team access below unsafe or just technically invalid?
1009   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1010                 "barrier type %d\n",
1011                 gtid, team->t.t_id, tid, bt));
1012 }
1013 
__kmp_hierarchical_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))1014 static void __kmp_hierarchical_barrier_release(
1015     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1016     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1017   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1018   kmp_team_t *team;
1019   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1020   kmp_uint32 nproc;
1021   bool team_change = false; // indicates on-core barrier shouldn't be used
1022 
1023   if (KMP_MASTER_TID(tid)) {
1024     team = __kmp_threads[gtid]->th.th_team;
1025     KMP_DEBUG_ASSERT(team != NULL);
1026     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1027                   "entered barrier type %d\n",
1028                   gtid, team->t.t_id, tid, bt));
1029   } else { // Worker threads
1030     // Wait for parent thread to release me
1031     if (!thr_bar->use_oncore_barrier ||
1032         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1033         thr_bar->team == NULL) {
1034       // Use traditional method of waiting on my own b_go flag
1035       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1036       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1037       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1038       TCW_8(thr_bar->b_go,
1039             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1040     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1041       // infinite, not nested
1042       // Wait on my "offset" bits on parent's b_go flag
1043       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1044       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1045                            thr_bar->offset + 1, bt,
1046                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1047       flag.wait(this_thr, TRUE);
1048       if (thr_bar->wait_flag ==
1049           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1050         TCW_8(thr_bar->b_go,
1051               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1052       } else { // Reset my bits on parent's b_go flag
1053         (RCAST(volatile char *,
1054                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1055       }
1056     }
1057     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1058     // Early exit for reaping threads releasing forkjoin barrier
1059     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1060       return;
1061     // The worker thread may now assume that the team is valid.
1062     team = __kmp_threads[gtid]->th.th_team;
1063     KMP_DEBUG_ASSERT(team != NULL);
1064     tid = __kmp_tid_from_gtid(gtid);
1065 
1066     KA_TRACE(
1067         20,
1068         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1069          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1070     KMP_MB(); // Flush all pending memory write invalidates.
1071   }
1072 
1073   nproc = this_thr->th.th_team_nproc;
1074   int level = team->t.t_level;
1075   if (team->t.t_threads[0]
1076           ->th.th_teams_microtask) { // are we inside the teams construct?
1077     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1078         this_thr->th.th_teams_level == level)
1079       ++level; // level was not increased in teams construct for team_of_workers
1080     if (this_thr->th.th_teams_size.nteams > 1)
1081       ++level; // level was not increased in teams construct for team_of_masters
1082   }
1083   if (level == 1)
1084     thr_bar->use_oncore_barrier = 1;
1085   else
1086     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1087 
1088   // If the team size has increased, we still communicate with old leaves via
1089   // oncore barrier.
1090   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1091   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1092   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1093                                                        tid, team);
1094   // But if the entire team changes, we won't use oncore barrier at all
1095   if (team_change)
1096     old_leaf_kids = 0;
1097 
1098 #if KMP_BARRIER_ICV_PUSH
1099   if (propagate_icvs) {
1100     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1101                              FALSE);
1102     if (KMP_MASTER_TID(
1103             tid)) { // primary already has copy in final destination; copy
1104       copy_icvs(&thr_bar->th_fixed_icvs,
1105                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1106     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1107                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1108       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1109         // leaves (on-core children) pull parent's fixed ICVs directly to local
1110         // ICV store
1111         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1112                   &thr_bar->parent_bar->th_fixed_icvs);
1113       // non-leaves will get ICVs piggybacked with b_go via NGO store
1114     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1115       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1116         // access
1117         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1118       else // leaves copy parent's fixed ICVs directly to local ICV store
1119         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1120                   &thr_bar->parent_bar->th_fixed_icvs);
1121     }
1122   }
1123 #endif // KMP_BARRIER_ICV_PUSH
1124 
1125   // Now, release my children
1126   if (thr_bar->my_level) { // not a leaf
1127     kmp_int32 child_tid;
1128     kmp_uint32 last;
1129     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1130         thr_bar->use_oncore_barrier) {
1131       if (KMP_MASTER_TID(tid)) { // do a flat release
1132         // Set local b_go to bump children via NGO store of the cache line
1133         // containing IVCs and b_go.
1134         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1135         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1136         // the cache line
1137         ngo_load(&thr_bar->th_fixed_icvs);
1138         // This loops over all the threads skipping only the leaf nodes in the
1139         // hierarchy
1140         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1141              child_tid += thr_bar->skip_per_level[1]) {
1142           kmp_bstate_t *child_bar =
1143               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1144           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1145                         "releasing T#%d(%d:%d)"
1146                         " go(%p): %u => %u\n",
1147                         gtid, team->t.t_id, tid,
1148                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1149                         child_tid, &child_bar->b_go, child_bar->b_go,
1150                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1151           // Use ngo store (if available) to both store ICVs and release child
1152           // via child's b_go
1153           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1154         }
1155         ngo_sync();
1156       }
1157       TCW_8(thr_bar->b_go,
1158             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1159       // Now, release leaf children
1160       if (thr_bar->leaf_kids) { // if there are any
1161         // We test team_change on the off-chance that the level 1 team changed.
1162         if (team_change ||
1163             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1164           if (old_leaf_kids) { // release old leaf kids
1165             thr_bar->b_go |= old_leaf_state;
1166           }
1167           // Release new leaf kids
1168           last = tid + thr_bar->skip_per_level[1];
1169           if (last > nproc)
1170             last = nproc;
1171           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1172                ++child_tid) { // skip_per_level[0]=1
1173             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1174             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1175             KA_TRACE(
1176                 20,
1177                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1178                  " T#%d(%d:%d) go(%p): %u => %u\n",
1179                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1180                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1181                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1182             // Release child using child's b_go flag
1183             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1184             flag.release();
1185           }
1186         } else { // Release all children at once with leaf_state bits on my own
1187           // b_go flag
1188           thr_bar->b_go |= thr_bar->leaf_state;
1189         }
1190       }
1191     } else { // Blocktime is not infinite; do a simple hierarchical release
1192       for (int d = thr_bar->my_level - 1; d >= 0;
1193            --d) { // Release highest level threads first
1194         last = tid + thr_bar->skip_per_level[d + 1];
1195         kmp_uint32 skip = thr_bar->skip_per_level[d];
1196         if (last > nproc)
1197           last = nproc;
1198         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1199           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1200           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1201           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1202                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1203                         gtid, team->t.t_id, tid,
1204                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1205                         child_tid, &child_bar->b_go, child_bar->b_go,
1206                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1207           // Release child using child's b_go flag
1208           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1209           flag.release();
1210         }
1211       }
1212     }
1213 #if KMP_BARRIER_ICV_PUSH
1214     if (propagate_icvs && !KMP_MASTER_TID(tid))
1215       // non-leaves copy ICVs from fixed ICVs to local dest
1216       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1217                 &thr_bar->th_fixed_icvs);
1218 #endif // KMP_BARRIER_ICV_PUSH
1219   }
1220   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1221                 "barrier type %d\n",
1222                 gtid, team->t.t_id, tid, bt));
1223 }
1224 
1225 // End of Barrier Algorithms
1226 
1227 // type traits for cancellable value
1228 // if cancellable is true, then is_cancellable is a normal boolean variable
1229 // if cancellable is false, then is_cancellable is a compile time constant
1230 template <bool cancellable> struct is_cancellable {};
1231 template <> struct is_cancellable<true> {
1232   bool value;
is_cancellableis_cancellable1233   is_cancellable() : value(false) {}
is_cancellableis_cancellable1234   is_cancellable(bool b) : value(b) {}
operator =is_cancellable1235   is_cancellable &operator=(bool b) {
1236     value = b;
1237     return *this;
1238   }
operator boolis_cancellable1239   operator bool() const { return value; }
1240 };
1241 template <> struct is_cancellable<false> {
operator =is_cancellable1242   is_cancellable &operator=(bool b) { return *this; }
operator boolis_cancellable1243   constexpr operator bool() const { return false; }
1244 };
1245 
1246 // Internal function to do a barrier.
1247 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1248    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1249    barrier
1250    When cancellable = false,
1251      Returns 0 if primary thread, 1 if worker thread.
1252    When cancellable = true
1253      Returns 0 if not cancelled, 1 if cancelled.  */
1254 template <bool cancellable = false>
__kmp_barrier_template(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1255 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1256                                   size_t reduce_size, void *reduce_data,
1257                                   void (*reduce)(void *, void *)) {
1258   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1259   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1260   int tid = __kmp_tid_from_gtid(gtid);
1261   kmp_info_t *this_thr = __kmp_threads[gtid];
1262   kmp_team_t *team = this_thr->th.th_team;
1263   int status = 0;
1264   is_cancellable<cancellable> cancelled;
1265 #if OMPT_SUPPORT && OMPT_OPTIONAL
1266   ompt_data_t *my_task_data;
1267   ompt_data_t *my_parallel_data;
1268   void *return_address;
1269   ompt_sync_region_t barrier_kind;
1270 #endif
1271 
1272   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1273                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1274 
1275 #if OMPT_SUPPORT
1276   if (ompt_enabled.enabled) {
1277 #if OMPT_OPTIONAL
1278     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1279     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1280     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1281     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1282     if (ompt_enabled.ompt_callback_sync_region) {
1283       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1284           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1285           return_address);
1286     }
1287     if (ompt_enabled.ompt_callback_sync_region_wait) {
1288       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1289           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1290           return_address);
1291     }
1292 #endif
1293     // It is OK to report the barrier state after the barrier begin callback.
1294     // According to the OMPT specification, a compliant implementation may
1295     // even delay reporting this state until the barrier begins to wait.
1296     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1297   }
1298 #endif
1299 
1300   if (!team->t.t_serialized) {
1301 #if USE_ITT_BUILD
1302     // This value will be used in itt notify events below.
1303     void *itt_sync_obj = NULL;
1304 #if USE_ITT_NOTIFY
1305     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1306       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1307 #endif
1308 #endif /* USE_ITT_BUILD */
1309     if (__kmp_tasking_mode == tskm_extra_barrier) {
1310       __kmp_tasking_barrier(team, this_thr, gtid);
1311       KA_TRACE(15,
1312                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1313                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1314     }
1315 
1316     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1317        access it when the team struct is not guaranteed to exist. */
1318     // See note about the corresponding code in __kmp_join_barrier() being
1319     // performance-critical.
1320     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1321 #if KMP_USE_MONITOR
1322       this_thr->th.th_team_bt_intervals =
1323           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1324       this_thr->th.th_team_bt_set =
1325           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1326 #else
1327       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1328 #endif
1329     }
1330 
1331 #if USE_ITT_BUILD
1332     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1333       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1334 #endif /* USE_ITT_BUILD */
1335 #if USE_DEBUGGER
1336     // Let the debugger know: the thread arrived to the barrier and waiting.
1337     if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1338       team->t.t_bar[bt].b_master_arrived += 1;
1339     } else {
1340       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1341     } // if
1342 #endif /* USE_DEBUGGER */
1343     if (reduce != NULL) {
1344       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1345       this_thr->th.th_local.reduce_data = reduce_data;
1346     }
1347 
1348     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1349       // use 0 to only setup the current team if nthreads > 1
1350       __kmp_task_team_setup(this_thr, team, 0);
1351 
1352     if (cancellable) {
1353       cancelled = __kmp_linear_barrier_gather_cancellable(
1354           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1355     } else {
1356       switch (__kmp_barrier_gather_pattern[bt]) {
1357       case bp_hyper_bar: {
1358         // don't set branch bits to 0; use linear
1359         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1360         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1361                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1362         break;
1363       }
1364       case bp_hierarchical_bar: {
1365         __kmp_hierarchical_barrier_gather(
1366             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1367         break;
1368       }
1369       case bp_tree_bar: {
1370         // don't set branch bits to 0; use linear
1371         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1372         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1373                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1374         break;
1375       }
1376       default: {
1377         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1378                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1379       }
1380       }
1381     }
1382 
1383     KMP_MB();
1384 
1385     if (KMP_MASTER_TID(tid)) {
1386       status = 0;
1387       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1388         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1389       }
1390 #if USE_DEBUGGER
1391       // Let the debugger know: All threads are arrived and starting leaving the
1392       // barrier.
1393       team->t.t_bar[bt].b_team_arrived += 1;
1394 #endif
1395 
1396       if (__kmp_omp_cancellation) {
1397         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1398         // Reset cancellation flag for worksharing constructs
1399         if (cancel_request == cancel_loop ||
1400             cancel_request == cancel_sections) {
1401           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1402         }
1403       }
1404 #if USE_ITT_BUILD
1405       /* TODO: In case of split reduction barrier, primary thread may send
1406          acquired event early, before the final summation into the shared
1407          variable is done (final summation can be a long operation for array
1408          reductions).  */
1409       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1410         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1411 #endif /* USE_ITT_BUILD */
1412 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1413       // Barrier - report frame end (only if active_level == 1)
1414       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1415           __kmp_forkjoin_frames_mode &&
1416           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1417            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1418           team->t.t_active_level == 1) {
1419         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1420         kmp_uint64 cur_time = __itt_get_timestamp();
1421         kmp_info_t **other_threads = team->t.t_threads;
1422         int nproc = this_thr->th.th_team_nproc;
1423         int i;
1424         switch (__kmp_forkjoin_frames_mode) {
1425         case 1:
1426           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1427                                  loc, nproc);
1428           this_thr->th.th_frame_time = cur_time;
1429           break;
1430         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1431           // be fixed)
1432           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1433                                  1, loc, nproc);
1434           break;
1435         case 3:
1436           if (__itt_metadata_add_ptr) {
1437             // Initialize with primary thread's wait time
1438             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1439             // Set arrive time to zero to be able to check it in
1440             // __kmp_invoke_task(); the same is done inside the loop below
1441             this_thr->th.th_bar_arrive_time = 0;
1442             for (i = 1; i < nproc; ++i) {
1443               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1444               other_threads[i]->th.th_bar_arrive_time = 0;
1445             }
1446             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1447                                          cur_time, delta,
1448                                          (kmp_uint64)(reduce != NULL));
1449           }
1450           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1451                                  loc, nproc);
1452           this_thr->th.th_frame_time = cur_time;
1453           break;
1454         }
1455       }
1456 #endif /* USE_ITT_BUILD */
1457     } else {
1458       status = 1;
1459 #if USE_ITT_BUILD
1460       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1461         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1462 #endif /* USE_ITT_BUILD */
1463     }
1464     if ((status == 1 || !is_split) && !cancelled) {
1465       if (cancellable) {
1466         cancelled = __kmp_linear_barrier_release_cancellable(
1467             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1468       } else {
1469         switch (__kmp_barrier_release_pattern[bt]) {
1470         case bp_hyper_bar: {
1471           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1472           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1473                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474           break;
1475         }
1476         case bp_hierarchical_bar: {
1477           __kmp_hierarchical_barrier_release(
1478               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1479           break;
1480         }
1481         case bp_tree_bar: {
1482           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1483           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1484                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1485           break;
1486         }
1487         default: {
1488           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1489                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1490         }
1491         }
1492       }
1493       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1494         __kmp_task_team_sync(this_thr, team);
1495       }
1496     }
1497 
1498 #if USE_ITT_BUILD
1499     /* GEH: TODO: Move this under if-condition above and also include in
1500        __kmp_end_split_barrier(). This will more accurately represent the actual
1501        release time of the threads for split barriers.  */
1502     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505   } else { // Team is serialized.
1506     status = 0;
1507     if (__kmp_tasking_mode != tskm_immediate_exec) {
1508       if (this_thr->th.th_task_team != NULL) {
1509 #if USE_ITT_NOTIFY
1510         void *itt_sync_obj = NULL;
1511         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1512           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1513           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1514         }
1515 #endif
1516 
1517         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1518                          TRUE);
1519         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1520         __kmp_task_team_setup(this_thr, team, 0);
1521 
1522 #if USE_ITT_BUILD
1523         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1524           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1525 #endif /* USE_ITT_BUILD */
1526       }
1527     }
1528   }
1529   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1530                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1531                 __kmp_tid_from_gtid(gtid), status));
1532 
1533 #if OMPT_SUPPORT
1534   if (ompt_enabled.enabled) {
1535 #if OMPT_OPTIONAL
1536     if (ompt_enabled.ompt_callback_sync_region_wait) {
1537       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1538           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1539           return_address);
1540     }
1541     if (ompt_enabled.ompt_callback_sync_region) {
1542       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1543           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1544           return_address);
1545     }
1546 #endif
1547     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1548   }
1549 #endif
1550 
1551   if (cancellable)
1552     return (int)cancelled;
1553   return status;
1554 }
1555 
1556 // Returns 0 if primary thread, 1 if worker thread.
__kmp_barrier(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1557 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1558                   size_t reduce_size, void *reduce_data,
1559                   void (*reduce)(void *, void *)) {
1560   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1561                                   reduce);
1562 }
1563 
1564 #if defined(KMP_GOMP_COMPAT)
1565 // Returns 1 if cancelled, 0 otherwise
__kmp_barrier_gomp_cancel(int gtid)1566 int __kmp_barrier_gomp_cancel(int gtid) {
1567   if (__kmp_omp_cancellation) {
1568     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1569                                                  0, NULL, NULL);
1570     if (cancelled) {
1571       int tid = __kmp_tid_from_gtid(gtid);
1572       kmp_info_t *this_thr = __kmp_threads[gtid];
1573       if (KMP_MASTER_TID(tid)) {
1574         // Primary thread does not need to revert anything
1575       } else {
1576         // Workers need to revert their private b_arrived flag
1577         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1578             KMP_BARRIER_STATE_BUMP;
1579       }
1580     }
1581     return cancelled;
1582   }
1583   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1584   return FALSE;
1585 }
1586 #endif
1587 
__kmp_end_split_barrier(enum barrier_type bt,int gtid)1588 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1589   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1590   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1591   KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1592   int tid = __kmp_tid_from_gtid(gtid);
1593   kmp_info_t *this_thr = __kmp_threads[gtid];
1594   kmp_team_t *team = this_thr->th.th_team;
1595 
1596   if (!team->t.t_serialized) {
1597     if (KMP_MASTER_GTID(gtid)) {
1598       switch (__kmp_barrier_release_pattern[bt]) {
1599       case bp_hyper_bar: {
1600         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1601         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1602                                     FALSE USE_ITT_BUILD_ARG(NULL));
1603         break;
1604       }
1605       case bp_hierarchical_bar: {
1606         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1607                                            FALSE USE_ITT_BUILD_ARG(NULL));
1608         break;
1609       }
1610       case bp_tree_bar: {
1611         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1612         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1613                                    FALSE USE_ITT_BUILD_ARG(NULL));
1614         break;
1615       }
1616       default: {
1617         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1618                                      FALSE USE_ITT_BUILD_ARG(NULL));
1619       }
1620       }
1621       if (__kmp_tasking_mode != tskm_immediate_exec) {
1622         __kmp_task_team_sync(this_thr, team);
1623       } // if
1624     }
1625   }
1626 }
1627 
__kmp_join_barrier(int gtid)1628 void __kmp_join_barrier(int gtid) {
1629   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1630   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1631 
1632   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1633 
1634   kmp_info_t *this_thr = __kmp_threads[gtid];
1635   kmp_team_t *team;
1636   kmp_uint nproc;
1637   kmp_info_t *master_thread;
1638   int tid;
1639 #ifdef KMP_DEBUG
1640   int team_id;
1641 #endif /* KMP_DEBUG */
1642 #if USE_ITT_BUILD
1643   void *itt_sync_obj = NULL;
1644 #if USE_ITT_NOTIFY
1645   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1646     // Get object created at fork_barrier
1647     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1648 #endif
1649 #endif /* USE_ITT_BUILD */
1650   KMP_MB();
1651 
1652   // Get current info
1653   team = this_thr->th.th_team;
1654   nproc = this_thr->th.th_team_nproc;
1655   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1656   tid = __kmp_tid_from_gtid(gtid);
1657 #ifdef KMP_DEBUG
1658   team_id = team->t.t_id;
1659 #endif /* KMP_DEBUG */
1660   master_thread = this_thr->th.th_team_master;
1661 #ifdef KMP_DEBUG
1662   if (master_thread != team->t.t_threads[0]) {
1663     __kmp_print_structure();
1664   }
1665 #endif /* KMP_DEBUG */
1666   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1667   KMP_MB();
1668 
1669   // Verify state
1670   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1671   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1672   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1673   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1674                 gtid, team_id, tid));
1675 
1676 #if OMPT_SUPPORT
1677   if (ompt_enabled.enabled) {
1678 #if OMPT_OPTIONAL
1679     ompt_data_t *my_task_data;
1680     ompt_data_t *my_parallel_data;
1681     void *codeptr = NULL;
1682     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1683     if (KMP_MASTER_TID(ds_tid) &&
1684         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1685          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1686       codeptr = team->t.ompt_team_info.master_return_address;
1687     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1688     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1689     if (ompt_enabled.ompt_callback_sync_region) {
1690       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1691           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1692           my_task_data, codeptr);
1693     }
1694     if (ompt_enabled.ompt_callback_sync_region_wait) {
1695       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1696           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1697           my_task_data, codeptr);
1698     }
1699     if (!KMP_MASTER_TID(ds_tid))
1700       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1701 #endif
1702     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1703   }
1704 #endif
1705 
1706   if (__kmp_tasking_mode == tskm_extra_barrier) {
1707     __kmp_tasking_barrier(team, this_thr, gtid);
1708     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1709                   team_id, tid));
1710   }
1711 #ifdef KMP_DEBUG
1712   if (__kmp_tasking_mode != tskm_immediate_exec) {
1713     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1714                   "%p, th_task_team = %p\n",
1715                   __kmp_gtid_from_thread(this_thr), team_id,
1716                   team->t.t_task_team[this_thr->th.th_task_state],
1717                   this_thr->th.th_task_team));
1718     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1719                      team->t.t_task_team[this_thr->th.th_task_state]);
1720   }
1721 #endif /* KMP_DEBUG */
1722 
1723   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1724      access it when the team struct is not guaranteed to exist. Doing these
1725      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1726      we do not perform the copy if blocktime=infinite, since the values are not
1727      used by __kmp_wait_template() in that case. */
1728   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1729 #if KMP_USE_MONITOR
1730     this_thr->th.th_team_bt_intervals =
1731         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1732     this_thr->th.th_team_bt_set =
1733         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1734 #else
1735     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1736 #endif
1737   }
1738 
1739 #if USE_ITT_BUILD
1740   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1741     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1742 #endif /* USE_ITT_BUILD */
1743 
1744   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1745   case bp_hyper_bar: {
1746     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1747     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1748                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1749     break;
1750   }
1751   case bp_hierarchical_bar: {
1752     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1753                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1754     break;
1755   }
1756   case bp_tree_bar: {
1757     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1758     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1759                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1760     break;
1761   }
1762   default: {
1763     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1764                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1765   }
1766   }
1767 
1768   /* From this point on, the team data structure may be deallocated at any time
1769      by the primary thread - it is unsafe to reference it in any of the worker
1770      threads. Any per-team data items that need to be referenced before the
1771      end of the barrier should be moved to the kmp_task_team_t structs.  */
1772   if (KMP_MASTER_TID(tid)) {
1773     if (__kmp_tasking_mode != tskm_immediate_exec) {
1774       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1775     }
1776     if (__kmp_display_affinity) {
1777       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1778     }
1779 #if KMP_STATS_ENABLED
1780     // Have primary thread flag the workers to indicate they are now waiting for
1781     // next parallel region, Also wake them up so they switch their timers to
1782     // idle.
1783     for (int i = 0; i < team->t.t_nproc; ++i) {
1784       kmp_info_t *team_thread = team->t.t_threads[i];
1785       if (team_thread == this_thr)
1786         continue;
1787       team_thread->th.th_stats->setIdleFlag();
1788       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1789           team_thread->th.th_sleep_loc != NULL)
1790         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1791                                   team_thread->th.th_sleep_loc);
1792     }
1793 #endif
1794 #if USE_ITT_BUILD
1795     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1796       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1797 #endif /* USE_ITT_BUILD */
1798 
1799 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1800     // Join barrier - report frame end
1801     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1802         __kmp_forkjoin_frames_mode &&
1803         (this_thr->th.th_teams_microtask == NULL || // either not in teams
1804          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1805         team->t.t_active_level == 1) {
1806       kmp_uint64 cur_time = __itt_get_timestamp();
1807       ident_t *loc = team->t.t_ident;
1808       kmp_info_t **other_threads = team->t.t_threads;
1809       int nproc = this_thr->th.th_team_nproc;
1810       int i;
1811       switch (__kmp_forkjoin_frames_mode) {
1812       case 1:
1813         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1814                                loc, nproc);
1815         break;
1816       case 2:
1817         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1818                                loc, nproc);
1819         break;
1820       case 3:
1821         if (__itt_metadata_add_ptr) {
1822           // Initialize with primary thread's wait time
1823           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1824           // Set arrive time to zero to be able to check it in
1825           // __kmp_invoke_task(); the same is done inside the loop below
1826           this_thr->th.th_bar_arrive_time = 0;
1827           for (i = 1; i < nproc; ++i) {
1828             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1829             other_threads[i]->th.th_bar_arrive_time = 0;
1830           }
1831           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1832                                        cur_time, delta, 0);
1833         }
1834         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1835                                loc, nproc);
1836         this_thr->th.th_frame_time = cur_time;
1837         break;
1838       }
1839     }
1840 #endif /* USE_ITT_BUILD */
1841   }
1842 #if USE_ITT_BUILD
1843   else {
1844     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1845       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1846   }
1847 #endif /* USE_ITT_BUILD */
1848 
1849 #if KMP_DEBUG
1850   if (KMP_MASTER_TID(tid)) {
1851     KA_TRACE(
1852         15,
1853         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1854          gtid, team_id, tid, nproc));
1855   }
1856 #endif /* KMP_DEBUG */
1857 
1858   // TODO now, mark worker threads as done so they may be disbanded
1859   KMP_MB(); // Flush all pending memory write invalidates.
1860   KA_TRACE(10,
1861            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1862 
1863 }
1864 
1865 // TODO release worker threads' fork barriers as we are ready instead of all at
1866 // once
__kmp_fork_barrier(int gtid,int tid)1867 void __kmp_fork_barrier(int gtid, int tid) {
1868   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1869   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1870   kmp_info_t *this_thr = __kmp_threads[gtid];
1871   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1872 #if USE_ITT_BUILD
1873   void *itt_sync_obj = NULL;
1874 #endif /* USE_ITT_BUILD */
1875   if (team)
1876 
1877   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1878                 (team != NULL) ? team->t.t_id : -1, tid));
1879 
1880   // th_team pointer only valid for primary thread here
1881   if (KMP_MASTER_TID(tid)) {
1882 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1883     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1884       // Create itt barrier object
1885       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1886       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1887     }
1888 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1889 
1890 #ifdef KMP_DEBUG
1891     KMP_DEBUG_ASSERT(team);
1892     kmp_info_t **other_threads = team->t.t_threads;
1893     int i;
1894 
1895     // Verify state
1896     KMP_MB();
1897 
1898     for (i = 1; i < team->t.t_nproc; ++i) {
1899       KA_TRACE(500,
1900                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1901                 "== %u.\n",
1902                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1903                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1904                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1905       KMP_DEBUG_ASSERT(
1906           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1907            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1908       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1909     }
1910 #endif
1911 
1912     if (__kmp_tasking_mode != tskm_immediate_exec) {
1913       // 0 indicates setup current task team if nthreads > 1
1914       __kmp_task_team_setup(this_thr, team, 0);
1915     }
1916 
1917     /* The primary thread may have changed its blocktime between join barrier
1918        and fork barrier. Copy the blocktime info to the thread, where
1919        __kmp_wait_template() can access it when the team struct is not
1920        guaranteed to exist. */
1921     // See note about the corresponding code in __kmp_join_barrier() being
1922     // performance-critical
1923     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1924 #if KMP_USE_MONITOR
1925       this_thr->th.th_team_bt_intervals =
1926           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1927       this_thr->th.th_team_bt_set =
1928           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1929 #else
1930       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1931 #endif
1932     }
1933   } // primary thread
1934 
1935   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1936   case bp_hyper_bar: {
1937     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1938     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1939                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1940     break;
1941   }
1942   case bp_hierarchical_bar: {
1943     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1944                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1945     break;
1946   }
1947   case bp_tree_bar: {
1948     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1949     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1950                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1951     break;
1952   }
1953   default: {
1954     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1955                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1956   }
1957   }
1958 
1959 #if OMPT_SUPPORT
1960   if (ompt_enabled.enabled &&
1961       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1962     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1963     ompt_data_t *task_data = (team)
1964                                  ? OMPT_CUR_TASK_DATA(this_thr)
1965                                  : &(this_thr->th.ompt_thread_info.task_data);
1966     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1967 #if OMPT_OPTIONAL
1968     void *codeptr = NULL;
1969     if (KMP_MASTER_TID(ds_tid) &&
1970         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1971          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1972       codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
1973     if (ompt_enabled.ompt_callback_sync_region_wait) {
1974       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1975           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1976           codeptr);
1977     }
1978     if (ompt_enabled.ompt_callback_sync_region) {
1979       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1980           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1981           codeptr);
1982     }
1983 #endif
1984     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1985       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1986           ompt_scope_end, NULL, task_data, 0, ds_tid,
1987           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1988     }
1989   }
1990 #endif
1991 
1992   // Early exit for reaping threads releasing forkjoin barrier
1993   if (TCR_4(__kmp_global.g.g_done)) {
1994     this_thr->th.th_task_team = NULL;
1995 
1996 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1997     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1998       if (!KMP_MASTER_TID(tid)) {
1999         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2000         if (itt_sync_obj)
2001           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2002       }
2003     }
2004 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2005     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2006     return;
2007   }
2008 
2009   /* We can now assume that a valid team structure has been allocated by the
2010      primary thread and propagated to all worker threads. The current thread,
2011      however, may not be part of the team, so we can't blindly assume that the
2012      team pointer is non-null.  */
2013   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2014   KMP_DEBUG_ASSERT(team != NULL);
2015   tid = __kmp_tid_from_gtid(gtid);
2016 
2017 #if KMP_BARRIER_ICV_PULL
2018   /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2019      __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2020      implicit task has this data before this function is called. We cannot
2021      modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2022      thread struct, because it is not always the case that the threads arrays
2023      have been allocated when __kmp_fork_call() is executed. */
2024   {
2025     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2026     if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2027       // Copy the initial ICVs from the primary thread's thread struct to the
2028       // implicit task for this tid.
2029       KA_TRACE(10,
2030                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2031       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2032                                tid, FALSE);
2033       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2034                 &team->t.t_threads[0]
2035                      ->th.th_bar[bs_forkjoin_barrier]
2036                      .bb.th_fixed_icvs);
2037     }
2038   }
2039 #endif // KMP_BARRIER_ICV_PULL
2040 
2041   if (__kmp_tasking_mode != tskm_immediate_exec) {
2042     __kmp_task_team_sync(this_thr, team);
2043   }
2044 
2045 #if KMP_AFFINITY_SUPPORTED
2046   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2047   if (proc_bind == proc_bind_intel) {
2048     // Call dynamic affinity settings
2049     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2050       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2051     }
2052   } else if (proc_bind != proc_bind_false) {
2053     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2054       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2055                      __kmp_gtid_from_thread(this_thr),
2056                      this_thr->th.th_current_place));
2057     } else {
2058       __kmp_affinity_set_place(gtid);
2059     }
2060   }
2061 #endif // KMP_AFFINITY_SUPPORTED
2062   // Perform the display affinity functionality
2063   if (__kmp_display_affinity) {
2064     if (team->t.t_display_affinity
2065 #if KMP_AFFINITY_SUPPORTED
2066         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2067 #endif
2068     ) {
2069       // NULL means use the affinity-format-var ICV
2070       __kmp_aux_display_affinity(gtid, NULL);
2071       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2072       this_thr->th.th_prev_level = team->t.t_level;
2073     }
2074   }
2075   if (!KMP_MASTER_TID(tid))
2076     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2077 
2078 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2079   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2080     if (!KMP_MASTER_TID(tid)) {
2081       // Get correct barrier object
2082       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2083       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2084     } // (prepare called inside barrier_release)
2085   }
2086 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2087   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2088                 team->t.t_id, tid));
2089 }
2090 
__kmp_setup_icv_copy(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)2091 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2092                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2093   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2094 
2095   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2096   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2097 
2098 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2099    __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2100    implicit task has this data before this function is called. */
2101 #if KMP_BARRIER_ICV_PULL
2102   /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2103      remains untouched), where all of the worker threads can access them and
2104      make their own copies after the barrier. */
2105   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2106   // allocated at this point
2107   copy_icvs(
2108       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2109       new_icvs);
2110   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2111                 team->t.t_threads[0], team));
2112 #elif KMP_BARRIER_ICV_PUSH
2113   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2114   // done here.
2115   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2116                 team->t.t_threads[0], team));
2117 #else
2118   // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2119   // time.
2120   ngo_load(new_icvs);
2121   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2122   // allocated at this point
2123   for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2124     // TODO: GEH - pass in better source location info since usually NULL here
2125     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2126                   f, team->t.t_threads[f], team));
2127     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2128     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2129     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2130                   f, team->t.t_threads[f], team));
2131   }
2132   ngo_sync();
2133 #endif // KMP_BARRIER_ICV_PULL
2134 }
2135