1 /*
2 * kmp_barrier.cpp
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24
25 #include "tsan_annotations.h"
26
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39
40 void __kmp_print_structure(void); // Forward declaration
41
42 // ---------------------------- Barrier Algorithms ----------------------------
43
44 // Linear Barrier
45 template <bool cancellable = false>
__kmp_linear_barrier_gather_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))46 static bool __kmp_linear_barrier_gather_template(
47 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50 kmp_team_t *team = this_thr->th.th_team;
51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52 kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(
55 20,
56 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64 __itt_get_timestamp();
65 }
66 #endif
67 // We now perform a linear reduction to signal that all of the threads have
68 // arrived.
69 if (!KMP_MASTER_TID(tid)) {
70 KA_TRACE(20,
71 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72 "arrived(%p): %llu => %llu\n",
73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76 // Mark arrival to master thread
77 /* After performing this write, a worker thread may not assume that the team
78 is valid any more - it could be deallocated by the master thread at any
79 time. */
80 ANNOTATE_BARRIER_BEGIN(this_thr);
81 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82 flag.release();
83 } else {
84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85 int nproc = this_thr->th.th_team_nproc;
86 int i;
87 // Don't have to worry about sleep bit here or atomic since team setting
88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89
90 // Collect all the worker team member threads.
91 for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93 // Prefetch next thread's arrived count
94 if (i + 1 < nproc)
95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98 "arrived(%p) == %llu\n",
99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100 team->t.t_id, i,
101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102
103 // Wait for worker thread to arrive
104 if (cancellable) {
105 kmp_flag_64<true, false> flag(
106 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108 return true;
109 } else {
110 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111 new_state);
112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113 }
114 ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116 // Barrier imbalance - write min of the thread time and the other thread
117 // time to the thread.
118 if (__kmp_forkjoin_frames_mode == 2) {
119 this_thr->th.th_bar_min_time = KMP_MIN(
120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121 }
122 #endif
123 if (reduce) {
124 KA_TRACE(100,
125 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127 team->t.t_id, i));
128 ANNOTATE_REDUCE_AFTER(reduce);
129 OMPT_REDUCTION_DECL(this_thr, gtid);
130 OMPT_REDUCTION_BEGIN;
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
133 OMPT_REDUCTION_END;
134 ANNOTATE_REDUCE_BEFORE(reduce);
135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136 }
137 }
138 // Don't have to worry about sleep bit here or atomic since team setting
139 team_bar->b_arrived = new_state;
140 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141 "arrived(%p) = %llu\n",
142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143 new_state));
144 }
145 KA_TRACE(
146 20,
147 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148 gtid, team->t.t_id, tid, bt));
149 return false;
150 }
151
152 template <bool cancellable = false>
__kmp_linear_barrier_release_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))153 static bool __kmp_linear_barrier_release_template(
154 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158 kmp_team_t *team;
159
160 if (KMP_MASTER_TID(tid)) {
161 unsigned int i;
162 kmp_uint32 nproc = this_thr->th.th_team_nproc;
163 kmp_info_t **other_threads;
164
165 team = __kmp_threads[gtid]->th.th_team;
166 KMP_DEBUG_ASSERT(team != NULL);
167 other_threads = team->t.t_threads;
168
169 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170 "barrier type %d\n",
171 gtid, team->t.t_id, tid, bt));
172
173 if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175 {
176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177 if (propagate_icvs) {
178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179 for (i = 1; i < nproc; ++i) {
180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181 team, i, FALSE);
182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183 &team->t.t_implicit_task_taskdata[0].td_icvs);
184 }
185 ngo_sync();
186 }
187 }
188 #endif // KMP_BARRIER_ICV_PUSH
189
190 // Now, release all of the worker threads
191 for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193 // Prefetch next thread's go flag
194 if (i + 1 < nproc)
195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197 KA_TRACE(
198 20,
199 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200 "go(%p): %u => %u\n",
201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go,
204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207 other_threads[i]);
208 flag.release();
209 }
210 }
211 } else { // Wait for the MASTER thread to release us
212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214 if (cancellable) {
215 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217 return true;
218 } else {
219 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221 }
222 ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226 // disabled)
227 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228 // Cancel wait on previous parallel region...
229 __kmp_itt_task_starting(itt_sync_obj);
230
231 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232 return false;
233
234 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235 if (itt_sync_obj != NULL)
236 // Call prepare as early as possible for "new" barrier
237 __kmp_itt_task_finished(itt_sync_obj);
238 } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240 // Early exit for reaping threads releasing forkjoin barrier
241 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242 return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245 tid = __kmp_tid_from_gtid(gtid);
246 team = __kmp_threads[gtid]->th.th_team;
247 #endif
248 KMP_DEBUG_ASSERT(team != NULL);
249 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250 KA_TRACE(20,
251 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253 KMP_MB(); // Flush all pending memory write invalidates.
254 }
255 KA_TRACE(
256 20,
257 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258 gtid, team->t.t_id, tid, bt));
259 return false;
260 }
261
__kmp_linear_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))262 static void __kmp_linear_barrier_gather(
263 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265 __kmp_linear_barrier_gather_template<false>(
266 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268
__kmp_linear_barrier_gather_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))269 static bool __kmp_linear_barrier_gather_cancellable(
270 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272 return __kmp_linear_barrier_gather_template<true>(
273 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275
__kmp_linear_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))276 static void __kmp_linear_barrier_release(
277 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279 __kmp_linear_barrier_release_template<false>(
280 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282
__kmp_linear_barrier_release_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))283 static bool __kmp_linear_barrier_release_cancellable(
284 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286 return __kmp_linear_barrier_release_template<true>(
287 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289
290 // Tree barrier
291 static void
__kmp_tree_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))292 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
293 int tid, void (*reduce)(void *, void *)
294 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
295 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
296 kmp_team_t *team = this_thr->th.th_team;
297 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
298 kmp_info_t **other_threads = team->t.t_threads;
299 kmp_uint32 nproc = this_thr->th.th_team_nproc;
300 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
301 kmp_uint32 branch_factor = 1 << branch_bits;
302 kmp_uint32 child;
303 kmp_uint32 child_tid;
304 kmp_uint64 new_state;
305
306 KA_TRACE(
307 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
308 gtid, team->t.t_id, tid, bt));
309 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
310
311 #if USE_ITT_BUILD && USE_ITT_NOTIFY
312 // Barrier imbalance - save arrive time to the thread
313 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
314 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
315 __itt_get_timestamp();
316 }
317 #endif
318 // Perform tree gather to wait until all threads have arrived; reduce any
319 // required data as we go
320 child_tid = (tid << branch_bits) + 1;
321 if (child_tid < nproc) {
322 // Parent threads wait for all their children to arrive
323 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
324 child = 1;
325 do {
326 kmp_info_t *child_thr = other_threads[child_tid];
327 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
328 #if KMP_CACHE_MANAGE
329 // Prefetch next thread's arrived count
330 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
331 KMP_CACHE_PREFETCH(
332 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
333 #endif /* KMP_CACHE_MANAGE */
334 KA_TRACE(20,
335 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
336 "arrived(%p) == %llu\n",
337 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
338 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
339 // Wait for child to arrive
340 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
341 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
342 ANNOTATE_BARRIER_END(child_thr);
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344 // Barrier imbalance - write min of the thread time and a child time to
345 // the thread.
346 if (__kmp_forkjoin_frames_mode == 2) {
347 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
348 child_thr->th.th_bar_min_time);
349 }
350 #endif
351 if (reduce) {
352 KA_TRACE(100,
353 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
354 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
355 team->t.t_id, child_tid));
356 ANNOTATE_REDUCE_AFTER(reduce);
357 OMPT_REDUCTION_DECL(this_thr, gtid);
358 OMPT_REDUCTION_BEGIN;
359 (*reduce)(this_thr->th.th_local.reduce_data,
360 child_thr->th.th_local.reduce_data);
361 OMPT_REDUCTION_END;
362 ANNOTATE_REDUCE_BEFORE(reduce);
363 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
364 }
365 child++;
366 child_tid++;
367 } while (child <= branch_factor && child_tid < nproc);
368 }
369
370 if (!KMP_MASTER_TID(tid)) { // Worker threads
371 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372
373 KA_TRACE(20,
374 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
375 "arrived(%p): %llu => %llu\n",
376 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
377 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
378 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
379
380 // Mark arrival to parent thread
381 /* After performing this write, a worker thread may not assume that the team
382 is valid any more - it could be deallocated by the master thread at any
383 time. */
384 ANNOTATE_BARRIER_BEGIN(this_thr);
385 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
386 flag.release();
387 } else {
388 // Need to update the team arrived pointer if we are the master thread
389 if (nproc > 1) // New value was already computed above
390 team->t.t_bar[bt].b_arrived = new_state;
391 else
392 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
393 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
394 "arrived(%p) = %llu\n",
395 gtid, team->t.t_id, tid, team->t.t_id,
396 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397 }
398 KA_TRACE(20,
399 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
400 gtid, team->t.t_id, tid, bt));
401 }
402
__kmp_tree_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))403 static void __kmp_tree_barrier_release(
404 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
405 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
406 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
407 kmp_team_t *team;
408 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
409 kmp_uint32 nproc;
410 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
411 kmp_uint32 branch_factor = 1 << branch_bits;
412 kmp_uint32 child;
413 kmp_uint32 child_tid;
414
415 // Perform a tree release for all of the threads that have been gathered
416 if (!KMP_MASTER_TID(
417 tid)) { // Handle fork barrier workers who aren't part of a team yet
418 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
419 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
420 // Wait for parent thread to release us
421 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
422 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
423 ANNOTATE_BARRIER_END(this_thr);
424 #if USE_ITT_BUILD && USE_ITT_NOTIFY
425 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
426 // In fork barrier where we could not get the object reliably (or
427 // ITTNOTIFY is disabled)
428 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
429 // Cancel wait on previous parallel region...
430 __kmp_itt_task_starting(itt_sync_obj);
431
432 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433 return;
434
435 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
436 if (itt_sync_obj != NULL)
437 // Call prepare as early as possible for "new" barrier
438 __kmp_itt_task_finished(itt_sync_obj);
439 } else
440 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
441 // Early exit for reaping threads releasing forkjoin barrier
442 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
443 return;
444
445 // The worker thread may now assume that the team is valid.
446 team = __kmp_threads[gtid]->th.th_team;
447 KMP_DEBUG_ASSERT(team != NULL);
448 tid = __kmp_tid_from_gtid(gtid);
449
450 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
451 KA_TRACE(20,
452 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
453 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
454 KMP_MB(); // Flush all pending memory write invalidates.
455 } else {
456 team = __kmp_threads[gtid]->th.th_team;
457 KMP_DEBUG_ASSERT(team != NULL);
458 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
459 "barrier type %d\n",
460 gtid, team->t.t_id, tid, bt));
461 }
462 nproc = this_thr->th.th_team_nproc;
463 child_tid = (tid << branch_bits) + 1;
464
465 if (child_tid < nproc) {
466 kmp_info_t **other_threads = team->t.t_threads;
467 child = 1;
468 // Parent threads release all their children
469 do {
470 kmp_info_t *child_thr = other_threads[child_tid];
471 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473 // Prefetch next thread's go count
474 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
475 KMP_CACHE_PREFETCH(
476 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
477 #endif /* KMP_CACHE_MANAGE */
478
479 #if KMP_BARRIER_ICV_PUSH
480 {
481 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
482 if (propagate_icvs) {
483 __kmp_init_implicit_task(team->t.t_ident,
484 team->t.t_threads[child_tid], team,
485 child_tid, FALSE);
486 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
487 &team->t.t_implicit_task_taskdata[0].td_icvs);
488 }
489 }
490 #endif // KMP_BARRIER_ICV_PUSH
491 KA_TRACE(20,
492 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
493 "go(%p): %u => %u\n",
494 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
495 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
496 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
497 // Release child from barrier
498 ANNOTATE_BARRIER_BEGIN(child_thr);
499 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
500 flag.release();
501 child++;
502 child_tid++;
503 } while (child <= branch_factor && child_tid < nproc);
504 }
505 KA_TRACE(
506 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
507 gtid, team->t.t_id, tid, bt));
508 }
509
510 // Hyper Barrier
511 static void
__kmp_hyper_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))512 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
513 int tid, void (*reduce)(void *, void *)
514 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
515 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
516 kmp_team_t *team = this_thr->th.th_team;
517 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
518 kmp_info_t **other_threads = team->t.t_threads;
519 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
520 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
521 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
522 kmp_uint32 branch_factor = 1 << branch_bits;
523 kmp_uint32 offset;
524 kmp_uint32 level;
525
526 KA_TRACE(
527 20,
528 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
529 gtid, team->t.t_id, tid, bt));
530 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
531
532 #if USE_ITT_BUILD && USE_ITT_NOTIFY
533 // Barrier imbalance - save arrive time to the thread
534 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
535 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
536 __itt_get_timestamp();
537 }
538 #endif
539 /* Perform a hypercube-embedded tree gather to wait until all of the threads
540 have arrived, and reduce any required data as we go. */
541 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
542 for (level = 0, offset = 1; offset < num_threads;
543 level += branch_bits, offset <<= branch_bits) {
544 kmp_uint32 child;
545 kmp_uint32 child_tid;
546
547 if (((tid >> level) & (branch_factor - 1)) != 0) {
548 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549
550 KMP_MB(); // Synchronize parent and child threads.
551 KA_TRACE(20,
552 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
553 "arrived(%p): %llu => %llu\n",
554 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
555 team->t.t_id, parent_tid, &thr_bar->b_arrived,
556 thr_bar->b_arrived,
557 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
558 // Mark arrival to parent thread
559 /* After performing this write (in the last iteration of the enclosing for
560 loop), a worker thread may not assume that the team is valid any more
561 - it could be deallocated by the master thread at any time. */
562 ANNOTATE_BARRIER_BEGIN(this_thr);
563 p_flag.set_waiter(other_threads[parent_tid]);
564 p_flag.release();
565 break;
566 }
567
568 // Parent threads wait for children to arrive
569 if (new_state == KMP_BARRIER_UNUSED_STATE)
570 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
571 for (child = 1, child_tid = tid + (1 << level);
572 child < branch_factor && child_tid < num_threads;
573 child++, child_tid += (1 << level)) {
574 kmp_info_t *child_thr = other_threads[child_tid];
575 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
576 #if KMP_CACHE_MANAGE
577 kmp_uint32 next_child_tid = child_tid + (1 << level);
578 // Prefetch next thread's arrived count
579 if (child + 1 < branch_factor && next_child_tid < num_threads)
580 KMP_CACHE_PREFETCH(
581 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
582 #endif /* KMP_CACHE_MANAGE */
583 KA_TRACE(20,
584 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
585 "arrived(%p) == %llu\n",
586 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
587 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
588 // Wait for child to arrive
589 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
590 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
591 ANNOTATE_BARRIER_END(child_thr);
592 KMP_MB(); // Synchronize parent and child threads.
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594 // Barrier imbalance - write min of the thread time and a child time to
595 // the thread.
596 if (__kmp_forkjoin_frames_mode == 2) {
597 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598 child_thr->th.th_bar_min_time);
599 }
600 #endif
601 if (reduce) {
602 KA_TRACE(100,
603 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605 team->t.t_id, child_tid));
606 ANNOTATE_REDUCE_AFTER(reduce);
607 OMPT_REDUCTION_DECL(this_thr, gtid);
608 OMPT_REDUCTION_BEGIN;
609 (*reduce)(this_thr->th.th_local.reduce_data,
610 child_thr->th.th_local.reduce_data);
611 OMPT_REDUCTION_END;
612 ANNOTATE_REDUCE_BEFORE(reduce);
613 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614 }
615 }
616 }
617
618 if (KMP_MASTER_TID(tid)) {
619 // Need to update the team arrived pointer if we are the master thread
620 if (new_state == KMP_BARRIER_UNUSED_STATE)
621 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622 else
623 team->t.t_bar[bt].b_arrived = new_state;
624 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625 "arrived(%p) = %llu\n",
626 gtid, team->t.t_id, tid, team->t.t_id,
627 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628 }
629 KA_TRACE(
630 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631 gtid, team->t.t_id, tid, bt));
632 }
633
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
__kmp_hyper_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))636 static void __kmp_hyper_barrier_release(
637 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640 kmp_team_t *team;
641 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642 kmp_info_t **other_threads;
643 kmp_uint32 num_threads;
644 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645 kmp_uint32 branch_factor = 1 << branch_bits;
646 kmp_uint32 child;
647 kmp_uint32 child_tid;
648 kmp_uint32 offset;
649 kmp_uint32 level;
650
651 /* Perform a hypercube-embedded tree release for all of the threads that have
652 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653 are released in the reverse order of the corresponding gather, otherwise
654 threads are released in the same order. */
655 if (KMP_MASTER_TID(tid)) { // master
656 team = __kmp_threads[gtid]->th.th_team;
657 KMP_DEBUG_ASSERT(team != NULL);
658 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659 "barrier type %d\n",
660 gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662 if (propagate_icvs) { // master already has ICVs in final destination; copy
663 copy_icvs(&thr_bar->th_fixed_icvs,
664 &team->t.t_implicit_task_taskdata[tid].td_icvs);
665 }
666 #endif
667 } else { // Handle fork barrier workers who aren't part of a team yet
668 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670 // Wait for parent thread to release us
671 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673 ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676 // In fork barrier where we could not get the object reliably
677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678 // Cancel wait on previous parallel region...
679 __kmp_itt_task_starting(itt_sync_obj);
680
681 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682 return;
683
684 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685 if (itt_sync_obj != NULL)
686 // Call prepare as early as possible for "new" barrier
687 __kmp_itt_task_finished(itt_sync_obj);
688 } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690 // Early exit for reaping threads releasing forkjoin barrier
691 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692 return;
693
694 // The worker thread may now assume that the team is valid.
695 team = __kmp_threads[gtid]->th.th_team;
696 KMP_DEBUG_ASSERT(team != NULL);
697 tid = __kmp_tid_from_gtid(gtid);
698
699 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700 KA_TRACE(20,
701 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703 KMP_MB(); // Flush all pending memory write invalidates.
704 }
705 num_threads = this_thr->th.th_team_nproc;
706 other_threads = team->t.t_threads;
707
708 #ifdef KMP_REVERSE_HYPER_BAR
709 // Count up to correct level for parent
710 for (level = 0, offset = 1;
711 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712 level += branch_bits, offset <<= branch_bits)
713 ;
714
715 // Now go down from there
716 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717 level -= branch_bits, offset >>= branch_bits)
718 #else
719 // Go down the tree, level by level
720 for (level = 0, offset = 1; offset < num_threads;
721 level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723 {
724 #ifdef KMP_REVERSE_HYPER_BAR
725 /* Now go in reverse order through the children, highest to lowest.
726 Initial setting of child is conservative here. */
727 child = num_threads >> ((level == 0) ? level : level - 1);
728 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729 child_tid = tid + (child << level);
730 child >= 1; child--, child_tid -= (1 << level))
731 #else
732 if (((tid >> level) & (branch_factor - 1)) != 0)
733 // No need to go lower than this, since this is the level parent would be
734 // notified
735 break;
736 // Iterate through children on this level of the tree
737 for (child = 1, child_tid = tid + (1 << level);
738 child < branch_factor && child_tid < num_threads;
739 child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741 {
742 if (child_tid >= num_threads)
743 continue; // Child doesn't exist so keep going
744 else {
745 kmp_info_t *child_thr = other_threads[child_tid];
746 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748 kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751 if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753 if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755 KMP_CACHE_PREFETCH(
756 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758
759 #if KMP_BARRIER_ICV_PUSH
760 if (propagate_icvs) // push my fixed ICVs to my child
761 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763
764 KA_TRACE(
765 20,
766 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767 "go(%p): %u => %u\n",
768 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771 // Release child from barrier
772 ANNOTATE_BARRIER_BEGIN(child_thr);
773 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
774 flag.release();
775 }
776 }
777 }
778 #if KMP_BARRIER_ICV_PUSH
779 if (propagate_icvs &&
780 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782 FALSE);
783 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784 &thr_bar->th_fixed_icvs);
785 }
786 #endif
787 KA_TRACE(
788 20,
789 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790 gtid, team->t.t_id, tid, bt));
791 }
792
793 // Hierarchical Barrier
794
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797 Performs the minimum amount of initialization required based on how the team
798 has changed. Returns true if leaf children will require both on-core and
799 traditional wake-up mechanisms. For example, if the team size increases,
800 threads already in the team will respond to on-core wakeup on their parent
801 thread, but threads newly added to the team will only be listening on the
802 their local b_go. */
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt,kmp_bstate_t * thr_bar,kmp_uint32 nproc,int gtid,int tid,kmp_team_t * team)803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804 kmp_bstate_t *thr_bar,
805 kmp_uint32 nproc, int gtid,
806 int tid, kmp_team_t *team) {
807 // Checks to determine if (re-)initialization is needed
808 bool uninitialized = thr_bar->team == NULL;
809 bool team_changed = team != thr_bar->team;
810 bool team_sz_changed = nproc != thr_bar->nproc;
811 bool tid_changed = tid != thr_bar->old_tid;
812 bool retval = false;
813
814 if (uninitialized || team_sz_changed) {
815 __kmp_get_hierarchy(nproc, thr_bar);
816 }
817
818 if (uninitialized || team_sz_changed || tid_changed) {
819 thr_bar->my_level = thr_bar->depth - 1; // default for master
820 thr_bar->parent_tid = -1; // default for master
821 if (!KMP_MASTER_TID(
822 tid)) { // if not master, find parent thread in hierarchy
823 kmp_uint32 d = 0;
824 while (d < thr_bar->depth) { // find parent based on level of thread in
825 // hierarchy, and note level
826 kmp_uint32 rem;
827 if (d == thr_bar->depth - 2) { // reached level right below the master
828 thr_bar->parent_tid = 0;
829 thr_bar->my_level = d;
830 break;
831 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
832 // TODO: can we make the above op faster?
833 // thread is not a subtree root at next level, so this is max
834 thr_bar->parent_tid = tid - rem;
835 thr_bar->my_level = d;
836 break;
837 }
838 ++d;
839 }
840 }
841 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
842 (thr_bar->skip_per_level[thr_bar->my_level])),
843 &(thr_bar->offset));
844 thr_bar->old_tid = tid;
845 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846 thr_bar->team = team;
847 thr_bar->parent_bar =
848 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849 }
850 if (uninitialized || team_changed || tid_changed) {
851 thr_bar->team = team;
852 thr_bar->parent_bar =
853 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854 retval = true;
855 }
856 if (uninitialized || team_sz_changed || tid_changed) {
857 thr_bar->nproc = nproc;
858 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859 if (thr_bar->my_level == 0)
860 thr_bar->leaf_kids = 0;
861 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
863 thr_bar->leaf_state = 0;
864 for (int i = 0; i < thr_bar->leaf_kids; ++i)
865 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866 }
867 return retval;
868 }
869
__kmp_hierarchical_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))870 static void __kmp_hierarchical_barrier_gather(
871 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874 kmp_team_t *team = this_thr->th.th_team;
875 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876 kmp_uint32 nproc = this_thr->th.th_team_nproc;
877 kmp_info_t **other_threads = team->t.t_threads;
878 kmp_uint64 new_state;
879
880 int level = team->t.t_level;
881 if (other_threads[0]
882 ->th.th_teams_microtask) // are we inside the teams construct?
883 if (this_thr->th.th_teams_size.nteams > 1)
884 ++level; // level was not increased in teams construct for team_of_masters
885 if (level == 1)
886 thr_bar->use_oncore_barrier = 1;
887 else
888 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889
890 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891 "barrier type %d\n",
892 gtid, team->t.t_id, tid, bt));
893 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896 // Barrier imbalance - save arrive time to the thread
897 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899 }
900 #endif
901
902 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903 team);
904
905 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906 kmp_int32 child_tid;
907 new_state =
908 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910 thr_bar->use_oncore_barrier) {
911 if (thr_bar->leaf_kids) {
912 // First, wait for leaf children to check-in on my b_arrived flag
913 kmp_uint64 leaf_state =
914 KMP_MASTER_TID(tid)
915 ? thr_bar->b_arrived | thr_bar->leaf_state
916 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918 "for leaf kids\n",
919 gtid, team->t.t_id, tid));
920 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
921 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922 if (reduce) {
923 ANNOTATE_REDUCE_AFTER(reduce);
924 OMPT_REDUCTION_DECL(this_thr, gtid);
925 OMPT_REDUCTION_BEGIN;
926 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927 ++child_tid) {
928 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929 "T#%d(%d:%d)\n",
930 gtid, team->t.t_id, tid,
931 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932 child_tid));
933 ANNOTATE_BARRIER_END(other_threads[child_tid]);
934 (*reduce)(this_thr->th.th_local.reduce_data,
935 other_threads[child_tid]->th.th_local.reduce_data);
936 }
937 OMPT_REDUCTION_END;
938 ANNOTATE_REDUCE_BEFORE(reduce);
939 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940 }
941 // clear leaf_state bits
942 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943 }
944 // Next, wait for higher level children on each child's b_arrived flag
945 for (kmp_uint32 d = 1; d < thr_bar->my_level;
946 ++d) { // gather lowest level threads first, but skip 0
947 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948 skip = thr_bar->skip_per_level[d];
949 if (last > nproc)
950 last = nproc;
951 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952 kmp_info_t *child_thr = other_threads[child_tid];
953 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955 "T#%d(%d:%d) "
956 "arrived(%p) == %llu\n",
957 gtid, team->t.t_id, tid,
958 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959 child_tid, &child_bar->b_arrived, new_state));
960 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962 ANNOTATE_BARRIER_END(child_thr);
963 if (reduce) {
964 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965 "T#%d(%d:%d)\n",
966 gtid, team->t.t_id, tid,
967 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968 child_tid));
969 ANNOTATE_REDUCE_AFTER(reduce);
970 (*reduce)(this_thr->th.th_local.reduce_data,
971 child_thr->th.th_local.reduce_data);
972 ANNOTATE_REDUCE_BEFORE(reduce);
973 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974 }
975 }
976 }
977 } else { // Blocktime is not infinite
978 for (kmp_uint32 d = 0; d < thr_bar->my_level;
979 ++d) { // Gather lowest level threads first
980 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981 skip = thr_bar->skip_per_level[d];
982 if (last > nproc)
983 last = nproc;
984 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985 kmp_info_t *child_thr = other_threads[child_tid];
986 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988 "T#%d(%d:%d) "
989 "arrived(%p) == %llu\n",
990 gtid, team->t.t_id, tid,
991 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992 child_tid, &child_bar->b_arrived, new_state));
993 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
994 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995 ANNOTATE_BARRIER_END(child_thr);
996 if (reduce) {
997 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998 "T#%d(%d:%d)\n",
999 gtid, team->t.t_id, tid,
1000 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001 child_tid));
1002 ANNOTATE_REDUCE_AFTER(reduce);
1003 (*reduce)(this_thr->th.th_local.reduce_data,
1004 child_thr->th.th_local.reduce_data);
1005 ANNOTATE_REDUCE_BEFORE(reduce);
1006 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007 }
1008 }
1009 }
1010 }
1011 }
1012 // All subordinates are gathered; now release parent if not master thread
1013
1014 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017 gtid, team->t.t_id, tid,
1018 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021 /* Mark arrival to parent: After performing this write, a worker thread may
1022 not assume that the team is valid any more - it could be deallocated by
1023 the master thread at any time. */
1024 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026 // flag; release it
1027 ANNOTATE_BARRIER_BEGIN(this_thr);
1028 kmp_flag_64<> flag(&thr_bar->b_arrived,
1029 other_threads[thr_bar->parent_tid]);
1030 flag.release();
1031 } else {
1032 // Leaf does special release on "offset" bits of parent's b_arrived flag
1033 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1034 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1035 thr_bar->offset + 1);
1036 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1037 flag.release();
1038 }
1039 } else { // Master thread needs to update the team's b_arrived value
1040 team->t.t_bar[bt].b_arrived = new_state;
1041 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1042 "arrived(%p) = %llu\n",
1043 gtid, team->t.t_id, tid, team->t.t_id,
1044 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1045 }
1046 // Is the team access below unsafe or just technically invalid?
1047 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1048 "barrier type %d\n",
1049 gtid, team->t.t_id, tid, bt));
1050 }
1051
__kmp_hierarchical_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))1052 static void __kmp_hierarchical_barrier_release(
1053 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1054 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1055 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1056 kmp_team_t *team;
1057 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1058 kmp_uint32 nproc;
1059 bool team_change = false; // indicates on-core barrier shouldn't be used
1060
1061 if (KMP_MASTER_TID(tid)) {
1062 team = __kmp_threads[gtid]->th.th_team;
1063 KMP_DEBUG_ASSERT(team != NULL);
1064 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1065 "entered barrier type %d\n",
1066 gtid, team->t.t_id, tid, bt));
1067 } else { // Worker threads
1068 // Wait for parent thread to release me
1069 if (!thr_bar->use_oncore_barrier ||
1070 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1071 thr_bar->team == NULL) {
1072 // Use traditional method of waiting on my own b_go flag
1073 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1074 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1075 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1076 ANNOTATE_BARRIER_END(this_thr);
1077 TCW_8(thr_bar->b_go,
1078 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1079 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1080 // infinite, not nested
1081 // Wait on my "offset" bits on parent's b_go flag
1082 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1083 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1084 thr_bar->offset + 1, bt,
1085 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1086 flag.wait(this_thr, TRUE);
1087 if (thr_bar->wait_flag ==
1088 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1089 TCW_8(thr_bar->b_go,
1090 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1091 } else { // Reset my bits on parent's b_go flag
1092 (RCAST(volatile char *,
1093 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1094 }
1095 }
1096 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1097 // Early exit for reaping threads releasing forkjoin barrier
1098 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1099 return;
1100 // The worker thread may now assume that the team is valid.
1101 team = __kmp_threads[gtid]->th.th_team;
1102 KMP_DEBUG_ASSERT(team != NULL);
1103 tid = __kmp_tid_from_gtid(gtid);
1104
1105 KA_TRACE(
1106 20,
1107 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1108 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1109 KMP_MB(); // Flush all pending memory write invalidates.
1110 }
1111
1112 nproc = this_thr->th.th_team_nproc;
1113 int level = team->t.t_level;
1114 if (team->t.t_threads[0]
1115 ->th.th_teams_microtask) { // are we inside the teams construct?
1116 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1117 this_thr->th.th_teams_level == level)
1118 ++level; // level was not increased in teams construct for team_of_workers
1119 if (this_thr->th.th_teams_size.nteams > 1)
1120 ++level; // level was not increased in teams construct for team_of_masters
1121 }
1122 if (level == 1)
1123 thr_bar->use_oncore_barrier = 1;
1124 else
1125 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1126
1127 // If the team size has increased, we still communicate with old leaves via
1128 // oncore barrier.
1129 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1130 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1131 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1132 tid, team);
1133 // But if the entire team changes, we won't use oncore barrier at all
1134 if (team_change)
1135 old_leaf_kids = 0;
1136
1137 #if KMP_BARRIER_ICV_PUSH
1138 if (propagate_icvs) {
1139 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1140 FALSE);
1141 if (KMP_MASTER_TID(
1142 tid)) { // master already has copy in final destination; copy
1143 copy_icvs(&thr_bar->th_fixed_icvs,
1144 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1145 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1146 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1147 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1148 // leaves (on-core children) pull parent's fixed ICVs directly to local
1149 // ICV store
1150 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1151 &thr_bar->parent_bar->th_fixed_icvs);
1152 // non-leaves will get ICVs piggybacked with b_go via NGO store
1153 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1154 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1155 // access
1156 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1157 else // leaves copy parent's fixed ICVs directly to local ICV store
1158 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1159 &thr_bar->parent_bar->th_fixed_icvs);
1160 }
1161 }
1162 #endif // KMP_BARRIER_ICV_PUSH
1163
1164 // Now, release my children
1165 if (thr_bar->my_level) { // not a leaf
1166 kmp_int32 child_tid;
1167 kmp_uint32 last;
1168 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1169 thr_bar->use_oncore_barrier) {
1170 if (KMP_MASTER_TID(tid)) { // do a flat release
1171 // Set local b_go to bump children via NGO store of the cache line
1172 // containing IVCs and b_go.
1173 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1174 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1175 // the cache line
1176 ngo_load(&thr_bar->th_fixed_icvs);
1177 // This loops over all the threads skipping only the leaf nodes in the
1178 // hierarchy
1179 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1180 child_tid += thr_bar->skip_per_level[1]) {
1181 kmp_bstate_t *child_bar =
1182 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1183 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1184 "releasing T#%d(%d:%d)"
1185 " go(%p): %u => %u\n",
1186 gtid, team->t.t_id, tid,
1187 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1188 child_tid, &child_bar->b_go, child_bar->b_go,
1189 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1190 // Use ngo store (if available) to both store ICVs and release child
1191 // via child's b_go
1192 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1193 }
1194 ngo_sync();
1195 }
1196 TCW_8(thr_bar->b_go,
1197 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1198 // Now, release leaf children
1199 if (thr_bar->leaf_kids) { // if there are any
1200 // We test team_change on the off-chance that the level 1 team changed.
1201 if (team_change ||
1202 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1203 if (old_leaf_kids) { // release old leaf kids
1204 thr_bar->b_go |= old_leaf_state;
1205 }
1206 // Release new leaf kids
1207 last = tid + thr_bar->skip_per_level[1];
1208 if (last > nproc)
1209 last = nproc;
1210 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1211 ++child_tid) { // skip_per_level[0]=1
1212 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1213 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1214 KA_TRACE(
1215 20,
1216 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1217 " T#%d(%d:%d) go(%p): %u => %u\n",
1218 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1219 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1220 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1221 // Release child using child's b_go flag
1222 ANNOTATE_BARRIER_BEGIN(child_thr);
1223 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1224 flag.release();
1225 }
1226 } else { // Release all children at once with leaf_state bits on my own
1227 // b_go flag
1228 thr_bar->b_go |= thr_bar->leaf_state;
1229 }
1230 }
1231 } else { // Blocktime is not infinite; do a simple hierarchical release
1232 for (int d = thr_bar->my_level - 1; d >= 0;
1233 --d) { // Release highest level threads first
1234 last = tid + thr_bar->skip_per_level[d + 1];
1235 kmp_uint32 skip = thr_bar->skip_per_level[d];
1236 if (last > nproc)
1237 last = nproc;
1238 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1239 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1240 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1241 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1242 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1243 gtid, team->t.t_id, tid,
1244 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1245 child_tid, &child_bar->b_go, child_bar->b_go,
1246 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1247 // Release child using child's b_go flag
1248 ANNOTATE_BARRIER_BEGIN(child_thr);
1249 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1250 flag.release();
1251 }
1252 }
1253 }
1254 #if KMP_BARRIER_ICV_PUSH
1255 if (propagate_icvs && !KMP_MASTER_TID(tid))
1256 // non-leaves copy ICVs from fixed ICVs to local dest
1257 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1258 &thr_bar->th_fixed_icvs);
1259 #endif // KMP_BARRIER_ICV_PUSH
1260 }
1261 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1262 "barrier type %d\n",
1263 gtid, team->t.t_id, tid, bt));
1264 }
1265
1266 // End of Barrier Algorithms
1267
1268 // type traits for cancellable value
1269 // if cancellable is true, then is_cancellable is a normal boolean variable
1270 // if cancellable is false, then is_cancellable is a compile time constant
1271 template <bool cancellable> struct is_cancellable {};
1272 template <> struct is_cancellable<true> {
1273 bool value;
is_cancellableis_cancellable1274 is_cancellable() : value(false) {}
is_cancellableis_cancellable1275 is_cancellable(bool b) : value(b) {}
operator =is_cancellable1276 is_cancellable &operator=(bool b) {
1277 value = b;
1278 return *this;
1279 }
operator boolis_cancellable1280 operator bool() const { return value; }
1281 };
1282 template <> struct is_cancellable<false> {
operator =is_cancellable1283 is_cancellable &operator=(bool b) { return *this; }
operator boolis_cancellable1284 constexpr operator bool() const { return false; }
1285 };
1286
1287 // Internal function to do a barrier.
1288 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1289 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1290 barrier
1291 When cancellable = false,
1292 Returns 0 if master thread, 1 if worker thread.
1293 When cancellable = true
1294 Returns 0 if not cancelled, 1 if cancelled. */
1295 template <bool cancellable = false>
__kmp_barrier_template(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1296 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1297 size_t reduce_size, void *reduce_data,
1298 void (*reduce)(void *, void *)) {
1299 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1300 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1301 int tid = __kmp_tid_from_gtid(gtid);
1302 kmp_info_t *this_thr = __kmp_threads[gtid];
1303 kmp_team_t *team = this_thr->th.th_team;
1304 int status = 0;
1305 is_cancellable<cancellable> cancelled;
1306 #if OMPT_SUPPORT && OMPT_OPTIONAL
1307 ompt_data_t *my_task_data;
1308 ompt_data_t *my_parallel_data;
1309 void *return_address;
1310 ompt_sync_region_t barrier_kind;
1311 #endif
1312
1313 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1314 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1315
1316 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1317 #if OMPT_SUPPORT
1318 if (ompt_enabled.enabled) {
1319 #if OMPT_OPTIONAL
1320 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1321 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1322 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1323 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1324 if (ompt_enabled.ompt_callback_sync_region) {
1325 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1326 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1327 return_address);
1328 }
1329 if (ompt_enabled.ompt_callback_sync_region_wait) {
1330 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1331 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1332 return_address);
1333 }
1334 #endif
1335 // It is OK to report the barrier state after the barrier begin callback.
1336 // According to the OMPT specification, a compliant implementation may
1337 // even delay reporting this state until the barrier begins to wait.
1338 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1339 }
1340 #endif
1341
1342 if (!team->t.t_serialized) {
1343 #if USE_ITT_BUILD
1344 // This value will be used in itt notify events below.
1345 void *itt_sync_obj = NULL;
1346 #if USE_ITT_NOTIFY
1347 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1348 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1349 #endif
1350 #endif /* USE_ITT_BUILD */
1351 if (__kmp_tasking_mode == tskm_extra_barrier) {
1352 __kmp_tasking_barrier(team, this_thr, gtid);
1353 KA_TRACE(15,
1354 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1355 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1356 }
1357
1358 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1359 access it when the team struct is not guaranteed to exist. */
1360 // See note about the corresponding code in __kmp_join_barrier() being
1361 // performance-critical.
1362 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1363 #if KMP_USE_MONITOR
1364 this_thr->th.th_team_bt_intervals =
1365 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1366 this_thr->th.th_team_bt_set =
1367 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1368 #else
1369 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1370 #endif
1371 }
1372
1373 #if USE_ITT_BUILD
1374 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1375 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1376 #endif /* USE_ITT_BUILD */
1377 #if USE_DEBUGGER
1378 // Let the debugger know: the thread arrived to the barrier and waiting.
1379 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1380 team->t.t_bar[bt].b_master_arrived += 1;
1381 } else {
1382 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1383 } // if
1384 #endif /* USE_DEBUGGER */
1385 if (reduce != NULL) {
1386 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1387 this_thr->th.th_local.reduce_data = reduce_data;
1388 }
1389
1390 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1391 // use 0 to only setup the current team if nthreads > 1
1392 __kmp_task_team_setup(this_thr, team, 0);
1393
1394 if (cancellable) {
1395 cancelled = __kmp_linear_barrier_gather_cancellable(
1396 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1397 } else {
1398 switch (__kmp_barrier_gather_pattern[bt]) {
1399 case bp_hyper_bar: {
1400 // don't set branch bits to 0; use linear
1401 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1402 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1403 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1404 break;
1405 }
1406 case bp_hierarchical_bar: {
1407 __kmp_hierarchical_barrier_gather(
1408 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1409 break;
1410 }
1411 case bp_tree_bar: {
1412 // don't set branch bits to 0; use linear
1413 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1414 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1415 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1416 break;
1417 }
1418 default: {
1419 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1420 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1421 }
1422 }
1423 }
1424
1425 KMP_MB();
1426
1427 if (KMP_MASTER_TID(tid)) {
1428 status = 0;
1429 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1430 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1431 }
1432 #if USE_DEBUGGER
1433 // Let the debugger know: All threads are arrived and starting leaving the
1434 // barrier.
1435 team->t.t_bar[bt].b_team_arrived += 1;
1436 #endif
1437
1438 if (__kmp_omp_cancellation) {
1439 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1440 // Reset cancellation flag for worksharing constructs
1441 if (cancel_request == cancel_loop ||
1442 cancel_request == cancel_sections) {
1443 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1444 }
1445 }
1446 #if USE_ITT_BUILD
1447 /* TODO: In case of split reduction barrier, master thread may send
1448 acquired event early, before the final summation into the shared
1449 variable is done (final summation can be a long operation for array
1450 reductions). */
1451 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1452 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1453 #endif /* USE_ITT_BUILD */
1454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1455 // Barrier - report frame end (only if active_level == 1)
1456 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1457 __kmp_forkjoin_frames_mode &&
1458 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1459 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1460 team->t.t_active_level == 1) {
1461 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1462 kmp_uint64 cur_time = __itt_get_timestamp();
1463 kmp_info_t **other_threads = team->t.t_threads;
1464 int nproc = this_thr->th.th_team_nproc;
1465 int i;
1466 switch (__kmp_forkjoin_frames_mode) {
1467 case 1:
1468 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1469 loc, nproc);
1470 this_thr->th.th_frame_time = cur_time;
1471 break;
1472 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1473 // be fixed)
1474 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1475 1, loc, nproc);
1476 break;
1477 case 3:
1478 if (__itt_metadata_add_ptr) {
1479 // Initialize with master's wait time
1480 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1481 // Set arrive time to zero to be able to check it in
1482 // __kmp_invoke_task(); the same is done inside the loop below
1483 this_thr->th.th_bar_arrive_time = 0;
1484 for (i = 1; i < nproc; ++i) {
1485 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1486 other_threads[i]->th.th_bar_arrive_time = 0;
1487 }
1488 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1489 cur_time, delta,
1490 (kmp_uint64)(reduce != NULL));
1491 }
1492 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1493 loc, nproc);
1494 this_thr->th.th_frame_time = cur_time;
1495 break;
1496 }
1497 }
1498 #endif /* USE_ITT_BUILD */
1499 } else {
1500 status = 1;
1501 #if USE_ITT_BUILD
1502 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505 }
1506 if ((status == 1 || !is_split) && !cancelled) {
1507 if (cancellable) {
1508 cancelled = __kmp_linear_barrier_release_cancellable(
1509 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1510 } else {
1511 switch (__kmp_barrier_release_pattern[bt]) {
1512 case bp_hyper_bar: {
1513 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1514 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1515 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516 break;
1517 }
1518 case bp_hierarchical_bar: {
1519 __kmp_hierarchical_barrier_release(
1520 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1521 break;
1522 }
1523 case bp_tree_bar: {
1524 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1525 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1526 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1527 break;
1528 }
1529 default: {
1530 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1531 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1532 }
1533 }
1534 }
1535 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1536 __kmp_task_team_sync(this_thr, team);
1537 }
1538 }
1539
1540 #if USE_ITT_BUILD
1541 /* GEH: TODO: Move this under if-condition above and also include in
1542 __kmp_end_split_barrier(). This will more accurately represent the actual
1543 release time of the threads for split barriers. */
1544 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1545 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1546 #endif /* USE_ITT_BUILD */
1547 } else { // Team is serialized.
1548 status = 0;
1549 if (__kmp_tasking_mode != tskm_immediate_exec) {
1550 if (this_thr->th.th_task_team != NULL) {
1551 #if USE_ITT_NOTIFY
1552 void *itt_sync_obj = NULL;
1553 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1554 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1555 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1556 }
1557 #endif
1558
1559 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1560 TRUE);
1561 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1562 __kmp_task_team_setup(this_thr, team, 0);
1563
1564 #if USE_ITT_BUILD
1565 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1566 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1567 #endif /* USE_ITT_BUILD */
1568 }
1569 }
1570 }
1571 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1572 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1573 __kmp_tid_from_gtid(gtid), status));
1574
1575 #if OMPT_SUPPORT
1576 if (ompt_enabled.enabled) {
1577 #if OMPT_OPTIONAL
1578 if (ompt_enabled.ompt_callback_sync_region_wait) {
1579 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1580 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581 return_address);
1582 }
1583 if (ompt_enabled.ompt_callback_sync_region) {
1584 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1585 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1586 return_address);
1587 }
1588 #endif
1589 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1590 }
1591 #endif
1592 ANNOTATE_BARRIER_END(&team->t.t_bar);
1593
1594 if (cancellable)
1595 return (int)cancelled;
1596 return status;
1597 }
1598
1599 // Returns 0 if master thread, 1 if worker thread.
__kmp_barrier(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1600 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1601 size_t reduce_size, void *reduce_data,
1602 void (*reduce)(void *, void *)) {
1603 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1604 reduce);
1605 }
1606
1607 #if defined(KMP_GOMP_COMPAT)
1608 // Returns 1 if cancelled, 0 otherwise
__kmp_barrier_gomp_cancel(int gtid)1609 int __kmp_barrier_gomp_cancel(int gtid) {
1610 if (__kmp_omp_cancellation) {
1611 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1612 0, NULL, NULL);
1613 if (cancelled) {
1614 int tid = __kmp_tid_from_gtid(gtid);
1615 kmp_info_t *this_thr = __kmp_threads[gtid];
1616 if (KMP_MASTER_TID(tid)) {
1617 // Master does not need to revert anything
1618 } else {
1619 // Workers need to revert their private b_arrived flag
1620 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1621 KMP_BARRIER_STATE_BUMP;
1622 }
1623 }
1624 return cancelled;
1625 }
1626 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1627 return FALSE;
1628 }
1629 #endif
1630
__kmp_end_split_barrier(enum barrier_type bt,int gtid)1631 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1632 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1633 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1634 int tid = __kmp_tid_from_gtid(gtid);
1635 kmp_info_t *this_thr = __kmp_threads[gtid];
1636 kmp_team_t *team = this_thr->th.th_team;
1637
1638 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1639 if (!team->t.t_serialized) {
1640 if (KMP_MASTER_GTID(gtid)) {
1641 switch (__kmp_barrier_release_pattern[bt]) {
1642 case bp_hyper_bar: {
1643 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1644 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1645 FALSE USE_ITT_BUILD_ARG(NULL));
1646 break;
1647 }
1648 case bp_hierarchical_bar: {
1649 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1650 FALSE USE_ITT_BUILD_ARG(NULL));
1651 break;
1652 }
1653 case bp_tree_bar: {
1654 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1655 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1656 FALSE USE_ITT_BUILD_ARG(NULL));
1657 break;
1658 }
1659 default: {
1660 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1661 FALSE USE_ITT_BUILD_ARG(NULL));
1662 }
1663 }
1664 if (__kmp_tasking_mode != tskm_immediate_exec) {
1665 __kmp_task_team_sync(this_thr, team);
1666 } // if
1667 }
1668 }
1669 ANNOTATE_BARRIER_END(&team->t.t_bar);
1670 }
1671
__kmp_join_barrier(int gtid)1672 void __kmp_join_barrier(int gtid) {
1673 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1674 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1675 kmp_info_t *this_thr = __kmp_threads[gtid];
1676 kmp_team_t *team;
1677 kmp_uint nproc;
1678 kmp_info_t *master_thread;
1679 int tid;
1680 #ifdef KMP_DEBUG
1681 int team_id;
1682 #endif /* KMP_DEBUG */
1683 #if USE_ITT_BUILD
1684 void *itt_sync_obj = NULL;
1685 #if USE_ITT_NOTIFY
1686 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1687 // Get object created at fork_barrier
1688 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1689 #endif
1690 #endif /* USE_ITT_BUILD */
1691 KMP_MB();
1692
1693 // Get current info
1694 team = this_thr->th.th_team;
1695 nproc = this_thr->th.th_team_nproc;
1696 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1697 tid = __kmp_tid_from_gtid(gtid);
1698 #ifdef KMP_DEBUG
1699 team_id = team->t.t_id;
1700 #endif /* KMP_DEBUG */
1701 master_thread = this_thr->th.th_team_master;
1702 #ifdef KMP_DEBUG
1703 if (master_thread != team->t.t_threads[0]) {
1704 __kmp_print_structure();
1705 }
1706 #endif /* KMP_DEBUG */
1707 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1708 KMP_MB();
1709
1710 // Verify state
1711 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1712 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1713 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1714 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1715 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1716 gtid, team_id, tid));
1717
1718 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1719 #if OMPT_SUPPORT
1720 if (ompt_enabled.enabled) {
1721 #if OMPT_OPTIONAL
1722 ompt_data_t *my_task_data;
1723 ompt_data_t *my_parallel_data;
1724 void *codeptr = NULL;
1725 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1726 if (KMP_MASTER_TID(ds_tid) &&
1727 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1728 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1729 codeptr = team->t.ompt_team_info.master_return_address;
1730 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1731 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1732 if (ompt_enabled.ompt_callback_sync_region) {
1733 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1734 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735 my_task_data, codeptr);
1736 }
1737 if (ompt_enabled.ompt_callback_sync_region_wait) {
1738 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1739 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1740 my_task_data, codeptr);
1741 }
1742 if (!KMP_MASTER_TID(ds_tid))
1743 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1744 #endif
1745 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1746 }
1747 #endif
1748
1749 if (__kmp_tasking_mode == tskm_extra_barrier) {
1750 __kmp_tasking_barrier(team, this_thr, gtid);
1751 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1752 team_id, tid));
1753 }
1754 #ifdef KMP_DEBUG
1755 if (__kmp_tasking_mode != tskm_immediate_exec) {
1756 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1757 "%p, th_task_team = %p\n",
1758 __kmp_gtid_from_thread(this_thr), team_id,
1759 team->t.t_task_team[this_thr->th.th_task_state],
1760 this_thr->th.th_task_team));
1761 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1762 team->t.t_task_team[this_thr->th.th_task_state]);
1763 }
1764 #endif /* KMP_DEBUG */
1765
1766 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1767 access it when the team struct is not guaranteed to exist. Doing these
1768 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1769 we do not perform the copy if blocktime=infinite, since the values are not
1770 used by __kmp_wait_template() in that case. */
1771 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1772 #if KMP_USE_MONITOR
1773 this_thr->th.th_team_bt_intervals =
1774 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1775 this_thr->th.th_team_bt_set =
1776 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1777 #else
1778 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1779 #endif
1780 }
1781
1782 #if USE_ITT_BUILD
1783 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1784 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1785 #endif /* USE_ITT_BUILD */
1786
1787 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1788 case bp_hyper_bar: {
1789 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1790 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792 break;
1793 }
1794 case bp_hierarchical_bar: {
1795 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1796 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1797 break;
1798 }
1799 case bp_tree_bar: {
1800 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1801 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803 break;
1804 }
1805 default: {
1806 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1807 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1808 }
1809 }
1810
1811 /* From this point on, the team data structure may be deallocated at any time
1812 by the master thread - it is unsafe to reference it in any of the worker
1813 threads. Any per-team data items that need to be referenced before the
1814 end of the barrier should be moved to the kmp_task_team_t structs. */
1815 if (KMP_MASTER_TID(tid)) {
1816 if (__kmp_tasking_mode != tskm_immediate_exec) {
1817 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1818 }
1819 if (__kmp_display_affinity) {
1820 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1821 }
1822 #if KMP_STATS_ENABLED
1823 // Have master thread flag the workers to indicate they are now waiting for
1824 // next parallel region, Also wake them up so they switch their timers to
1825 // idle.
1826 for (int i = 0; i < team->t.t_nproc; ++i) {
1827 kmp_info_t *team_thread = team->t.t_threads[i];
1828 if (team_thread == this_thr)
1829 continue;
1830 team_thread->th.th_stats->setIdleFlag();
1831 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1832 team_thread->th.th_sleep_loc != NULL)
1833 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1834 team_thread->th.th_sleep_loc);
1835 }
1836 #endif
1837 #if USE_ITT_BUILD
1838 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1839 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1840 #endif /* USE_ITT_BUILD */
1841
1842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1843 // Join barrier - report frame end
1844 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1845 __kmp_forkjoin_frames_mode &&
1846 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1847 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1848 team->t.t_active_level == 1) {
1849 kmp_uint64 cur_time = __itt_get_timestamp();
1850 ident_t *loc = team->t.t_ident;
1851 kmp_info_t **other_threads = team->t.t_threads;
1852 int nproc = this_thr->th.th_team_nproc;
1853 int i;
1854 switch (__kmp_forkjoin_frames_mode) {
1855 case 1:
1856 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1857 loc, nproc);
1858 break;
1859 case 2:
1860 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1861 loc, nproc);
1862 break;
1863 case 3:
1864 if (__itt_metadata_add_ptr) {
1865 // Initialize with master's wait time
1866 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1867 // Set arrive time to zero to be able to check it in
1868 // __kmp_invoke_task(); the same is done inside the loop below
1869 this_thr->th.th_bar_arrive_time = 0;
1870 for (i = 1; i < nproc; ++i) {
1871 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1872 other_threads[i]->th.th_bar_arrive_time = 0;
1873 }
1874 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1875 cur_time, delta, 0);
1876 }
1877 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1878 loc, nproc);
1879 this_thr->th.th_frame_time = cur_time;
1880 break;
1881 }
1882 }
1883 #endif /* USE_ITT_BUILD */
1884 }
1885 #if USE_ITT_BUILD
1886 else {
1887 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1888 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1889 }
1890 #endif /* USE_ITT_BUILD */
1891
1892 #if KMP_DEBUG
1893 if (KMP_MASTER_TID(tid)) {
1894 KA_TRACE(
1895 15,
1896 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1897 gtid, team_id, tid, nproc));
1898 }
1899 #endif /* KMP_DEBUG */
1900
1901 // TODO now, mark worker threads as done so they may be disbanded
1902 KMP_MB(); // Flush all pending memory write invalidates.
1903 KA_TRACE(10,
1904 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1905
1906 ANNOTATE_BARRIER_END(&team->t.t_bar);
1907 }
1908
1909 // TODO release worker threads' fork barriers as we are ready instead of all at
1910 // once
__kmp_fork_barrier(int gtid,int tid)1911 void __kmp_fork_barrier(int gtid, int tid) {
1912 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1913 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1914 kmp_info_t *this_thr = __kmp_threads[gtid];
1915 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1916 #if USE_ITT_BUILD
1917 void *itt_sync_obj = NULL;
1918 #endif /* USE_ITT_BUILD */
1919 if (team)
1920 ANNOTATE_BARRIER_END(&team->t.t_bar);
1921
1922 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1923 (team != NULL) ? team->t.t_id : -1, tid));
1924
1925 // th_team pointer only valid for master thread here
1926 if (KMP_MASTER_TID(tid)) {
1927 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1928 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1929 // Create itt barrier object
1930 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1931 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1932 }
1933 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1934
1935 #ifdef KMP_DEBUG
1936 kmp_info_t **other_threads = team->t.t_threads;
1937 int i;
1938
1939 // Verify state
1940 KMP_MB();
1941
1942 for (i = 1; i < team->t.t_nproc; ++i) {
1943 KA_TRACE(500,
1944 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1945 "== %u.\n",
1946 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1947 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1948 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1949 KMP_DEBUG_ASSERT(
1950 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1951 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1952 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1953 }
1954 #endif
1955
1956 if (__kmp_tasking_mode != tskm_immediate_exec) {
1957 // 0 indicates setup current task team if nthreads > 1
1958 __kmp_task_team_setup(this_thr, team, 0);
1959 }
1960
1961 /* The master thread may have changed its blocktime between the join barrier
1962 and the fork barrier. Copy the blocktime info to the thread, where
1963 __kmp_wait_template() can access it when the team struct is not
1964 guaranteed to exist. */
1965 // See note about the corresponding code in __kmp_join_barrier() being
1966 // performance-critical
1967 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1968 #if KMP_USE_MONITOR
1969 this_thr->th.th_team_bt_intervals =
1970 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1971 this_thr->th.th_team_bt_set =
1972 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1973 #else
1974 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1975 #endif
1976 }
1977 } // master
1978
1979 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1980 case bp_hyper_bar: {
1981 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1982 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984 break;
1985 }
1986 case bp_hierarchical_bar: {
1987 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1988 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1989 break;
1990 }
1991 case bp_tree_bar: {
1992 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1993 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995 break;
1996 }
1997 default: {
1998 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1999 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2000 }
2001 }
2002
2003 #if OMPT_SUPPORT
2004 if (ompt_enabled.enabled &&
2005 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2006 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2007 ompt_data_t *task_data = (team)
2008 ? OMPT_CUR_TASK_DATA(this_thr)
2009 : &(this_thr->th.ompt_thread_info.task_data);
2010 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2011 #if OMPT_OPTIONAL
2012 void *codeptr = NULL;
2013 if (KMP_MASTER_TID(ds_tid) &&
2014 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2015 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2016 codeptr = team->t.ompt_team_info.master_return_address;
2017 if (ompt_enabled.ompt_callback_sync_region_wait) {
2018 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2019 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020 codeptr);
2021 }
2022 if (ompt_enabled.ompt_callback_sync_region) {
2023 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2024 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2025 codeptr);
2026 }
2027 #endif
2028 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2029 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2030 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2031 }
2032 }
2033 #endif
2034
2035 // Early exit for reaping threads releasing forkjoin barrier
2036 if (TCR_4(__kmp_global.g.g_done)) {
2037 this_thr->th.th_task_team = NULL;
2038
2039 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2040 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2041 if (!KMP_MASTER_TID(tid)) {
2042 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2043 if (itt_sync_obj)
2044 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2045 }
2046 }
2047 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2048 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2049 return;
2050 }
2051
2052 /* We can now assume that a valid team structure has been allocated by the
2053 master and propagated to all worker threads. The current thread, however,
2054 may not be part of the team, so we can't blindly assume that the team
2055 pointer is non-null. */
2056 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2057 KMP_DEBUG_ASSERT(team != NULL);
2058 tid = __kmp_tid_from_gtid(gtid);
2059
2060 #if KMP_BARRIER_ICV_PULL
2061 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2062 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2063 implicit task has this data before this function is called. We cannot
2064 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2065 struct, because it is not always the case that the threads arrays have
2066 been allocated when __kmp_fork_call() is executed. */
2067 {
2068 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2069 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2070 // Copy the initial ICVs from the master's thread struct to the implicit
2071 // task for this tid.
2072 KA_TRACE(10,
2073 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2074 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2075 tid, FALSE);
2076 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2077 &team->t.t_threads[0]
2078 ->th.th_bar[bs_forkjoin_barrier]
2079 .bb.th_fixed_icvs);
2080 }
2081 }
2082 #endif // KMP_BARRIER_ICV_PULL
2083
2084 if (__kmp_tasking_mode != tskm_immediate_exec) {
2085 __kmp_task_team_sync(this_thr, team);
2086 }
2087
2088 #if KMP_AFFINITY_SUPPORTED
2089 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2090 if (proc_bind == proc_bind_intel) {
2091 // Call dynamic affinity settings
2092 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2093 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2094 }
2095 } else if (proc_bind != proc_bind_false) {
2096 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2097 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2098 __kmp_gtid_from_thread(this_thr),
2099 this_thr->th.th_current_place));
2100 } else {
2101 __kmp_affinity_set_place(gtid);
2102 }
2103 }
2104 #endif // KMP_AFFINITY_SUPPORTED
2105 // Perform the display affinity functionality
2106 if (__kmp_display_affinity) {
2107 if (team->t.t_display_affinity
2108 #if KMP_AFFINITY_SUPPORTED
2109 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2110 #endif
2111 ) {
2112 // NULL means use the affinity-format-var ICV
2113 __kmp_aux_display_affinity(gtid, NULL);
2114 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2115 this_thr->th.th_prev_level = team->t.t_level;
2116 }
2117 }
2118 if (!KMP_MASTER_TID(tid))
2119 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2120
2121 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2122 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2123 if (!KMP_MASTER_TID(tid)) {
2124 // Get correct barrier object
2125 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2126 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2127 } // (prepare called inside barrier_release)
2128 }
2129 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2130 ANNOTATE_BARRIER_END(&team->t.t_bar);
2131 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2132 team->t.t_id, tid));
2133 }
2134
__kmp_setup_icv_copy(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)2135 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2136 kmp_internal_control_t *new_icvs, ident_t *loc) {
2137 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2138
2139 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2140 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2141
2142 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2143 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2144 implicit task has this data before this function is called. */
2145 #if KMP_BARRIER_ICV_PULL
2146 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2147 untouched), where all of the worker threads can access them and make their
2148 own copies after the barrier. */
2149 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2150 // allocated at this point
2151 copy_icvs(
2152 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2153 new_icvs);
2154 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2155 team->t.t_threads[0], team));
2156 #elif KMP_BARRIER_ICV_PUSH
2157 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2158 // done here.
2159 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2160 team->t.t_threads[0], team));
2161 #else
2162 // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2163 // time.
2164 ngo_load(new_icvs);
2165 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2166 // allocated at this point
2167 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2168 // TODO: GEH - pass in better source location info since usually NULL here
2169 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2170 f, team->t.t_threads[f], team));
2171 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2172 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2173 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2174 f, team->t.t_threads[f], team));
2175 }
2176 ngo_sync();
2177 #endif // KMP_BARRIER_ICV_PULL
2178 }
2179