1 /*
2 * kmp_barrier.cpp
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24
25 #include "tsan_annotations.h"
26
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39
40 void __kmp_print_structure(void); // Forward declaration
41
42 // ---------------------------- Barrier Algorithms ----------------------------
43
44 // Linear Barrier
45 template <bool cancellable = false>
__kmp_linear_barrier_gather_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))46 static bool __kmp_linear_barrier_gather_template(
47 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50 kmp_team_t *team = this_thr->th.th_team;
51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52 kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(
55 20,
56 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64 __itt_get_timestamp();
65 }
66 #endif
67 // We now perform a linear reduction to signal that all of the threads have
68 // arrived.
69 if (!KMP_MASTER_TID(tid)) {
70 KA_TRACE(20,
71 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72 "arrived(%p): %llu => %llu\n",
73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76 // Mark arrival to master thread
77 /* After performing this write, a worker thread may not assume that the team
78 is valid any more - it could be deallocated by the master thread at any
79 time. */
80 ANNOTATE_BARRIER_BEGIN(this_thr);
81 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82 flag.release();
83 } else {
84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85 int nproc = this_thr->th.th_team_nproc;
86 int i;
87 // Don't have to worry about sleep bit here or atomic since team setting
88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89
90 // Collect all the worker team member threads.
91 for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93 // Prefetch next thread's arrived count
94 if (i + 1 < nproc)
95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98 "arrived(%p) == %llu\n",
99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100 team->t.t_id, i,
101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102
103 // Wait for worker thread to arrive
104 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105 new_state);
106 if (cancellable) {
107 bool cancelled = flag.wait_cancellable_nosleep(
108 this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109 if (cancelled)
110 return true;
111 } else {
112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113 }
114 ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116 // Barrier imbalance - write min of the thread time and the other thread
117 // time to the thread.
118 if (__kmp_forkjoin_frames_mode == 2) {
119 this_thr->th.th_bar_min_time = KMP_MIN(
120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121 }
122 #endif
123 if (reduce) {
124 KA_TRACE(100,
125 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127 team->t.t_id, i));
128 ANNOTATE_REDUCE_AFTER(reduce);
129 OMPT_REDUCTION_DECL(this_thr, gtid);
130 OMPT_REDUCTION_BEGIN;
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
133 OMPT_REDUCTION_END;
134 ANNOTATE_REDUCE_BEFORE(reduce);
135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136 }
137 }
138 // Don't have to worry about sleep bit here or atomic since team setting
139 team_bar->b_arrived = new_state;
140 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141 "arrived(%p) = %llu\n",
142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143 new_state));
144 }
145 KA_TRACE(
146 20,
147 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148 gtid, team->t.t_id, tid, bt));
149 return false;
150 }
151
152 template <bool cancellable = false>
__kmp_linear_barrier_release_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))153 static bool __kmp_linear_barrier_release_template(
154 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158 kmp_team_t *team;
159
160 if (KMP_MASTER_TID(tid)) {
161 unsigned int i;
162 kmp_uint32 nproc = this_thr->th.th_team_nproc;
163 kmp_info_t **other_threads;
164
165 team = __kmp_threads[gtid]->th.th_team;
166 KMP_DEBUG_ASSERT(team != NULL);
167 other_threads = team->t.t_threads;
168
169 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170 "barrier type %d\n",
171 gtid, team->t.t_id, tid, bt));
172
173 if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175 {
176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177 if (propagate_icvs) {
178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179 for (i = 1; i < nproc; ++i) {
180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181 team, i, FALSE);
182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183 &team->t.t_implicit_task_taskdata[0].td_icvs);
184 }
185 ngo_sync();
186 }
187 }
188 #endif // KMP_BARRIER_ICV_PUSH
189
190 // Now, release all of the worker threads
191 for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193 // Prefetch next thread's go flag
194 if (i + 1 < nproc)
195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197 KA_TRACE(
198 20,
199 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200 "go(%p): %u => %u\n",
201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go,
204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207 other_threads[i]);
208 flag.release();
209 }
210 }
211 } else { // Wait for the MASTER thread to release us
212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215 if (cancellable) {
216 bool cancelled = flag.wait_cancellable_nosleep(
217 this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218 if (cancelled) {
219 return true;
220 }
221 } else {
222 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223 }
224 ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228 // disabled)
229 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230 // Cancel wait on previous parallel region...
231 __kmp_itt_task_starting(itt_sync_obj);
232
233 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234 return false;
235
236 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237 if (itt_sync_obj != NULL)
238 // Call prepare as early as possible for "new" barrier
239 __kmp_itt_task_finished(itt_sync_obj);
240 } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242 // Early exit for reaping threads releasing forkjoin barrier
243 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244 return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247 tid = __kmp_tid_from_gtid(gtid);
248 team = __kmp_threads[gtid]->th.th_team;
249 #endif
250 KMP_DEBUG_ASSERT(team != NULL);
251 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252 KA_TRACE(20,
253 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255 KMP_MB(); // Flush all pending memory write invalidates.
256 }
257 KA_TRACE(
258 20,
259 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260 gtid, team->t.t_id, tid, bt));
261 return false;
262 }
263
__kmp_linear_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))264 static void __kmp_linear_barrier_gather(
265 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267 __kmp_linear_barrier_gather_template<false>(
268 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270
__kmp_linear_barrier_gather_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))271 static bool __kmp_linear_barrier_gather_cancellable(
272 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274 return __kmp_linear_barrier_gather_template<true>(
275 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277
__kmp_linear_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))278 static void __kmp_linear_barrier_release(
279 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281 __kmp_linear_barrier_release_template<false>(
282 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284
__kmp_linear_barrier_release_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))285 static bool __kmp_linear_barrier_release_cancellable(
286 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288 return __kmp_linear_barrier_release_template<true>(
289 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291
292 // Tree barrier
293 static void
__kmp_tree_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295 int tid, void (*reduce)(void *, void *)
296 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298 kmp_team_t *team = this_thr->th.th_team;
299 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300 kmp_info_t **other_threads = team->t.t_threads;
301 kmp_uint32 nproc = this_thr->th.th_team_nproc;
302 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303 kmp_uint32 branch_factor = 1 << branch_bits;
304 kmp_uint32 child;
305 kmp_uint32 child_tid;
306 kmp_uint64 new_state;
307
308 KA_TRACE(
309 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310 gtid, team->t.t_id, tid, bt));
311 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314 // Barrier imbalance - save arrive time to the thread
315 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317 __itt_get_timestamp();
318 }
319 #endif
320 // Perform tree gather to wait until all threads have arrived; reduce any
321 // required data as we go
322 child_tid = (tid << branch_bits) + 1;
323 if (child_tid < nproc) {
324 // Parent threads wait for all their children to arrive
325 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326 child = 1;
327 do {
328 kmp_info_t *child_thr = other_threads[child_tid];
329 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331 // Prefetch next thread's arrived count
332 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333 KMP_CACHE_PREFETCH(
334 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336 KA_TRACE(20,
337 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338 "arrived(%p) == %llu\n",
339 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341 // Wait for child to arrive
342 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344 ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346 // Barrier imbalance - write min of the thread time and a child time to
347 // the thread.
348 if (__kmp_forkjoin_frames_mode == 2) {
349 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350 child_thr->th.th_bar_min_time);
351 }
352 #endif
353 if (reduce) {
354 KA_TRACE(100,
355 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357 team->t.t_id, child_tid));
358 ANNOTATE_REDUCE_AFTER(reduce);
359 OMPT_REDUCTION_DECL(this_thr, gtid);
360 OMPT_REDUCTION_BEGIN;
361 (*reduce)(this_thr->th.th_local.reduce_data,
362 child_thr->th.th_local.reduce_data);
363 OMPT_REDUCTION_END;
364 ANNOTATE_REDUCE_BEFORE(reduce);
365 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366 }
367 child++;
368 child_tid++;
369 } while (child <= branch_factor && child_tid < nproc);
370 }
371
372 if (!KMP_MASTER_TID(tid)) { // Worker threads
373 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374
375 KA_TRACE(20,
376 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377 "arrived(%p): %llu => %llu\n",
378 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381
382 // Mark arrival to parent thread
383 /* After performing this write, a worker thread may not assume that the team
384 is valid any more - it could be deallocated by the master thread at any
385 time. */
386 ANNOTATE_BARRIER_BEGIN(this_thr);
387 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388 flag.release();
389 } else {
390 // Need to update the team arrived pointer if we are the master thread
391 if (nproc > 1) // New value was already computed above
392 team->t.t_bar[bt].b_arrived = new_state;
393 else
394 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396 "arrived(%p) = %llu\n",
397 gtid, team->t.t_id, tid, team->t.t_id,
398 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399 }
400 KA_TRACE(20,
401 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402 gtid, team->t.t_id, tid, bt));
403 }
404
__kmp_tree_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))405 static void __kmp_tree_barrier_release(
406 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409 kmp_team_t *team;
410 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411 kmp_uint32 nproc;
412 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413 kmp_uint32 branch_factor = 1 << branch_bits;
414 kmp_uint32 child;
415 kmp_uint32 child_tid;
416
417 // Perform a tree release for all of the threads that have been gathered
418 if (!KMP_MASTER_TID(
419 tid)) { // Handle fork barrier workers who aren't part of a team yet
420 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422 // Wait for parent thread to release us
423 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425 ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428 // In fork barrier where we could not get the object reliably (or
429 // ITTNOTIFY is disabled)
430 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431 // Cancel wait on previous parallel region...
432 __kmp_itt_task_starting(itt_sync_obj);
433
434 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435 return;
436
437 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438 if (itt_sync_obj != NULL)
439 // Call prepare as early as possible for "new" barrier
440 __kmp_itt_task_finished(itt_sync_obj);
441 } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443 // Early exit for reaping threads releasing forkjoin barrier
444 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445 return;
446
447 // The worker thread may now assume that the team is valid.
448 team = __kmp_threads[gtid]->th.th_team;
449 KMP_DEBUG_ASSERT(team != NULL);
450 tid = __kmp_tid_from_gtid(gtid);
451
452 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453 KA_TRACE(20,
454 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456 KMP_MB(); // Flush all pending memory write invalidates.
457 } else {
458 team = __kmp_threads[gtid]->th.th_team;
459 KMP_DEBUG_ASSERT(team != NULL);
460 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461 "barrier type %d\n",
462 gtid, team->t.t_id, tid, bt));
463 }
464 nproc = this_thr->th.th_team_nproc;
465 child_tid = (tid << branch_bits) + 1;
466
467 if (child_tid < nproc) {
468 kmp_info_t **other_threads = team->t.t_threads;
469 child = 1;
470 // Parent threads release all their children
471 do {
472 kmp_info_t *child_thr = other_threads[child_tid];
473 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475 // Prefetch next thread's go count
476 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477 KMP_CACHE_PREFETCH(
478 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480
481 #if KMP_BARRIER_ICV_PUSH
482 {
483 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484 if (propagate_icvs) {
485 __kmp_init_implicit_task(team->t.t_ident,
486 team->t.t_threads[child_tid], team,
487 child_tid, FALSE);
488 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489 &team->t.t_implicit_task_taskdata[0].td_icvs);
490 }
491 }
492 #endif // KMP_BARRIER_ICV_PUSH
493 KA_TRACE(20,
494 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495 "go(%p): %u => %u\n",
496 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499 // Release child from barrier
500 ANNOTATE_BARRIER_BEGIN(child_thr);
501 kmp_flag_64 flag(&child_bar->b_go, child_thr);
502 flag.release();
503 child++;
504 child_tid++;
505 } while (child <= branch_factor && child_tid < nproc);
506 }
507 KA_TRACE(
508 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509 gtid, team->t.t_id, tid, bt));
510 }
511
512 // Hyper Barrier
513 static void
__kmp_hyper_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515 int tid, void (*reduce)(void *, void *)
516 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518 kmp_team_t *team = this_thr->th.th_team;
519 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520 kmp_info_t **other_threads = team->t.t_threads;
521 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524 kmp_uint32 branch_factor = 1 << branch_bits;
525 kmp_uint32 offset;
526 kmp_uint32 level;
527
528 KA_TRACE(
529 20,
530 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531 gtid, team->t.t_id, tid, bt));
532 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535 // Barrier imbalance - save arrive time to the thread
536 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538 __itt_get_timestamp();
539 }
540 #endif
541 /* Perform a hypercube-embedded tree gather to wait until all of the threads
542 have arrived, and reduce any required data as we go. */
543 kmp_flag_64 p_flag(&thr_bar->b_arrived);
544 for (level = 0, offset = 1; offset < num_threads;
545 level += branch_bits, offset <<= branch_bits) {
546 kmp_uint32 child;
547 kmp_uint32 child_tid;
548
549 if (((tid >> level) & (branch_factor - 1)) != 0) {
550 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551
552 KMP_MB(); // Synchronize parent and child threads.
553 KA_TRACE(20,
554 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
555 "arrived(%p): %llu => %llu\n",
556 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
557 team->t.t_id, parent_tid, &thr_bar->b_arrived,
558 thr_bar->b_arrived,
559 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560 // Mark arrival to parent thread
561 /* After performing this write (in the last iteration of the enclosing for
562 loop), a worker thread may not assume that the team is valid any more
563 - it could be deallocated by the master thread at any time. */
564 ANNOTATE_BARRIER_BEGIN(this_thr);
565 p_flag.set_waiter(other_threads[parent_tid]);
566 p_flag.release();
567 break;
568 }
569
570 // Parent threads wait for children to arrive
571 if (new_state == KMP_BARRIER_UNUSED_STATE)
572 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
573 for (child = 1, child_tid = tid + (1 << level);
574 child < branch_factor && child_tid < num_threads;
575 child++, child_tid += (1 << level)) {
576 kmp_info_t *child_thr = other_threads[child_tid];
577 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578 #if KMP_CACHE_MANAGE
579 kmp_uint32 next_child_tid = child_tid + (1 << level);
580 // Prefetch next thread's arrived count
581 if (child + 1 < branch_factor && next_child_tid < num_threads)
582 KMP_CACHE_PREFETCH(
583 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584 #endif /* KMP_CACHE_MANAGE */
585 KA_TRACE(20,
586 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
587 "arrived(%p) == %llu\n",
588 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
589 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590 // Wait for child to arrive
591 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
592 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
593 ANNOTATE_BARRIER_END(child_thr);
594 KMP_MB(); // Synchronize parent and child threads.
595 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596 // Barrier imbalance - write min of the thread time and a child time to
597 // the thread.
598 if (__kmp_forkjoin_frames_mode == 2) {
599 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
600 child_thr->th.th_bar_min_time);
601 }
602 #endif
603 if (reduce) {
604 KA_TRACE(100,
605 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
606 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
607 team->t.t_id, child_tid));
608 ANNOTATE_REDUCE_AFTER(reduce);
609 OMPT_REDUCTION_DECL(this_thr, gtid);
610 OMPT_REDUCTION_BEGIN;
611 (*reduce)(this_thr->th.th_local.reduce_data,
612 child_thr->th.th_local.reduce_data);
613 OMPT_REDUCTION_END;
614 ANNOTATE_REDUCE_BEFORE(reduce);
615 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616 }
617 }
618 }
619
620 if (KMP_MASTER_TID(tid)) {
621 // Need to update the team arrived pointer if we are the master thread
622 if (new_state == KMP_BARRIER_UNUSED_STATE)
623 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
624 else
625 team->t.t_bar[bt].b_arrived = new_state;
626 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
627 "arrived(%p) = %llu\n",
628 gtid, team->t.t_id, tid, team->t.t_id,
629 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630 }
631 KA_TRACE(
632 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
633 gtid, team->t.t_id, tid, bt));
634 }
635
636 // The reverse versions seem to beat the forward versions overall
637 #define KMP_REVERSE_HYPER_BAR
__kmp_hyper_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))638 static void __kmp_hyper_barrier_release(
639 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
640 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
641 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
642 kmp_team_t *team;
643 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
644 kmp_info_t **other_threads;
645 kmp_uint32 num_threads;
646 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
647 kmp_uint32 branch_factor = 1 << branch_bits;
648 kmp_uint32 child;
649 kmp_uint32 child_tid;
650 kmp_uint32 offset;
651 kmp_uint32 level;
652
653 /* Perform a hypercube-embedded tree release for all of the threads that have
654 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
655 are released in the reverse order of the corresponding gather, otherwise
656 threads are released in the same order. */
657 if (KMP_MASTER_TID(tid)) { // master
658 team = __kmp_threads[gtid]->th.th_team;
659 KMP_DEBUG_ASSERT(team != NULL);
660 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
661 "barrier type %d\n",
662 gtid, team->t.t_id, tid, bt));
663 #if KMP_BARRIER_ICV_PUSH
664 if (propagate_icvs) { // master already has ICVs in final destination; copy
665 copy_icvs(&thr_bar->th_fixed_icvs,
666 &team->t.t_implicit_task_taskdata[tid].td_icvs);
667 }
668 #endif
669 } else { // Handle fork barrier workers who aren't part of a team yet
670 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
671 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
672 // Wait for parent thread to release us
673 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
674 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
675 ANNOTATE_BARRIER_END(this_thr);
676 #if USE_ITT_BUILD && USE_ITT_NOTIFY
677 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
678 // In fork barrier where we could not get the object reliably
679 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
680 // Cancel wait on previous parallel region...
681 __kmp_itt_task_starting(itt_sync_obj);
682
683 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684 return;
685
686 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
687 if (itt_sync_obj != NULL)
688 // Call prepare as early as possible for "new" barrier
689 __kmp_itt_task_finished(itt_sync_obj);
690 } else
691 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
692 // Early exit for reaping threads releasing forkjoin barrier
693 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
694 return;
695
696 // The worker thread may now assume that the team is valid.
697 team = __kmp_threads[gtid]->th.th_team;
698 KMP_DEBUG_ASSERT(team != NULL);
699 tid = __kmp_tid_from_gtid(gtid);
700
701 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
702 KA_TRACE(20,
703 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
704 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705 KMP_MB(); // Flush all pending memory write invalidates.
706 }
707 num_threads = this_thr->th.th_team_nproc;
708 other_threads = team->t.t_threads;
709
710 #ifdef KMP_REVERSE_HYPER_BAR
711 // Count up to correct level for parent
712 for (level = 0, offset = 1;
713 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
714 level += branch_bits, offset <<= branch_bits)
715 ;
716
717 // Now go down from there
718 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
719 level -= branch_bits, offset >>= branch_bits)
720 #else
721 // Go down the tree, level by level
722 for (level = 0, offset = 1; offset < num_threads;
723 level += branch_bits, offset <<= branch_bits)
724 #endif // KMP_REVERSE_HYPER_BAR
725 {
726 #ifdef KMP_REVERSE_HYPER_BAR
727 /* Now go in reverse order through the children, highest to lowest.
728 Initial setting of child is conservative here. */
729 child = num_threads >> ((level == 0) ? level : level - 1);
730 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
731 child_tid = tid + (child << level);
732 child >= 1; child--, child_tid -= (1 << level))
733 #else
734 if (((tid >> level) & (branch_factor - 1)) != 0)
735 // No need to go lower than this, since this is the level parent would be
736 // notified
737 break;
738 // Iterate through children on this level of the tree
739 for (child = 1, child_tid = tid + (1 << level);
740 child < branch_factor && child_tid < num_threads;
741 child++, child_tid += (1 << level))
742 #endif // KMP_REVERSE_HYPER_BAR
743 {
744 if (child_tid >= num_threads)
745 continue; // Child doesn't exist so keep going
746 else {
747 kmp_info_t *child_thr = other_threads[child_tid];
748 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
749 #if KMP_CACHE_MANAGE
750 kmp_uint32 next_child_tid = child_tid - (1 << level);
751 // Prefetch next thread's go count
752 #ifdef KMP_REVERSE_HYPER_BAR
753 if (child - 1 >= 1 && next_child_tid < num_threads)
754 #else
755 if (child + 1 < branch_factor && next_child_tid < num_threads)
756 #endif // KMP_REVERSE_HYPER_BAR
757 KMP_CACHE_PREFETCH(
758 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #endif /* KMP_CACHE_MANAGE */
760
761 #if KMP_BARRIER_ICV_PUSH
762 if (propagate_icvs) // push my fixed ICVs to my child
763 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
764 #endif // KMP_BARRIER_ICV_PUSH
765
766 KA_TRACE(
767 20,
768 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
769 "go(%p): %u => %u\n",
770 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
771 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
772 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
773 // Release child from barrier
774 ANNOTATE_BARRIER_BEGIN(child_thr);
775 kmp_flag_64 flag(&child_bar->b_go, child_thr);
776 flag.release();
777 }
778 }
779 }
780 #if KMP_BARRIER_ICV_PUSH
781 if (propagate_icvs &&
782 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
783 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
784 FALSE);
785 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
786 &thr_bar->th_fixed_icvs);
787 }
788 #endif
789 KA_TRACE(
790 20,
791 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
792 gtid, team->t.t_id, tid, bt));
793 }
794
795 // Hierarchical Barrier
796
797 // Initialize thread barrier data
798 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
799 Performs the minimum amount of initialization required based on how the team
800 has changed. Returns true if leaf children will require both on-core and
801 traditional wake-up mechanisms. For example, if the team size increases,
802 threads already in the team will respond to on-core wakeup on their parent
803 thread, but threads newly added to the team will only be listening on the
804 their local b_go. */
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt,kmp_bstate_t * thr_bar,kmp_uint32 nproc,int gtid,int tid,kmp_team_t * team)805 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
806 kmp_bstate_t *thr_bar,
807 kmp_uint32 nproc, int gtid,
808 int tid, kmp_team_t *team) {
809 // Checks to determine if (re-)initialization is needed
810 bool uninitialized = thr_bar->team == NULL;
811 bool team_changed = team != thr_bar->team;
812 bool team_sz_changed = nproc != thr_bar->nproc;
813 bool tid_changed = tid != thr_bar->old_tid;
814 bool retval = false;
815
816 if (uninitialized || team_sz_changed) {
817 __kmp_get_hierarchy(nproc, thr_bar);
818 }
819
820 if (uninitialized || team_sz_changed || tid_changed) {
821 thr_bar->my_level = thr_bar->depth - 1; // default for master
822 thr_bar->parent_tid = -1; // default for master
823 if (!KMP_MASTER_TID(
824 tid)) { // if not master, find parent thread in hierarchy
825 kmp_uint32 d = 0;
826 while (d < thr_bar->depth) { // find parent based on level of thread in
827 // hierarchy, and note level
828 kmp_uint32 rem;
829 if (d == thr_bar->depth - 2) { // reached level right below the master
830 thr_bar->parent_tid = 0;
831 thr_bar->my_level = d;
832 break;
833 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834 0) { // TODO: can we make this op faster?
835 // thread is not a subtree root at next level, so this is max
836 thr_bar->parent_tid = tid - rem;
837 thr_bar->my_level = d;
838 break;
839 }
840 ++d;
841 }
842 }
843 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
844 thr_bar->old_tid = tid;
845 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846 thr_bar->team = team;
847 thr_bar->parent_bar =
848 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849 }
850 if (uninitialized || team_changed || tid_changed) {
851 thr_bar->team = team;
852 thr_bar->parent_bar =
853 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854 retval = true;
855 }
856 if (uninitialized || team_sz_changed || tid_changed) {
857 thr_bar->nproc = nproc;
858 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859 if (thr_bar->my_level == 0)
860 thr_bar->leaf_kids = 0;
861 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862 thr_bar->leaf_kids = nproc - tid - 1;
863 thr_bar->leaf_state = 0;
864 for (int i = 0; i < thr_bar->leaf_kids; ++i)
865 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866 }
867 return retval;
868 }
869
__kmp_hierarchical_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))870 static void __kmp_hierarchical_barrier_gather(
871 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874 kmp_team_t *team = this_thr->th.th_team;
875 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876 kmp_uint32 nproc = this_thr->th.th_team_nproc;
877 kmp_info_t **other_threads = team->t.t_threads;
878 kmp_uint64 new_state;
879
880 int level = team->t.t_level;
881 if (other_threads[0]
882 ->th.th_teams_microtask) // are we inside the teams construct?
883 if (this_thr->th.th_teams_size.nteams > 1)
884 ++level; // level was not increased in teams construct for team_of_masters
885 if (level == 1)
886 thr_bar->use_oncore_barrier = 1;
887 else
888 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889
890 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891 "barrier type %d\n",
892 gtid, team->t.t_id, tid, bt));
893 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896 // Barrier imbalance - save arrive time to the thread
897 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899 }
900 #endif
901
902 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903 team);
904
905 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906 kmp_int32 child_tid;
907 new_state =
908 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910 thr_bar->use_oncore_barrier) {
911 if (thr_bar->leaf_kids) {
912 // First, wait for leaf children to check-in on my b_arrived flag
913 kmp_uint64 leaf_state =
914 KMP_MASTER_TID(tid)
915 ? thr_bar->b_arrived | thr_bar->leaf_state
916 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918 "for leaf kids\n",
919 gtid, team->t.t_id, tid));
920 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
921 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922 if (reduce) {
923 ANNOTATE_REDUCE_AFTER(reduce);
924 OMPT_REDUCTION_DECL(this_thr, gtid);
925 OMPT_REDUCTION_BEGIN;
926 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927 ++child_tid) {
928 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929 "T#%d(%d:%d)\n",
930 gtid, team->t.t_id, tid,
931 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932 child_tid));
933 ANNOTATE_BARRIER_END(other_threads[child_tid]);
934 (*reduce)(this_thr->th.th_local.reduce_data,
935 other_threads[child_tid]->th.th_local.reduce_data);
936 }
937 OMPT_REDUCTION_END;
938 ANNOTATE_REDUCE_BEFORE(reduce);
939 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940 }
941 // clear leaf_state bits
942 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943 }
944 // Next, wait for higher level children on each child's b_arrived flag
945 for (kmp_uint32 d = 1; d < thr_bar->my_level;
946 ++d) { // gather lowest level threads first, but skip 0
947 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948 skip = thr_bar->skip_per_level[d];
949 if (last > nproc)
950 last = nproc;
951 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952 kmp_info_t *child_thr = other_threads[child_tid];
953 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955 "T#%d(%d:%d) "
956 "arrived(%p) == %llu\n",
957 gtid, team->t.t_id, tid,
958 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959 child_tid, &child_bar->b_arrived, new_state));
960 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
961 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962 ANNOTATE_BARRIER_END(child_thr);
963 if (reduce) {
964 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965 "T#%d(%d:%d)\n",
966 gtid, team->t.t_id, tid,
967 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968 child_tid));
969 ANNOTATE_REDUCE_AFTER(reduce);
970 (*reduce)(this_thr->th.th_local.reduce_data,
971 child_thr->th.th_local.reduce_data);
972 ANNOTATE_REDUCE_BEFORE(reduce);
973 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974 }
975 }
976 }
977 } else { // Blocktime is not infinite
978 for (kmp_uint32 d = 0; d < thr_bar->my_level;
979 ++d) { // Gather lowest level threads first
980 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981 skip = thr_bar->skip_per_level[d];
982 if (last > nproc)
983 last = nproc;
984 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985 kmp_info_t *child_thr = other_threads[child_tid];
986 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988 "T#%d(%d:%d) "
989 "arrived(%p) == %llu\n",
990 gtid, team->t.t_id, tid,
991 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992 child_tid, &child_bar->b_arrived, new_state));
993 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
994 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995 ANNOTATE_BARRIER_END(child_thr);
996 if (reduce) {
997 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998 "T#%d(%d:%d)\n",
999 gtid, team->t.t_id, tid,
1000 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001 child_tid));
1002 ANNOTATE_REDUCE_AFTER(reduce);
1003 (*reduce)(this_thr->th.th_local.reduce_data,
1004 child_thr->th.th_local.reduce_data);
1005 ANNOTATE_REDUCE_BEFORE(reduce);
1006 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007 }
1008 }
1009 }
1010 }
1011 }
1012 // All subordinates are gathered; now release parent if not master thread
1013
1014 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017 gtid, team->t.t_id, tid,
1018 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021 /* Mark arrival to parent: After performing this write, a worker thread may
1022 not assume that the team is valid any more - it could be deallocated by
1023 the master thread at any time. */
1024 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026 // flag; release it
1027 ANNOTATE_BARRIER_BEGIN(this_thr);
1028 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1029 flag.release();
1030 } else {
1031 // Leaf does special release on "offset" bits of parent's b_arrived flag
1032 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1033 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1034 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035 flag.release();
1036 }
1037 } else { // Master thread needs to update the team's b_arrived value
1038 team->t.t_bar[bt].b_arrived = new_state;
1039 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040 "arrived(%p) = %llu\n",
1041 gtid, team->t.t_id, tid, team->t.t_id,
1042 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043 }
1044 // Is the team access below unsafe or just technically invalid?
1045 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046 "barrier type %d\n",
1047 gtid, team->t.t_id, tid, bt));
1048 }
1049
__kmp_hierarchical_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))1050 static void __kmp_hierarchical_barrier_release(
1051 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054 kmp_team_t *team;
1055 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056 kmp_uint32 nproc;
1057 bool team_change = false; // indicates on-core barrier shouldn't be used
1058
1059 if (KMP_MASTER_TID(tid)) {
1060 team = __kmp_threads[gtid]->th.th_team;
1061 KMP_DEBUG_ASSERT(team != NULL);
1062 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063 "entered barrier type %d\n",
1064 gtid, team->t.t_id, tid, bt));
1065 } else { // Worker threads
1066 // Wait for parent thread to release me
1067 if (!thr_bar->use_oncore_barrier ||
1068 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069 thr_bar->team == NULL) {
1070 // Use traditional method of waiting on my own b_go flag
1071 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074 ANNOTATE_BARRIER_END(this_thr);
1075 TCW_8(thr_bar->b_go,
1076 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078 // infinite, not nested
1079 // Wait on my "offset" bits on parent's b_go flag
1080 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082 thr_bar->offset, bt,
1083 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084 flag.wait(this_thr, TRUE);
1085 if (thr_bar->wait_flag ==
1086 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087 TCW_8(thr_bar->b_go,
1088 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089 } else { // Reset my bits on parent's b_go flag
1090 (RCAST(volatile char *,
1091 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092 }
1093 }
1094 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095 // Early exit for reaping threads releasing forkjoin barrier
1096 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097 return;
1098 // The worker thread may now assume that the team is valid.
1099 team = __kmp_threads[gtid]->th.th_team;
1100 KMP_DEBUG_ASSERT(team != NULL);
1101 tid = __kmp_tid_from_gtid(gtid);
1102
1103 KA_TRACE(
1104 20,
1105 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107 KMP_MB(); // Flush all pending memory write invalidates.
1108 }
1109
1110 nproc = this_thr->th.th_team_nproc;
1111 int level = team->t.t_level;
1112 if (team->t.t_threads[0]
1113 ->th.th_teams_microtask) { // are we inside the teams construct?
1114 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115 this_thr->th.th_teams_level == level)
1116 ++level; // level was not increased in teams construct for team_of_workers
1117 if (this_thr->th.th_teams_size.nteams > 1)
1118 ++level; // level was not increased in teams construct for team_of_masters
1119 }
1120 if (level == 1)
1121 thr_bar->use_oncore_barrier = 1;
1122 else
1123 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124
1125 // If the team size has increased, we still communicate with old leaves via
1126 // oncore barrier.
1127 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130 tid, team);
1131 // But if the entire team changes, we won't use oncore barrier at all
1132 if (team_change)
1133 old_leaf_kids = 0;
1134
1135 #if KMP_BARRIER_ICV_PUSH
1136 if (propagate_icvs) {
1137 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138 FALSE);
1139 if (KMP_MASTER_TID(
1140 tid)) { // master already has copy in final destination; copy
1141 copy_icvs(&thr_bar->th_fixed_icvs,
1142 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146 // leaves (on-core children) pull parent's fixed ICVs directly to local
1147 // ICV store
1148 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149 &thr_bar->parent_bar->th_fixed_icvs);
1150 // non-leaves will get ICVs piggybacked with b_go via NGO store
1151 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153 // access
1154 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155 else // leaves copy parent's fixed ICVs directly to local ICV store
1156 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157 &thr_bar->parent_bar->th_fixed_icvs);
1158 }
1159 }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161
1162 // Now, release my children
1163 if (thr_bar->my_level) { // not a leaf
1164 kmp_int32 child_tid;
1165 kmp_uint32 last;
1166 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167 thr_bar->use_oncore_barrier) {
1168 if (KMP_MASTER_TID(tid)) { // do a flat release
1169 // Set local b_go to bump children via NGO store of the cache line
1170 // containing IVCs and b_go.
1171 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173 // the cache line
1174 ngo_load(&thr_bar->th_fixed_icvs);
1175 // This loops over all the threads skipping only the leaf nodes in the
1176 // hierarchy
1177 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178 child_tid += thr_bar->skip_per_level[1]) {
1179 kmp_bstate_t *child_bar =
1180 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182 "releasing T#%d(%d:%d)"
1183 " go(%p): %u => %u\n",
1184 gtid, team->t.t_id, tid,
1185 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186 child_tid, &child_bar->b_go, child_bar->b_go,
1187 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188 // Use ngo store (if available) to both store ICVs and release child
1189 // via child's b_go
1190 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191 }
1192 ngo_sync();
1193 }
1194 TCW_8(thr_bar->b_go,
1195 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196 // Now, release leaf children
1197 if (thr_bar->leaf_kids) { // if there are any
1198 // We test team_change on the off-chance that the level 1 team changed.
1199 if (team_change ||
1200 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201 if (old_leaf_kids) { // release old leaf kids
1202 thr_bar->b_go |= old_leaf_state;
1203 }
1204 // Release new leaf kids
1205 last = tid + thr_bar->skip_per_level[1];
1206 if (last > nproc)
1207 last = nproc;
1208 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209 ++child_tid) { // skip_per_level[0]=1
1210 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212 KA_TRACE(
1213 20,
1214 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215 " T#%d(%d:%d) go(%p): %u => %u\n",
1216 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219 // Release child using child's b_go flag
1220 ANNOTATE_BARRIER_BEGIN(child_thr);
1221 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1222 flag.release();
1223 }
1224 } else { // Release all children at once with leaf_state bits on my own
1225 // b_go flag
1226 thr_bar->b_go |= thr_bar->leaf_state;
1227 }
1228 }
1229 } else { // Blocktime is not infinite; do a simple hierarchical release
1230 for (int d = thr_bar->my_level - 1; d >= 0;
1231 --d) { // Release highest level threads first
1232 last = tid + thr_bar->skip_per_level[d + 1];
1233 kmp_uint32 skip = thr_bar->skip_per_level[d];
1234 if (last > nproc)
1235 last = nproc;
1236 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241 gtid, team->t.t_id, tid,
1242 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243 child_tid, &child_bar->b_go, child_bar->b_go,
1244 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245 // Release child using child's b_go flag
1246 ANNOTATE_BARRIER_BEGIN(child_thr);
1247 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1248 flag.release();
1249 }
1250 }
1251 }
1252 #if KMP_BARRIER_ICV_PUSH
1253 if (propagate_icvs && !KMP_MASTER_TID(tid))
1254 // non-leaves copy ICVs from fixed ICVs to local dest
1255 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256 &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258 }
1259 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260 "barrier type %d\n",
1261 gtid, team->t.t_id, tid, bt));
1262 }
1263
1264 // End of Barrier Algorithms
1265
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271 bool value;
is_cancellableis_cancellable1272 is_cancellable() : value(false) {}
is_cancellableis_cancellable1273 is_cancellable(bool b) : value(b) {}
operator =is_cancellable1274 is_cancellable &operator=(bool b) {
1275 value = b;
1276 return *this;
1277 }
operator boolis_cancellable1278 operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
operator =is_cancellable1281 is_cancellable &operator=(bool b) { return *this; }
operator boolis_cancellable1282 constexpr operator bool() const { return false; }
1283 };
1284
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288 barrier
1289 When cancellable = false,
1290 Returns 0 if master thread, 1 if worker thread.
1291 When cancellable = true
1292 Returns 0 if not cancelled, 1 if cancelled. */
1293 template <bool cancellable = false>
__kmp_barrier_template(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295 size_t reduce_size, void *reduce_data,
1296 void (*reduce)(void *, void *)) {
1297 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299 int tid = __kmp_tid_from_gtid(gtid);
1300 kmp_info_t *this_thr = __kmp_threads[gtid];
1301 kmp_team_t *team = this_thr->th.th_team;
1302 int status = 0;
1303 is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305 ompt_data_t *my_task_data;
1306 ompt_data_t *my_parallel_data;
1307 void *return_address;
1308 ompt_sync_region_t barrier_kind;
1309 #endif
1310
1311 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313
1314 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316 if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322 if (ompt_enabled.ompt_callback_sync_region) {
1323 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325 return_address);
1326 }
1327 if (ompt_enabled.ompt_callback_sync_region_wait) {
1328 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330 return_address);
1331 }
1332 #endif
1333 // It is OK to report the barrier state after the barrier begin callback.
1334 // According to the OMPT specification, a compliant implementation may
1335 // even delay reporting this state until the barrier begins to wait.
1336 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337 }
1338 #endif
1339
1340 if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342 // This value will be used in itt notify events below.
1343 void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349 if (__kmp_tasking_mode == tskm_extra_barrier) {
1350 __kmp_tasking_barrier(team, this_thr, gtid);
1351 KA_TRACE(15,
1352 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354 }
1355
1356 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357 access it when the team struct is not guaranteed to exist. */
1358 // See note about the corresponding code in __kmp_join_barrier() being
1359 // performance-critical.
1360 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362 this_thr->th.th_team_bt_intervals =
1363 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364 this_thr->th.th_team_bt_set =
1365 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369 }
1370
1371 #if USE_ITT_BUILD
1372 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376 // Let the debugger know: the thread arrived to the barrier and waiting.
1377 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378 team->t.t_bar[bt].b_master_arrived += 1;
1379 } else {
1380 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381 } // if
1382 #endif /* USE_DEBUGGER */
1383 if (reduce != NULL) {
1384 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1385 this_thr->th.th_local.reduce_data = reduce_data;
1386 }
1387
1388 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389 // use 0 to only setup the current team if nthreads > 1
1390 __kmp_task_team_setup(this_thr, team, 0);
1391
1392 if (cancellable) {
1393 cancelled = __kmp_linear_barrier_gather_cancellable(
1394 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395 } else {
1396 switch (__kmp_barrier_gather_pattern[bt]) {
1397 case bp_hyper_bar: {
1398 // don't set branch bits to 0; use linear
1399 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402 break;
1403 }
1404 case bp_hierarchical_bar: {
1405 __kmp_hierarchical_barrier_gather(
1406 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407 break;
1408 }
1409 case bp_tree_bar: {
1410 // don't set branch bits to 0; use linear
1411 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414 break;
1415 }
1416 default: {
1417 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419 }
1420 }
1421 }
1422
1423 KMP_MB();
1424
1425 if (KMP_MASTER_TID(tid)) {
1426 status = 0;
1427 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429 }
1430 #if USE_DEBUGGER
1431 // Let the debugger know: All threads are arrived and starting leaving the
1432 // barrier.
1433 team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435
1436 if (__kmp_omp_cancellation) {
1437 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438 // Reset cancellation flag for worksharing constructs
1439 if (cancel_request == cancel_loop ||
1440 cancel_request == cancel_sections) {
1441 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442 }
1443 }
1444 #if USE_ITT_BUILD
1445 /* TODO: In case of split reduction barrier, master thread may send
1446 acquired event early, before the final summation into the shared
1447 variable is done (final summation can be a long operation for array
1448 reductions). */
1449 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453 // Barrier - report frame end (only if active_level == 1)
1454 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455 __kmp_forkjoin_frames_mode &&
1456 this_thr->th.th_teams_microtask == NULL &&
1457 team->t.t_active_level == 1) {
1458 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1459 kmp_uint64 cur_time = __itt_get_timestamp();
1460 kmp_info_t **other_threads = team->t.t_threads;
1461 int nproc = this_thr->th.th_team_nproc;
1462 int i;
1463 switch (__kmp_forkjoin_frames_mode) {
1464 case 1:
1465 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1466 loc, nproc);
1467 this_thr->th.th_frame_time = cur_time;
1468 break;
1469 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1470 // be fixed)
1471 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1472 1, loc, nproc);
1473 break;
1474 case 3:
1475 if (__itt_metadata_add_ptr) {
1476 // Initialize with master's wait time
1477 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478 // Set arrive time to zero to be able to check it in
1479 // __kmp_invoke_task(); the same is done inside the loop below
1480 this_thr->th.th_bar_arrive_time = 0;
1481 for (i = 1; i < nproc; ++i) {
1482 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1483 other_threads[i]->th.th_bar_arrive_time = 0;
1484 }
1485 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1486 cur_time, delta,
1487 (kmp_uint64)(reduce != NULL));
1488 }
1489 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1490 loc, nproc);
1491 this_thr->th.th_frame_time = cur_time;
1492 break;
1493 }
1494 }
1495 #endif /* USE_ITT_BUILD */
1496 } else {
1497 status = 1;
1498 #if USE_ITT_BUILD
1499 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 #endif /* USE_ITT_BUILD */
1502 }
1503 if ((status == 1 || !is_split) && !cancelled) {
1504 if (cancellable) {
1505 cancelled = __kmp_linear_barrier_release_cancellable(
1506 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507 } else {
1508 switch (__kmp_barrier_release_pattern[bt]) {
1509 case bp_hyper_bar: {
1510 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1511 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1512 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513 break;
1514 }
1515 case bp_hierarchical_bar: {
1516 __kmp_hierarchical_barrier_release(
1517 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518 break;
1519 }
1520 case bp_tree_bar: {
1521 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1522 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1523 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524 break;
1525 }
1526 default: {
1527 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1528 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1529 }
1530 }
1531 }
1532 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1533 __kmp_task_team_sync(this_thr, team);
1534 }
1535 }
1536
1537 #if USE_ITT_BUILD
1538 /* GEH: TODO: Move this under if-condition above and also include in
1539 __kmp_end_split_barrier(). This will more accurately represent the actual
1540 release time of the threads for split barriers. */
1541 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1542 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1543 #endif /* USE_ITT_BUILD */
1544 } else { // Team is serialized.
1545 status = 0;
1546 if (__kmp_tasking_mode != tskm_immediate_exec) {
1547 if (this_thr->th.th_task_team != NULL) {
1548 #if USE_ITT_NOTIFY
1549 void *itt_sync_obj = NULL;
1550 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1552 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1553 }
1554 #endif
1555
1556 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1557 TRUE);
1558 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1559 __kmp_task_team_setup(this_thr, team, 0);
1560
1561 #if USE_ITT_BUILD
1562 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1563 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564 #endif /* USE_ITT_BUILD */
1565 }
1566 }
1567 }
1568 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1569 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1570 __kmp_tid_from_gtid(gtid), status));
1571
1572 #if OMPT_SUPPORT
1573 if (ompt_enabled.enabled) {
1574 #if OMPT_OPTIONAL
1575 if (ompt_enabled.ompt_callback_sync_region_wait) {
1576 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1577 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578 return_address);
1579 }
1580 if (ompt_enabled.ompt_callback_sync_region) {
1581 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1582 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583 return_address);
1584 }
1585 #endif
1586 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587 }
1588 #endif
1589 ANNOTATE_BARRIER_END(&team->t.t_bar);
1590
1591 if (cancellable)
1592 return (int)cancelled;
1593 return status;
1594 }
1595
1596 // Returns 0 if master thread, 1 if worker thread.
__kmp_barrier(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1597 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1598 size_t reduce_size, void *reduce_data,
1599 void (*reduce)(void *, void *)) {
1600 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1601 reduce);
1602 }
1603
1604 #if defined(KMP_GOMP_COMPAT)
1605 // Returns 1 if cancelled, 0 otherwise
__kmp_barrier_gomp_cancel(int gtid)1606 int __kmp_barrier_gomp_cancel(int gtid) {
1607 if (__kmp_omp_cancellation) {
1608 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609 0, NULL, NULL);
1610 if (cancelled) {
1611 int tid = __kmp_tid_from_gtid(gtid);
1612 kmp_info_t *this_thr = __kmp_threads[gtid];
1613 if (KMP_MASTER_TID(tid)) {
1614 // Master does not need to revert anything
1615 } else {
1616 // Workers need to revert their private b_arrived flag
1617 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1618 KMP_BARRIER_STATE_BUMP;
1619 }
1620 }
1621 return cancelled;
1622 }
1623 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1624 return FALSE;
1625 }
1626 #endif
1627
__kmp_end_split_barrier(enum barrier_type bt,int gtid)1628 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1629 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1630 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1631 int tid = __kmp_tid_from_gtid(gtid);
1632 kmp_info_t *this_thr = __kmp_threads[gtid];
1633 kmp_team_t *team = this_thr->th.th_team;
1634
1635 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1636 if (!team->t.t_serialized) {
1637 if (KMP_MASTER_GTID(gtid)) {
1638 switch (__kmp_barrier_release_pattern[bt]) {
1639 case bp_hyper_bar: {
1640 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1641 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1642 FALSE USE_ITT_BUILD_ARG(NULL));
1643 break;
1644 }
1645 case bp_hierarchical_bar: {
1646 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1647 FALSE USE_ITT_BUILD_ARG(NULL));
1648 break;
1649 }
1650 case bp_tree_bar: {
1651 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1652 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1653 FALSE USE_ITT_BUILD_ARG(NULL));
1654 break;
1655 }
1656 default: {
1657 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1658 FALSE USE_ITT_BUILD_ARG(NULL));
1659 }
1660 }
1661 if (__kmp_tasking_mode != tskm_immediate_exec) {
1662 __kmp_task_team_sync(this_thr, team);
1663 } // if
1664 }
1665 }
1666 ANNOTATE_BARRIER_END(&team->t.t_bar);
1667 }
1668
__kmp_join_barrier(int gtid)1669 void __kmp_join_barrier(int gtid) {
1670 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1671 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1672 kmp_info_t *this_thr = __kmp_threads[gtid];
1673 kmp_team_t *team;
1674 kmp_uint nproc;
1675 kmp_info_t *master_thread;
1676 int tid;
1677 #ifdef KMP_DEBUG
1678 int team_id;
1679 #endif /* KMP_DEBUG */
1680 #if USE_ITT_BUILD
1681 void *itt_sync_obj = NULL;
1682 #if USE_ITT_NOTIFY
1683 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1684 // Get object created at fork_barrier
1685 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686 #endif
1687 #endif /* USE_ITT_BUILD */
1688 KMP_MB();
1689
1690 // Get current info
1691 team = this_thr->th.th_team;
1692 nproc = this_thr->th.th_team_nproc;
1693 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1694 tid = __kmp_tid_from_gtid(gtid);
1695 #ifdef KMP_DEBUG
1696 team_id = team->t.t_id;
1697 #endif /* KMP_DEBUG */
1698 master_thread = this_thr->th.th_team_master;
1699 #ifdef KMP_DEBUG
1700 if (master_thread != team->t.t_threads[0]) {
1701 __kmp_print_structure();
1702 }
1703 #endif /* KMP_DEBUG */
1704 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1705 KMP_MB();
1706
1707 // Verify state
1708 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1709 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1710 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1711 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1712 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1713 gtid, team_id, tid));
1714
1715 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1716 #if OMPT_SUPPORT
1717 if (ompt_enabled.enabled) {
1718 #if OMPT_OPTIONAL
1719 ompt_data_t *my_task_data;
1720 ompt_data_t *my_parallel_data;
1721 void *codeptr = NULL;
1722 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1723 if (KMP_MASTER_TID(ds_tid) &&
1724 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1725 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1726 codeptr = team->t.ompt_team_info.master_return_address;
1727 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1728 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1729 if (ompt_enabled.ompt_callback_sync_region) {
1730 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1731 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1732 my_task_data, codeptr);
1733 }
1734 if (ompt_enabled.ompt_callback_sync_region_wait) {
1735 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1736 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1737 my_task_data, codeptr);
1738 }
1739 if (!KMP_MASTER_TID(ds_tid))
1740 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1741 #endif
1742 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1743 }
1744 #endif
1745
1746 if (__kmp_tasking_mode == tskm_extra_barrier) {
1747 __kmp_tasking_barrier(team, this_thr, gtid);
1748 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1749 team_id, tid));
1750 }
1751 #ifdef KMP_DEBUG
1752 if (__kmp_tasking_mode != tskm_immediate_exec) {
1753 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1754 "%p, th_task_team = %p\n",
1755 __kmp_gtid_from_thread(this_thr), team_id,
1756 team->t.t_task_team[this_thr->th.th_task_state],
1757 this_thr->th.th_task_team));
1758 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1759 team->t.t_task_team[this_thr->th.th_task_state]);
1760 }
1761 #endif /* KMP_DEBUG */
1762
1763 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1764 access it when the team struct is not guaranteed to exist. Doing these
1765 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1766 we do not perform the copy if blocktime=infinite, since the values are not
1767 used by __kmp_wait_template() in that case. */
1768 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1769 #if KMP_USE_MONITOR
1770 this_thr->th.th_team_bt_intervals =
1771 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1772 this_thr->th.th_team_bt_set =
1773 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1774 #else
1775 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1776 #endif
1777 }
1778
1779 #if USE_ITT_BUILD
1780 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1781 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782 #endif /* USE_ITT_BUILD */
1783
1784 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1785 case bp_hyper_bar: {
1786 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1787 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1788 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789 break;
1790 }
1791 case bp_hierarchical_bar: {
1792 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1793 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794 break;
1795 }
1796 case bp_tree_bar: {
1797 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1798 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1799 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800 break;
1801 }
1802 default: {
1803 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1804 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1805 }
1806 }
1807
1808 /* From this point on, the team data structure may be deallocated at any time
1809 by the master thread - it is unsafe to reference it in any of the worker
1810 threads. Any per-team data items that need to be referenced before the
1811 end of the barrier should be moved to the kmp_task_team_t structs. */
1812 if (KMP_MASTER_TID(tid)) {
1813 if (__kmp_tasking_mode != tskm_immediate_exec) {
1814 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1815 }
1816 if (__kmp_display_affinity) {
1817 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1818 }
1819 #if KMP_STATS_ENABLED
1820 // Have master thread flag the workers to indicate they are now waiting for
1821 // next parallel region, Also wake them up so they switch their timers to
1822 // idle.
1823 for (int i = 0; i < team->t.t_nproc; ++i) {
1824 kmp_info_t *team_thread = team->t.t_threads[i];
1825 if (team_thread == this_thr)
1826 continue;
1827 team_thread->th.th_stats->setIdleFlag();
1828 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1829 team_thread->th.th_sleep_loc != NULL)
1830 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1831 team_thread->th.th_sleep_loc);
1832 }
1833 #endif
1834 #if USE_ITT_BUILD
1835 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837 #endif /* USE_ITT_BUILD */
1838
1839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1840 // Join barrier - report frame end
1841 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1842 __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1843 team->t.t_active_level == 1) {
1844 kmp_uint64 cur_time = __itt_get_timestamp();
1845 ident_t *loc = team->t.t_ident;
1846 kmp_info_t **other_threads = team->t.t_threads;
1847 int nproc = this_thr->th.th_team_nproc;
1848 int i;
1849 switch (__kmp_forkjoin_frames_mode) {
1850 case 1:
1851 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1852 loc, nproc);
1853 break;
1854 case 2:
1855 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1856 loc, nproc);
1857 break;
1858 case 3:
1859 if (__itt_metadata_add_ptr) {
1860 // Initialize with master's wait time
1861 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1862 // Set arrive time to zero to be able to check it in
1863 // __kmp_invoke_task(); the same is done inside the loop below
1864 this_thr->th.th_bar_arrive_time = 0;
1865 for (i = 1; i < nproc; ++i) {
1866 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1867 other_threads[i]->th.th_bar_arrive_time = 0;
1868 }
1869 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1870 cur_time, delta, 0);
1871 }
1872 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1873 loc, nproc);
1874 this_thr->th.th_frame_time = cur_time;
1875 break;
1876 }
1877 }
1878 #endif /* USE_ITT_BUILD */
1879 }
1880 #if USE_ITT_BUILD
1881 else {
1882 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1883 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1884 }
1885 #endif /* USE_ITT_BUILD */
1886
1887 #if KMP_DEBUG
1888 if (KMP_MASTER_TID(tid)) {
1889 KA_TRACE(
1890 15,
1891 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1892 gtid, team_id, tid, nproc));
1893 }
1894 #endif /* KMP_DEBUG */
1895
1896 // TODO now, mark worker threads as done so they may be disbanded
1897 KMP_MB(); // Flush all pending memory write invalidates.
1898 KA_TRACE(10,
1899 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1900
1901 ANNOTATE_BARRIER_END(&team->t.t_bar);
1902 }
1903
1904 // TODO release worker threads' fork barriers as we are ready instead of all at
1905 // once
__kmp_fork_barrier(int gtid,int tid)1906 void __kmp_fork_barrier(int gtid, int tid) {
1907 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1908 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1909 kmp_info_t *this_thr = __kmp_threads[gtid];
1910 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1911 #if USE_ITT_BUILD
1912 void *itt_sync_obj = NULL;
1913 #endif /* USE_ITT_BUILD */
1914 if (team)
1915 ANNOTATE_BARRIER_END(&team->t.t_bar);
1916
1917 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1918 (team != NULL) ? team->t.t_id : -1, tid));
1919
1920 // th_team pointer only valid for master thread here
1921 if (KMP_MASTER_TID(tid)) {
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1924 // Create itt barrier object
1925 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1926 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1927 }
1928 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1929
1930 #ifdef KMP_DEBUG
1931 kmp_info_t **other_threads = team->t.t_threads;
1932 int i;
1933
1934 // Verify state
1935 KMP_MB();
1936
1937 for (i = 1; i < team->t.t_nproc; ++i) {
1938 KA_TRACE(500,
1939 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1940 "== %u.\n",
1941 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1942 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1943 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1944 KMP_DEBUG_ASSERT(
1945 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1946 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1947 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1948 }
1949 #endif
1950
1951 if (__kmp_tasking_mode != tskm_immediate_exec) {
1952 // 0 indicates setup current task team if nthreads > 1
1953 __kmp_task_team_setup(this_thr, team, 0);
1954 }
1955
1956 /* The master thread may have changed its blocktime between the join barrier
1957 and the fork barrier. Copy the blocktime info to the thread, where
1958 __kmp_wait_template() can access it when the team struct is not
1959 guaranteed to exist. */
1960 // See note about the corresponding code in __kmp_join_barrier() being
1961 // performance-critical
1962 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1963 #if KMP_USE_MONITOR
1964 this_thr->th.th_team_bt_intervals =
1965 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1966 this_thr->th.th_team_bt_set =
1967 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1968 #else
1969 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1970 #endif
1971 }
1972 } // master
1973
1974 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1975 case bp_hyper_bar: {
1976 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1977 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1978 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1979 break;
1980 }
1981 case bp_hierarchical_bar: {
1982 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984 break;
1985 }
1986 case bp_tree_bar: {
1987 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1988 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1989 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1990 break;
1991 }
1992 default: {
1993 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995 }
1996 }
1997
1998 #if OMPT_SUPPORT
1999 if (ompt_enabled.enabled &&
2000 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2001 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2002 ompt_data_t *task_data = (team)
2003 ? OMPT_CUR_TASK_DATA(this_thr)
2004 : &(this_thr->th.ompt_thread_info.task_data);
2005 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2006 #if OMPT_OPTIONAL
2007 void *codeptr = NULL;
2008 if (KMP_MASTER_TID(ds_tid) &&
2009 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2010 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2011 codeptr = team->t.ompt_team_info.master_return_address;
2012 if (ompt_enabled.ompt_callback_sync_region_wait) {
2013 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2014 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2015 codeptr);
2016 }
2017 if (ompt_enabled.ompt_callback_sync_region) {
2018 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2019 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020 codeptr);
2021 }
2022 #endif
2023 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2024 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2025 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2026 }
2027 }
2028 #endif
2029
2030 // Early exit for reaping threads releasing forkjoin barrier
2031 if (TCR_4(__kmp_global.g.g_done)) {
2032 this_thr->th.th_task_team = NULL;
2033
2034 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2035 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2036 if (!KMP_MASTER_TID(tid)) {
2037 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2038 if (itt_sync_obj)
2039 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2040 }
2041 }
2042 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2043 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2044 return;
2045 }
2046
2047 /* We can now assume that a valid team structure has been allocated by the
2048 master and propagated to all worker threads. The current thread, however,
2049 may not be part of the team, so we can't blindly assume that the team
2050 pointer is non-null. */
2051 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2052 KMP_DEBUG_ASSERT(team != NULL);
2053 tid = __kmp_tid_from_gtid(gtid);
2054
2055 #if KMP_BARRIER_ICV_PULL
2056 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2057 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2058 implicit task has this data before this function is called. We cannot
2059 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2060 struct, because it is not always the case that the threads arrays have
2061 been allocated when __kmp_fork_call() is executed. */
2062 {
2063 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2064 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2065 // Copy the initial ICVs from the master's thread struct to the implicit
2066 // task for this tid.
2067 KA_TRACE(10,
2068 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2069 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2070 tid, FALSE);
2071 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2072 &team->t.t_threads[0]
2073 ->th.th_bar[bs_forkjoin_barrier]
2074 .bb.th_fixed_icvs);
2075 }
2076 }
2077 #endif // KMP_BARRIER_ICV_PULL
2078
2079 if (__kmp_tasking_mode != tskm_immediate_exec) {
2080 __kmp_task_team_sync(this_thr, team);
2081 }
2082
2083 #if KMP_AFFINITY_SUPPORTED
2084 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2085 if (proc_bind == proc_bind_intel) {
2086 // Call dynamic affinity settings
2087 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2088 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2089 }
2090 } else if (proc_bind != proc_bind_false) {
2091 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2092 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2093 __kmp_gtid_from_thread(this_thr),
2094 this_thr->th.th_current_place));
2095 } else {
2096 __kmp_affinity_set_place(gtid);
2097 }
2098 }
2099 #endif // KMP_AFFINITY_SUPPORTED
2100 // Perform the display affinity functionality
2101 if (__kmp_display_affinity) {
2102 if (team->t.t_display_affinity
2103 #if KMP_AFFINITY_SUPPORTED
2104 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2105 #endif
2106 ) {
2107 // NULL means use the affinity-format-var ICV
2108 __kmp_aux_display_affinity(gtid, NULL);
2109 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2110 this_thr->th.th_prev_level = team->t.t_level;
2111 }
2112 }
2113 if (!KMP_MASTER_TID(tid))
2114 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2115
2116 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2117 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2118 if (!KMP_MASTER_TID(tid)) {
2119 // Get correct barrier object
2120 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2121 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2122 } // (prepare called inside barrier_release)
2123 }
2124 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2125 ANNOTATE_BARRIER_END(&team->t.t_bar);
2126 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2127 team->t.t_id, tid));
2128 }
2129
__kmp_setup_icv_copy(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)2130 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2131 kmp_internal_control_t *new_icvs, ident_t *loc) {
2132 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2133
2134 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2135 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2136
2137 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2138 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2139 implicit task has this data before this function is called. */
2140 #if KMP_BARRIER_ICV_PULL
2141 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2142 untouched), where all of the worker threads can access them and make their
2143 own copies after the barrier. */
2144 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2145 // allocated at this point
2146 copy_icvs(
2147 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2148 new_icvs);
2149 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2150 team->t.t_threads[0], team));
2151 #elif KMP_BARRIER_ICV_PUSH
2152 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2153 // done here.
2154 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2155 team->t.t_threads[0], team));
2156 #else
2157 // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2158 // time.
2159 ngo_load(new_icvs);
2160 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2161 // allocated at this point
2162 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2163 // TODO: GEH - pass in better source location info since usually NULL here
2164 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2165 f, team->t.t_threads[f], team));
2166 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2167 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2168 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2169 f, team->t.t_threads[f], team));
2170 }
2171 ngo_sync();
2172 #endif // KMP_BARRIER_ICV_PULL
2173 }
2174