1 /*
2 * kmp_barrier.cpp
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24
25 #if KMP_MIC && USE_NGO_STORES
26 // ICV copying
27 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
28 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
29 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
31 #else
32 #define ngo_load(src) ((void)0)
33 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
34 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
35 #define ngo_sync() ((void)0)
36 #endif /* KMP_MIC && USE_NGO_STORES */
37
38 void __kmp_print_structure(void); // Forward declaration
39
40 // ---------------------------- Barrier Algorithms ----------------------------
41
42 // Linear Barrier
43 template <bool cancellable = false>
__kmp_linear_barrier_gather_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))44 static bool __kmp_linear_barrier_gather_template(
45 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
47 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
48 kmp_team_t *team = this_thr->th.th_team;
49 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
50 kmp_info_t **other_threads = team->t.t_threads;
51
52 KA_TRACE(
53 20,
54 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58 #if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
60 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
61 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
62 __itt_get_timestamp();
63 }
64 #endif
65 // We now perform a linear reduction to signal that all of the threads have
66 // arrived.
67 if (!KMP_MASTER_TID(tid)) {
68 KA_TRACE(20,
69 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
70 "arrived(%p): %llu => %llu\n",
71 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
72 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
73 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74 // Mark arrival to primary thread
75 /* After performing this write, a worker thread may not assume that the team
76 is valid any more - it could be deallocated by the primary thread at any
77 time. */
78 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
79 flag.release();
80 } else {
81 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
82 int nproc = this_thr->th.th_team_nproc;
83 int i;
84 // Don't have to worry about sleep bit here or atomic since team setting
85 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
86
87 // Collect all the worker team member threads.
88 for (i = 1; i < nproc; ++i) {
89 #if KMP_CACHE_MANAGE
90 // Prefetch next thread's arrived count
91 if (i + 1 < nproc)
92 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
93 #endif /* KMP_CACHE_MANAGE */
94 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
95 "arrived(%p) == %llu\n",
96 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
97 team->t.t_id, i,
98 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
99
100 // Wait for worker thread to arrive
101 if (cancellable) {
102 kmp_flag_64<true, false> flag(
103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
104 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
105 return true;
106 } else {
107 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
108 new_state);
109 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
110 }
111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
112 // Barrier imbalance - write min of the thread time and the other thread
113 // time to the thread.
114 if (__kmp_forkjoin_frames_mode == 2) {
115 this_thr->th.th_bar_min_time = KMP_MIN(
116 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
117 }
118 #endif
119 if (reduce) {
120 KA_TRACE(100,
121 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
123 team->t.t_id, i));
124 OMPT_REDUCTION_DECL(this_thr, gtid);
125 OMPT_REDUCTION_BEGIN;
126 (*reduce)(this_thr->th.th_local.reduce_data,
127 other_threads[i]->th.th_local.reduce_data);
128 OMPT_REDUCTION_END;
129 }
130 }
131 // Don't have to worry about sleep bit here or atomic since team setting
132 team_bar->b_arrived = new_state;
133 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
134 "arrived(%p) = %llu\n",
135 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
136 new_state));
137 }
138 KA_TRACE(
139 20,
140 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
141 gtid, team->t.t_id, tid, bt));
142 return false;
143 }
144
145 template <bool cancellable = false>
__kmp_linear_barrier_release_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))146 static bool __kmp_linear_barrier_release_template(
147 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
148 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
149 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
150 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
151 kmp_team_t *team;
152
153 if (KMP_MASTER_TID(tid)) {
154 unsigned int i;
155 kmp_uint32 nproc = this_thr->th.th_team_nproc;
156 kmp_info_t **other_threads;
157
158 team = __kmp_threads[gtid]->th.th_team;
159 KMP_DEBUG_ASSERT(team != NULL);
160 other_threads = team->t.t_threads;
161
162 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
163 "barrier type %d\n",
164 gtid, team->t.t_id, tid, bt));
165
166 if (nproc > 1) {
167 #if KMP_BARRIER_ICV_PUSH
168 {
169 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
170 if (propagate_icvs) {
171 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
172 for (i = 1; i < nproc; ++i) {
173 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
174 team, i, FALSE);
175 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
176 &team->t.t_implicit_task_taskdata[0].td_icvs);
177 }
178 ngo_sync();
179 }
180 }
181 #endif // KMP_BARRIER_ICV_PUSH
182
183 // Now, release all of the worker threads
184 for (i = 1; i < nproc; ++i) {
185 #if KMP_CACHE_MANAGE
186 // Prefetch next thread's go flag
187 if (i + 1 < nproc)
188 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
189 #endif /* KMP_CACHE_MANAGE */
190 KA_TRACE(
191 20,
192 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
193 "go(%p): %u => %u\n",
194 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
195 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
196 other_threads[i]->th.th_bar[bt].bb.b_go,
197 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
198 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
199 other_threads[i]);
200 flag.release();
201 }
202 }
203 } else { // Wait for the PRIMARY thread to release us
204 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
205 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
206 if (cancellable) {
207 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
208 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
209 return true;
210 } else {
211 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
212 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
213 }
214 #if USE_ITT_BUILD && USE_ITT_NOTIFY
215 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
216 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
217 // disabled)
218 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
219 // Cancel wait on previous parallel region...
220 __kmp_itt_task_starting(itt_sync_obj);
221
222 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
223 return false;
224
225 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
226 if (itt_sync_obj != NULL)
227 // Call prepare as early as possible for "new" barrier
228 __kmp_itt_task_finished(itt_sync_obj);
229 } else
230 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
231 // Early exit for reaping threads releasing forkjoin barrier
232 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
233 return false;
234 // The worker thread may now assume that the team is valid.
235 #ifdef KMP_DEBUG
236 tid = __kmp_tid_from_gtid(gtid);
237 team = __kmp_threads[gtid]->th.th_team;
238 #endif
239 KMP_DEBUG_ASSERT(team != NULL);
240 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
241 KA_TRACE(20,
242 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
243 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
244 KMP_MB(); // Flush all pending memory write invalidates.
245 }
246 KA_TRACE(
247 20,
248 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
249 gtid, team->t.t_id, tid, bt));
250 return false;
251 }
252
__kmp_linear_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))253 static void __kmp_linear_barrier_gather(
254 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
255 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
256 __kmp_linear_barrier_gather_template<false>(
257 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
258 }
259
__kmp_linear_barrier_gather_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))260 static bool __kmp_linear_barrier_gather_cancellable(
261 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
262 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
263 return __kmp_linear_barrier_gather_template<true>(
264 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
265 }
266
__kmp_linear_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))267 static void __kmp_linear_barrier_release(
268 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
269 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
270 __kmp_linear_barrier_release_template<false>(
271 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
272 }
273
__kmp_linear_barrier_release_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))274 static bool __kmp_linear_barrier_release_cancellable(
275 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
276 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
277 return __kmp_linear_barrier_release_template<true>(
278 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
279 }
280
281 // Tree barrier
__kmp_tree_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))282 static void __kmp_tree_barrier_gather(
283 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
284 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
285 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
286 kmp_team_t *team = this_thr->th.th_team;
287 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
288 kmp_info_t **other_threads = team->t.t_threads;
289 kmp_uint32 nproc = this_thr->th.th_team_nproc;
290 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
291 kmp_uint32 branch_factor = 1 << branch_bits;
292 kmp_uint32 child;
293 kmp_uint32 child_tid;
294 kmp_uint64 new_state = 0;
295
296 KA_TRACE(
297 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
298 gtid, team->t.t_id, tid, bt));
299 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
300
301 #if USE_ITT_BUILD && USE_ITT_NOTIFY
302 // Barrier imbalance - save arrive time to the thread
303 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
304 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
305 __itt_get_timestamp();
306 }
307 #endif
308 // Perform tree gather to wait until all threads have arrived; reduce any
309 // required data as we go
310 child_tid = (tid << branch_bits) + 1;
311 if (child_tid < nproc) {
312 // Parent threads wait for all their children to arrive
313 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
314 child = 1;
315 do {
316 kmp_info_t *child_thr = other_threads[child_tid];
317 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
318 #if KMP_CACHE_MANAGE
319 // Prefetch next thread's arrived count
320 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
321 KMP_CACHE_PREFETCH(
322 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
323 #endif /* KMP_CACHE_MANAGE */
324 KA_TRACE(20,
325 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
326 "arrived(%p) == %llu\n",
327 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
328 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
329 // Wait for child to arrive
330 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
331 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
332 #if USE_ITT_BUILD && USE_ITT_NOTIFY
333 // Barrier imbalance - write min of the thread time and a child time to
334 // the thread.
335 if (__kmp_forkjoin_frames_mode == 2) {
336 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
337 child_thr->th.th_bar_min_time);
338 }
339 #endif
340 if (reduce) {
341 KA_TRACE(100,
342 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
343 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
344 team->t.t_id, child_tid));
345 OMPT_REDUCTION_DECL(this_thr, gtid);
346 OMPT_REDUCTION_BEGIN;
347 (*reduce)(this_thr->th.th_local.reduce_data,
348 child_thr->th.th_local.reduce_data);
349 OMPT_REDUCTION_END;
350 }
351 child++;
352 child_tid++;
353 } while (child <= branch_factor && child_tid < nproc);
354 }
355
356 if (!KMP_MASTER_TID(tid)) { // Worker threads
357 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
358
359 KA_TRACE(20,
360 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
361 "arrived(%p): %llu => %llu\n",
362 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
363 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
364 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
365
366 // Mark arrival to parent thread
367 /* After performing this write, a worker thread may not assume that the team
368 is valid any more - it could be deallocated by the primary thread at any
369 time. */
370 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
371 flag.release();
372 } else {
373 // Need to update the team arrived pointer if we are the primary thread
374 if (nproc > 1) // New value was already computed above
375 team->t.t_bar[bt].b_arrived = new_state;
376 else
377 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
378 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
379 "arrived(%p) = %llu\n",
380 gtid, team->t.t_id, tid, team->t.t_id,
381 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
382 }
383 KA_TRACE(20,
384 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
385 gtid, team->t.t_id, tid, bt));
386 }
387
__kmp_tree_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))388 static void __kmp_tree_barrier_release(
389 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
390 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
391 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
392 kmp_team_t *team;
393 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
394 kmp_uint32 nproc;
395 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
396 kmp_uint32 branch_factor = 1 << branch_bits;
397 kmp_uint32 child;
398 kmp_uint32 child_tid;
399
400 // Perform a tree release for all of the threads that have been gathered
401 if (!KMP_MASTER_TID(
402 tid)) { // Handle fork barrier workers who aren't part of a team yet
403 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
404 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
405 // Wait for parent thread to release us
406 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
407 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
409 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
410 // In fork barrier where we could not get the object reliably (or
411 // ITTNOTIFY is disabled)
412 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
413 // Cancel wait on previous parallel region...
414 __kmp_itt_task_starting(itt_sync_obj);
415
416 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
417 return;
418
419 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
420 if (itt_sync_obj != NULL)
421 // Call prepare as early as possible for "new" barrier
422 __kmp_itt_task_finished(itt_sync_obj);
423 } else
424 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
425 // Early exit for reaping threads releasing forkjoin barrier
426 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
427 return;
428
429 // The worker thread may now assume that the team is valid.
430 team = __kmp_threads[gtid]->th.th_team;
431 KMP_DEBUG_ASSERT(team != NULL);
432 tid = __kmp_tid_from_gtid(gtid);
433
434 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
435 KA_TRACE(20,
436 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
437 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
438 KMP_MB(); // Flush all pending memory write invalidates.
439 } else {
440 team = __kmp_threads[gtid]->th.th_team;
441 KMP_DEBUG_ASSERT(team != NULL);
442 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
443 "barrier type %d\n",
444 gtid, team->t.t_id, tid, bt));
445 }
446 nproc = this_thr->th.th_team_nproc;
447 child_tid = (tid << branch_bits) + 1;
448
449 if (child_tid < nproc) {
450 kmp_info_t **other_threads = team->t.t_threads;
451 child = 1;
452 // Parent threads release all their children
453 do {
454 kmp_info_t *child_thr = other_threads[child_tid];
455 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
456 #if KMP_CACHE_MANAGE
457 // Prefetch next thread's go count
458 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
459 KMP_CACHE_PREFETCH(
460 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
461 #endif /* KMP_CACHE_MANAGE */
462
463 #if KMP_BARRIER_ICV_PUSH
464 {
465 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
466 if (propagate_icvs) {
467 __kmp_init_implicit_task(team->t.t_ident,
468 team->t.t_threads[child_tid], team,
469 child_tid, FALSE);
470 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
471 &team->t.t_implicit_task_taskdata[0].td_icvs);
472 }
473 }
474 #endif // KMP_BARRIER_ICV_PUSH
475 KA_TRACE(20,
476 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
477 "go(%p): %u => %u\n",
478 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
479 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
480 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
481 // Release child from barrier
482 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
483 flag.release();
484 child++;
485 child_tid++;
486 } while (child <= branch_factor && child_tid < nproc);
487 }
488 KA_TRACE(
489 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
490 gtid, team->t.t_id, tid, bt));
491 }
492
493 // Hyper Barrier
__kmp_hyper_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))494 static void __kmp_hyper_barrier_gather(
495 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
496 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
497 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
498 kmp_team_t *team = this_thr->th.th_team;
499 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
500 kmp_info_t **other_threads = team->t.t_threads;
501 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
502 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
503 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
504 kmp_uint32 branch_factor = 1 << branch_bits;
505 kmp_uint32 offset;
506 kmp_uint32 level;
507
508 KA_TRACE(
509 20,
510 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
511 gtid, team->t.t_id, tid, bt));
512 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
513
514 #if USE_ITT_BUILD && USE_ITT_NOTIFY
515 // Barrier imbalance - save arrive time to the thread
516 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
517 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
518 __itt_get_timestamp();
519 }
520 #endif
521 /* Perform a hypercube-embedded tree gather to wait until all of the threads
522 have arrived, and reduce any required data as we go. */
523 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
524 for (level = 0, offset = 1; offset < num_threads;
525 level += branch_bits, offset <<= branch_bits) {
526 kmp_uint32 child;
527 kmp_uint32 child_tid;
528
529 if (((tid >> level) & (branch_factor - 1)) != 0) {
530 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
531
532 KMP_MB(); // Synchronize parent and child threads.
533 KA_TRACE(20,
534 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
535 "arrived(%p): %llu => %llu\n",
536 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
537 team->t.t_id, parent_tid, &thr_bar->b_arrived,
538 thr_bar->b_arrived,
539 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
540 // Mark arrival to parent thread
541 /* After performing this write (in the last iteration of the enclosing for
542 loop), a worker thread may not assume that the team is valid any more
543 - it could be deallocated by the primary thread at any time. */
544 p_flag.set_waiter(other_threads[parent_tid]);
545 p_flag.release();
546 break;
547 }
548
549 // Parent threads wait for children to arrive
550 if (new_state == KMP_BARRIER_UNUSED_STATE)
551 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
552 for (child = 1, child_tid = tid + (1 << level);
553 child < branch_factor && child_tid < num_threads;
554 child++, child_tid += (1 << level)) {
555 kmp_info_t *child_thr = other_threads[child_tid];
556 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
557 #if KMP_CACHE_MANAGE
558 kmp_uint32 next_child_tid = child_tid + (1 << level);
559 // Prefetch next thread's arrived count
560 if (child + 1 < branch_factor && next_child_tid < num_threads)
561 KMP_CACHE_PREFETCH(
562 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
563 #endif /* KMP_CACHE_MANAGE */
564 KA_TRACE(20,
565 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
566 "arrived(%p) == %llu\n",
567 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
568 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
569 // Wait for child to arrive
570 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
571 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
572 KMP_MB(); // Synchronize parent and child threads.
573 #if USE_ITT_BUILD && USE_ITT_NOTIFY
574 // Barrier imbalance - write min of the thread time and a child time to
575 // the thread.
576 if (__kmp_forkjoin_frames_mode == 2) {
577 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
578 child_thr->th.th_bar_min_time);
579 }
580 #endif
581 if (reduce) {
582 KA_TRACE(100,
583 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585 team->t.t_id, child_tid));
586 OMPT_REDUCTION_DECL(this_thr, gtid);
587 OMPT_REDUCTION_BEGIN;
588 (*reduce)(this_thr->th.th_local.reduce_data,
589 child_thr->th.th_local.reduce_data);
590 OMPT_REDUCTION_END;
591 }
592 }
593 }
594
595 if (KMP_MASTER_TID(tid)) {
596 // Need to update the team arrived pointer if we are the primary thread
597 if (new_state == KMP_BARRIER_UNUSED_STATE)
598 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
599 else
600 team->t.t_bar[bt].b_arrived = new_state;
601 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
602 "arrived(%p) = %llu\n",
603 gtid, team->t.t_id, tid, team->t.t_id,
604 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
605 }
606 KA_TRACE(
607 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
608 gtid, team->t.t_id, tid, bt));
609 }
610
611 // The reverse versions seem to beat the forward versions overall
612 #define KMP_REVERSE_HYPER_BAR
__kmp_hyper_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))613 static void __kmp_hyper_barrier_release(
614 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
615 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
616 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
617 kmp_team_t *team;
618 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
619 kmp_info_t **other_threads;
620 kmp_uint32 num_threads;
621 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
622 kmp_uint32 branch_factor = 1 << branch_bits;
623 kmp_uint32 child;
624 kmp_uint32 child_tid;
625 kmp_uint32 offset;
626 kmp_uint32 level;
627
628 /* Perform a hypercube-embedded tree release for all of the threads that have
629 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
630 are released in the reverse order of the corresponding gather, otherwise
631 threads are released in the same order. */
632 if (KMP_MASTER_TID(tid)) { // primary thread
633 team = __kmp_threads[gtid]->th.th_team;
634 KMP_DEBUG_ASSERT(team != NULL);
635 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
636 "barrier type %d\n",
637 gtid, team->t.t_id, tid, bt));
638 #if KMP_BARRIER_ICV_PUSH
639 if (propagate_icvs) { // primary already has ICVs in final destination; copy
640 copy_icvs(&thr_bar->th_fixed_icvs,
641 &team->t.t_implicit_task_taskdata[tid].td_icvs);
642 }
643 #endif
644 } else { // Handle fork barrier workers who aren't part of a team yet
645 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
646 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
647 // Wait for parent thread to release us
648 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
649 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
650 #if USE_ITT_BUILD && USE_ITT_NOTIFY
651 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
652 // In fork barrier where we could not get the object reliably
653 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
654 // Cancel wait on previous parallel region...
655 __kmp_itt_task_starting(itt_sync_obj);
656
657 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
658 return;
659
660 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
661 if (itt_sync_obj != NULL)
662 // Call prepare as early as possible for "new" barrier
663 __kmp_itt_task_finished(itt_sync_obj);
664 } else
665 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
666 // Early exit for reaping threads releasing forkjoin barrier
667 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
668 return;
669
670 // The worker thread may now assume that the team is valid.
671 team = __kmp_threads[gtid]->th.th_team;
672 KMP_DEBUG_ASSERT(team != NULL);
673 tid = __kmp_tid_from_gtid(gtid);
674
675 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
676 KA_TRACE(20,
677 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
678 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
679 KMP_MB(); // Flush all pending memory write invalidates.
680 }
681 num_threads = this_thr->th.th_team_nproc;
682 other_threads = team->t.t_threads;
683
684 #ifdef KMP_REVERSE_HYPER_BAR
685 // Count up to correct level for parent
686 for (level = 0, offset = 1;
687 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
688 level += branch_bits, offset <<= branch_bits)
689 ;
690
691 // Now go down from there
692 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
693 level -= branch_bits, offset >>= branch_bits)
694 #else
695 // Go down the tree, level by level
696 for (level = 0, offset = 1; offset < num_threads;
697 level += branch_bits, offset <<= branch_bits)
698 #endif // KMP_REVERSE_HYPER_BAR
699 {
700 #ifdef KMP_REVERSE_HYPER_BAR
701 /* Now go in reverse order through the children, highest to lowest.
702 Initial setting of child is conservative here. */
703 child = num_threads >> ((level == 0) ? level : level - 1);
704 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
705 child_tid = tid + (child << level);
706 child >= 1; child--, child_tid -= (1 << level))
707 #else
708 if (((tid >> level) & (branch_factor - 1)) != 0)
709 // No need to go lower than this, since this is the level parent would be
710 // notified
711 break;
712 // Iterate through children on this level of the tree
713 for (child = 1, child_tid = tid + (1 << level);
714 child < branch_factor && child_tid < num_threads;
715 child++, child_tid += (1 << level))
716 #endif // KMP_REVERSE_HYPER_BAR
717 {
718 if (child_tid >= num_threads)
719 continue; // Child doesn't exist so keep going
720 else {
721 kmp_info_t *child_thr = other_threads[child_tid];
722 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
723 #if KMP_CACHE_MANAGE
724 kmp_uint32 next_child_tid = child_tid - (1 << level);
725 // Prefetch next thread's go count
726 #ifdef KMP_REVERSE_HYPER_BAR
727 if (child - 1 >= 1 && next_child_tid < num_threads)
728 #else
729 if (child + 1 < branch_factor && next_child_tid < num_threads)
730 #endif // KMP_REVERSE_HYPER_BAR
731 KMP_CACHE_PREFETCH(
732 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
733 #endif /* KMP_CACHE_MANAGE */
734
735 #if KMP_BARRIER_ICV_PUSH
736 if (propagate_icvs) // push my fixed ICVs to my child
737 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
738 #endif // KMP_BARRIER_ICV_PUSH
739
740 KA_TRACE(
741 20,
742 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
743 "go(%p): %u => %u\n",
744 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
745 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
746 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
747 // Release child from barrier
748 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
749 flag.release();
750 }
751 }
752 }
753 #if KMP_BARRIER_ICV_PUSH
754 if (propagate_icvs &&
755 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
756 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
757 FALSE);
758 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
759 &thr_bar->th_fixed_icvs);
760 }
761 #endif
762 KA_TRACE(
763 20,
764 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
765 gtid, team->t.t_id, tid, bt));
766 }
767
768 // Hierarchical Barrier
769
770 // Initialize thread barrier data
771 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
772 Performs the minimum amount of initialization required based on how the team
773 has changed. Returns true if leaf children will require both on-core and
774 traditional wake-up mechanisms. For example, if the team size increases,
775 threads already in the team will respond to on-core wakeup on their parent
776 thread, but threads newly added to the team will only be listening on the
777 their local b_go. */
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt,kmp_bstate_t * thr_bar,kmp_uint32 nproc,int gtid,int tid,kmp_team_t * team)778 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
779 kmp_bstate_t *thr_bar,
780 kmp_uint32 nproc, int gtid,
781 int tid, kmp_team_t *team) {
782 // Checks to determine if (re-)initialization is needed
783 bool uninitialized = thr_bar->team == NULL;
784 bool team_changed = team != thr_bar->team;
785 bool team_sz_changed = nproc != thr_bar->nproc;
786 bool tid_changed = tid != thr_bar->old_tid;
787 bool retval = false;
788
789 if (uninitialized || team_sz_changed) {
790 __kmp_get_hierarchy(nproc, thr_bar);
791 }
792
793 if (uninitialized || team_sz_changed || tid_changed) {
794 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
795 thr_bar->parent_tid = -1; // default for primary thread
796 if (!KMP_MASTER_TID(tid)) {
797 // if not primary thread, find parent thread in hierarchy
798 kmp_uint32 d = 0;
799 while (d < thr_bar->depth) { // find parent based on level of thread in
800 // hierarchy, and note level
801 kmp_uint32 rem;
802 if (d == thr_bar->depth - 2) { // reached level right below the primary
803 thr_bar->parent_tid = 0;
804 thr_bar->my_level = d;
805 break;
806 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
807 // TODO: can we make the above op faster?
808 // thread is not a subtree root at next level, so this is max
809 thr_bar->parent_tid = tid - rem;
810 thr_bar->my_level = d;
811 break;
812 }
813 ++d;
814 }
815 }
816 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
817 (thr_bar->skip_per_level[thr_bar->my_level])),
818 &(thr_bar->offset));
819 thr_bar->old_tid = tid;
820 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
821 thr_bar->team = team;
822 thr_bar->parent_bar =
823 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
824 }
825 if (uninitialized || team_changed || tid_changed) {
826 thr_bar->team = team;
827 thr_bar->parent_bar =
828 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
829 retval = true;
830 }
831 if (uninitialized || team_sz_changed || tid_changed) {
832 thr_bar->nproc = nproc;
833 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
834 if (thr_bar->my_level == 0)
835 thr_bar->leaf_kids = 0;
836 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
837 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
838 thr_bar->leaf_state = 0;
839 for (int i = 0; i < thr_bar->leaf_kids; ++i)
840 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
841 }
842 return retval;
843 }
844
__kmp_hierarchical_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))845 static void __kmp_hierarchical_barrier_gather(
846 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
847 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
848 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
849 kmp_team_t *team = this_thr->th.th_team;
850 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
851 kmp_uint32 nproc = this_thr->th.th_team_nproc;
852 kmp_info_t **other_threads = team->t.t_threads;
853 kmp_uint64 new_state = 0;
854
855 int level = team->t.t_level;
856 if (other_threads[0]
857 ->th.th_teams_microtask) // are we inside the teams construct?
858 if (this_thr->th.th_teams_size.nteams > 1)
859 ++level; // level was not increased in teams construct for team_of_masters
860 if (level == 1)
861 thr_bar->use_oncore_barrier = 1;
862 else
863 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
864
865 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
866 "barrier type %d\n",
867 gtid, team->t.t_id, tid, bt));
868 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
869
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY
871 // Barrier imbalance - save arrive time to the thread
872 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
873 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
874 }
875 #endif
876
877 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
878 team);
879
880 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
881 kmp_int32 child_tid;
882 new_state =
883 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
884 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
885 thr_bar->use_oncore_barrier) {
886 if (thr_bar->leaf_kids) {
887 // First, wait for leaf children to check-in on my b_arrived flag
888 kmp_uint64 leaf_state =
889 KMP_MASTER_TID(tid)
890 ? thr_bar->b_arrived | thr_bar->leaf_state
891 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
892 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
893 "for leaf kids\n",
894 gtid, team->t.t_id, tid));
895 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
896 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
897 if (reduce) {
898 OMPT_REDUCTION_DECL(this_thr, gtid);
899 OMPT_REDUCTION_BEGIN;
900 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
901 ++child_tid) {
902 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
903 "T#%d(%d:%d)\n",
904 gtid, team->t.t_id, tid,
905 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
906 child_tid));
907 (*reduce)(this_thr->th.th_local.reduce_data,
908 other_threads[child_tid]->th.th_local.reduce_data);
909 }
910 OMPT_REDUCTION_END;
911 }
912 // clear leaf_state bits
913 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
914 }
915 // Next, wait for higher level children on each child's b_arrived flag
916 for (kmp_uint32 d = 1; d < thr_bar->my_level;
917 ++d) { // gather lowest level threads first, but skip 0
918 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
919 skip = thr_bar->skip_per_level[d];
920 if (last > nproc)
921 last = nproc;
922 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
923 kmp_info_t *child_thr = other_threads[child_tid];
924 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
925 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
926 "T#%d(%d:%d) "
927 "arrived(%p) == %llu\n",
928 gtid, team->t.t_id, tid,
929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930 child_tid, &child_bar->b_arrived, new_state));
931 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
932 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
933 if (reduce) {
934 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
935 "T#%d(%d:%d)\n",
936 gtid, team->t.t_id, tid,
937 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
938 child_tid));
939 (*reduce)(this_thr->th.th_local.reduce_data,
940 child_thr->th.th_local.reduce_data);
941 }
942 }
943 }
944 } else { // Blocktime is not infinite
945 for (kmp_uint32 d = 0; d < thr_bar->my_level;
946 ++d) { // Gather lowest level threads first
947 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948 skip = thr_bar->skip_per_level[d];
949 if (last > nproc)
950 last = nproc;
951 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952 kmp_info_t *child_thr = other_threads[child_tid];
953 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955 "T#%d(%d:%d) "
956 "arrived(%p) == %llu\n",
957 gtid, team->t.t_id, tid,
958 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959 child_tid, &child_bar->b_arrived, new_state));
960 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962 if (reduce) {
963 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
964 "T#%d(%d:%d)\n",
965 gtid, team->t.t_id, tid,
966 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
967 child_tid));
968 (*reduce)(this_thr->th.th_local.reduce_data,
969 child_thr->th.th_local.reduce_data);
970 }
971 }
972 }
973 }
974 }
975 // All subordinates are gathered; now release parent if not primary thread
976
977 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
978 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
979 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
980 gtid, team->t.t_id, tid,
981 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
982 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
983 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
984 /* Mark arrival to parent: After performing this write, a worker thread may
985 not assume that the team is valid any more - it could be deallocated by
986 the primary thread at any time. */
987 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
988 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
989 // flag; release it
990 kmp_flag_64<> flag(&thr_bar->b_arrived,
991 other_threads[thr_bar->parent_tid]);
992 flag.release();
993 } else {
994 // Leaf does special release on "offset" bits of parent's b_arrived flag
995 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
996 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
997 thr_bar->offset + 1);
998 flag.set_waiter(other_threads[thr_bar->parent_tid]);
999 flag.release();
1000 }
1001 } else { // Primary thread needs to update the team's b_arrived value
1002 team->t.t_bar[bt].b_arrived = new_state;
1003 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1004 "arrived(%p) = %llu\n",
1005 gtid, team->t.t_id, tid, team->t.t_id,
1006 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1007 }
1008 // Is the team access below unsafe or just technically invalid?
1009 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1010 "barrier type %d\n",
1011 gtid, team->t.t_id, tid, bt));
1012 }
1013
__kmp_hierarchical_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))1014 static void __kmp_hierarchical_barrier_release(
1015 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1016 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1017 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1018 kmp_team_t *team;
1019 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1020 kmp_uint32 nproc;
1021 bool team_change = false; // indicates on-core barrier shouldn't be used
1022
1023 if (KMP_MASTER_TID(tid)) {
1024 team = __kmp_threads[gtid]->th.th_team;
1025 KMP_DEBUG_ASSERT(team != NULL);
1026 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1027 "entered barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1029 } else { // Worker threads
1030 // Wait for parent thread to release me
1031 if (!thr_bar->use_oncore_barrier ||
1032 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1033 thr_bar->team == NULL) {
1034 // Use traditional method of waiting on my own b_go flag
1035 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1036 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1037 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1038 TCW_8(thr_bar->b_go,
1039 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1040 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1041 // infinite, not nested
1042 // Wait on my "offset" bits on parent's b_go flag
1043 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1044 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1045 thr_bar->offset + 1, bt,
1046 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1047 flag.wait(this_thr, TRUE);
1048 if (thr_bar->wait_flag ==
1049 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1050 TCW_8(thr_bar->b_go,
1051 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1052 } else { // Reset my bits on parent's b_go flag
1053 (RCAST(volatile char *,
1054 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1055 }
1056 }
1057 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1058 // Early exit for reaping threads releasing forkjoin barrier
1059 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1060 return;
1061 // The worker thread may now assume that the team is valid.
1062 team = __kmp_threads[gtid]->th.th_team;
1063 KMP_DEBUG_ASSERT(team != NULL);
1064 tid = __kmp_tid_from_gtid(gtid);
1065
1066 KA_TRACE(
1067 20,
1068 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1069 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1070 KMP_MB(); // Flush all pending memory write invalidates.
1071 }
1072
1073 nproc = this_thr->th.th_team_nproc;
1074 int level = team->t.t_level;
1075 if (team->t.t_threads[0]
1076 ->th.th_teams_microtask) { // are we inside the teams construct?
1077 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1078 this_thr->th.th_teams_level == level)
1079 ++level; // level was not increased in teams construct for team_of_workers
1080 if (this_thr->th.th_teams_size.nteams > 1)
1081 ++level; // level was not increased in teams construct for team_of_masters
1082 }
1083 if (level == 1)
1084 thr_bar->use_oncore_barrier = 1;
1085 else
1086 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1087
1088 // If the team size has increased, we still communicate with old leaves via
1089 // oncore barrier.
1090 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1091 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1092 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1093 tid, team);
1094 // But if the entire team changes, we won't use oncore barrier at all
1095 if (team_change)
1096 old_leaf_kids = 0;
1097
1098 #if KMP_BARRIER_ICV_PUSH
1099 if (propagate_icvs) {
1100 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1101 FALSE);
1102 if (KMP_MASTER_TID(
1103 tid)) { // primary already has copy in final destination; copy
1104 copy_icvs(&thr_bar->th_fixed_icvs,
1105 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1106 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1107 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1108 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1109 // leaves (on-core children) pull parent's fixed ICVs directly to local
1110 // ICV store
1111 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1112 &thr_bar->parent_bar->th_fixed_icvs);
1113 // non-leaves will get ICVs piggybacked with b_go via NGO store
1114 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1115 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1116 // access
1117 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1118 else // leaves copy parent's fixed ICVs directly to local ICV store
1119 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1120 &thr_bar->parent_bar->th_fixed_icvs);
1121 }
1122 }
1123 #endif // KMP_BARRIER_ICV_PUSH
1124
1125 // Now, release my children
1126 if (thr_bar->my_level) { // not a leaf
1127 kmp_int32 child_tid;
1128 kmp_uint32 last;
1129 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1130 thr_bar->use_oncore_barrier) {
1131 if (KMP_MASTER_TID(tid)) { // do a flat release
1132 // Set local b_go to bump children via NGO store of the cache line
1133 // containing IVCs and b_go.
1134 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1135 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1136 // the cache line
1137 ngo_load(&thr_bar->th_fixed_icvs);
1138 // This loops over all the threads skipping only the leaf nodes in the
1139 // hierarchy
1140 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1141 child_tid += thr_bar->skip_per_level[1]) {
1142 kmp_bstate_t *child_bar =
1143 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1144 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1145 "releasing T#%d(%d:%d)"
1146 " go(%p): %u => %u\n",
1147 gtid, team->t.t_id, tid,
1148 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1149 child_tid, &child_bar->b_go, child_bar->b_go,
1150 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1151 // Use ngo store (if available) to both store ICVs and release child
1152 // via child's b_go
1153 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1154 }
1155 ngo_sync();
1156 }
1157 TCW_8(thr_bar->b_go,
1158 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1159 // Now, release leaf children
1160 if (thr_bar->leaf_kids) { // if there are any
1161 // We test team_change on the off-chance that the level 1 team changed.
1162 if (team_change ||
1163 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1164 if (old_leaf_kids) { // release old leaf kids
1165 thr_bar->b_go |= old_leaf_state;
1166 }
1167 // Release new leaf kids
1168 last = tid + thr_bar->skip_per_level[1];
1169 if (last > nproc)
1170 last = nproc;
1171 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1172 ++child_tid) { // skip_per_level[0]=1
1173 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1174 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1175 KA_TRACE(
1176 20,
1177 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1178 " T#%d(%d:%d) go(%p): %u => %u\n",
1179 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1180 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1181 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1182 // Release child using child's b_go flag
1183 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1184 flag.release();
1185 }
1186 } else { // Release all children at once with leaf_state bits on my own
1187 // b_go flag
1188 thr_bar->b_go |= thr_bar->leaf_state;
1189 }
1190 }
1191 } else { // Blocktime is not infinite; do a simple hierarchical release
1192 for (int d = thr_bar->my_level - 1; d >= 0;
1193 --d) { // Release highest level threads first
1194 last = tid + thr_bar->skip_per_level[d + 1];
1195 kmp_uint32 skip = thr_bar->skip_per_level[d];
1196 if (last > nproc)
1197 last = nproc;
1198 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1199 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1200 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1201 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1202 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1203 gtid, team->t.t_id, tid,
1204 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1205 child_tid, &child_bar->b_go, child_bar->b_go,
1206 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1207 // Release child using child's b_go flag
1208 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1209 flag.release();
1210 }
1211 }
1212 }
1213 #if KMP_BARRIER_ICV_PUSH
1214 if (propagate_icvs && !KMP_MASTER_TID(tid))
1215 // non-leaves copy ICVs from fixed ICVs to local dest
1216 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1217 &thr_bar->th_fixed_icvs);
1218 #endif // KMP_BARRIER_ICV_PUSH
1219 }
1220 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1221 "barrier type %d\n",
1222 gtid, team->t.t_id, tid, bt));
1223 }
1224
1225 // End of Barrier Algorithms
1226
1227 // type traits for cancellable value
1228 // if cancellable is true, then is_cancellable is a normal boolean variable
1229 // if cancellable is false, then is_cancellable is a compile time constant
1230 template <bool cancellable> struct is_cancellable {};
1231 template <> struct is_cancellable<true> {
1232 bool value;
is_cancellableis_cancellable1233 is_cancellable() : value(false) {}
is_cancellableis_cancellable1234 is_cancellable(bool b) : value(b) {}
operator =is_cancellable1235 is_cancellable &operator=(bool b) {
1236 value = b;
1237 return *this;
1238 }
operator boolis_cancellable1239 operator bool() const { return value; }
1240 };
1241 template <> struct is_cancellable<false> {
operator =is_cancellable1242 is_cancellable &operator=(bool b) { return *this; }
operator boolis_cancellable1243 constexpr operator bool() const { return false; }
1244 };
1245
1246 // Internal function to do a barrier.
1247 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1248 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1249 barrier
1250 When cancellable = false,
1251 Returns 0 if primary thread, 1 if worker thread.
1252 When cancellable = true
1253 Returns 0 if not cancelled, 1 if cancelled. */
1254 template <bool cancellable = false>
__kmp_barrier_template(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1255 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1256 size_t reduce_size, void *reduce_data,
1257 void (*reduce)(void *, void *)) {
1258 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1259 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1260 int tid = __kmp_tid_from_gtid(gtid);
1261 kmp_info_t *this_thr = __kmp_threads[gtid];
1262 kmp_team_t *team = this_thr->th.th_team;
1263 int status = 0;
1264 is_cancellable<cancellable> cancelled;
1265 #if OMPT_SUPPORT && OMPT_OPTIONAL
1266 ompt_data_t *my_task_data;
1267 ompt_data_t *my_parallel_data;
1268 void *return_address;
1269 ompt_sync_region_t barrier_kind;
1270 #endif
1271
1272 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1273 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1274
1275 #if OMPT_SUPPORT
1276 if (ompt_enabled.enabled) {
1277 #if OMPT_OPTIONAL
1278 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1279 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1280 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1281 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1282 if (ompt_enabled.ompt_callback_sync_region) {
1283 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1284 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1285 return_address);
1286 }
1287 if (ompt_enabled.ompt_callback_sync_region_wait) {
1288 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1289 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1290 return_address);
1291 }
1292 #endif
1293 // It is OK to report the barrier state after the barrier begin callback.
1294 // According to the OMPT specification, a compliant implementation may
1295 // even delay reporting this state until the barrier begins to wait.
1296 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1297 }
1298 #endif
1299
1300 if (!team->t.t_serialized) {
1301 #if USE_ITT_BUILD
1302 // This value will be used in itt notify events below.
1303 void *itt_sync_obj = NULL;
1304 #if USE_ITT_NOTIFY
1305 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1306 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1307 #endif
1308 #endif /* USE_ITT_BUILD */
1309 if (__kmp_tasking_mode == tskm_extra_barrier) {
1310 __kmp_tasking_barrier(team, this_thr, gtid);
1311 KA_TRACE(15,
1312 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1313 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1314 }
1315
1316 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1317 access it when the team struct is not guaranteed to exist. */
1318 // See note about the corresponding code in __kmp_join_barrier() being
1319 // performance-critical.
1320 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1321 #if KMP_USE_MONITOR
1322 this_thr->th.th_team_bt_intervals =
1323 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1324 this_thr->th.th_team_bt_set =
1325 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1326 #else
1327 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1328 #endif
1329 }
1330
1331 #if USE_ITT_BUILD
1332 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1333 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1334 #endif /* USE_ITT_BUILD */
1335 #if USE_DEBUGGER
1336 // Let the debugger know: the thread arrived to the barrier and waiting.
1337 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1338 team->t.t_bar[bt].b_master_arrived += 1;
1339 } else {
1340 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1341 } // if
1342 #endif /* USE_DEBUGGER */
1343 if (reduce != NULL) {
1344 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1345 this_thr->th.th_local.reduce_data = reduce_data;
1346 }
1347
1348 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1349 // use 0 to only setup the current team if nthreads > 1
1350 __kmp_task_team_setup(this_thr, team, 0);
1351
1352 if (cancellable) {
1353 cancelled = __kmp_linear_barrier_gather_cancellable(
1354 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1355 } else {
1356 switch (__kmp_barrier_gather_pattern[bt]) {
1357 case bp_hyper_bar: {
1358 // don't set branch bits to 0; use linear
1359 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1360 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1361 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1362 break;
1363 }
1364 case bp_hierarchical_bar: {
1365 __kmp_hierarchical_barrier_gather(
1366 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1367 break;
1368 }
1369 case bp_tree_bar: {
1370 // don't set branch bits to 0; use linear
1371 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1372 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1373 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1374 break;
1375 }
1376 default: {
1377 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1378 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1379 }
1380 }
1381 }
1382
1383 KMP_MB();
1384
1385 if (KMP_MASTER_TID(tid)) {
1386 status = 0;
1387 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1388 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1389 }
1390 #if USE_DEBUGGER
1391 // Let the debugger know: All threads are arrived and starting leaving the
1392 // barrier.
1393 team->t.t_bar[bt].b_team_arrived += 1;
1394 #endif
1395
1396 if (__kmp_omp_cancellation) {
1397 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1398 // Reset cancellation flag for worksharing constructs
1399 if (cancel_request == cancel_loop ||
1400 cancel_request == cancel_sections) {
1401 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1402 }
1403 }
1404 #if USE_ITT_BUILD
1405 /* TODO: In case of split reduction barrier, primary thread may send
1406 acquired event early, before the final summation into the shared
1407 variable is done (final summation can be a long operation for array
1408 reductions). */
1409 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1410 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1411 #endif /* USE_ITT_BUILD */
1412 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1413 // Barrier - report frame end (only if active_level == 1)
1414 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1415 __kmp_forkjoin_frames_mode &&
1416 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1417 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1418 team->t.t_active_level == 1) {
1419 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1420 kmp_uint64 cur_time = __itt_get_timestamp();
1421 kmp_info_t **other_threads = team->t.t_threads;
1422 int nproc = this_thr->th.th_team_nproc;
1423 int i;
1424 switch (__kmp_forkjoin_frames_mode) {
1425 case 1:
1426 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1427 loc, nproc);
1428 this_thr->th.th_frame_time = cur_time;
1429 break;
1430 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1431 // be fixed)
1432 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1433 1, loc, nproc);
1434 break;
1435 case 3:
1436 if (__itt_metadata_add_ptr) {
1437 // Initialize with primary thread's wait time
1438 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1439 // Set arrive time to zero to be able to check it in
1440 // __kmp_invoke_task(); the same is done inside the loop below
1441 this_thr->th.th_bar_arrive_time = 0;
1442 for (i = 1; i < nproc; ++i) {
1443 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1444 other_threads[i]->th.th_bar_arrive_time = 0;
1445 }
1446 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1447 cur_time, delta,
1448 (kmp_uint64)(reduce != NULL));
1449 }
1450 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1451 loc, nproc);
1452 this_thr->th.th_frame_time = cur_time;
1453 break;
1454 }
1455 }
1456 #endif /* USE_ITT_BUILD */
1457 } else {
1458 status = 1;
1459 #if USE_ITT_BUILD
1460 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1461 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1462 #endif /* USE_ITT_BUILD */
1463 }
1464 if ((status == 1 || !is_split) && !cancelled) {
1465 if (cancellable) {
1466 cancelled = __kmp_linear_barrier_release_cancellable(
1467 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1468 } else {
1469 switch (__kmp_barrier_release_pattern[bt]) {
1470 case bp_hyper_bar: {
1471 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1472 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1473 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474 break;
1475 }
1476 case bp_hierarchical_bar: {
1477 __kmp_hierarchical_barrier_release(
1478 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1479 break;
1480 }
1481 case bp_tree_bar: {
1482 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1483 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1484 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1485 break;
1486 }
1487 default: {
1488 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1489 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1490 }
1491 }
1492 }
1493 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1494 __kmp_task_team_sync(this_thr, team);
1495 }
1496 }
1497
1498 #if USE_ITT_BUILD
1499 /* GEH: TODO: Move this under if-condition above and also include in
1500 __kmp_end_split_barrier(). This will more accurately represent the actual
1501 release time of the threads for split barriers. */
1502 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505 } else { // Team is serialized.
1506 status = 0;
1507 if (__kmp_tasking_mode != tskm_immediate_exec) {
1508 if (this_thr->th.th_task_team != NULL) {
1509 #if USE_ITT_NOTIFY
1510 void *itt_sync_obj = NULL;
1511 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1512 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1513 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1514 }
1515 #endif
1516
1517 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1518 TRUE);
1519 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1520 __kmp_task_team_setup(this_thr, team, 0);
1521
1522 #if USE_ITT_BUILD
1523 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1524 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1525 #endif /* USE_ITT_BUILD */
1526 }
1527 }
1528 }
1529 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1530 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1531 __kmp_tid_from_gtid(gtid), status));
1532
1533 #if OMPT_SUPPORT
1534 if (ompt_enabled.enabled) {
1535 #if OMPT_OPTIONAL
1536 if (ompt_enabled.ompt_callback_sync_region_wait) {
1537 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1538 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1539 return_address);
1540 }
1541 if (ompt_enabled.ompt_callback_sync_region) {
1542 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1543 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1544 return_address);
1545 }
1546 #endif
1547 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1548 }
1549 #endif
1550
1551 if (cancellable)
1552 return (int)cancelled;
1553 return status;
1554 }
1555
1556 // Returns 0 if primary thread, 1 if worker thread.
__kmp_barrier(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1557 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1558 size_t reduce_size, void *reduce_data,
1559 void (*reduce)(void *, void *)) {
1560 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1561 reduce);
1562 }
1563
1564 #if defined(KMP_GOMP_COMPAT)
1565 // Returns 1 if cancelled, 0 otherwise
__kmp_barrier_gomp_cancel(int gtid)1566 int __kmp_barrier_gomp_cancel(int gtid) {
1567 if (__kmp_omp_cancellation) {
1568 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1569 0, NULL, NULL);
1570 if (cancelled) {
1571 int tid = __kmp_tid_from_gtid(gtid);
1572 kmp_info_t *this_thr = __kmp_threads[gtid];
1573 if (KMP_MASTER_TID(tid)) {
1574 // Primary thread does not need to revert anything
1575 } else {
1576 // Workers need to revert their private b_arrived flag
1577 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1578 KMP_BARRIER_STATE_BUMP;
1579 }
1580 }
1581 return cancelled;
1582 }
1583 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1584 return FALSE;
1585 }
1586 #endif
1587
__kmp_end_split_barrier(enum barrier_type bt,int gtid)1588 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1589 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1590 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1591 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1592 int tid = __kmp_tid_from_gtid(gtid);
1593 kmp_info_t *this_thr = __kmp_threads[gtid];
1594 kmp_team_t *team = this_thr->th.th_team;
1595
1596 if (!team->t.t_serialized) {
1597 if (KMP_MASTER_GTID(gtid)) {
1598 switch (__kmp_barrier_release_pattern[bt]) {
1599 case bp_hyper_bar: {
1600 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1601 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1602 FALSE USE_ITT_BUILD_ARG(NULL));
1603 break;
1604 }
1605 case bp_hierarchical_bar: {
1606 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1607 FALSE USE_ITT_BUILD_ARG(NULL));
1608 break;
1609 }
1610 case bp_tree_bar: {
1611 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1612 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1613 FALSE USE_ITT_BUILD_ARG(NULL));
1614 break;
1615 }
1616 default: {
1617 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1618 FALSE USE_ITT_BUILD_ARG(NULL));
1619 }
1620 }
1621 if (__kmp_tasking_mode != tskm_immediate_exec) {
1622 __kmp_task_team_sync(this_thr, team);
1623 } // if
1624 }
1625 }
1626 }
1627
__kmp_join_barrier(int gtid)1628 void __kmp_join_barrier(int gtid) {
1629 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1630 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1631
1632 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1633
1634 kmp_info_t *this_thr = __kmp_threads[gtid];
1635 kmp_team_t *team;
1636 kmp_uint nproc;
1637 kmp_info_t *master_thread;
1638 int tid;
1639 #ifdef KMP_DEBUG
1640 int team_id;
1641 #endif /* KMP_DEBUG */
1642 #if USE_ITT_BUILD
1643 void *itt_sync_obj = NULL;
1644 #if USE_ITT_NOTIFY
1645 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1646 // Get object created at fork_barrier
1647 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1648 #endif
1649 #endif /* USE_ITT_BUILD */
1650 KMP_MB();
1651
1652 // Get current info
1653 team = this_thr->th.th_team;
1654 nproc = this_thr->th.th_team_nproc;
1655 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1656 tid = __kmp_tid_from_gtid(gtid);
1657 #ifdef KMP_DEBUG
1658 team_id = team->t.t_id;
1659 #endif /* KMP_DEBUG */
1660 master_thread = this_thr->th.th_team_master;
1661 #ifdef KMP_DEBUG
1662 if (master_thread != team->t.t_threads[0]) {
1663 __kmp_print_structure();
1664 }
1665 #endif /* KMP_DEBUG */
1666 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1667 KMP_MB();
1668
1669 // Verify state
1670 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1671 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1672 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1673 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1674 gtid, team_id, tid));
1675
1676 #if OMPT_SUPPORT
1677 if (ompt_enabled.enabled) {
1678 #if OMPT_OPTIONAL
1679 ompt_data_t *my_task_data;
1680 ompt_data_t *my_parallel_data;
1681 void *codeptr = NULL;
1682 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1683 if (KMP_MASTER_TID(ds_tid) &&
1684 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1685 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1686 codeptr = team->t.ompt_team_info.master_return_address;
1687 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1688 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1689 if (ompt_enabled.ompt_callback_sync_region) {
1690 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1691 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1692 my_task_data, codeptr);
1693 }
1694 if (ompt_enabled.ompt_callback_sync_region_wait) {
1695 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1696 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1697 my_task_data, codeptr);
1698 }
1699 if (!KMP_MASTER_TID(ds_tid))
1700 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1701 #endif
1702 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1703 }
1704 #endif
1705
1706 if (__kmp_tasking_mode == tskm_extra_barrier) {
1707 __kmp_tasking_barrier(team, this_thr, gtid);
1708 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1709 team_id, tid));
1710 }
1711 #ifdef KMP_DEBUG
1712 if (__kmp_tasking_mode != tskm_immediate_exec) {
1713 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1714 "%p, th_task_team = %p\n",
1715 __kmp_gtid_from_thread(this_thr), team_id,
1716 team->t.t_task_team[this_thr->th.th_task_state],
1717 this_thr->th.th_task_team));
1718 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1719 team->t.t_task_team[this_thr->th.th_task_state]);
1720 }
1721 #endif /* KMP_DEBUG */
1722
1723 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1724 access it when the team struct is not guaranteed to exist. Doing these
1725 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1726 we do not perform the copy if blocktime=infinite, since the values are not
1727 used by __kmp_wait_template() in that case. */
1728 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1729 #if KMP_USE_MONITOR
1730 this_thr->th.th_team_bt_intervals =
1731 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1732 this_thr->th.th_team_bt_set =
1733 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1734 #else
1735 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1736 #endif
1737 }
1738
1739 #if USE_ITT_BUILD
1740 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1741 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1742 #endif /* USE_ITT_BUILD */
1743
1744 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1745 case bp_hyper_bar: {
1746 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1747 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1748 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1749 break;
1750 }
1751 case bp_hierarchical_bar: {
1752 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1753 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1754 break;
1755 }
1756 case bp_tree_bar: {
1757 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1758 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1759 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1760 break;
1761 }
1762 default: {
1763 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1764 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1765 }
1766 }
1767
1768 /* From this point on, the team data structure may be deallocated at any time
1769 by the primary thread - it is unsafe to reference it in any of the worker
1770 threads. Any per-team data items that need to be referenced before the
1771 end of the barrier should be moved to the kmp_task_team_t structs. */
1772 if (KMP_MASTER_TID(tid)) {
1773 if (__kmp_tasking_mode != tskm_immediate_exec) {
1774 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1775 }
1776 if (__kmp_display_affinity) {
1777 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1778 }
1779 #if KMP_STATS_ENABLED
1780 // Have primary thread flag the workers to indicate they are now waiting for
1781 // next parallel region, Also wake them up so they switch their timers to
1782 // idle.
1783 for (int i = 0; i < team->t.t_nproc; ++i) {
1784 kmp_info_t *team_thread = team->t.t_threads[i];
1785 if (team_thread == this_thr)
1786 continue;
1787 team_thread->th.th_stats->setIdleFlag();
1788 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1789 team_thread->th.th_sleep_loc != NULL)
1790 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1791 team_thread->th.th_sleep_loc);
1792 }
1793 #endif
1794 #if USE_ITT_BUILD
1795 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1796 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1797 #endif /* USE_ITT_BUILD */
1798
1799 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1800 // Join barrier - report frame end
1801 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1802 __kmp_forkjoin_frames_mode &&
1803 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1804 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1805 team->t.t_active_level == 1) {
1806 kmp_uint64 cur_time = __itt_get_timestamp();
1807 ident_t *loc = team->t.t_ident;
1808 kmp_info_t **other_threads = team->t.t_threads;
1809 int nproc = this_thr->th.th_team_nproc;
1810 int i;
1811 switch (__kmp_forkjoin_frames_mode) {
1812 case 1:
1813 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1814 loc, nproc);
1815 break;
1816 case 2:
1817 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1818 loc, nproc);
1819 break;
1820 case 3:
1821 if (__itt_metadata_add_ptr) {
1822 // Initialize with primary thread's wait time
1823 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1824 // Set arrive time to zero to be able to check it in
1825 // __kmp_invoke_task(); the same is done inside the loop below
1826 this_thr->th.th_bar_arrive_time = 0;
1827 for (i = 1; i < nproc; ++i) {
1828 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1829 other_threads[i]->th.th_bar_arrive_time = 0;
1830 }
1831 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1832 cur_time, delta, 0);
1833 }
1834 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1835 loc, nproc);
1836 this_thr->th.th_frame_time = cur_time;
1837 break;
1838 }
1839 }
1840 #endif /* USE_ITT_BUILD */
1841 }
1842 #if USE_ITT_BUILD
1843 else {
1844 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1845 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1846 }
1847 #endif /* USE_ITT_BUILD */
1848
1849 #if KMP_DEBUG
1850 if (KMP_MASTER_TID(tid)) {
1851 KA_TRACE(
1852 15,
1853 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1854 gtid, team_id, tid, nproc));
1855 }
1856 #endif /* KMP_DEBUG */
1857
1858 // TODO now, mark worker threads as done so they may be disbanded
1859 KMP_MB(); // Flush all pending memory write invalidates.
1860 KA_TRACE(10,
1861 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1862
1863 }
1864
1865 // TODO release worker threads' fork barriers as we are ready instead of all at
1866 // once
__kmp_fork_barrier(int gtid,int tid)1867 void __kmp_fork_barrier(int gtid, int tid) {
1868 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1869 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1870 kmp_info_t *this_thr = __kmp_threads[gtid];
1871 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1872 #if USE_ITT_BUILD
1873 void *itt_sync_obj = NULL;
1874 #endif /* USE_ITT_BUILD */
1875 if (team)
1876
1877 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1878 (team != NULL) ? team->t.t_id : -1, tid));
1879
1880 // th_team pointer only valid for primary thread here
1881 if (KMP_MASTER_TID(tid)) {
1882 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1883 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1884 // Create itt barrier object
1885 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1886 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1887 }
1888 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1889
1890 #ifdef KMP_DEBUG
1891 KMP_DEBUG_ASSERT(team);
1892 kmp_info_t **other_threads = team->t.t_threads;
1893 int i;
1894
1895 // Verify state
1896 KMP_MB();
1897
1898 for (i = 1; i < team->t.t_nproc; ++i) {
1899 KA_TRACE(500,
1900 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1901 "== %u.\n",
1902 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1903 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1904 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1905 KMP_DEBUG_ASSERT(
1906 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1907 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1908 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1909 }
1910 #endif
1911
1912 if (__kmp_tasking_mode != tskm_immediate_exec) {
1913 // 0 indicates setup current task team if nthreads > 1
1914 __kmp_task_team_setup(this_thr, team, 0);
1915 }
1916
1917 /* The primary thread may have changed its blocktime between join barrier
1918 and fork barrier. Copy the blocktime info to the thread, where
1919 __kmp_wait_template() can access it when the team struct is not
1920 guaranteed to exist. */
1921 // See note about the corresponding code in __kmp_join_barrier() being
1922 // performance-critical
1923 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1924 #if KMP_USE_MONITOR
1925 this_thr->th.th_team_bt_intervals =
1926 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1927 this_thr->th.th_team_bt_set =
1928 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1929 #else
1930 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1931 #endif
1932 }
1933 } // primary thread
1934
1935 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1936 case bp_hyper_bar: {
1937 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1938 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1939 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1940 break;
1941 }
1942 case bp_hierarchical_bar: {
1943 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1944 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1945 break;
1946 }
1947 case bp_tree_bar: {
1948 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1949 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1950 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1951 break;
1952 }
1953 default: {
1954 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1955 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1956 }
1957 }
1958
1959 #if OMPT_SUPPORT
1960 if (ompt_enabled.enabled &&
1961 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1962 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1963 ompt_data_t *task_data = (team)
1964 ? OMPT_CUR_TASK_DATA(this_thr)
1965 : &(this_thr->th.ompt_thread_info.task_data);
1966 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1967 #if OMPT_OPTIONAL
1968 void *codeptr = NULL;
1969 if (KMP_MASTER_TID(ds_tid) &&
1970 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1971 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1972 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
1973 if (ompt_enabled.ompt_callback_sync_region_wait) {
1974 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1975 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1976 codeptr);
1977 }
1978 if (ompt_enabled.ompt_callback_sync_region) {
1979 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1980 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1981 codeptr);
1982 }
1983 #endif
1984 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1985 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1986 ompt_scope_end, NULL, task_data, 0, ds_tid,
1987 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1988 }
1989 }
1990 #endif
1991
1992 // Early exit for reaping threads releasing forkjoin barrier
1993 if (TCR_4(__kmp_global.g.g_done)) {
1994 this_thr->th.th_task_team = NULL;
1995
1996 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1997 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1998 if (!KMP_MASTER_TID(tid)) {
1999 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2000 if (itt_sync_obj)
2001 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2002 }
2003 }
2004 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2005 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2006 return;
2007 }
2008
2009 /* We can now assume that a valid team structure has been allocated by the
2010 primary thread and propagated to all worker threads. The current thread,
2011 however, may not be part of the team, so we can't blindly assume that the
2012 team pointer is non-null. */
2013 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2014 KMP_DEBUG_ASSERT(team != NULL);
2015 tid = __kmp_tid_from_gtid(gtid);
2016
2017 #if KMP_BARRIER_ICV_PULL
2018 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2019 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2020 implicit task has this data before this function is called. We cannot
2021 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2022 thread struct, because it is not always the case that the threads arrays
2023 have been allocated when __kmp_fork_call() is executed. */
2024 {
2025 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2026 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2027 // Copy the initial ICVs from the primary thread's thread struct to the
2028 // implicit task for this tid.
2029 KA_TRACE(10,
2030 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2031 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2032 tid, FALSE);
2033 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2034 &team->t.t_threads[0]
2035 ->th.th_bar[bs_forkjoin_barrier]
2036 .bb.th_fixed_icvs);
2037 }
2038 }
2039 #endif // KMP_BARRIER_ICV_PULL
2040
2041 if (__kmp_tasking_mode != tskm_immediate_exec) {
2042 __kmp_task_team_sync(this_thr, team);
2043 }
2044
2045 #if KMP_AFFINITY_SUPPORTED
2046 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2047 if (proc_bind == proc_bind_intel) {
2048 // Call dynamic affinity settings
2049 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2050 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2051 }
2052 } else if (proc_bind != proc_bind_false) {
2053 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2054 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2055 __kmp_gtid_from_thread(this_thr),
2056 this_thr->th.th_current_place));
2057 } else {
2058 __kmp_affinity_set_place(gtid);
2059 }
2060 }
2061 #endif // KMP_AFFINITY_SUPPORTED
2062 // Perform the display affinity functionality
2063 if (__kmp_display_affinity) {
2064 if (team->t.t_display_affinity
2065 #if KMP_AFFINITY_SUPPORTED
2066 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2067 #endif
2068 ) {
2069 // NULL means use the affinity-format-var ICV
2070 __kmp_aux_display_affinity(gtid, NULL);
2071 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2072 this_thr->th.th_prev_level = team->t.t_level;
2073 }
2074 }
2075 if (!KMP_MASTER_TID(tid))
2076 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2077
2078 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2079 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2080 if (!KMP_MASTER_TID(tid)) {
2081 // Get correct barrier object
2082 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2083 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2084 } // (prepare called inside barrier_release)
2085 }
2086 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2087 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2088 team->t.t_id, tid));
2089 }
2090
__kmp_setup_icv_copy(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)2091 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2092 kmp_internal_control_t *new_icvs, ident_t *loc) {
2093 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2094
2095 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2096 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2097
2098 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2099 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2100 implicit task has this data before this function is called. */
2101 #if KMP_BARRIER_ICV_PULL
2102 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2103 remains untouched), where all of the worker threads can access them and
2104 make their own copies after the barrier. */
2105 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2106 // allocated at this point
2107 copy_icvs(
2108 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2109 new_icvs);
2110 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2111 team->t.t_threads[0], team));
2112 #elif KMP_BARRIER_ICV_PUSH
2113 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2114 // done here.
2115 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2116 team->t.t_threads[0], team));
2117 #else
2118 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2119 // time.
2120 ngo_load(new_icvs);
2121 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2122 // allocated at this point
2123 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2124 // TODO: GEH - pass in better source location info since usually NULL here
2125 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2126 f, team->t.t_threads[f], team));
2127 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2128 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2129 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2130 f, team->t.t_threads[f], team));
2131 }
2132 ngo_sync();
2133 #endif // KMP_BARRIER_ICV_PULL
2134 }
2135