1 /*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65
66 char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76 /* ------------------------------------------------------------------------ */
77
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81
82 /* Forward declarations */
83
84 void __kmp_cleanup(void);
85
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116 /* Calculate the identifier of the current thread */
117 /* fast (and somewhat portable) way to get unique identifier of executing
118 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
__kmp_get_global_thread_id()119 int __kmp_get_global_thread_id() {
120 int i;
121 kmp_info_t **other_threads;
122 size_t stack_data;
123 char *stack_addr;
124 size_t stack_size;
125 char *stack_base;
126
127 KA_TRACE(
128 1000,
129 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
130 __kmp_nth, __kmp_all_nth));
131
132 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135 __kmp_init_gtid for this to work. */
136
137 if (!TCR_4(__kmp_init_gtid))
138 return KMP_GTID_DNE;
139
140 #ifdef KMP_TDATA_GTID
141 if (TCR_4(__kmp_gtid_mode) >= 3) {
142 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143 return __kmp_gtid;
144 }
145 #endif
146 if (TCR_4(__kmp_gtid_mode) >= 2) {
147 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148 return __kmp_gtid_get_specific();
149 }
150 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151
152 stack_addr = (char *)&stack_data;
153 other_threads = __kmp_threads;
154
155 /* ATT: The code below is a source of potential bugs due to unsynchronized
156 access to __kmp_threads array. For example:
157 1. Current thread loads other_threads[i] to thr and checks it, it is
158 non-NULL.
159 2. Current thread is suspended by OS.
160 3. Another thread unregisters and finishes (debug versions of free()
161 may fill memory with something like 0xEF).
162 4. Current thread is resumed.
163 5. Current thread reads junk from *thr.
164 TODO: Fix it. --ln */
165
166 for (i = 0; i < __kmp_threads_capacity; i++) {
167
168 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169 if (!thr)
170 continue;
171
172 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174
175 /* stack grows down -- search through all of the active threads */
176
177 if (stack_addr <= stack_base) {
178 size_t stack_diff = stack_base - stack_addr;
179
180 if (stack_diff <= stack_size) {
181 /* The only way we can be closer than the allocated */
182 /* stack size is if we are running on this thread. */
183 // __kmp_gtid_get_specific can return negative value because this
184 // function can be called by thread destructor. However, before the
185 // thread destructor is called, the value of the corresponding
186 // thread-specific data will be reset to NULL.
187 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
188 __kmp_gtid_get_specific() == i);
189 return i;
190 }
191 }
192 }
193
194 /* get specific to try and determine our gtid */
195 KA_TRACE(1000,
196 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
197 "thread, using TLS\n"));
198 i = __kmp_gtid_get_specific();
199
200 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
201
202 /* if we havn't been assigned a gtid, then return code */
203 if (i < 0)
204 return i;
205
206 // other_threads[i] can be nullptr at this point because the corresponding
207 // thread could have already been destructed. It can happen when this function
208 // is called in end library routine.
209 if (!TCR_SYNC_PTR(other_threads[i]))
210 return i;
211
212 /* dynamically updated stack window for uber threads to avoid get_specific
213 call */
214 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
215 KMP_FATAL(StackOverflow, i);
216 }
217
218 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 if (stack_addr > stack_base) {
220 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
221 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
222 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
223 stack_base);
224 } else {
225 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
226 stack_base - stack_addr);
227 }
228
229 /* Reprint stack bounds for ubermaster since they have been refined */
230 if (__kmp_storage_map) {
231 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
232 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
233 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
234 other_threads[i]->th.th_info.ds.ds_stacksize,
235 "th_%d stack (refinement)", i);
236 }
237 return i;
238 }
239
__kmp_get_global_thread_id_reg()240 int __kmp_get_global_thread_id_reg() {
241 int gtid;
242
243 if (!__kmp_init_serial) {
244 gtid = KMP_GTID_DNE;
245 } else
246 #ifdef KMP_TDATA_GTID
247 if (TCR_4(__kmp_gtid_mode) >= 3) {
248 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
249 gtid = __kmp_gtid;
250 } else
251 #endif
252 if (TCR_4(__kmp_gtid_mode) >= 2) {
253 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
254 gtid = __kmp_gtid_get_specific();
255 } else {
256 KA_TRACE(1000,
257 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
258 gtid = __kmp_get_global_thread_id();
259 }
260
261 /* we must be a new uber master sibling thread */
262 if (gtid == KMP_GTID_DNE) {
263 KA_TRACE(10,
264 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
265 "Registering a new gtid.\n"));
266 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
267 if (!__kmp_init_serial) {
268 __kmp_do_serial_initialize();
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 gtid = __kmp_register_root(FALSE);
272 }
273 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
274 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
275 }
276
277 KMP_DEBUG_ASSERT(gtid >= 0);
278
279 return gtid;
280 }
281
282 /* caller must hold forkjoin_lock */
__kmp_check_stack_overlap(kmp_info_t * th)283 void __kmp_check_stack_overlap(kmp_info_t *th) {
284 int f;
285 char *stack_beg = NULL;
286 char *stack_end = NULL;
287 int gtid;
288
289 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
290 if (__kmp_storage_map) {
291 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
292 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
293
294 gtid = __kmp_gtid_from_thread(th);
295
296 if (gtid == KMP_GTID_MONITOR) {
297 __kmp_print_storage_map_gtid(
298 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
299 "th_%s stack (%s)", "mon",
300 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
301 } else {
302 __kmp_print_storage_map_gtid(
303 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
304 "th_%d stack (%s)", gtid,
305 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
306 }
307 }
308
309 /* No point in checking ubermaster threads since they use refinement and
310 * cannot overlap */
311 gtid = __kmp_gtid_from_thread(th);
312 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
313 KA_TRACE(10,
314 ("__kmp_check_stack_overlap: performing extensive checking\n"));
315 if (stack_beg == NULL) {
316 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
317 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
318 }
319
320 for (f = 0; f < __kmp_threads_capacity; f++) {
321 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
322
323 if (f_th && f_th != th) {
324 char *other_stack_end =
325 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
326 char *other_stack_beg =
327 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
328 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
329 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
330
331 /* Print the other stack values before the abort */
332 if (__kmp_storage_map)
333 __kmp_print_storage_map_gtid(
334 -1, other_stack_beg, other_stack_end,
335 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
336 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
337
338 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
339 __kmp_msg_null);
340 }
341 }
342 }
343 }
344 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
345 }
346
347 /* ------------------------------------------------------------------------ */
348
__kmp_infinite_loop(void)349 void __kmp_infinite_loop(void) {
350 static int done = FALSE;
351
352 while (!done) {
353 KMP_YIELD(TRUE);
354 }
355 }
356
357 #define MAX_MESSAGE 512
358
__kmp_print_storage_map_gtid(int gtid,void * p1,void * p2,size_t size,char const * format,...)359 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
360 char const *format, ...) {
361 char buffer[MAX_MESSAGE];
362 va_list ap;
363
364 va_start(ap, format);
365 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
366 p2, (unsigned long)size, format);
367 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
368 __kmp_vprintf(kmp_err, buffer, ap);
369 #if KMP_PRINT_DATA_PLACEMENT
370 int node;
371 if (gtid >= 0) {
372 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
373 if (__kmp_storage_map_verbose) {
374 node = __kmp_get_host_node(p1);
375 if (node < 0) /* doesn't work, so don't try this next time */
376 __kmp_storage_map_verbose = FALSE;
377 else {
378 char *last;
379 int lastNode;
380 int localProc = __kmp_get_cpu_from_gtid(gtid);
381
382 const int page_size = KMP_GET_PAGE_SIZE();
383
384 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
385 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
386 if (localProc >= 0)
387 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
388 localProc >> 1);
389 else
390 __kmp_printf_no_lock(" GTID %d\n", gtid);
391 #if KMP_USE_PRCTL
392 /* The more elaborate format is disabled for now because of the prctl
393 * hanging bug. */
394 do {
395 last = p1;
396 lastNode = node;
397 /* This loop collates adjacent pages with the same host node. */
398 do {
399 (char *)p1 += page_size;
400 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
401 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
402 lastNode);
403 } while (p1 <= p2);
404 #else
405 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
406 (char *)p1 + (page_size - 1),
407 __kmp_get_host_node(p1));
408 if (p1 < p2) {
409 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
410 (char *)p2 + (page_size - 1),
411 __kmp_get_host_node(p2));
412 }
413 #endif
414 }
415 }
416 } else
417 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
418 }
419 #endif /* KMP_PRINT_DATA_PLACEMENT */
420 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
421
422 va_end(ap);
423 }
424
__kmp_warn(char const * format,...)425 void __kmp_warn(char const *format, ...) {
426 char buffer[MAX_MESSAGE];
427 va_list ap;
428
429 if (__kmp_generate_warnings == kmp_warnings_off) {
430 return;
431 }
432
433 va_start(ap, format);
434
435 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
436 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
437 __kmp_vprintf(kmp_err, buffer, ap);
438 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
439
440 va_end(ap);
441 }
442
__kmp_abort_process()443 void __kmp_abort_process() {
444 // Later threads may stall here, but that's ok because abort() will kill them.
445 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
446
447 if (__kmp_debug_buf) {
448 __kmp_dump_debug_buffer();
449 }
450
451 #if KMP_OS_WINDOWS
452 // Let other threads know of abnormal termination and prevent deadlock
453 // if abort happened during library initialization or shutdown
454 __kmp_global.g.g_abort = SIGABRT;
455
456 /* On Windows* OS by default abort() causes pop-up error box, which stalls
457 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
458 boxes. _set_abort_behavior() works well, but this function is not
459 available in VS7 (this is not problem for DLL, but it is a problem for
460 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
461 help, at least in some versions of MS C RTL.
462
463 It seems following sequence is the only way to simulate abort() and
464 avoid pop-up error box. */
465 raise(SIGABRT);
466 _exit(3); // Just in case, if signal ignored, exit anyway.
467 #else
468 __kmp_unregister_library();
469 abort();
470 #endif
471
472 __kmp_infinite_loop();
473 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
474
475 } // __kmp_abort_process
476
__kmp_abort_thread(void)477 void __kmp_abort_thread(void) {
478 // TODO: Eliminate g_abort global variable and this function.
479 // In case of abort just call abort(), it will kill all the threads.
480 __kmp_infinite_loop();
481 } // __kmp_abort_thread
482
483 /* Print out the storage map for the major kmp_info_t thread data structures
484 that are allocated together. */
485
__kmp_print_thread_storage_map(kmp_info_t * thr,int gtid)486 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
487 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
491 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
492
493 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
494 sizeof(kmp_local_t), "th_%d.th_local", gtid);
495
496 __kmp_print_storage_map_gtid(
497 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
498 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
499
500 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
501 &thr->th.th_bar[bs_plain_barrier + 1],
502 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
506 &thr->th.th_bar[bs_forkjoin_barrier + 1],
507 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
508 gtid);
509
510 #if KMP_FAST_REDUCTION_BARRIER
511 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
512 &thr->th.th_bar[bs_reduction_barrier + 1],
513 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
514 gtid);
515 #endif // KMP_FAST_REDUCTION_BARRIER
516 }
517
518 /* Print out the storage map for the major kmp_team_t team data structures
519 that are allocated together. */
520
__kmp_print_team_storage_map(const char * header,kmp_team_t * team,int team_id,int num_thr)521 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
522 int team_id, int num_thr) {
523 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
524 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
525 header, team_id);
526
527 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
528 &team->t.t_bar[bs_last_barrier],
529 sizeof(kmp_balign_team_t) * bs_last_barrier,
530 "%s_%d.t_bar", header, team_id);
531
532 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
533 &team->t.t_bar[bs_plain_barrier + 1],
534 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
535 header, team_id);
536
537 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
538 &team->t.t_bar[bs_forkjoin_barrier + 1],
539 sizeof(kmp_balign_team_t),
540 "%s_%d.t_bar[forkjoin]", header, team_id);
541
542 #if KMP_FAST_REDUCTION_BARRIER
543 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
544 &team->t.t_bar[bs_reduction_barrier + 1],
545 sizeof(kmp_balign_team_t),
546 "%s_%d.t_bar[reduction]", header, team_id);
547 #endif // KMP_FAST_REDUCTION_BARRIER
548
549 __kmp_print_storage_map_gtid(
550 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
551 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
552
553 __kmp_print_storage_map_gtid(
554 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
555 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
556
557 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
558 &team->t.t_disp_buffer[num_disp_buff],
559 sizeof(dispatch_shared_info_t) * num_disp_buff,
560 "%s_%d.t_disp_buffer", header, team_id);
561 }
562
__kmp_init_allocator()563 static void __kmp_init_allocator() {
564 __kmp_init_memkind();
565 __kmp_init_target_mem();
566 }
__kmp_fini_allocator()567 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
568
569 /* ------------------------------------------------------------------------ */
570
571 #if ENABLE_LIBOMPTARGET
__kmp_init_omptarget()572 static void __kmp_init_omptarget() {
573 __kmp_init_target_task();
574 }
575 #endif
576
577 /* ------------------------------------------------------------------------ */
578
579 #if KMP_DYNAMIC_LIB
580 #if KMP_OS_WINDOWS
581
DllMain(HINSTANCE hInstDLL,DWORD fdwReason,LPVOID lpReserved)582 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
583 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
584
585 switch (fdwReason) {
586
587 case DLL_PROCESS_ATTACH:
588 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
589
590 return TRUE;
591
592 case DLL_PROCESS_DETACH:
593 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
594
595 // According to Windows* documentation for DllMain entry point:
596 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
597 // lpReserved == NULL when FreeLibrary() is called,
598 // lpReserved != NULL when the process is terminated.
599 // When FreeLibrary() is called, worker threads remain alive. So the
600 // runtime's state is consistent and executing proper shutdown is OK.
601 // When the process is terminated, worker threads have exited or been
602 // forcefully terminated by the OS and only the shutdown thread remains.
603 // This can leave the runtime in an inconsistent state.
604 // Hence, only attempt proper cleanup when FreeLibrary() is called.
605 // Otherwise, rely on OS to reclaim resources.
606 if (lpReserved == NULL)
607 __kmp_internal_end_library(__kmp_gtid_get_specific());
608
609 return TRUE;
610
611 case DLL_THREAD_ATTACH:
612 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
613
614 /* if we want to register new siblings all the time here call
615 * __kmp_get_gtid(); */
616 return TRUE;
617
618 case DLL_THREAD_DETACH:
619 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
620
621 __kmp_internal_end_thread(__kmp_gtid_get_specific());
622 return TRUE;
623 }
624
625 return TRUE;
626 }
627
628 #endif /* KMP_OS_WINDOWS */
629 #endif /* KMP_DYNAMIC_LIB */
630
631 /* __kmp_parallel_deo -- Wait until it's our turn. */
__kmp_parallel_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)632 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633 int gtid = *gtid_ref;
634 #ifdef BUILD_PARALLEL_ORDERED
635 kmp_team_t *team = __kmp_team_from_gtid(gtid);
636 #endif /* BUILD_PARALLEL_ORDERED */
637
638 if (__kmp_env_consistency_check) {
639 if (__kmp_threads[gtid]->th.th_root->r.r_active)
640 #if KMP_USE_DYNAMIC_LOCK
641 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
642 #else
643 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
644 #endif
645 }
646 #ifdef BUILD_PARALLEL_ORDERED
647 if (!team->t.t_serialized) {
648 KMP_MB();
649 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
650 NULL);
651 KMP_MB();
652 }
653 #endif /* BUILD_PARALLEL_ORDERED */
654 }
655
656 /* __kmp_parallel_dxo -- Signal the next task. */
__kmp_parallel_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)657 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
658 int gtid = *gtid_ref;
659 #ifdef BUILD_PARALLEL_ORDERED
660 int tid = __kmp_tid_from_gtid(gtid);
661 kmp_team_t *team = __kmp_team_from_gtid(gtid);
662 #endif /* BUILD_PARALLEL_ORDERED */
663
664 if (__kmp_env_consistency_check) {
665 if (__kmp_threads[gtid]->th.th_root->r.r_active)
666 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
667 }
668 #ifdef BUILD_PARALLEL_ORDERED
669 if (!team->t.t_serialized) {
670 KMP_MB(); /* Flush all pending memory write invalidates. */
671
672 /* use the tid of the next thread in this team */
673 /* TODO replace with general release procedure */
674 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
675
676 KMP_MB(); /* Flush all pending memory write invalidates. */
677 }
678 #endif /* BUILD_PARALLEL_ORDERED */
679 }
680
681 /* ------------------------------------------------------------------------ */
682 /* The BARRIER for a SINGLE process section is always explicit */
683
__kmp_enter_single(int gtid,ident_t * id_ref,int push_ws)684 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
685 int status;
686 kmp_info_t *th;
687 kmp_team_t *team;
688
689 if (!TCR_4(__kmp_init_parallel))
690 __kmp_parallel_initialize();
691 __kmp_resume_if_soft_paused();
692
693 th = __kmp_threads[gtid];
694 team = th->th.th_team;
695 status = 0;
696
697 th->th.th_ident = id_ref;
698
699 if (team->t.t_serialized) {
700 status = 1;
701 } else {
702 kmp_int32 old_this = th->th.th_local.this_construct;
703
704 ++th->th.th_local.this_construct;
705 /* try to set team count to thread count--success means thread got the
706 single block */
707 /* TODO: Should this be acquire or release? */
708 if (team->t.t_construct == old_this) {
709 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
710 th->th.th_local.this_construct);
711 }
712 #if USE_ITT_BUILD
713 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
714 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
715 team->t.t_active_level == 1) {
716 // Only report metadata by primary thread of active team at level 1
717 __kmp_itt_metadata_single(id_ref);
718 }
719 #endif /* USE_ITT_BUILD */
720 }
721
722 if (__kmp_env_consistency_check) {
723 if (status && push_ws) {
724 __kmp_push_workshare(gtid, ct_psingle, id_ref);
725 } else {
726 __kmp_check_workshare(gtid, ct_psingle, id_ref);
727 }
728 }
729 #if USE_ITT_BUILD
730 if (status) {
731 __kmp_itt_single_start(gtid);
732 }
733 #endif /* USE_ITT_BUILD */
734 return status;
735 }
736
__kmp_exit_single(int gtid)737 void __kmp_exit_single(int gtid) {
738 #if USE_ITT_BUILD
739 __kmp_itt_single_end(gtid);
740 #endif /* USE_ITT_BUILD */
741 if (__kmp_env_consistency_check)
742 __kmp_pop_workshare(gtid, ct_psingle, NULL);
743 }
744
745 /* determine if we can go parallel or must use a serialized parallel region and
746 * how many threads we can use
747 * set_nproc is the number of threads requested for the team
748 * returns 0 if we should serialize or only use one thread,
749 * otherwise the number of threads to use
750 * The forkjoin lock is held by the caller. */
__kmp_reserve_threads(kmp_root_t * root,kmp_team_t * parent_team,int master_tid,int set_nthreads,int enter_teams)751 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
752 int master_tid, int set_nthreads,
753 int enter_teams) {
754 int capacity;
755 int new_nthreads;
756 KMP_DEBUG_ASSERT(__kmp_init_serial);
757 KMP_DEBUG_ASSERT(root && parent_team);
758 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
759
760 // If dyn-var is set, dynamically adjust the number of desired threads,
761 // according to the method specified by dynamic_mode.
762 new_nthreads = set_nthreads;
763 if (!get__dynamic_2(parent_team, master_tid)) {
764 ;
765 }
766 #ifdef USE_LOAD_BALANCE
767 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
768 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
769 if (new_nthreads == 1) {
770 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
771 "reservation to 1 thread\n",
772 master_tid));
773 return 1;
774 }
775 if (new_nthreads < set_nthreads) {
776 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
777 "reservation to %d threads\n",
778 master_tid, new_nthreads));
779 }
780 }
781 #endif /* USE_LOAD_BALANCE */
782 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
783 new_nthreads = __kmp_avail_proc - __kmp_nth +
784 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
785 if (new_nthreads <= 1) {
786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
787 "reservation to 1 thread\n",
788 master_tid));
789 return 1;
790 }
791 if (new_nthreads < set_nthreads) {
792 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
793 "reservation to %d threads\n",
794 master_tid, new_nthreads));
795 } else {
796 new_nthreads = set_nthreads;
797 }
798 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
799 if (set_nthreads > 2) {
800 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
801 new_nthreads = (new_nthreads % set_nthreads) + 1;
802 if (new_nthreads == 1) {
803 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
804 "reservation to 1 thread\n",
805 master_tid));
806 return 1;
807 }
808 if (new_nthreads < set_nthreads) {
809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
810 "reservation to %d threads\n",
811 master_tid, new_nthreads));
812 }
813 }
814 } else {
815 KMP_ASSERT(0);
816 }
817
818 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
819 if (__kmp_nth + new_nthreads -
820 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821 __kmp_max_nth) {
822 int tl_nthreads = __kmp_max_nth - __kmp_nth +
823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824 if (tl_nthreads <= 0) {
825 tl_nthreads = 1;
826 }
827
828 // If dyn-var is false, emit a 1-time warning.
829 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830 __kmp_reserve_warn = 1;
831 __kmp_msg(kmp_ms_warning,
832 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834 }
835 if (tl_nthreads == 1) {
836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
837 "reduced reservation to 1 thread\n",
838 master_tid));
839 return 1;
840 }
841 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
842 "reservation to %d threads\n",
843 master_tid, tl_nthreads));
844 new_nthreads = tl_nthreads;
845 }
846
847 // Respect OMP_THREAD_LIMIT
848 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
849 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
850 if (cg_nthreads + new_nthreads -
851 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
852 max_cg_threads) {
853 int tl_nthreads = max_cg_threads - cg_nthreads +
854 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
855 if (tl_nthreads <= 0) {
856 tl_nthreads = 1;
857 }
858
859 // If dyn-var is false, emit a 1-time warning.
860 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
861 __kmp_reserve_warn = 1;
862 __kmp_msg(kmp_ms_warning,
863 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
864 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
865 }
866 if (tl_nthreads == 1) {
867 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
868 "reduced reservation to 1 thread\n",
869 master_tid));
870 return 1;
871 }
872 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
873 "reservation to %d threads\n",
874 master_tid, tl_nthreads));
875 new_nthreads = tl_nthreads;
876 }
877
878 // Check if the threads array is large enough, or needs expanding.
879 // See comment in __kmp_register_root() about the adjustment if
880 // __kmp_threads[0] == NULL.
881 capacity = __kmp_threads_capacity;
882 if (TCR_PTR(__kmp_threads[0]) == NULL) {
883 --capacity;
884 }
885 // If it is not for initializing the hidden helper team, we need to take
886 // __kmp_hidden_helper_threads_num out of the capacity because it is included
887 // in __kmp_threads_capacity.
888 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
889 capacity -= __kmp_hidden_helper_threads_num;
890 }
891 if (__kmp_nth + new_nthreads -
892 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893 capacity) {
894 // Expand the threads array.
895 int slotsRequired = __kmp_nth + new_nthreads -
896 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
897 capacity;
898 int slotsAdded = __kmp_expand_threads(slotsRequired);
899 if (slotsAdded < slotsRequired) {
900 // The threads array was not expanded enough.
901 new_nthreads -= (slotsRequired - slotsAdded);
902 KMP_ASSERT(new_nthreads >= 1);
903
904 // If dyn-var is false, emit a 1-time warning.
905 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
906 __kmp_reserve_warn = 1;
907 if (__kmp_tp_cached) {
908 __kmp_msg(kmp_ms_warning,
909 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
910 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
911 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
912 } else {
913 __kmp_msg(kmp_ms_warning,
914 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
915 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
916 }
917 }
918 }
919 }
920
921 #ifdef KMP_DEBUG
922 if (new_nthreads == 1) {
923 KC_TRACE(10,
924 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
925 "dead roots and rechecking; requested %d threads\n",
926 __kmp_get_gtid(), set_nthreads));
927 } else {
928 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
929 " %d threads\n",
930 __kmp_get_gtid(), new_nthreads, set_nthreads));
931 }
932 #endif // KMP_DEBUG
933 return new_nthreads;
934 }
935
936 /* Allocate threads from the thread pool and assign them to the new team. We are
937 assured that there are enough threads available, because we checked on that
938 earlier within critical section forkjoin */
__kmp_fork_team_threads(kmp_root_t * root,kmp_team_t * team,kmp_info_t * master_th,int master_gtid,int fork_teams_workers)939 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
940 kmp_info_t *master_th, int master_gtid,
941 int fork_teams_workers) {
942 int i;
943 int use_hot_team;
944
945 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
946 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
947 KMP_MB();
948
949 /* first, let's setup the primary thread */
950 master_th->th.th_info.ds.ds_tid = 0;
951 master_th->th.th_team = team;
952 master_th->th.th_team_nproc = team->t.t_nproc;
953 master_th->th.th_team_master = master_th;
954 master_th->th.th_team_serialized = FALSE;
955 master_th->th.th_dispatch = &team->t.t_dispatch[0];
956
957 /* make sure we are not the optimized hot team */
958 #if KMP_NESTED_HOT_TEAMS
959 use_hot_team = 0;
960 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
961 if (hot_teams) { // hot teams array is not allocated if
962 // KMP_HOT_TEAMS_MAX_LEVEL=0
963 int level = team->t.t_active_level - 1; // index in array of hot teams
964 if (master_th->th.th_teams_microtask) { // are we inside the teams?
965 if (master_th->th.th_teams_size.nteams > 1) {
966 ++level; // level was not increased in teams construct for
967 // team_of_masters
968 }
969 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
970 master_th->th.th_teams_level == team->t.t_level) {
971 ++level; // level was not increased in teams construct for
972 // team_of_workers before the parallel
973 } // team->t.t_level will be increased inside parallel
974 }
975 if (level < __kmp_hot_teams_max_level) {
976 if (hot_teams[level].hot_team) {
977 // hot team has already been allocated for given level
978 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
979 use_hot_team = 1; // the team is ready to use
980 } else {
981 use_hot_team = 0; // AC: threads are not allocated yet
982 hot_teams[level].hot_team = team; // remember new hot team
983 hot_teams[level].hot_team_nth = team->t.t_nproc;
984 }
985 } else {
986 use_hot_team = 0;
987 }
988 }
989 #else
990 use_hot_team = team == root->r.r_hot_team;
991 #endif
992 if (!use_hot_team) {
993
994 /* install the primary thread */
995 team->t.t_threads[0] = master_th;
996 __kmp_initialize_info(master_th, team, 0, master_gtid);
997
998 /* now, install the worker threads */
999 for (i = 1; i < team->t.t_nproc; i++) {
1000
1001 /* fork or reallocate a new thread and install it in team */
1002 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1003 team->t.t_threads[i] = thr;
1004 KMP_DEBUG_ASSERT(thr);
1005 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1006 /* align team and thread arrived states */
1007 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1008 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1009 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1010 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1011 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1012 team->t.t_bar[bs_plain_barrier].b_arrived));
1013 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1014 thr->th.th_teams_level = master_th->th.th_teams_level;
1015 thr->th.th_teams_size = master_th->th.th_teams_size;
1016 { // Initialize threads' barrier data.
1017 int b;
1018 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1019 for (b = 0; b < bs_last_barrier; ++b) {
1020 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1021 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1022 #if USE_DEBUGGER
1023 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1024 #endif
1025 }
1026 }
1027 }
1028
1029 #if KMP_AFFINITY_SUPPORTED
1030 // Do not partition the places list for teams construct workers who
1031 // haven't actually been forked to do real work yet. This partitioning
1032 // will take place in the parallel region nested within the teams construct.
1033 if (!fork_teams_workers) {
1034 __kmp_partition_places(team);
1035 }
1036 #endif
1037
1038 if (team->t.t_nproc > 1 &&
1039 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1040 team->t.b->update_num_threads(team->t.t_nproc);
1041 __kmp_add_threads_to_team(team, team->t.t_nproc);
1042 }
1043 }
1044
1045 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1046 for (i = 0; i < team->t.t_nproc; i++) {
1047 kmp_info_t *thr = team->t.t_threads[i];
1048 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1049 thr->th.th_prev_level != team->t.t_level) {
1050 team->t.t_display_affinity = 1;
1051 break;
1052 }
1053 }
1054 }
1055
1056 KMP_MB();
1057 }
1058
1059 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1060 // Propagate any changes to the floating point control registers out to the team
1061 // We try to avoid unnecessary writes to the relevant cache line in the team
1062 // structure, so we don't make changes unless they are needed.
propagateFPControl(kmp_team_t * team)1063 inline static void propagateFPControl(kmp_team_t *team) {
1064 if (__kmp_inherit_fp_control) {
1065 kmp_int16 x87_fpu_control_word;
1066 kmp_uint32 mxcsr;
1067
1068 // Get primary thread's values of FPU control flags (both X87 and vector)
1069 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070 __kmp_store_mxcsr(&mxcsr);
1071 mxcsr &= KMP_X86_MXCSR_MASK;
1072
1073 // There is no point looking at t_fp_control_saved here.
1074 // If it is TRUE, we still have to update the values if they are different
1075 // from those we now have. If it is FALSE we didn't save anything yet, but
1076 // our objective is the same. We have to ensure that the values in the team
1077 // are the same as those we have.
1078 // So, this code achieves what we need whether or not t_fp_control_saved is
1079 // true. By checking whether the value needs updating we avoid unnecessary
1080 // writes that would put the cache-line into a written state, causing all
1081 // threads in the team to have to read it again.
1082 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1083 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1084 // Although we don't use this value, other code in the runtime wants to know
1085 // whether it should restore them. So we must ensure it is correct.
1086 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1087 } else {
1088 // Similarly here. Don't write to this cache-line in the team structure
1089 // unless we have to.
1090 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1091 }
1092 }
1093
1094 // Do the opposite, setting the hardware registers to the updated values from
1095 // the team.
updateHWFPControl(kmp_team_t * team)1096 inline static void updateHWFPControl(kmp_team_t *team) {
1097 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1098 // Only reset the fp control regs if they have been changed in the team.
1099 // the parallel region that we are exiting.
1100 kmp_int16 x87_fpu_control_word;
1101 kmp_uint32 mxcsr;
1102 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1103 __kmp_store_mxcsr(&mxcsr);
1104 mxcsr &= KMP_X86_MXCSR_MASK;
1105
1106 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1107 __kmp_clear_x87_fpu_status_word();
1108 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1109 }
1110
1111 if (team->t.t_mxcsr != mxcsr) {
1112 __kmp_load_mxcsr(&team->t.t_mxcsr);
1113 }
1114 }
1115 }
1116 #else
1117 #define propagateFPControl(x) ((void)0)
1118 #define updateHWFPControl(x) ((void)0)
1119 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1120
1121 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1122 int realloc); // forward declaration
1123
1124 /* Run a parallel region that has been serialized, so runs only in a team of the
1125 single primary thread. */
__kmp_serialized_parallel(ident_t * loc,kmp_int32 global_tid)1126 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1127 kmp_info_t *this_thr;
1128 kmp_team_t *serial_team;
1129
1130 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1131
1132 /* Skip all this code for autopar serialized loops since it results in
1133 unacceptable overhead */
1134 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1135 return;
1136
1137 if (!TCR_4(__kmp_init_parallel))
1138 __kmp_parallel_initialize();
1139 __kmp_resume_if_soft_paused();
1140
1141 this_thr = __kmp_threads[global_tid];
1142 serial_team = this_thr->th.th_serial_team;
1143
1144 /* utilize the serialized team held by this thread */
1145 KMP_DEBUG_ASSERT(serial_team);
1146 KMP_MB();
1147
1148 if (__kmp_tasking_mode != tskm_immediate_exec) {
1149 KMP_DEBUG_ASSERT(
1150 this_thr->th.th_task_team ==
1151 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1152 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1153 NULL);
1154 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1155 "team %p, new task_team = NULL\n",
1156 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1157 this_thr->th.th_task_team = NULL;
1158 }
1159
1160 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1161 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1162 proc_bind = proc_bind_false;
1163 } else if (proc_bind == proc_bind_default) {
1164 // No proc_bind clause was specified, so use the current value
1165 // of proc-bind-var for this parallel region.
1166 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1167 }
1168 // Reset for next parallel region
1169 this_thr->th.th_set_proc_bind = proc_bind_default;
1170
1171 // Reset num_threads for next parallel region
1172 this_thr->th.th_set_nproc = 0;
1173
1174 #if OMPT_SUPPORT
1175 ompt_data_t ompt_parallel_data = ompt_data_none;
1176 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1177 if (ompt_enabled.enabled &&
1178 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1179
1180 ompt_task_info_t *parent_task_info;
1181 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1182
1183 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1184 if (ompt_enabled.ompt_callback_parallel_begin) {
1185 int team_size = 1;
1186
1187 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1188 &(parent_task_info->task_data), &(parent_task_info->frame),
1189 &ompt_parallel_data, team_size,
1190 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191 }
1192 }
1193 #endif // OMPT_SUPPORT
1194
1195 if (this_thr->th.th_team != serial_team) {
1196 // Nested level will be an index in the nested nthreads array
1197 int level = this_thr->th.th_team->t.t_level;
1198
1199 if (serial_team->t.t_serialized) {
1200 /* this serial team was already used
1201 TODO increase performance by making this locks more specific */
1202 kmp_team_t *new_team;
1203
1204 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1205
1206 new_team =
1207 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1208 #if OMPT_SUPPORT
1209 ompt_parallel_data,
1210 #endif
1211 proc_bind, &this_thr->th.th_current_task->td_icvs,
1212 0 USE_NESTED_HOT_ARG(NULL));
1213 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1214 KMP_ASSERT(new_team);
1215
1216 /* setup new serialized team and install it */
1217 new_team->t.t_threads[0] = this_thr;
1218 new_team->t.t_parent = this_thr->th.th_team;
1219 serial_team = new_team;
1220 this_thr->th.th_serial_team = serial_team;
1221
1222 KF_TRACE(
1223 10,
1224 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1225 global_tid, serial_team));
1226
1227 /* TODO the above breaks the requirement that if we run out of resources,
1228 then we can still guarantee that serialized teams are ok, since we may
1229 need to allocate a new one */
1230 } else {
1231 KF_TRACE(
1232 10,
1233 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1234 global_tid, serial_team));
1235 }
1236
1237 /* we have to initialize this serial team */
1238 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1239 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1240 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1241 serial_team->t.t_ident = loc;
1242 serial_team->t.t_serialized = 1;
1243 serial_team->t.t_nproc = 1;
1244 serial_team->t.t_parent = this_thr->th.th_team;
1245 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1246 this_thr->th.th_team = serial_team;
1247 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1248
1249 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1250 this_thr->th.th_current_task));
1251 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1252 this_thr->th.th_current_task->td_flags.executing = 0;
1253
1254 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1255
1256 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1257 implicit task for each serialized task represented by
1258 team->t.t_serialized? */
1259 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1260 &this_thr->th.th_current_task->td_parent->td_icvs);
1261
1262 // Thread value exists in the nested nthreads array for the next nested
1263 // level
1264 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265 this_thr->th.th_current_task->td_icvs.nproc =
1266 __kmp_nested_nth.nth[level + 1];
1267 }
1268
1269 if (__kmp_nested_proc_bind.used &&
1270 (level + 1 < __kmp_nested_proc_bind.used)) {
1271 this_thr->th.th_current_task->td_icvs.proc_bind =
1272 __kmp_nested_proc_bind.bind_types[level + 1];
1273 }
1274
1275 #if USE_DEBUGGER
1276 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1277 #endif
1278 this_thr->th.th_info.ds.ds_tid = 0;
1279
1280 /* set thread cache values */
1281 this_thr->th.th_team_nproc = 1;
1282 this_thr->th.th_team_master = this_thr;
1283 this_thr->th.th_team_serialized = 1;
1284
1285 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1286 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1287 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1288
1289 propagateFPControl(serial_team);
1290
1291 /* check if we need to allocate dispatch buffers stack */
1292 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1293 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1294 serial_team->t.t_dispatch->th_disp_buffer =
1295 (dispatch_private_info_t *)__kmp_allocate(
1296 sizeof(dispatch_private_info_t));
1297 }
1298 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1299
1300 KMP_MB();
1301
1302 } else {
1303 /* this serialized team is already being used,
1304 * that's fine, just add another nested level */
1305 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1306 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1307 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1308 ++serial_team->t.t_serialized;
1309 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1310
1311 // Nested level will be an index in the nested nthreads array
1312 int level = this_thr->th.th_team->t.t_level;
1313 // Thread value exists in the nested nthreads array for the next nested
1314 // level
1315 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1316 this_thr->th.th_current_task->td_icvs.nproc =
1317 __kmp_nested_nth.nth[level + 1];
1318 }
1319 serial_team->t.t_level++;
1320 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1321 "of serial team %p to %d\n",
1322 global_tid, serial_team, serial_team->t.t_level));
1323
1324 /* allocate/push dispatch buffers stack */
1325 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326 {
1327 dispatch_private_info_t *disp_buffer =
1328 (dispatch_private_info_t *)__kmp_allocate(
1329 sizeof(dispatch_private_info_t));
1330 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1331 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1332 }
1333 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334
1335 KMP_MB();
1336 }
1337 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1338
1339 // Perform the display affinity functionality for
1340 // serialized parallel regions
1341 if (__kmp_display_affinity) {
1342 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1343 this_thr->th.th_prev_num_threads != 1) {
1344 // NULL means use the affinity-format-var ICV
1345 __kmp_aux_display_affinity(global_tid, NULL);
1346 this_thr->th.th_prev_level = serial_team->t.t_level;
1347 this_thr->th.th_prev_num_threads = 1;
1348 }
1349 }
1350
1351 if (__kmp_env_consistency_check)
1352 __kmp_push_parallel(global_tid, NULL);
1353 #if OMPT_SUPPORT
1354 serial_team->t.ompt_team_info.master_return_address = codeptr;
1355 if (ompt_enabled.enabled &&
1356 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1357 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1358 OMPT_GET_FRAME_ADDRESS(0);
1359
1360 ompt_lw_taskteam_t lw_taskteam;
1361 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1362 &ompt_parallel_data, codeptr);
1363
1364 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1365 // don't use lw_taskteam after linking. content was swaped
1366
1367 /* OMPT implicit task begin */
1368 if (ompt_enabled.ompt_callback_implicit_task) {
1369 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1370 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1371 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1372 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1373 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1374 __kmp_tid_from_gtid(global_tid);
1375 }
1376
1377 /* OMPT state */
1378 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1379 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1380 OMPT_GET_FRAME_ADDRESS(0);
1381 }
1382 #endif
1383 }
1384
1385 // Test if this fork is for a team closely nested in a teams construct
__kmp_is_fork_in_teams(kmp_info_t * master_th,microtask_t microtask,int level,int teams_level,kmp_va_list ap)1386 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1387 microtask_t microtask, int level,
1388 int teams_level, kmp_va_list ap) {
1389 return (master_th->th.th_teams_microtask && ap &&
1390 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391 }
1392
1393 // Test if this fork is for the teams construct, i.e. to form the outer league
1394 // of teams
__kmp_is_entering_teams(int active_level,int level,int teams_level,kmp_va_list ap)1395 static inline bool __kmp_is_entering_teams(int active_level, int level,
1396 int teams_level, kmp_va_list ap) {
1397 return ((ap == NULL && active_level == 0) ||
1398 (ap && teams_level > 0 && teams_level == level));
1399 }
1400
1401 // AC: This is start of parallel that is nested inside teams construct.
1402 // The team is actual (hot), all workers are ready at the fork barrier.
1403 // No lock needed to initialize the team a bit, then free workers.
1404 static inline int
__kmp_fork_in_teams(ident_t * loc,int gtid,kmp_team_t * parent_team,kmp_int32 argc,kmp_info_t * master_th,kmp_root_t * root,enum fork_context_e call_context,microtask_t microtask,launch_t invoker,int master_set_numthreads,int level,ompt_data_t ompt_parallel_data,void * return_address,kmp_va_list ap)1405 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1406 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1407 enum fork_context_e call_context, microtask_t microtask,
1408 launch_t invoker, int master_set_numthreads, int level,
1409 #if OMPT_SUPPORT
1410 ompt_data_t ompt_parallel_data, void *return_address,
1411 #endif
1412 kmp_va_list ap) {
1413 void **argv;
1414 int i;
1415
1416 parent_team->t.t_ident = loc;
1417 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1418 parent_team->t.t_argc = argc;
1419 argv = (void **)parent_team->t.t_argv;
1420 for (i = argc - 1; i >= 0; --i) {
1421 *argv++ = va_arg(kmp_va_deref(ap), void *);
1422 }
1423 // Increment our nested depth levels, but not increase the serialization
1424 if (parent_team == master_th->th.th_serial_team) {
1425 // AC: we are in serialized parallel
1426 __kmpc_serialized_parallel(loc, gtid);
1427 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1428
1429 if (call_context == fork_context_gnu) {
1430 // AC: need to decrement t_serialized for enquiry functions to work
1431 // correctly, will restore at join time
1432 parent_team->t.t_serialized--;
1433 return TRUE;
1434 }
1435
1436 #if OMPD_SUPPORT
1437 parent_team->t.t_pkfn = microtask;
1438 #endif
1439
1440 #if OMPT_SUPPORT
1441 void *dummy;
1442 void **exit_frame_p;
1443 ompt_data_t *implicit_task_data;
1444 ompt_lw_taskteam_t lw_taskteam;
1445
1446 if (ompt_enabled.enabled) {
1447 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1448 &ompt_parallel_data, return_address);
1449 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1450
1451 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1452 // Don't use lw_taskteam after linking. Content was swapped.
1453
1454 /* OMPT implicit task begin */
1455 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1456 if (ompt_enabled.ompt_callback_implicit_task) {
1457 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1458 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1459 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1460 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461 }
1462
1463 /* OMPT state */
1464 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1465 } else {
1466 exit_frame_p = &dummy;
1467 }
1468 #endif
1469
1470 // AC: need to decrement t_serialized for enquiry functions to work
1471 // correctly, will restore at join time
1472 parent_team->t.t_serialized--;
1473
1474 {
1475 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1476 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1477 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1478 #if OMPT_SUPPORT
1479 ,
1480 exit_frame_p
1481 #endif
1482 );
1483 }
1484
1485 #if OMPT_SUPPORT
1486 if (ompt_enabled.enabled) {
1487 *exit_frame_p = NULL;
1488 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1489 if (ompt_enabled.ompt_callback_implicit_task) {
1490 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1491 ompt_scope_end, NULL, implicit_task_data, 1,
1492 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1493 }
1494 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1495 __ompt_lw_taskteam_unlink(master_th);
1496 if (ompt_enabled.ompt_callback_parallel_end) {
1497 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1498 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1499 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1500 }
1501 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1502 }
1503 #endif
1504 return TRUE;
1505 }
1506
1507 parent_team->t.t_pkfn = microtask;
1508 parent_team->t.t_invoke = invoker;
1509 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1510 parent_team->t.t_active_level++;
1511 parent_team->t.t_level++;
1512 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1513
1514 // If the threads allocated to the team are less than the thread limit, update
1515 // the thread limit here. th_teams_size.nth is specific to this team nested
1516 // in a teams construct, the team is fully created, and we're about to do
1517 // the actual fork. Best to do this here so that the subsequent uses below
1518 // and in the join have the correct value.
1519 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1520
1521 #if OMPT_SUPPORT
1522 if (ompt_enabled.enabled) {
1523 ompt_lw_taskteam_t lw_taskteam;
1524 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1525 return_address);
1526 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1527 }
1528 #endif
1529
1530 /* Change number of threads in the team if requested */
1531 if (master_set_numthreads) { // The parallel has num_threads clause
1532 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1533 // AC: only can reduce number of threads dynamically, can't increase
1534 kmp_info_t **other_threads = parent_team->t.t_threads;
1535 // NOTE: if using distributed barrier, we need to run this code block
1536 // even when the team size appears not to have changed from the max.
1537 int old_proc = master_th->th.th_teams_size.nth;
1538 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1539 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1540 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1541 }
1542 parent_team->t.t_nproc = master_set_numthreads;
1543 for (i = 0; i < master_set_numthreads; ++i) {
1544 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545 }
1546 }
1547 // Keep extra threads hot in the team for possible next parallels
1548 master_th->th.th_set_nproc = 0;
1549 }
1550
1551 #if USE_DEBUGGER
1552 if (__kmp_debugging) { // Let debugger override number of threads.
1553 int nth = __kmp_omp_num_threads(loc);
1554 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1555 master_set_numthreads = nth;
1556 }
1557 }
1558 #endif
1559
1560 // Figure out the proc_bind policy for the nested parallel within teams
1561 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1562 // proc_bind_default means don't update
1563 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1564 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1565 proc_bind = proc_bind_false;
1566 } else {
1567 // No proc_bind clause specified; use current proc-bind-var
1568 if (proc_bind == proc_bind_default) {
1569 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1570 }
1571 /* else: The proc_bind policy was specified explicitly on parallel clause.
1572 This overrides proc-bind-var for this parallel region, but does not
1573 change proc-bind-var. */
1574 // Figure the value of proc-bind-var for the child threads.
1575 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1576 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1577 master_th->th.th_current_task->td_icvs.proc_bind)) {
1578 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579 }
1580 }
1581 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1582 // Need to change the bind-var ICV to correct value for each implicit task
1583 if (proc_bind_icv != proc_bind_default &&
1584 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1585 kmp_info_t **other_threads = parent_team->t.t_threads;
1586 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1587 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588 }
1589 }
1590 // Reset for next parallel region
1591 master_th->th.th_set_proc_bind = proc_bind_default;
1592
1593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1594 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1595 KMP_ITT_DEBUG) &&
1596 __kmp_forkjoin_frames_mode == 3 &&
1597 parent_team->t.t_active_level == 1 // only report frames at level 1
1598 && master_th->th.th_teams_size.nteams == 1) {
1599 kmp_uint64 tmp_time = __itt_get_timestamp();
1600 master_th->th.th_frame_time = tmp_time;
1601 parent_team->t.t_region_time = tmp_time;
1602 }
1603 if (__itt_stack_caller_create_ptr) {
1604 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1605 // create new stack stitching id before entering fork barrier
1606 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1607 }
1608 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1609 #if KMP_AFFINITY_SUPPORTED
1610 __kmp_partition_places(parent_team);
1611 #endif
1612
1613 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1614 "master_th=%p, gtid=%d\n",
1615 root, parent_team, master_th, gtid));
1616 __kmp_internal_fork(loc, gtid, parent_team);
1617 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1618 "master_th=%p, gtid=%d\n",
1619 root, parent_team, master_th, gtid));
1620
1621 if (call_context == fork_context_gnu)
1622 return TRUE;
1623
1624 /* Invoke microtask for PRIMARY thread */
1625 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1626 parent_team->t.t_id, parent_team->t.t_pkfn));
1627
1628 if (!parent_team->t.t_invoke(gtid)) {
1629 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1630 }
1631 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1632 parent_team->t.t_id, parent_team->t.t_pkfn));
1633 KMP_MB(); /* Flush all pending memory write invalidates. */
1634
1635 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1636
1637 return TRUE;
1638 }
1639
1640 // Create a serialized parallel region
1641 static inline int
__kmp_serial_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_info_t * master_th,kmp_team_t * parent_team,ompt_data_t * ompt_parallel_data,void ** return_address,ompt_data_t ** parent_task_data,kmp_va_list ap)1642 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1643 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1644 kmp_info_t *master_th, kmp_team_t *parent_team,
1645 #if OMPT_SUPPORT
1646 ompt_data_t *ompt_parallel_data, void **return_address,
1647 ompt_data_t **parent_task_data,
1648 #endif
1649 kmp_va_list ap) {
1650 kmp_team_t *team;
1651 int i;
1652 void **argv;
1653
1654 /* josh todo: hypothetical question: what do we do for OS X*? */
1655 #if KMP_OS_LINUX && \
1656 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1657 SimpleVLA<void *> args(argc);
1658 #else
1659 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1660 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1661 KMP_ARCH_AARCH64) */
1662
1663 KA_TRACE(
1664 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1665
1666 __kmpc_serialized_parallel(loc, gtid);
1667
1668 #if OMPD_SUPPORT
1669 master_th->th.th_serial_team->t.t_pkfn = microtask;
1670 #endif
1671
1672 if (call_context == fork_context_intel) {
1673 /* TODO this sucks, use the compiler itself to pass args! :) */
1674 master_th->th.th_serial_team->t.t_ident = loc;
1675 if (!ap) {
1676 // revert change made in __kmpc_serialized_parallel()
1677 master_th->th.th_serial_team->t.t_level--;
1678 // Get args from parent team for teams construct
1679
1680 #if OMPT_SUPPORT
1681 void *dummy;
1682 void **exit_frame_p;
1683 ompt_task_info_t *task_info;
1684 ompt_lw_taskteam_t lw_taskteam;
1685
1686 if (ompt_enabled.enabled) {
1687 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1688 ompt_parallel_data, *return_address);
1689
1690 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1691 // don't use lw_taskteam after linking. content was swaped
1692 task_info = OMPT_CUR_TASK_INFO(master_th);
1693 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1694 if (ompt_enabled.ompt_callback_implicit_task) {
1695 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1696 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1697 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1698 &(task_info->task_data), 1,
1699 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700 }
1701
1702 /* OMPT state */
1703 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1704 } else {
1705 exit_frame_p = &dummy;
1706 }
1707 #endif
1708
1709 {
1710 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1711 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1712 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1713 #if OMPT_SUPPORT
1714 ,
1715 exit_frame_p
1716 #endif
1717 );
1718 }
1719
1720 #if OMPT_SUPPORT
1721 if (ompt_enabled.enabled) {
1722 *exit_frame_p = NULL;
1723 if (ompt_enabled.ompt_callback_implicit_task) {
1724 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725 ompt_scope_end, NULL, &(task_info->task_data), 1,
1726 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1727 }
1728 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1729 __ompt_lw_taskteam_unlink(master_th);
1730 if (ompt_enabled.ompt_callback_parallel_end) {
1731 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1732 ompt_parallel_data, *parent_task_data,
1733 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1734 }
1735 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1736 }
1737 #endif
1738 } else if (microtask == (microtask_t)__kmp_teams_master) {
1739 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1740 team = master_th->th.th_team;
1741 // team->t.t_pkfn = microtask;
1742 team->t.t_invoke = invoker;
1743 __kmp_alloc_argv_entries(argc, team, TRUE);
1744 team->t.t_argc = argc;
1745 argv = (void **)team->t.t_argv;
1746 if (ap) {
1747 for (i = argc - 1; i >= 0; --i)
1748 *argv++ = va_arg(kmp_va_deref(ap), void *);
1749 } else {
1750 for (i = 0; i < argc; ++i)
1751 // Get args from parent team for teams construct
1752 argv[i] = parent_team->t.t_argv[i];
1753 }
1754 // AC: revert change made in __kmpc_serialized_parallel()
1755 // because initial code in teams should have level=0
1756 team->t.t_level--;
1757 // AC: call special invoker for outer "parallel" of teams construct
1758 invoker(gtid);
1759 #if OMPT_SUPPORT
1760 if (ompt_enabled.enabled) {
1761 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1762 if (ompt_enabled.ompt_callback_implicit_task) {
1763 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1764 ompt_scope_end, NULL, &(task_info->task_data), 0,
1765 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1766 }
1767 if (ompt_enabled.ompt_callback_parallel_end) {
1768 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1769 ompt_parallel_data, *parent_task_data,
1770 OMPT_INVOKER(call_context) | ompt_parallel_league,
1771 *return_address);
1772 }
1773 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774 }
1775 #endif
1776 } else {
1777 argv = args;
1778 for (i = argc - 1; i >= 0; --i)
1779 *argv++ = va_arg(kmp_va_deref(ap), void *);
1780 KMP_MB();
1781
1782 #if OMPT_SUPPORT
1783 void *dummy;
1784 void **exit_frame_p;
1785 ompt_task_info_t *task_info;
1786 ompt_lw_taskteam_t lw_taskteam;
1787 ompt_data_t *implicit_task_data;
1788
1789 if (ompt_enabled.enabled) {
1790 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1791 ompt_parallel_data, *return_address);
1792 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1793 // don't use lw_taskteam after linking. content was swaped
1794 task_info = OMPT_CUR_TASK_INFO(master_th);
1795 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1796
1797 /* OMPT implicit task begin */
1798 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1799 if (ompt_enabled.ompt_callback_implicit_task) {
1800 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1803 ompt_task_implicit);
1804 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1805 }
1806
1807 /* OMPT state */
1808 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809 } else {
1810 exit_frame_p = &dummy;
1811 }
1812 #endif
1813
1814 {
1815 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1818 #if OMPT_SUPPORT
1819 ,
1820 exit_frame_p
1821 #endif
1822 );
1823 }
1824
1825 #if OMPT_SUPPORT
1826 if (ompt_enabled.enabled) {
1827 *exit_frame_p = NULL;
1828 if (ompt_enabled.ompt_callback_implicit_task) {
1829 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1830 ompt_scope_end, NULL, &(task_info->task_data), 1,
1831 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1832 }
1833
1834 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1835 __ompt_lw_taskteam_unlink(master_th);
1836 if (ompt_enabled.ompt_callback_parallel_end) {
1837 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838 ompt_parallel_data, *parent_task_data,
1839 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1840 }
1841 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842 }
1843 #endif
1844 }
1845 } else if (call_context == fork_context_gnu) {
1846 #if OMPT_SUPPORT
1847 if (ompt_enabled.enabled) {
1848 ompt_lw_taskteam_t lwt;
1849 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1850 *return_address);
1851
1852 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1853 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1854 }
1855 // don't use lw_taskteam after linking. content was swaped
1856 #endif
1857
1858 // we were called from GNU native code
1859 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1860 return FALSE;
1861 } else {
1862 KMP_ASSERT2(call_context < fork_context_last,
1863 "__kmp_serial_fork_call: unknown fork_context parameter");
1864 }
1865
1866 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1867 KMP_MB();
1868 return FALSE;
1869 }
1870
1871 /* most of the work for a fork */
1872 /* return true if we really went parallel, false if serialized */
__kmp_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_va_list ap)1873 int __kmp_fork_call(ident_t *loc, int gtid,
1874 enum fork_context_e call_context, // Intel, GNU, ...
1875 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1876 kmp_va_list ap) {
1877 void **argv;
1878 int i;
1879 int master_tid;
1880 int master_this_cons;
1881 kmp_team_t *team;
1882 kmp_team_t *parent_team;
1883 kmp_info_t *master_th;
1884 kmp_root_t *root;
1885 int nthreads;
1886 int master_active;
1887 int master_set_numthreads;
1888 int task_thread_limit = 0;
1889 int level;
1890 int active_level;
1891 int teams_level;
1892 #if KMP_NESTED_HOT_TEAMS
1893 kmp_hot_team_ptr_t **p_hot_teams;
1894 #endif
1895 { // KMP_TIME_BLOCK
1896 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1897 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1898
1899 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1900 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1901 /* Some systems prefer the stack for the root thread(s) to start with */
1902 /* some gap from the parent stack to prevent false sharing. */
1903 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1904 /* These 2 lines below are so this does not get optimized out */
1905 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1906 __kmp_stkpadding += (short)((kmp_int64)dummy);
1907 }
1908
1909 /* initialize if needed */
1910 KMP_DEBUG_ASSERT(
1911 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1912 if (!TCR_4(__kmp_init_parallel))
1913 __kmp_parallel_initialize();
1914 __kmp_resume_if_soft_paused();
1915
1916 /* setup current data */
1917 // AC: potentially unsafe, not in sync with library shutdown,
1918 // __kmp_threads can be freed
1919 master_th = __kmp_threads[gtid];
1920
1921 parent_team = master_th->th.th_team;
1922 master_tid = master_th->th.th_info.ds.ds_tid;
1923 master_this_cons = master_th->th.th_local.this_construct;
1924 root = master_th->th.th_root;
1925 master_active = root->r.r_active;
1926 master_set_numthreads = master_th->th.th_set_nproc;
1927 task_thread_limit =
1928 master_th->th.th_current_task->td_icvs.task_thread_limit;
1929
1930 #if OMPT_SUPPORT
1931 ompt_data_t ompt_parallel_data = ompt_data_none;
1932 ompt_data_t *parent_task_data;
1933 ompt_frame_t *ompt_frame;
1934 void *return_address = NULL;
1935
1936 if (ompt_enabled.enabled) {
1937 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1938 NULL, NULL);
1939 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1940 }
1941 #endif
1942
1943 // Assign affinity to root thread if it hasn't happened yet
1944 __kmp_assign_root_init_mask();
1945
1946 // Nested level will be an index in the nested nthreads array
1947 level = parent_team->t.t_level;
1948 // used to launch non-serial teams even if nested is not allowed
1949 active_level = parent_team->t.t_active_level;
1950 // needed to check nesting inside the teams
1951 teams_level = master_th->th.th_teams_level;
1952 #if KMP_NESTED_HOT_TEAMS
1953 p_hot_teams = &master_th->th.th_hot_teams;
1954 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1955 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1956 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1957 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1958 // it is either actual or not needed (when active_level > 0)
1959 (*p_hot_teams)[0].hot_team_nth = 1;
1960 }
1961 #endif
1962
1963 #if OMPT_SUPPORT
1964 if (ompt_enabled.enabled) {
1965 if (ompt_enabled.ompt_callback_parallel_begin) {
1966 int team_size = master_set_numthreads
1967 ? master_set_numthreads
1968 : get__nproc_2(parent_team, master_tid);
1969 int flags = OMPT_INVOKER(call_context) |
1970 ((microtask == (microtask_t)__kmp_teams_master)
1971 ? ompt_parallel_league
1972 : ompt_parallel_team);
1973 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1974 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1975 return_address);
1976 }
1977 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1978 }
1979 #endif
1980
1981 master_th->th.th_ident = loc;
1982
1983 // Parallel closely nested in teams construct:
1984 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1985 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1986 call_context, microtask, invoker,
1987 master_set_numthreads, level,
1988 #if OMPT_SUPPORT
1989 ompt_parallel_data, return_address,
1990 #endif
1991 ap);
1992 } // End parallel closely nested in teams construct
1993
1994 #if KMP_DEBUG
1995 if (__kmp_tasking_mode != tskm_immediate_exec) {
1996 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1997 parent_team->t.t_task_team[master_th->th.th_task_state]);
1998 }
1999 #endif
2000
2001 // Need this to happen before we determine the number of threads, not while
2002 // we are allocating the team
2003 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2004
2005 // Determine the number of threads
2006 int enter_teams =
2007 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2008 if ((!enter_teams &&
2009 (parent_team->t.t_active_level >=
2010 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2011 (__kmp_library == library_serial)) {
2012 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2013 nthreads = 1;
2014 } else {
2015 nthreads = master_set_numthreads
2016 ? master_set_numthreads
2017 // TODO: get nproc directly from current task
2018 : get__nproc_2(parent_team, master_tid);
2019 // Use the thread_limit set for the current target task if exists, else go
2020 // with the deduced nthreads
2021 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2022 ? task_thread_limit
2023 : nthreads;
2024 // Check if we need to take forkjoin lock? (no need for serialized
2025 // parallel out of teams construct).
2026 if (nthreads > 1) {
2027 /* determine how many new threads we can use */
2028 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2029 /* AC: If we execute teams from parallel region (on host), then teams
2030 should be created but each can only have 1 thread if nesting is
2031 disabled. If teams called from serial region, then teams and their
2032 threads should be created regardless of the nesting setting. */
2033 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2034 nthreads, enter_teams);
2035 if (nthreads == 1) {
2036 // Free lock for single thread execution here; for multi-thread
2037 // execution it will be freed later after team of threads created
2038 // and initialized
2039 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2040 }
2041 }
2042 }
2043 KMP_DEBUG_ASSERT(nthreads > 0);
2044
2045 // If we temporarily changed the set number of threads then restore it now
2046 master_th->th.th_set_nproc = 0;
2047
2048 if (nthreads == 1) {
2049 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2050 invoker, master_th, parent_team,
2051 #if OMPT_SUPPORT
2052 &ompt_parallel_data, &return_address,
2053 &parent_task_data,
2054 #endif
2055 ap);
2056 } // if (nthreads == 1)
2057
2058 // GEH: only modify the executing flag in the case when not serialized
2059 // serialized case is handled in kmpc_serialized_parallel
2060 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2061 "curtask=%p, curtask_max_aclevel=%d\n",
2062 parent_team->t.t_active_level, master_th,
2063 master_th->th.th_current_task,
2064 master_th->th.th_current_task->td_icvs.max_active_levels));
2065 // TODO: GEH - cannot do this assertion because root thread not set up as
2066 // executing
2067 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2068 master_th->th.th_current_task->td_flags.executing = 0;
2069
2070 if (!master_th->th.th_teams_microtask || level > teams_level) {
2071 /* Increment our nested depth level */
2072 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2073 }
2074
2075 // See if we need to make a copy of the ICVs.
2076 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2077 if ((level + 1 < __kmp_nested_nth.used) &&
2078 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2079 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2080 } else {
2081 nthreads_icv = 0; // don't update
2082 }
2083
2084 // Figure out the proc_bind_policy for the new team.
2085 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2086 // proc_bind_default means don't update
2087 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2088 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2089 proc_bind = proc_bind_false;
2090 } else {
2091 // No proc_bind clause specified; use current proc-bind-var for this
2092 // parallel region
2093 if (proc_bind == proc_bind_default) {
2094 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2095 }
2096 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2097 if (master_th->th.th_teams_microtask &&
2098 microtask == (microtask_t)__kmp_teams_master) {
2099 proc_bind = __kmp_teams_proc_bind;
2100 }
2101 /* else: The proc_bind policy was specified explicitly on parallel clause.
2102 This overrides proc-bind-var for this parallel region, but does not
2103 change proc-bind-var. */
2104 // Figure the value of proc-bind-var for the child threads.
2105 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2106 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2107 master_th->th.th_current_task->td_icvs.proc_bind)) {
2108 // Do not modify the proc bind icv for the two teams construct forks
2109 // They just let the proc bind icv pass through
2110 if (!master_th->th.th_teams_microtask ||
2111 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2112 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2113 }
2114 }
2115
2116 // Reset for next parallel region
2117 master_th->th.th_set_proc_bind = proc_bind_default;
2118
2119 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2120 kmp_internal_control_t new_icvs;
2121 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2122 new_icvs.next = NULL;
2123 if (nthreads_icv > 0) {
2124 new_icvs.nproc = nthreads_icv;
2125 }
2126 if (proc_bind_icv != proc_bind_default) {
2127 new_icvs.proc_bind = proc_bind_icv;
2128 }
2129
2130 /* allocate a new parallel team */
2131 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2132 team = __kmp_allocate_team(root, nthreads, nthreads,
2133 #if OMPT_SUPPORT
2134 ompt_parallel_data,
2135 #endif
2136 proc_bind, &new_icvs,
2137 argc USE_NESTED_HOT_ARG(master_th));
2138 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2139 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2140 } else {
2141 /* allocate a new parallel team */
2142 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2143 team = __kmp_allocate_team(root, nthreads, nthreads,
2144 #if OMPT_SUPPORT
2145 ompt_parallel_data,
2146 #endif
2147 proc_bind,
2148 &master_th->th.th_current_task->td_icvs,
2149 argc USE_NESTED_HOT_ARG(master_th));
2150 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2151 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2152 &master_th->th.th_current_task->td_icvs);
2153 }
2154 KF_TRACE(
2155 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2156
2157 /* setup the new team */
2158 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2159 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2160 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2161 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2162 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2163 #if OMPT_SUPPORT
2164 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2165 return_address);
2166 #endif
2167 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2168 // TODO: parent_team->t.t_level == INT_MAX ???
2169 if (!master_th->th.th_teams_microtask || level > teams_level) {
2170 int new_level = parent_team->t.t_level + 1;
2171 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2172 new_level = parent_team->t.t_active_level + 1;
2173 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2174 } else {
2175 // AC: Do not increase parallel level at start of the teams construct
2176 int new_level = parent_team->t.t_level;
2177 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2178 new_level = parent_team->t.t_active_level;
2179 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2180 }
2181 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2182 // set primary thread's schedule as new run-time schedule
2183 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2184
2185 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2186 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2187
2188 // Update the floating point rounding in the team if required.
2189 propagateFPControl(team);
2190 #if OMPD_SUPPORT
2191 if (ompd_state & OMPD_ENABLE_BP)
2192 ompd_bp_parallel_begin();
2193 #endif
2194
2195 if (__kmp_tasking_mode != tskm_immediate_exec) {
2196 // Set primary thread's task team to team's task team. Unless this is hot
2197 // team, it should be NULL.
2198 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2199 parent_team->t.t_task_team[master_th->th.th_task_state]);
2200 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2201 "%p, new task_team %p / team %p\n",
2202 __kmp_gtid_from_thread(master_th),
2203 master_th->th.th_task_team, parent_team,
2204 team->t.t_task_team[master_th->th.th_task_state], team));
2205
2206 if (active_level || master_th->th.th_task_team) {
2207 // Take a memo of primary thread's task_state
2208 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2209 if (master_th->th.th_task_state_top >=
2210 master_th->th.th_task_state_stack_sz) { // increase size
2211 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2212 kmp_uint8 *old_stack, *new_stack;
2213 kmp_uint32 i;
2214 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2215 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2216 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2217 }
2218 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2219 ++i) { // zero-init rest of stack
2220 new_stack[i] = 0;
2221 }
2222 old_stack = master_th->th.th_task_state_memo_stack;
2223 master_th->th.th_task_state_memo_stack = new_stack;
2224 master_th->th.th_task_state_stack_sz = new_size;
2225 __kmp_free(old_stack);
2226 }
2227 // Store primary thread's task_state on stack
2228 master_th->th
2229 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2230 master_th->th.th_task_state;
2231 master_th->th.th_task_state_top++;
2232 #if KMP_NESTED_HOT_TEAMS
2233 if (master_th->th.th_hot_teams &&
2234 active_level < __kmp_hot_teams_max_level &&
2235 team == master_th->th.th_hot_teams[active_level].hot_team) {
2236 // Restore primary thread's nested state if nested hot team
2237 master_th->th.th_task_state =
2238 master_th->th
2239 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2240 } else {
2241 #endif
2242 master_th->th.th_task_state = 0;
2243 #if KMP_NESTED_HOT_TEAMS
2244 }
2245 #endif
2246 }
2247 #if !KMP_NESTED_HOT_TEAMS
2248 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2249 (team == root->r.r_hot_team));
2250 #endif
2251 }
2252
2253 KA_TRACE(
2254 20,
2255 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2256 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2257 team->t.t_nproc));
2258 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2259 (team->t.t_master_tid == 0 &&
2260 (team->t.t_parent == root->r.r_root_team ||
2261 team->t.t_parent->t.t_serialized)));
2262 KMP_MB();
2263
2264 /* now, setup the arguments */
2265 argv = (void **)team->t.t_argv;
2266 if (ap) {
2267 for (i = argc - 1; i >= 0; --i) {
2268 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2269 KMP_CHECK_UPDATE(*argv, new_argv);
2270 argv++;
2271 }
2272 } else {
2273 for (i = 0; i < argc; ++i) {
2274 // Get args from parent team for teams construct
2275 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2276 }
2277 }
2278
2279 /* now actually fork the threads */
2280 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2281 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2282 root->r.r_active = TRUE;
2283
2284 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2285 __kmp_setup_icv_copy(team, nthreads,
2286 &master_th->th.th_current_task->td_icvs, loc);
2287
2288 #if OMPT_SUPPORT
2289 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2290 #endif
2291
2292 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2293
2294 #if USE_ITT_BUILD
2295 if (team->t.t_active_level == 1 // only report frames at level 1
2296 && !master_th->th.th_teams_microtask) { // not in teams construct
2297 #if USE_ITT_NOTIFY
2298 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2299 (__kmp_forkjoin_frames_mode == 3 ||
2300 __kmp_forkjoin_frames_mode == 1)) {
2301 kmp_uint64 tmp_time = 0;
2302 if (__itt_get_timestamp_ptr)
2303 tmp_time = __itt_get_timestamp();
2304 // Internal fork - report frame begin
2305 master_th->th.th_frame_time = tmp_time;
2306 if (__kmp_forkjoin_frames_mode == 3)
2307 team->t.t_region_time = tmp_time;
2308 } else
2309 // only one notification scheme (either "submit" or "forking/joined", not both)
2310 #endif /* USE_ITT_NOTIFY */
2311 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2312 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2313 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2314 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2315 }
2316 }
2317 #endif /* USE_ITT_BUILD */
2318
2319 /* now go on and do the work */
2320 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2321 KMP_MB();
2322 KF_TRACE(10,
2323 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2324 root, team, master_th, gtid));
2325
2326 #if USE_ITT_BUILD
2327 if (__itt_stack_caller_create_ptr) {
2328 // create new stack stitching id before entering fork barrier
2329 if (!enter_teams) {
2330 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2331 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2332 } else if (parent_team->t.t_serialized) {
2333 // keep stack stitching id in the serialized parent_team;
2334 // current team will be used for parallel inside the teams;
2335 // if parent_team is active, then it already keeps stack stitching id
2336 // for the league of teams
2337 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2338 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2339 }
2340 }
2341 #endif /* USE_ITT_BUILD */
2342
2343 // AC: skip __kmp_internal_fork at teams construct, let only primary
2344 // threads execute
2345 if (ap) {
2346 __kmp_internal_fork(loc, gtid, team);
2347 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2348 "master_th=%p, gtid=%d\n",
2349 root, team, master_th, gtid));
2350 }
2351
2352 if (call_context == fork_context_gnu) {
2353 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2354 return TRUE;
2355 }
2356
2357 /* Invoke microtask for PRIMARY thread */
2358 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2359 team->t.t_id, team->t.t_pkfn));
2360 } // END of timer KMP_fork_call block
2361
2362 #if KMP_STATS_ENABLED
2363 // If beginning a teams construct, then change thread state
2364 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2365 if (!ap) {
2366 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2367 }
2368 #endif
2369
2370 if (!team->t.t_invoke(gtid)) {
2371 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2372 }
2373
2374 #if KMP_STATS_ENABLED
2375 // If was beginning of a teams construct, then reset thread state
2376 if (!ap) {
2377 KMP_SET_THREAD_STATE(previous_state);
2378 }
2379 #endif
2380
2381 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2382 team->t.t_id, team->t.t_pkfn));
2383 KMP_MB(); /* Flush all pending memory write invalidates. */
2384
2385 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2386 #if OMPT_SUPPORT
2387 if (ompt_enabled.enabled) {
2388 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2389 }
2390 #endif
2391
2392 return TRUE;
2393 }
2394
2395 #if OMPT_SUPPORT
__kmp_join_restore_state(kmp_info_t * thread,kmp_team_t * team)2396 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2397 kmp_team_t *team) {
2398 // restore state outside the region
2399 thread->th.ompt_thread_info.state =
2400 ((team->t.t_serialized) ? ompt_state_work_serial
2401 : ompt_state_work_parallel);
2402 }
2403
__kmp_join_ompt(int gtid,kmp_info_t * thread,kmp_team_t * team,ompt_data_t * parallel_data,int flags,void * codeptr)2404 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2405 kmp_team_t *team, ompt_data_t *parallel_data,
2406 int flags, void *codeptr) {
2407 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2408 if (ompt_enabled.ompt_callback_parallel_end) {
2409 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2410 parallel_data, &(task_info->task_data), flags, codeptr);
2411 }
2412
2413 task_info->frame.enter_frame = ompt_data_none;
2414 __kmp_join_restore_state(thread, team);
2415 }
2416 #endif
2417
__kmp_join_call(ident_t * loc,int gtid,enum fork_context_e fork_context,int exit_teams)2418 void __kmp_join_call(ident_t *loc, int gtid
2419 #if OMPT_SUPPORT
2420 ,
2421 enum fork_context_e fork_context
2422 #endif
2423 ,
2424 int exit_teams) {
2425 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2426 kmp_team_t *team;
2427 kmp_team_t *parent_team;
2428 kmp_info_t *master_th;
2429 kmp_root_t *root;
2430 int master_active;
2431
2432 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2433
2434 /* setup current data */
2435 master_th = __kmp_threads[gtid];
2436 root = master_th->th.th_root;
2437 team = master_th->th.th_team;
2438 parent_team = team->t.t_parent;
2439
2440 master_th->th.th_ident = loc;
2441
2442 #if OMPT_SUPPORT
2443 void *team_microtask = (void *)team->t.t_pkfn;
2444 // For GOMP interface with serialized parallel, need the
2445 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2446 // and end-parallel events.
2447 if (ompt_enabled.enabled &&
2448 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2449 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2450 }
2451 #endif
2452
2453 #if KMP_DEBUG
2454 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2455 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2456 "th_task_team = %p\n",
2457 __kmp_gtid_from_thread(master_th), team,
2458 team->t.t_task_team[master_th->th.th_task_state],
2459 master_th->th.th_task_team));
2460 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2461 team->t.t_task_team[master_th->th.th_task_state]);
2462 }
2463 #endif
2464
2465 if (team->t.t_serialized) {
2466 if (master_th->th.th_teams_microtask) {
2467 // We are in teams construct
2468 int level = team->t.t_level;
2469 int tlevel = master_th->th.th_teams_level;
2470 if (level == tlevel) {
2471 // AC: we haven't incremented it earlier at start of teams construct,
2472 // so do it here - at the end of teams construct
2473 team->t.t_level++;
2474 } else if (level == tlevel + 1) {
2475 // AC: we are exiting parallel inside teams, need to increment
2476 // serialization in order to restore it in the next call to
2477 // __kmpc_end_serialized_parallel
2478 team->t.t_serialized++;
2479 }
2480 }
2481 __kmpc_end_serialized_parallel(loc, gtid);
2482
2483 #if OMPT_SUPPORT
2484 if (ompt_enabled.enabled) {
2485 if (fork_context == fork_context_gnu) {
2486 __ompt_lw_taskteam_unlink(master_th);
2487 }
2488 __kmp_join_restore_state(master_th, parent_team);
2489 }
2490 #endif
2491
2492 return;
2493 }
2494
2495 master_active = team->t.t_master_active;
2496
2497 if (!exit_teams) {
2498 // AC: No barrier for internal teams at exit from teams construct.
2499 // But there is barrier for external team (league).
2500 __kmp_internal_join(loc, gtid, team);
2501 #if USE_ITT_BUILD
2502 if (__itt_stack_caller_create_ptr) {
2503 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2504 // destroy the stack stitching id after join barrier
2505 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2506 team->t.t_stack_id = NULL;
2507 }
2508 #endif
2509 } else {
2510 master_th->th.th_task_state =
2511 0; // AC: no tasking in teams (out of any parallel)
2512 #if USE_ITT_BUILD
2513 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2514 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2515 // destroy the stack stitching id on exit from the teams construct
2516 // if parent_team is active, then the id will be destroyed later on
2517 // by master of the league of teams
2518 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2519 parent_team->t.t_stack_id = NULL;
2520 }
2521 #endif
2522 }
2523
2524 KMP_MB();
2525
2526 #if OMPT_SUPPORT
2527 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2528 void *codeptr = team->t.ompt_team_info.master_return_address;
2529 #endif
2530
2531 #if USE_ITT_BUILD
2532 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2533 if (team->t.t_active_level == 1 &&
2534 (!master_th->th.th_teams_microtask || /* not in teams construct */
2535 master_th->th.th_teams_size.nteams == 1)) {
2536 master_th->th.th_ident = loc;
2537 // only one notification scheme (either "submit" or "forking/joined", not
2538 // both)
2539 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2540 __kmp_forkjoin_frames_mode == 3)
2541 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2542 master_th->th.th_frame_time, 0, loc,
2543 master_th->th.th_team_nproc, 1);
2544 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2545 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2546 __kmp_itt_region_joined(gtid);
2547 } // active_level == 1
2548 #endif /* USE_ITT_BUILD */
2549
2550 #if KMP_AFFINITY_SUPPORTED
2551 if (!exit_teams) {
2552 // Restore master thread's partition.
2553 master_th->th.th_first_place = team->t.t_first_place;
2554 master_th->th.th_last_place = team->t.t_last_place;
2555 }
2556 #endif // KMP_AFFINITY_SUPPORTED
2557
2558 if (master_th->th.th_teams_microtask && !exit_teams &&
2559 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2560 team->t.t_level == master_th->th.th_teams_level + 1) {
2561 // AC: We need to leave the team structure intact at the end of parallel
2562 // inside the teams construct, so that at the next parallel same (hot) team
2563 // works, only adjust nesting levels
2564 #if OMPT_SUPPORT
2565 ompt_data_t ompt_parallel_data = ompt_data_none;
2566 if (ompt_enabled.enabled) {
2567 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2568 if (ompt_enabled.ompt_callback_implicit_task) {
2569 int ompt_team_size = team->t.t_nproc;
2570 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2571 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2572 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2573 }
2574 task_info->frame.exit_frame = ompt_data_none;
2575 task_info->task_data = ompt_data_none;
2576 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2577 __ompt_lw_taskteam_unlink(master_th);
2578 }
2579 #endif
2580 /* Decrement our nested depth level */
2581 team->t.t_level--;
2582 team->t.t_active_level--;
2583 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2584
2585 // Restore number of threads in the team if needed. This code relies on
2586 // the proper adjustment of th_teams_size.nth after the fork in
2587 // __kmp_teams_master on each teams primary thread in the case that
2588 // __kmp_reserve_threads reduced it.
2589 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2590 int old_num = master_th->th.th_team_nproc;
2591 int new_num = master_th->th.th_teams_size.nth;
2592 kmp_info_t **other_threads = team->t.t_threads;
2593 team->t.t_nproc = new_num;
2594 for (int i = 0; i < old_num; ++i) {
2595 other_threads[i]->th.th_team_nproc = new_num;
2596 }
2597 // Adjust states of non-used threads of the team
2598 for (int i = old_num; i < new_num; ++i) {
2599 // Re-initialize thread's barrier data.
2600 KMP_DEBUG_ASSERT(other_threads[i]);
2601 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2602 for (int b = 0; b < bs_last_barrier; ++b) {
2603 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2604 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2605 #if USE_DEBUGGER
2606 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2607 #endif
2608 }
2609 if (__kmp_tasking_mode != tskm_immediate_exec) {
2610 // Synchronize thread's task state
2611 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2612 }
2613 }
2614 }
2615
2616 #if OMPT_SUPPORT
2617 if (ompt_enabled.enabled) {
2618 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2619 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2620 }
2621 #endif
2622
2623 return;
2624 }
2625
2626 /* do cleanup and restore the parent team */
2627 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2628 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2629
2630 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2631
2632 /* jc: The following lock has instructions with REL and ACQ semantics,
2633 separating the parallel user code called in this parallel region
2634 from the serial user code called after this function returns. */
2635 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2636
2637 if (!master_th->th.th_teams_microtask ||
2638 team->t.t_level > master_th->th.th_teams_level) {
2639 /* Decrement our nested depth level */
2640 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2641 }
2642 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2643
2644 #if OMPT_SUPPORT
2645 if (ompt_enabled.enabled) {
2646 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2647 if (ompt_enabled.ompt_callback_implicit_task) {
2648 int flags = (team_microtask == (void *)__kmp_teams_master)
2649 ? ompt_task_initial
2650 : ompt_task_implicit;
2651 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2652 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2653 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2654 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2655 }
2656 task_info->frame.exit_frame = ompt_data_none;
2657 task_info->task_data = ompt_data_none;
2658 }
2659 #endif
2660
2661 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2662 master_th, team));
2663 __kmp_pop_current_task_from_thread(master_th);
2664
2665 master_th->th.th_def_allocator = team->t.t_def_allocator;
2666
2667 #if OMPD_SUPPORT
2668 if (ompd_state & OMPD_ENABLE_BP)
2669 ompd_bp_parallel_end();
2670 #endif
2671 updateHWFPControl(team);
2672
2673 if (root->r.r_active != master_active)
2674 root->r.r_active = master_active;
2675
2676 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2677 master_th)); // this will free worker threads
2678
2679 /* this race was fun to find. make sure the following is in the critical
2680 region otherwise assertions may fail occasionally since the old team may be
2681 reallocated and the hierarchy appears inconsistent. it is actually safe to
2682 run and won't cause any bugs, but will cause those assertion failures. it's
2683 only one deref&assign so might as well put this in the critical region */
2684 master_th->th.th_team = parent_team;
2685 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2686 master_th->th.th_team_master = parent_team->t.t_threads[0];
2687 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2688
2689 /* restore serialized team, if need be */
2690 if (parent_team->t.t_serialized &&
2691 parent_team != master_th->th.th_serial_team &&
2692 parent_team != root->r.r_root_team) {
2693 __kmp_free_team(root,
2694 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2695 master_th->th.th_serial_team = parent_team;
2696 }
2697
2698 if (__kmp_tasking_mode != tskm_immediate_exec) {
2699 if (master_th->th.th_task_state_top >
2700 0) { // Restore task state from memo stack
2701 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2702 // Remember primary thread's state if we re-use this nested hot team
2703 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2704 master_th->th.th_task_state;
2705 --master_th->th.th_task_state_top; // pop
2706 // Now restore state at this level
2707 master_th->th.th_task_state =
2708 master_th->th
2709 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2710 } else if (team != root->r.r_hot_team) {
2711 // Reset the task state of primary thread if we are not hot team because
2712 // in this case all the worker threads will be free, and their task state
2713 // will be reset. If not reset the primary's, the task state will be
2714 // inconsistent.
2715 master_th->th.th_task_state = 0;
2716 }
2717 // Copy the task team from the parent team to the primary thread
2718 master_th->th.th_task_team =
2719 parent_team->t.t_task_team[master_th->th.th_task_state];
2720 KA_TRACE(20,
2721 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2722 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2723 parent_team));
2724 }
2725
2726 // TODO: GEH - cannot do this assertion because root thread not set up as
2727 // executing
2728 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2729 master_th->th.th_current_task->td_flags.executing = 1;
2730
2731 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732
2733 #if KMP_AFFINITY_SUPPORTED
2734 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2735 __kmp_reset_root_init_mask(gtid);
2736 }
2737 #endif
2738 #if OMPT_SUPPORT
2739 int flags =
2740 OMPT_INVOKER(fork_context) |
2741 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2742 : ompt_parallel_team);
2743 if (ompt_enabled.enabled) {
2744 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2745 codeptr);
2746 }
2747 #endif
2748
2749 KMP_MB();
2750 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751 }
2752
2753 /* Check whether we should push an internal control record onto the
2754 serial team stack. If so, do it. */
__kmp_save_internal_controls(kmp_info_t * thread)2755 void __kmp_save_internal_controls(kmp_info_t *thread) {
2756
2757 if (thread->th.th_team != thread->th.th_serial_team) {
2758 return;
2759 }
2760 if (thread->th.th_team->t.t_serialized > 1) {
2761 int push = 0;
2762
2763 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2764 push = 1;
2765 } else {
2766 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2767 thread->th.th_team->t.t_serialized) {
2768 push = 1;
2769 }
2770 }
2771 if (push) { /* push a record on the serial team's stack */
2772 kmp_internal_control_t *control =
2773 (kmp_internal_control_t *)__kmp_allocate(
2774 sizeof(kmp_internal_control_t));
2775
2776 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2777
2778 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2779
2780 control->next = thread->th.th_team->t.t_control_stack_top;
2781 thread->th.th_team->t.t_control_stack_top = control;
2782 }
2783 }
2784 }
2785
2786 /* Changes set_nproc */
__kmp_set_num_threads(int new_nth,int gtid)2787 void __kmp_set_num_threads(int new_nth, int gtid) {
2788 kmp_info_t *thread;
2789 kmp_root_t *root;
2790
2791 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2792 KMP_DEBUG_ASSERT(__kmp_init_serial);
2793
2794 if (new_nth < 1)
2795 new_nth = 1;
2796 else if (new_nth > __kmp_max_nth)
2797 new_nth = __kmp_max_nth;
2798
2799 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2800 thread = __kmp_threads[gtid];
2801 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2802 return; // nothing to do
2803
2804 __kmp_save_internal_controls(thread);
2805
2806 set__nproc(thread, new_nth);
2807
2808 // If this omp_set_num_threads() call will cause the hot team size to be
2809 // reduced (in the absence of a num_threads clause), then reduce it now,
2810 // rather than waiting for the next parallel region.
2811 root = thread->th.th_root;
2812 if (__kmp_init_parallel && (!root->r.r_active) &&
2813 (root->r.r_hot_team->t.t_nproc > new_nth)
2814 #if KMP_NESTED_HOT_TEAMS
2815 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2816 #endif
2817 ) {
2818 kmp_team_t *hot_team = root->r.r_hot_team;
2819 int f;
2820
2821 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2822
2823 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2824 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2825 }
2826 // Release the extra threads we don't need any more.
2827 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2828 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2829 if (__kmp_tasking_mode != tskm_immediate_exec) {
2830 // When decreasing team size, threads no longer in the team should unref
2831 // task team.
2832 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2833 }
2834 __kmp_free_thread(hot_team->t.t_threads[f]);
2835 hot_team->t.t_threads[f] = NULL;
2836 }
2837 hot_team->t.t_nproc = new_nth;
2838 #if KMP_NESTED_HOT_TEAMS
2839 if (thread->th.th_hot_teams) {
2840 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2841 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2842 }
2843 #endif
2844
2845 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2846 hot_team->t.b->update_num_threads(new_nth);
2847 __kmp_add_threads_to_team(hot_team, new_nth);
2848 }
2849
2850 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2851
2852 // Update the t_nproc field in the threads that are still active.
2853 for (f = 0; f < new_nth; f++) {
2854 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2855 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2856 }
2857 // Special flag in case omp_set_num_threads() call
2858 hot_team->t.t_size_changed = -1;
2859 }
2860 }
2861
2862 /* Changes max_active_levels */
__kmp_set_max_active_levels(int gtid,int max_active_levels)2863 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2864 kmp_info_t *thread;
2865
2866 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2867 "%d = (%d)\n",
2868 gtid, max_active_levels));
2869 KMP_DEBUG_ASSERT(__kmp_init_serial);
2870
2871 // validate max_active_levels
2872 if (max_active_levels < 0) {
2873 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2874 // We ignore this call if the user has specified a negative value.
2875 // The current setting won't be changed. The last valid setting will be
2876 // used. A warning will be issued (if warnings are allowed as controlled by
2877 // the KMP_WARNINGS env var).
2878 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2879 "max_active_levels for thread %d = (%d)\n",
2880 gtid, max_active_levels));
2881 return;
2882 }
2883 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2884 // it's OK, the max_active_levels is within the valid range: [ 0;
2885 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2886 // We allow a zero value. (implementation defined behavior)
2887 } else {
2888 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2889 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2890 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2891 // Current upper limit is MAX_INT. (implementation defined behavior)
2892 // If the input exceeds the upper limit, we correct the input to be the
2893 // upper limit. (implementation defined behavior)
2894 // Actually, the flow should never get here until we use MAX_INT limit.
2895 }
2896 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2897 "max_active_levels for thread %d = (%d)\n",
2898 gtid, max_active_levels));
2899
2900 thread = __kmp_threads[gtid];
2901
2902 __kmp_save_internal_controls(thread);
2903
2904 set__max_active_levels(thread, max_active_levels);
2905 }
2906
2907 /* Gets max_active_levels */
__kmp_get_max_active_levels(int gtid)2908 int __kmp_get_max_active_levels(int gtid) {
2909 kmp_info_t *thread;
2910
2911 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2912 KMP_DEBUG_ASSERT(__kmp_init_serial);
2913
2914 thread = __kmp_threads[gtid];
2915 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2916 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2917 "curtask_maxaclevel=%d\n",
2918 gtid, thread->th.th_current_task,
2919 thread->th.th_current_task->td_icvs.max_active_levels));
2920 return thread->th.th_current_task->td_icvs.max_active_levels;
2921 }
2922
2923 // nteams-var per-device ICV
__kmp_set_num_teams(int num_teams)2924 void __kmp_set_num_teams(int num_teams) {
2925 if (num_teams > 0)
2926 __kmp_nteams = num_teams;
2927 }
__kmp_get_max_teams(void)2928 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2929 // teams-thread-limit-var per-device ICV
__kmp_set_teams_thread_limit(int limit)2930 void __kmp_set_teams_thread_limit(int limit) {
2931 if (limit > 0)
2932 __kmp_teams_thread_limit = limit;
2933 }
__kmp_get_teams_thread_limit(void)2934 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2935
2936 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2937 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2938
2939 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
__kmp_set_schedule(int gtid,kmp_sched_t kind,int chunk)2940 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2941 kmp_info_t *thread;
2942 kmp_sched_t orig_kind;
2943 // kmp_team_t *team;
2944
2945 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2946 gtid, (int)kind, chunk));
2947 KMP_DEBUG_ASSERT(__kmp_init_serial);
2948
2949 // Check if the kind parameter is valid, correct if needed.
2950 // Valid parameters should fit in one of two intervals - standard or extended:
2951 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2952 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2953 orig_kind = kind;
2954 kind = __kmp_sched_without_mods(kind);
2955
2956 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2957 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2958 // TODO: Hint needs attention in case we change the default schedule.
2959 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2960 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2961 __kmp_msg_null);
2962 kind = kmp_sched_default;
2963 chunk = 0; // ignore chunk value in case of bad kind
2964 }
2965
2966 thread = __kmp_threads[gtid];
2967
2968 __kmp_save_internal_controls(thread);
2969
2970 if (kind < kmp_sched_upper_std) {
2971 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2972 // differ static chunked vs. unchunked: chunk should be invalid to
2973 // indicate unchunked schedule (which is the default)
2974 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2975 } else {
2976 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2977 __kmp_sch_map[kind - kmp_sched_lower - 1];
2978 }
2979 } else {
2980 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2981 // kmp_sched_lower - 2 ];
2982 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2983 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2984 kmp_sched_lower - 2];
2985 }
2986 __kmp_sched_apply_mods_intkind(
2987 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2988 if (kind == kmp_sched_auto || chunk < 1) {
2989 // ignore parameter chunk for schedule auto
2990 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2991 } else {
2992 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2993 }
2994 }
2995
2996 /* Gets def_sched_var ICV values */
__kmp_get_schedule(int gtid,kmp_sched_t * kind,int * chunk)2997 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2998 kmp_info_t *thread;
2999 enum sched_type th_type;
3000
3001 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3002 KMP_DEBUG_ASSERT(__kmp_init_serial);
3003
3004 thread = __kmp_threads[gtid];
3005
3006 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3007 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3008 case kmp_sch_static:
3009 case kmp_sch_static_greedy:
3010 case kmp_sch_static_balanced:
3011 *kind = kmp_sched_static;
3012 __kmp_sched_apply_mods_stdkind(kind, th_type);
3013 *chunk = 0; // chunk was not set, try to show this fact via zero value
3014 return;
3015 case kmp_sch_static_chunked:
3016 *kind = kmp_sched_static;
3017 break;
3018 case kmp_sch_dynamic_chunked:
3019 *kind = kmp_sched_dynamic;
3020 break;
3021 case kmp_sch_guided_chunked:
3022 case kmp_sch_guided_iterative_chunked:
3023 case kmp_sch_guided_analytical_chunked:
3024 *kind = kmp_sched_guided;
3025 break;
3026 case kmp_sch_auto:
3027 *kind = kmp_sched_auto;
3028 break;
3029 case kmp_sch_trapezoidal:
3030 *kind = kmp_sched_trapezoidal;
3031 break;
3032 #if KMP_STATIC_STEAL_ENABLED
3033 case kmp_sch_static_steal:
3034 *kind = kmp_sched_static_steal;
3035 break;
3036 #endif
3037 default:
3038 KMP_FATAL(UnknownSchedulingType, th_type);
3039 }
3040
3041 __kmp_sched_apply_mods_stdkind(kind, th_type);
3042 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3043 }
3044
__kmp_get_ancestor_thread_num(int gtid,int level)3045 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3046
3047 int ii, dd;
3048 kmp_team_t *team;
3049 kmp_info_t *thr;
3050
3051 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3052 KMP_DEBUG_ASSERT(__kmp_init_serial);
3053
3054 // validate level
3055 if (level == 0)
3056 return 0;
3057 if (level < 0)
3058 return -1;
3059 thr = __kmp_threads[gtid];
3060 team = thr->th.th_team;
3061 ii = team->t.t_level;
3062 if (level > ii)
3063 return -1;
3064
3065 if (thr->th.th_teams_microtask) {
3066 // AC: we are in teams region where multiple nested teams have same level
3067 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3068 if (level <=
3069 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3070 KMP_DEBUG_ASSERT(ii >= tlevel);
3071 // AC: As we need to pass by the teams league, we need to artificially
3072 // increase ii
3073 if (ii == tlevel) {
3074 ii += 2; // three teams have same level
3075 } else {
3076 ii++; // two teams have same level
3077 }
3078 }
3079 }
3080
3081 if (ii == level)
3082 return __kmp_tid_from_gtid(gtid);
3083
3084 dd = team->t.t_serialized;
3085 level++;
3086 while (ii > level) {
3087 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3088 }
3089 if ((team->t.t_serialized) && (!dd)) {
3090 team = team->t.t_parent;
3091 continue;
3092 }
3093 if (ii > level) {
3094 team = team->t.t_parent;
3095 dd = team->t.t_serialized;
3096 ii--;
3097 }
3098 }
3099
3100 return (dd > 1) ? (0) : (team->t.t_master_tid);
3101 }
3102
__kmp_get_team_size(int gtid,int level)3103 int __kmp_get_team_size(int gtid, int level) {
3104
3105 int ii, dd;
3106 kmp_team_t *team;
3107 kmp_info_t *thr;
3108
3109 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3110 KMP_DEBUG_ASSERT(__kmp_init_serial);
3111
3112 // validate level
3113 if (level == 0)
3114 return 1;
3115 if (level < 0)
3116 return -1;
3117 thr = __kmp_threads[gtid];
3118 team = thr->th.th_team;
3119 ii = team->t.t_level;
3120 if (level > ii)
3121 return -1;
3122
3123 if (thr->th.th_teams_microtask) {
3124 // AC: we are in teams region where multiple nested teams have same level
3125 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3126 if (level <=
3127 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3128 KMP_DEBUG_ASSERT(ii >= tlevel);
3129 // AC: As we need to pass by the teams league, we need to artificially
3130 // increase ii
3131 if (ii == tlevel) {
3132 ii += 2; // three teams have same level
3133 } else {
3134 ii++; // two teams have same level
3135 }
3136 }
3137 }
3138
3139 while (ii > level) {
3140 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3141 }
3142 if (team->t.t_serialized && (!dd)) {
3143 team = team->t.t_parent;
3144 continue;
3145 }
3146 if (ii > level) {
3147 team = team->t.t_parent;
3148 ii--;
3149 }
3150 }
3151
3152 return team->t.t_nproc;
3153 }
3154
__kmp_get_schedule_global()3155 kmp_r_sched_t __kmp_get_schedule_global() {
3156 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3157 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3158 // independently. So one can get the updated schedule here.
3159
3160 kmp_r_sched_t r_sched;
3161
3162 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3163 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3164 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3165 // different roots (even in OMP 2.5)
3166 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3167 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3168 if (s == kmp_sch_static) {
3169 // replace STATIC with more detailed schedule (balanced or greedy)
3170 r_sched.r_sched_type = __kmp_static;
3171 } else if (s == kmp_sch_guided_chunked) {
3172 // replace GUIDED with more detailed schedule (iterative or analytical)
3173 r_sched.r_sched_type = __kmp_guided;
3174 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3175 r_sched.r_sched_type = __kmp_sched;
3176 }
3177 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3178
3179 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3180 // __kmp_chunk may be wrong here (if it was not ever set)
3181 r_sched.chunk = KMP_DEFAULT_CHUNK;
3182 } else {
3183 r_sched.chunk = __kmp_chunk;
3184 }
3185
3186 return r_sched;
3187 }
3188
3189 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3190 at least argc number of *t_argv entries for the requested team. */
__kmp_alloc_argv_entries(int argc,kmp_team_t * team,int realloc)3191 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3192
3193 KMP_DEBUG_ASSERT(team);
3194 if (!realloc || argc > team->t.t_max_argc) {
3195
3196 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3197 "current entries=%d\n",
3198 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3199 /* if previously allocated heap space for args, free them */
3200 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3201 __kmp_free((void *)team->t.t_argv);
3202
3203 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3204 /* use unused space in the cache line for arguments */
3205 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3206 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3207 "argv entries\n",
3208 team->t.t_id, team->t.t_max_argc));
3209 team->t.t_argv = &team->t.t_inline_argv[0];
3210 if (__kmp_storage_map) {
3211 __kmp_print_storage_map_gtid(
3212 -1, &team->t.t_inline_argv[0],
3213 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3214 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3215 team->t.t_id);
3216 }
3217 } else {
3218 /* allocate space for arguments in the heap */
3219 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3220 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3221 : 2 * argc;
3222 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3223 "argv entries\n",
3224 team->t.t_id, team->t.t_max_argc));
3225 team->t.t_argv =
3226 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3227 if (__kmp_storage_map) {
3228 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3229 &team->t.t_argv[team->t.t_max_argc],
3230 sizeof(void *) * team->t.t_max_argc,
3231 "team_%d.t_argv", team->t.t_id);
3232 }
3233 }
3234 }
3235 }
3236
__kmp_allocate_team_arrays(kmp_team_t * team,int max_nth)3237 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3238 int i;
3239 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3240 team->t.t_threads =
3241 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3242 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3243 sizeof(dispatch_shared_info_t) * num_disp_buff);
3244 team->t.t_dispatch =
3245 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3246 team->t.t_implicit_task_taskdata =
3247 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3248 team->t.t_max_nproc = max_nth;
3249
3250 /* setup dispatch buffers */
3251 for (i = 0; i < num_disp_buff; ++i) {
3252 team->t.t_disp_buffer[i].buffer_index = i;
3253 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3254 }
3255 }
3256
__kmp_free_team_arrays(kmp_team_t * team)3257 static void __kmp_free_team_arrays(kmp_team_t *team) {
3258 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3259 int i;
3260 for (i = 0; i < team->t.t_max_nproc; ++i) {
3261 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3262 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3263 team->t.t_dispatch[i].th_disp_buffer = NULL;
3264 }
3265 }
3266 #if KMP_USE_HIER_SCHED
3267 __kmp_dispatch_free_hierarchies(team);
3268 #endif
3269 __kmp_free(team->t.t_threads);
3270 __kmp_free(team->t.t_disp_buffer);
3271 __kmp_free(team->t.t_dispatch);
3272 __kmp_free(team->t.t_implicit_task_taskdata);
3273 team->t.t_threads = NULL;
3274 team->t.t_disp_buffer = NULL;
3275 team->t.t_dispatch = NULL;
3276 team->t.t_implicit_task_taskdata = 0;
3277 }
3278
__kmp_reallocate_team_arrays(kmp_team_t * team,int max_nth)3279 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3280 kmp_info_t **oldThreads = team->t.t_threads;
3281
3282 __kmp_free(team->t.t_disp_buffer);
3283 __kmp_free(team->t.t_dispatch);
3284 __kmp_free(team->t.t_implicit_task_taskdata);
3285 __kmp_allocate_team_arrays(team, max_nth);
3286
3287 KMP_MEMCPY(team->t.t_threads, oldThreads,
3288 team->t.t_nproc * sizeof(kmp_info_t *));
3289
3290 __kmp_free(oldThreads);
3291 }
3292
__kmp_get_global_icvs(void)3293 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3294
3295 kmp_r_sched_t r_sched =
3296 __kmp_get_schedule_global(); // get current state of scheduling globals
3297
3298 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3299
3300 kmp_internal_control_t g_icvs = {
3301 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3302 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3303 // adjustment of threads (per thread)
3304 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3305 // whether blocktime is explicitly set
3306 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3307 #if KMP_USE_MONITOR
3308 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3309 // intervals
3310 #endif
3311 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3312 // next parallel region (per thread)
3313 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3314 __kmp_cg_max_nth, // int thread_limit;
3315 __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3316 // on task. This is used in the case of target thread_limit
3317 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3318 // for max_active_levels
3319 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3320 // {sched,chunk} pair
3321 __kmp_nested_proc_bind.bind_types[0],
3322 __kmp_default_device,
3323 NULL // struct kmp_internal_control *next;
3324 };
3325
3326 return g_icvs;
3327 }
3328
__kmp_get_x_global_icvs(const kmp_team_t * team)3329 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3330
3331 kmp_internal_control_t gx_icvs;
3332 gx_icvs.serial_nesting_level =
3333 0; // probably =team->t.t_serial like in save_inter_controls
3334 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3335 gx_icvs.next = NULL;
3336
3337 return gx_icvs;
3338 }
3339
__kmp_initialize_root(kmp_root_t * root)3340 static void __kmp_initialize_root(kmp_root_t *root) {
3341 int f;
3342 kmp_team_t *root_team;
3343 kmp_team_t *hot_team;
3344 int hot_team_max_nth;
3345 kmp_r_sched_t r_sched =
3346 __kmp_get_schedule_global(); // get current state of scheduling globals
3347 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3348 KMP_DEBUG_ASSERT(root);
3349 KMP_ASSERT(!root->r.r_begin);
3350
3351 /* setup the root state structure */
3352 __kmp_init_lock(&root->r.r_begin_lock);
3353 root->r.r_begin = FALSE;
3354 root->r.r_active = FALSE;
3355 root->r.r_in_parallel = 0;
3356 root->r.r_blocktime = __kmp_dflt_blocktime;
3357 #if KMP_AFFINITY_SUPPORTED
3358 root->r.r_affinity_assigned = FALSE;
3359 #endif
3360
3361 /* setup the root team for this task */
3362 /* allocate the root team structure */
3363 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3364
3365 root_team =
3366 __kmp_allocate_team(root,
3367 1, // new_nproc
3368 1, // max_nproc
3369 #if OMPT_SUPPORT
3370 ompt_data_none, // root parallel id
3371 #endif
3372 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3373 0 // argc
3374 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3375 );
3376 #if USE_DEBUGGER
3377 // Non-NULL value should be assigned to make the debugger display the root
3378 // team.
3379 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3380 #endif
3381
3382 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3383
3384 root->r.r_root_team = root_team;
3385 root_team->t.t_control_stack_top = NULL;
3386
3387 /* initialize root team */
3388 root_team->t.t_threads[0] = NULL;
3389 root_team->t.t_nproc = 1;
3390 root_team->t.t_serialized = 1;
3391 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392 root_team->t.t_sched.sched = r_sched.sched;
3393 KA_TRACE(
3394 20,
3395 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3396 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3397
3398 /* setup the hot team for this task */
3399 /* allocate the hot team structure */
3400 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3401
3402 hot_team =
3403 __kmp_allocate_team(root,
3404 1, // new_nproc
3405 __kmp_dflt_team_nth_ub * 2, // max_nproc
3406 #if OMPT_SUPPORT
3407 ompt_data_none, // root parallel id
3408 #endif
3409 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3410 0 // argc
3411 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3412 );
3413 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3414
3415 root->r.r_hot_team = hot_team;
3416 root_team->t.t_control_stack_top = NULL;
3417
3418 /* first-time initialization */
3419 hot_team->t.t_parent = root_team;
3420
3421 /* initialize hot team */
3422 hot_team_max_nth = hot_team->t.t_max_nproc;
3423 for (f = 0; f < hot_team_max_nth; ++f) {
3424 hot_team->t.t_threads[f] = NULL;
3425 }
3426 hot_team->t.t_nproc = 1;
3427 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3428 hot_team->t.t_sched.sched = r_sched.sched;
3429 hot_team->t.t_size_changed = 0;
3430 }
3431
3432 #ifdef KMP_DEBUG
3433
3434 typedef struct kmp_team_list_item {
3435 kmp_team_p const *entry;
3436 struct kmp_team_list_item *next;
3437 } kmp_team_list_item_t;
3438 typedef kmp_team_list_item_t *kmp_team_list_t;
3439
__kmp_print_structure_team_accum(kmp_team_list_t list,kmp_team_p const * team)3440 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3441 kmp_team_list_t list, // List of teams.
3442 kmp_team_p const *team // Team to add.
3443 ) {
3444
3445 // List must terminate with item where both entry and next are NULL.
3446 // Team is added to the list only once.
3447 // List is sorted in ascending order by team id.
3448 // Team id is *not* a key.
3449
3450 kmp_team_list_t l;
3451
3452 KMP_DEBUG_ASSERT(list != NULL);
3453 if (team == NULL) {
3454 return;
3455 }
3456
3457 __kmp_print_structure_team_accum(list, team->t.t_parent);
3458 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3459
3460 // Search list for the team.
3461 l = list;
3462 while (l->next != NULL && l->entry != team) {
3463 l = l->next;
3464 }
3465 if (l->next != NULL) {
3466 return; // Team has been added before, exit.
3467 }
3468
3469 // Team is not found. Search list again for insertion point.
3470 l = list;
3471 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3472 l = l->next;
3473 }
3474
3475 // Insert team.
3476 {
3477 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3478 sizeof(kmp_team_list_item_t));
3479 *item = *l;
3480 l->entry = team;
3481 l->next = item;
3482 }
3483 }
3484
__kmp_print_structure_team(char const * title,kmp_team_p const * team)3485 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3486
3487 ) {
3488 __kmp_printf("%s", title);
3489 if (team != NULL) {
3490 __kmp_printf("%2x %p\n", team->t.t_id, team);
3491 } else {
3492 __kmp_printf(" - (nil)\n");
3493 }
3494 }
3495
__kmp_print_structure_thread(char const * title,kmp_info_p const * thread)3496 static void __kmp_print_structure_thread(char const *title,
3497 kmp_info_p const *thread) {
3498 __kmp_printf("%s", title);
3499 if (thread != NULL) {
3500 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3501 } else {
3502 __kmp_printf(" - (nil)\n");
3503 }
3504 }
3505
__kmp_print_structure(void)3506 void __kmp_print_structure(void) {
3507
3508 kmp_team_list_t list;
3509
3510 // Initialize list of teams.
3511 list =
3512 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3513 list->entry = NULL;
3514 list->next = NULL;
3515
3516 __kmp_printf("\n------------------------------\nGlobal Thread "
3517 "Table\n------------------------------\n");
3518 {
3519 int gtid;
3520 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3521 __kmp_printf("%2d", gtid);
3522 if (__kmp_threads != NULL) {
3523 __kmp_printf(" %p", __kmp_threads[gtid]);
3524 }
3525 if (__kmp_root != NULL) {
3526 __kmp_printf(" %p", __kmp_root[gtid]);
3527 }
3528 __kmp_printf("\n");
3529 }
3530 }
3531
3532 // Print out __kmp_threads array.
3533 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3534 "----------\n");
3535 if (__kmp_threads != NULL) {
3536 int gtid;
3537 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3538 kmp_info_t const *thread = __kmp_threads[gtid];
3539 if (thread != NULL) {
3540 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3541 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3542 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3543 __kmp_print_structure_team(" Serial Team: ",
3544 thread->th.th_serial_team);
3545 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3546 __kmp_print_structure_thread(" Primary: ",
3547 thread->th.th_team_master);
3548 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3549 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3550 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3551 __kmp_print_structure_thread(" Next in pool: ",
3552 thread->th.th_next_pool);
3553 __kmp_printf("\n");
3554 __kmp_print_structure_team_accum(list, thread->th.th_team);
3555 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3556 }
3557 }
3558 } else {
3559 __kmp_printf("Threads array is not allocated.\n");
3560 }
3561
3562 // Print out __kmp_root array.
3563 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3564 "--------\n");
3565 if (__kmp_root != NULL) {
3566 int gtid;
3567 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3568 kmp_root_t const *root = __kmp_root[gtid];
3569 if (root != NULL) {
3570 __kmp_printf("GTID %2d %p:\n", gtid, root);
3571 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3572 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3573 __kmp_print_structure_thread(" Uber Thread: ",
3574 root->r.r_uber_thread);
3575 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3576 __kmp_printf(" In Parallel: %2d\n",
3577 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3578 __kmp_printf("\n");
3579 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3580 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3581 }
3582 }
3583 } else {
3584 __kmp_printf("Ubers array is not allocated.\n");
3585 }
3586
3587 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3588 "--------\n");
3589 while (list->next != NULL) {
3590 kmp_team_p const *team = list->entry;
3591 int i;
3592 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3593 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3594 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3595 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3596 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3597 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3598 for (i = 0; i < team->t.t_nproc; ++i) {
3599 __kmp_printf(" Thread %2d: ", i);
3600 __kmp_print_structure_thread("", team->t.t_threads[i]);
3601 }
3602 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3603 __kmp_printf("\n");
3604 list = list->next;
3605 }
3606
3607 // Print out __kmp_thread_pool and __kmp_team_pool.
3608 __kmp_printf("\n------------------------------\nPools\n----------------------"
3609 "--------\n");
3610 __kmp_print_structure_thread("Thread pool: ",
3611 CCAST(kmp_info_t *, __kmp_thread_pool));
3612 __kmp_print_structure_team("Team pool: ",
3613 CCAST(kmp_team_t *, __kmp_team_pool));
3614 __kmp_printf("\n");
3615
3616 // Free team list.
3617 while (list != NULL) {
3618 kmp_team_list_item_t *item = list;
3619 list = list->next;
3620 KMP_INTERNAL_FREE(item);
3621 }
3622 }
3623
3624 #endif
3625
3626 //---------------------------------------------------------------------------
3627 // Stuff for per-thread fast random number generator
3628 // Table of primes
3629 static const unsigned __kmp_primes[] = {
3630 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3631 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3632 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3633 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3634 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3635 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3636 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3637 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3638 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3639 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3640 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3641
3642 //---------------------------------------------------------------------------
3643 // __kmp_get_random: Get a random number using a linear congruential method.
__kmp_get_random(kmp_info_t * thread)3644 unsigned short __kmp_get_random(kmp_info_t *thread) {
3645 unsigned x = thread->th.th_x;
3646 unsigned short r = (unsigned short)(x >> 16);
3647
3648 thread->th.th_x = x * thread->th.th_a + 1;
3649
3650 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3651 thread->th.th_info.ds.ds_tid, r));
3652
3653 return r;
3654 }
3655 //--------------------------------------------------------
3656 // __kmp_init_random: Initialize a random number generator
__kmp_init_random(kmp_info_t * thread)3657 void __kmp_init_random(kmp_info_t *thread) {
3658 unsigned seed = thread->th.th_info.ds.ds_tid;
3659
3660 thread->th.th_a =
3661 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3662 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3663 KA_TRACE(30,
3664 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3665 }
3666
3667 #if KMP_OS_WINDOWS
3668 /* reclaim array entries for root threads that are already dead, returns number
3669 * reclaimed */
__kmp_reclaim_dead_roots(void)3670 static int __kmp_reclaim_dead_roots(void) {
3671 int i, r = 0;
3672
3673 for (i = 0; i < __kmp_threads_capacity; ++i) {
3674 if (KMP_UBER_GTID(i) &&
3675 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3676 !__kmp_root[i]
3677 ->r.r_active) { // AC: reclaim only roots died in non-active state
3678 r += __kmp_unregister_root_other_thread(i);
3679 }
3680 }
3681 return r;
3682 }
3683 #endif
3684
3685 /* This function attempts to create free entries in __kmp_threads and
3686 __kmp_root, and returns the number of free entries generated.
3687
3688 For Windows* OS static library, the first mechanism used is to reclaim array
3689 entries for root threads that are already dead.
3690
3691 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3692 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3693 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3694 threadprivate cache array has been created. Synchronization with
3695 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3696
3697 After any dead root reclamation, if the clipping value allows array expansion
3698 to result in the generation of a total of nNeed free slots, the function does
3699 that expansion. If not, nothing is done beyond the possible initial root
3700 thread reclamation.
3701
3702 If any argument is negative, the behavior is undefined. */
__kmp_expand_threads(int nNeed)3703 static int __kmp_expand_threads(int nNeed) {
3704 int added = 0;
3705 int minimumRequiredCapacity;
3706 int newCapacity;
3707 kmp_info_t **newThreads;
3708 kmp_root_t **newRoot;
3709
3710 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3711 // resizing __kmp_threads does not need additional protection if foreign
3712 // threads are present
3713
3714 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3715 /* only for Windows static library */
3716 /* reclaim array entries for root threads that are already dead */
3717 added = __kmp_reclaim_dead_roots();
3718
3719 if (nNeed) {
3720 nNeed -= added;
3721 if (nNeed < 0)
3722 nNeed = 0;
3723 }
3724 #endif
3725 if (nNeed <= 0)
3726 return added;
3727
3728 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3729 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3730 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3731 // > __kmp_max_nth in one of two ways:
3732 //
3733 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3734 // may not be reused by another thread, so we may need to increase
3735 // __kmp_threads_capacity to __kmp_max_nth + 1.
3736 //
3737 // 2) New foreign root(s) are encountered. We always register new foreign
3738 // roots. This may cause a smaller # of threads to be allocated at
3739 // subsequent parallel regions, but the worker threads hang around (and
3740 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3741 //
3742 // Anyway, that is the reason for moving the check to see if
3743 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3744 // instead of having it performed here. -BB
3745
3746 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3747
3748 /* compute expansion headroom to check if we can expand */
3749 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3750 /* possible expansion too small -- give up */
3751 return added;
3752 }
3753 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3754
3755 newCapacity = __kmp_threads_capacity;
3756 do {
3757 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3758 : __kmp_sys_max_nth;
3759 } while (newCapacity < minimumRequiredCapacity);
3760 newThreads = (kmp_info_t **)__kmp_allocate(
3761 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3762 newRoot =
3763 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3764 KMP_MEMCPY(newThreads, __kmp_threads,
3765 __kmp_threads_capacity * sizeof(kmp_info_t *));
3766 KMP_MEMCPY(newRoot, __kmp_root,
3767 __kmp_threads_capacity * sizeof(kmp_root_t *));
3768 // Put old __kmp_threads array on a list. Any ongoing references to the old
3769 // list will be valid. This list is cleaned up at library shutdown.
3770 kmp_old_threads_list_t *node =
3771 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3772 node->threads = __kmp_threads;
3773 node->next = __kmp_old_threads_list;
3774 __kmp_old_threads_list = node;
3775
3776 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3777 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3778 added += newCapacity - __kmp_threads_capacity;
3779 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3780
3781 if (newCapacity > __kmp_tp_capacity) {
3782 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3783 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3784 __kmp_threadprivate_resize_cache(newCapacity);
3785 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3786 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3787 }
3788 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3789 }
3790
3791 return added;
3792 }
3793
3794 /* Register the current thread as a root thread and obtain our gtid. We must
3795 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3796 thread that calls from __kmp_do_serial_initialize() */
__kmp_register_root(int initial_thread)3797 int __kmp_register_root(int initial_thread) {
3798 kmp_info_t *root_thread;
3799 kmp_root_t *root;
3800 int gtid;
3801 int capacity;
3802 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3803 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3804 KMP_MB();
3805
3806 /* 2007-03-02:
3807 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3808 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3809 work as expected -- it may return false (that means there is at least one
3810 empty slot in __kmp_threads array), but it is possible the only free slot
3811 is #0, which is reserved for initial thread and so cannot be used for this
3812 one. Following code workarounds this bug.
3813
3814 However, right solution seems to be not reserving slot #0 for initial
3815 thread because:
3816 (1) there is no magic in slot #0,
3817 (2) we cannot detect initial thread reliably (the first thread which does
3818 serial initialization may be not a real initial thread).
3819 */
3820 capacity = __kmp_threads_capacity;
3821 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3822 --capacity;
3823 }
3824
3825 // If it is not for initializing the hidden helper team, we need to take
3826 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3827 // in __kmp_threads_capacity.
3828 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3829 capacity -= __kmp_hidden_helper_threads_num;
3830 }
3831
3832 /* see if there are too many threads */
3833 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3834 if (__kmp_tp_cached) {
3835 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3836 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3837 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3838 } else {
3839 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3840 __kmp_msg_null);
3841 }
3842 }
3843
3844 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3845 // 0: initial thread, also a regular OpenMP thread.
3846 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3847 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3848 // regular OpenMP threads.
3849 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3850 // Find an available thread slot for hidden helper thread. Slots for hidden
3851 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3852 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3853 gtid <= __kmp_hidden_helper_threads_num;
3854 gtid++)
3855 ;
3856 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3857 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3858 "hidden helper thread: T#%d\n",
3859 gtid));
3860 } else {
3861 /* find an available thread slot */
3862 // Don't reassign the zero slot since we need that to only be used by
3863 // initial thread. Slots for hidden helper threads should also be skipped.
3864 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3865 gtid = 0;
3866 } else {
3867 for (gtid = __kmp_hidden_helper_threads_num + 1;
3868 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3869 ;
3870 }
3871 KA_TRACE(
3872 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3873 KMP_ASSERT(gtid < __kmp_threads_capacity);
3874 }
3875
3876 /* update global accounting */
3877 __kmp_all_nth++;
3878 TCW_4(__kmp_nth, __kmp_nth + 1);
3879
3880 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3881 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3882 if (__kmp_adjust_gtid_mode) {
3883 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3884 if (TCR_4(__kmp_gtid_mode) != 2) {
3885 TCW_4(__kmp_gtid_mode, 2);
3886 }
3887 } else {
3888 if (TCR_4(__kmp_gtid_mode) != 1) {
3889 TCW_4(__kmp_gtid_mode, 1);
3890 }
3891 }
3892 }
3893
3894 #ifdef KMP_ADJUST_BLOCKTIME
3895 /* Adjust blocktime to zero if necessary */
3896 /* Middle initialization might not have occurred yet */
3897 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3898 if (__kmp_nth > __kmp_avail_proc) {
3899 __kmp_zero_bt = TRUE;
3900 }
3901 }
3902 #endif /* KMP_ADJUST_BLOCKTIME */
3903
3904 /* setup this new hierarchy */
3905 if (!(root = __kmp_root[gtid])) {
3906 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3907 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3908 }
3909
3910 #if KMP_STATS_ENABLED
3911 // Initialize stats as soon as possible (right after gtid assignment).
3912 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3913 __kmp_stats_thread_ptr->startLife();
3914 KMP_SET_THREAD_STATE(SERIAL_REGION);
3915 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3916 #endif
3917 __kmp_initialize_root(root);
3918
3919 /* setup new root thread structure */
3920 if (root->r.r_uber_thread) {
3921 root_thread = root->r.r_uber_thread;
3922 } else {
3923 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3924 if (__kmp_storage_map) {
3925 __kmp_print_thread_storage_map(root_thread, gtid);
3926 }
3927 root_thread->th.th_info.ds.ds_gtid = gtid;
3928 #if OMPT_SUPPORT
3929 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3930 #endif
3931 root_thread->th.th_root = root;
3932 if (__kmp_env_consistency_check) {
3933 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3934 }
3935 #if USE_FAST_MEMORY
3936 __kmp_initialize_fast_memory(root_thread);
3937 #endif /* USE_FAST_MEMORY */
3938
3939 #if KMP_USE_BGET
3940 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3941 __kmp_initialize_bget(root_thread);
3942 #endif
3943 __kmp_init_random(root_thread); // Initialize random number generator
3944 }
3945
3946 /* setup the serial team held in reserve by the root thread */
3947 if (!root_thread->th.th_serial_team) {
3948 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3949 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3950 root_thread->th.th_serial_team = __kmp_allocate_team(
3951 root, 1, 1,
3952 #if OMPT_SUPPORT
3953 ompt_data_none, // root parallel id
3954 #endif
3955 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3956 }
3957 KMP_ASSERT(root_thread->th.th_serial_team);
3958 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3959 root_thread->th.th_serial_team));
3960
3961 /* drop root_thread into place */
3962 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3963
3964 root->r.r_root_team->t.t_threads[0] = root_thread;
3965 root->r.r_hot_team->t.t_threads[0] = root_thread;
3966 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3967 // AC: the team created in reserve, not for execution (it is unused for now).
3968 root_thread->th.th_serial_team->t.t_serialized = 0;
3969 root->r.r_uber_thread = root_thread;
3970
3971 /* initialize the thread, get it ready to go */
3972 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3973 TCW_4(__kmp_init_gtid, TRUE);
3974
3975 /* prepare the primary thread for get_gtid() */
3976 __kmp_gtid_set_specific(gtid);
3977
3978 #if USE_ITT_BUILD
3979 __kmp_itt_thread_name(gtid);
3980 #endif /* USE_ITT_BUILD */
3981
3982 #ifdef KMP_TDATA_GTID
3983 __kmp_gtid = gtid;
3984 #endif
3985 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3986 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3987
3988 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3989 "plain=%u\n",
3990 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3991 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3992 KMP_INIT_BARRIER_STATE));
3993 { // Initialize barrier data.
3994 int b;
3995 for (b = 0; b < bs_last_barrier; ++b) {
3996 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3997 #if USE_DEBUGGER
3998 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3999 #endif
4000 }
4001 }
4002 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4003 KMP_INIT_BARRIER_STATE);
4004
4005 #if KMP_AFFINITY_SUPPORTED
4006 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4007 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4008 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4009 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4010 #endif /* KMP_AFFINITY_SUPPORTED */
4011 root_thread->th.th_def_allocator = __kmp_def_allocator;
4012 root_thread->th.th_prev_level = 0;
4013 root_thread->th.th_prev_num_threads = 1;
4014
4015 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4016 tmp->cg_root = root_thread;
4017 tmp->cg_thread_limit = __kmp_cg_max_nth;
4018 tmp->cg_nthreads = 1;
4019 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4020 " cg_nthreads init to 1\n",
4021 root_thread, tmp));
4022 tmp->up = NULL;
4023 root_thread->th.th_cg_roots = tmp;
4024
4025 __kmp_root_counter++;
4026
4027 #if OMPT_SUPPORT
4028 if (!initial_thread && ompt_enabled.enabled) {
4029
4030 kmp_info_t *root_thread = ompt_get_thread();
4031
4032 ompt_set_thread_state(root_thread, ompt_state_overhead);
4033
4034 if (ompt_enabled.ompt_callback_thread_begin) {
4035 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4036 ompt_thread_initial, __ompt_get_thread_data_internal());
4037 }
4038 ompt_data_t *task_data;
4039 ompt_data_t *parallel_data;
4040 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data,
4041 NULL);
4042 if (ompt_enabled.ompt_callback_implicit_task) {
4043 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4044 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4045 }
4046
4047 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4048 }
4049 #endif
4050 #if OMPD_SUPPORT
4051 if (ompd_state & OMPD_ENABLE_BP)
4052 ompd_bp_thread_begin();
4053 #endif
4054
4055 KMP_MB();
4056 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4057
4058 return gtid;
4059 }
4060
4061 #if KMP_NESTED_HOT_TEAMS
__kmp_free_hot_teams(kmp_root_t * root,kmp_info_t * thr,int level,const int max_level)4062 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4063 const int max_level) {
4064 int i, n, nth;
4065 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4066 if (!hot_teams || !hot_teams[level].hot_team) {
4067 return 0;
4068 }
4069 KMP_DEBUG_ASSERT(level < max_level);
4070 kmp_team_t *team = hot_teams[level].hot_team;
4071 nth = hot_teams[level].hot_team_nth;
4072 n = nth - 1; // primary thread is not freed
4073 if (level < max_level - 1) {
4074 for (i = 0; i < nth; ++i) {
4075 kmp_info_t *th = team->t.t_threads[i];
4076 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4077 if (i > 0 && th->th.th_hot_teams) {
4078 __kmp_free(th->th.th_hot_teams);
4079 th->th.th_hot_teams = NULL;
4080 }
4081 }
4082 }
4083 __kmp_free_team(root, team, NULL);
4084 return n;
4085 }
4086 #endif
4087
4088 // Resets a root thread and clear its root and hot teams.
4089 // Returns the number of __kmp_threads entries directly and indirectly freed.
__kmp_reset_root(int gtid,kmp_root_t * root)4090 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4091 kmp_team_t *root_team = root->r.r_root_team;
4092 kmp_team_t *hot_team = root->r.r_hot_team;
4093 int n = hot_team->t.t_nproc;
4094 int i;
4095
4096 KMP_DEBUG_ASSERT(!root->r.r_active);
4097
4098 root->r.r_root_team = NULL;
4099 root->r.r_hot_team = NULL;
4100 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4101 // before call to __kmp_free_team().
4102 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4103 #if KMP_NESTED_HOT_TEAMS
4104 if (__kmp_hot_teams_max_level >
4105 0) { // need to free nested hot teams and their threads if any
4106 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4107 kmp_info_t *th = hot_team->t.t_threads[i];
4108 if (__kmp_hot_teams_max_level > 1) {
4109 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4110 }
4111 if (th->th.th_hot_teams) {
4112 __kmp_free(th->th.th_hot_teams);
4113 th->th.th_hot_teams = NULL;
4114 }
4115 }
4116 }
4117 #endif
4118 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4119
4120 // Before we can reap the thread, we need to make certain that all other
4121 // threads in the teams that had this root as ancestor have stopped trying to
4122 // steal tasks.
4123 if (__kmp_tasking_mode != tskm_immediate_exec) {
4124 __kmp_wait_to_unref_task_teams();
4125 }
4126
4127 #if KMP_OS_WINDOWS
4128 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4129 KA_TRACE(
4130 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4131 "\n",
4132 (LPVOID) & (root->r.r_uber_thread->th),
4133 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4134 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4135 #endif /* KMP_OS_WINDOWS */
4136
4137 #if OMPD_SUPPORT
4138 if (ompd_state & OMPD_ENABLE_BP)
4139 ompd_bp_thread_end();
4140 #endif
4141
4142 #if OMPT_SUPPORT
4143 ompt_data_t *task_data;
4144 ompt_data_t *parallel_data;
4145 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data,
4146 NULL);
4147 if (ompt_enabled.ompt_callback_implicit_task) {
4148 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4149 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4150 }
4151 if (ompt_enabled.ompt_callback_thread_end) {
4152 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4153 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4154 }
4155 #endif
4156
4157 TCW_4(__kmp_nth,
4158 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4159 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4160 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4161 " to %d\n",
4162 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4163 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4164 if (i == 1) {
4165 // need to free contention group structure
4166 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4167 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4168 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4169 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4170 root->r.r_uber_thread->th.th_cg_roots = NULL;
4171 }
4172 __kmp_reap_thread(root->r.r_uber_thread, 1);
4173
4174 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4175 // instead of freeing.
4176 root->r.r_uber_thread = NULL;
4177 /* mark root as no longer in use */
4178 root->r.r_begin = FALSE;
4179
4180 return n;
4181 }
4182
__kmp_unregister_root_current_thread(int gtid)4183 void __kmp_unregister_root_current_thread(int gtid) {
4184 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4185 /* this lock should be ok, since unregister_root_current_thread is never
4186 called during an abort, only during a normal close. furthermore, if you
4187 have the forkjoin lock, you should never try to get the initz lock */
4188 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4189 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4190 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4191 "exiting T#%d\n",
4192 gtid));
4193 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4194 return;
4195 }
4196 kmp_root_t *root = __kmp_root[gtid];
4197
4198 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4199 KMP_ASSERT(KMP_UBER_GTID(gtid));
4200 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4201 KMP_ASSERT(root->r.r_active == FALSE);
4202
4203 KMP_MB();
4204
4205 kmp_info_t *thread = __kmp_threads[gtid];
4206 kmp_team_t *team = thread->th.th_team;
4207 kmp_task_team_t *task_team = thread->th.th_task_team;
4208
4209 // we need to wait for the proxy tasks before finishing the thread
4210 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4211 task_team->tt.tt_hidden_helper_task_encountered)) {
4212 #if OMPT_SUPPORT
4213 // the runtime is shutting down so we won't report any events
4214 thread->th.ompt_thread_info.state = ompt_state_undefined;
4215 #endif
4216 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4217 }
4218
4219 __kmp_reset_root(gtid, root);
4220
4221 KMP_MB();
4222 KC_TRACE(10,
4223 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4224
4225 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4226 }
4227
4228 #if KMP_OS_WINDOWS
4229 /* __kmp_forkjoin_lock must be already held
4230 Unregisters a root thread that is not the current thread. Returns the number
4231 of __kmp_threads entries freed as a result. */
__kmp_unregister_root_other_thread(int gtid)4232 static int __kmp_unregister_root_other_thread(int gtid) {
4233 kmp_root_t *root = __kmp_root[gtid];
4234 int r;
4235
4236 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4237 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4238 KMP_ASSERT(KMP_UBER_GTID(gtid));
4239 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4240 KMP_ASSERT(root->r.r_active == FALSE);
4241
4242 r = __kmp_reset_root(gtid, root);
4243 KC_TRACE(10,
4244 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4245 return r;
4246 }
4247 #endif
4248
4249 #if KMP_DEBUG
__kmp_task_info()4250 void __kmp_task_info() {
4251
4252 kmp_int32 gtid = __kmp_entry_gtid();
4253 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4254 kmp_info_t *this_thr = __kmp_threads[gtid];
4255 kmp_team_t *steam = this_thr->th.th_serial_team;
4256 kmp_team_t *team = this_thr->th.th_team;
4257
4258 __kmp_printf(
4259 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4260 "ptask=%p\n",
4261 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4262 team->t.t_implicit_task_taskdata[tid].td_parent);
4263 }
4264 #endif // KMP_DEBUG
4265
4266 /* TODO optimize with one big memclr, take out what isn't needed, split
4267 responsibility to workers as much as possible, and delay initialization of
4268 features as much as possible */
__kmp_initialize_info(kmp_info_t * this_thr,kmp_team_t * team,int tid,int gtid)4269 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4270 int tid, int gtid) {
4271 /* this_thr->th.th_info.ds.ds_gtid is setup in
4272 kmp_allocate_thread/create_worker.
4273 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4274 KMP_DEBUG_ASSERT(this_thr != NULL);
4275 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4276 KMP_DEBUG_ASSERT(team);
4277 KMP_DEBUG_ASSERT(team->t.t_threads);
4278 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4279 kmp_info_t *master = team->t.t_threads[0];
4280 KMP_DEBUG_ASSERT(master);
4281 KMP_DEBUG_ASSERT(master->th.th_root);
4282
4283 KMP_MB();
4284
4285 TCW_SYNC_PTR(this_thr->th.th_team, team);
4286
4287 this_thr->th.th_info.ds.ds_tid = tid;
4288 this_thr->th.th_set_nproc = 0;
4289 if (__kmp_tasking_mode != tskm_immediate_exec)
4290 // When tasking is possible, threads are not safe to reap until they are
4291 // done tasking; this will be set when tasking code is exited in wait
4292 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4293 else // no tasking --> always safe to reap
4294 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4295 this_thr->th.th_set_proc_bind = proc_bind_default;
4296 #if KMP_AFFINITY_SUPPORTED
4297 this_thr->th.th_new_place = this_thr->th.th_current_place;
4298 #endif
4299 this_thr->th.th_root = master->th.th_root;
4300
4301 /* setup the thread's cache of the team structure */
4302 this_thr->th.th_team_nproc = team->t.t_nproc;
4303 this_thr->th.th_team_master = master;
4304 this_thr->th.th_team_serialized = team->t.t_serialized;
4305
4306 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4307
4308 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4309 tid, gtid, this_thr, this_thr->th.th_current_task));
4310
4311 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4312 team, tid, TRUE);
4313
4314 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4315 tid, gtid, this_thr, this_thr->th.th_current_task));
4316 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4317 // __kmp_initialize_team()?
4318
4319 /* TODO no worksharing in speculative threads */
4320 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4321
4322 this_thr->th.th_local.this_construct = 0;
4323
4324 if (!this_thr->th.th_pri_common) {
4325 this_thr->th.th_pri_common =
4326 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4327 if (__kmp_storage_map) {
4328 __kmp_print_storage_map_gtid(
4329 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4330 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4331 }
4332 this_thr->th.th_pri_head = NULL;
4333 }
4334
4335 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4336 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4337 // Make new thread's CG root same as primary thread's
4338 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4339 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4340 if (tmp) {
4341 // worker changes CG, need to check if old CG should be freed
4342 int i = tmp->cg_nthreads--;
4343 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4344 " on node %p of thread %p to %d\n",
4345 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4346 if (i == 1) {
4347 __kmp_free(tmp); // last thread left CG --> free it
4348 }
4349 }
4350 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4351 // Increment new thread's CG root's counter to add the new thread
4352 this_thr->th.th_cg_roots->cg_nthreads++;
4353 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4354 " node %p of thread %p to %d\n",
4355 this_thr, this_thr->th.th_cg_roots,
4356 this_thr->th.th_cg_roots->cg_root,
4357 this_thr->th.th_cg_roots->cg_nthreads));
4358 this_thr->th.th_current_task->td_icvs.thread_limit =
4359 this_thr->th.th_cg_roots->cg_thread_limit;
4360 }
4361
4362 /* Initialize dynamic dispatch */
4363 {
4364 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4365 // Use team max_nproc since this will never change for the team.
4366 size_t disp_size =
4367 sizeof(dispatch_private_info_t) *
4368 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4369 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4370 team->t.t_max_nproc));
4371 KMP_ASSERT(dispatch);
4372 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4373 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4374
4375 dispatch->th_disp_index = 0;
4376 dispatch->th_doacross_buf_idx = 0;
4377 if (!dispatch->th_disp_buffer) {
4378 dispatch->th_disp_buffer =
4379 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4380
4381 if (__kmp_storage_map) {
4382 __kmp_print_storage_map_gtid(
4383 gtid, &dispatch->th_disp_buffer[0],
4384 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4385 ? 1
4386 : __kmp_dispatch_num_buffers],
4387 disp_size,
4388 "th_%d.th_dispatch.th_disp_buffer "
4389 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4390 gtid, team->t.t_id, gtid);
4391 }
4392 } else {
4393 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4394 }
4395
4396 dispatch->th_dispatch_pr_current = 0;
4397 dispatch->th_dispatch_sh_current = 0;
4398
4399 dispatch->th_deo_fcn = 0; /* ORDERED */
4400 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4401 }
4402
4403 this_thr->th.th_next_pool = NULL;
4404
4405 if (!this_thr->th.th_task_state_memo_stack) {
4406 size_t i;
4407 this_thr->th.th_task_state_memo_stack =
4408 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4409 this_thr->th.th_task_state_top = 0;
4410 this_thr->th.th_task_state_stack_sz = 4;
4411 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4412 ++i) // zero init the stack
4413 this_thr->th.th_task_state_memo_stack[i] = 0;
4414 }
4415
4416 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4417 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4418
4419 KMP_MB();
4420 }
4421
4422 /* allocate a new thread for the requesting team. this is only called from
4423 within a forkjoin critical section. we will first try to get an available
4424 thread from the thread pool. if none is available, we will fork a new one
4425 assuming we are able to create a new one. this should be assured, as the
4426 caller should check on this first. */
__kmp_allocate_thread(kmp_root_t * root,kmp_team_t * team,int new_tid)4427 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4428 int new_tid) {
4429 kmp_team_t *serial_team;
4430 kmp_info_t *new_thr;
4431 int new_gtid;
4432
4433 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4434 KMP_DEBUG_ASSERT(root && team);
4435 #if !KMP_NESTED_HOT_TEAMS
4436 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4437 #endif
4438 KMP_MB();
4439
4440 /* first, try to get one from the thread pool */
4441 if (__kmp_thread_pool) {
4442 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4443 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4444 if (new_thr == __kmp_thread_pool_insert_pt) {
4445 __kmp_thread_pool_insert_pt = NULL;
4446 }
4447 TCW_4(new_thr->th.th_in_pool, FALSE);
4448 __kmp_suspend_initialize_thread(new_thr);
4449 __kmp_lock_suspend_mx(new_thr);
4450 if (new_thr->th.th_active_in_pool == TRUE) {
4451 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4452 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4453 new_thr->th.th_active_in_pool = FALSE;
4454 }
4455 __kmp_unlock_suspend_mx(new_thr);
4456
4457 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4458 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4459 KMP_ASSERT(!new_thr->th.th_team);
4460 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4461
4462 /* setup the thread structure */
4463 __kmp_initialize_info(new_thr, team, new_tid,
4464 new_thr->th.th_info.ds.ds_gtid);
4465 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4466
4467 TCW_4(__kmp_nth, __kmp_nth + 1);
4468
4469 new_thr->th.th_task_state = 0;
4470 new_thr->th.th_task_state_top = 0;
4471 new_thr->th.th_task_state_stack_sz = 4;
4472
4473 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4474 // Make sure pool thread has transitioned to waiting on own thread struct
4475 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4476 // Thread activated in __kmp_allocate_team when increasing team size
4477 }
4478
4479 #ifdef KMP_ADJUST_BLOCKTIME
4480 /* Adjust blocktime back to zero if necessary */
4481 /* Middle initialization might not have occurred yet */
4482 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4483 if (__kmp_nth > __kmp_avail_proc) {
4484 __kmp_zero_bt = TRUE;
4485 }
4486 }
4487 #endif /* KMP_ADJUST_BLOCKTIME */
4488
4489 #if KMP_DEBUG
4490 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4491 // KMP_BARRIER_PARENT_FLAG.
4492 int b;
4493 kmp_balign_t *balign = new_thr->th.th_bar;
4494 for (b = 0; b < bs_last_barrier; ++b)
4495 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4496 #endif
4497
4498 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4499 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4500
4501 KMP_MB();
4502 return new_thr;
4503 }
4504
4505 /* no, well fork a new one */
4506 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4507 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4508
4509 #if KMP_USE_MONITOR
4510 // If this is the first worker thread the RTL is creating, then also
4511 // launch the monitor thread. We try to do this as early as possible.
4512 if (!TCR_4(__kmp_init_monitor)) {
4513 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4514 if (!TCR_4(__kmp_init_monitor)) {
4515 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4516 TCW_4(__kmp_init_monitor, 1);
4517 __kmp_create_monitor(&__kmp_monitor);
4518 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4519 #if KMP_OS_WINDOWS
4520 // AC: wait until monitor has started. This is a fix for CQ232808.
4521 // The reason is that if the library is loaded/unloaded in a loop with
4522 // small (parallel) work in between, then there is high probability that
4523 // monitor thread started after the library shutdown. At shutdown it is
4524 // too late to cope with the problem, because when the primary thread is
4525 // in DllMain (process detach) the monitor has no chances to start (it is
4526 // blocked), and primary thread has no means to inform the monitor that
4527 // the library has gone, because all the memory which the monitor can
4528 // access is going to be released/reset.
4529 while (TCR_4(__kmp_init_monitor) < 2) {
4530 KMP_YIELD(TRUE);
4531 }
4532 KF_TRACE(10, ("after monitor thread has started\n"));
4533 #endif
4534 }
4535 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4536 }
4537 #endif
4538
4539 KMP_MB();
4540
4541 {
4542 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4543 ? 1
4544 : __kmp_hidden_helper_threads_num + 1;
4545
4546 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4547 ++new_gtid) {
4548 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4549 }
4550
4551 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4552 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4553 }
4554 }
4555
4556 /* allocate space for it. */
4557 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4558
4559 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4560
4561 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4562 // suppress race conditions detection on synchronization flags in debug mode
4563 // this helps to analyze library internals eliminating false positives
4564 __itt_suppress_mark_range(
4565 __itt_suppress_range, __itt_suppress_threading_errors,
4566 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4567 __itt_suppress_mark_range(
4568 __itt_suppress_range, __itt_suppress_threading_errors,
4569 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4570 #if KMP_OS_WINDOWS
4571 __itt_suppress_mark_range(
4572 __itt_suppress_range, __itt_suppress_threading_errors,
4573 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4574 #else
4575 __itt_suppress_mark_range(__itt_suppress_range,
4576 __itt_suppress_threading_errors,
4577 &new_thr->th.th_suspend_init_count,
4578 sizeof(new_thr->th.th_suspend_init_count));
4579 #endif
4580 // TODO: check if we need to also suppress b_arrived flags
4581 __itt_suppress_mark_range(__itt_suppress_range,
4582 __itt_suppress_threading_errors,
4583 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4584 sizeof(new_thr->th.th_bar[0].bb.b_go));
4585 __itt_suppress_mark_range(__itt_suppress_range,
4586 __itt_suppress_threading_errors,
4587 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4588 sizeof(new_thr->th.th_bar[1].bb.b_go));
4589 __itt_suppress_mark_range(__itt_suppress_range,
4590 __itt_suppress_threading_errors,
4591 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4592 sizeof(new_thr->th.th_bar[2].bb.b_go));
4593 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4594 if (__kmp_storage_map) {
4595 __kmp_print_thread_storage_map(new_thr, new_gtid);
4596 }
4597
4598 // add the reserve serialized team, initialized from the team's primary thread
4599 {
4600 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4601 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4602 new_thr->th.th_serial_team = serial_team =
4603 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4604 #if OMPT_SUPPORT
4605 ompt_data_none, // root parallel id
4606 #endif
4607 proc_bind_default, &r_icvs,
4608 0 USE_NESTED_HOT_ARG(NULL));
4609 }
4610 KMP_ASSERT(serial_team);
4611 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4612 // execution (it is unused for now).
4613 serial_team->t.t_threads[0] = new_thr;
4614 KF_TRACE(10,
4615 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4616 new_thr));
4617
4618 /* setup the thread structures */
4619 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4620
4621 #if USE_FAST_MEMORY
4622 __kmp_initialize_fast_memory(new_thr);
4623 #endif /* USE_FAST_MEMORY */
4624
4625 #if KMP_USE_BGET
4626 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4627 __kmp_initialize_bget(new_thr);
4628 #endif
4629
4630 __kmp_init_random(new_thr); // Initialize random number generator
4631
4632 /* Initialize these only once when thread is grabbed for a team allocation */
4633 KA_TRACE(20,
4634 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4635 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4636
4637 int b;
4638 kmp_balign_t *balign = new_thr->th.th_bar;
4639 for (b = 0; b < bs_last_barrier; ++b) {
4640 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4641 balign[b].bb.team = NULL;
4642 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4643 balign[b].bb.use_oncore_barrier = 0;
4644 }
4645
4646 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4647 new_thr->th.th_sleep_loc_type = flag_unset;
4648
4649 new_thr->th.th_spin_here = FALSE;
4650 new_thr->th.th_next_waiting = 0;
4651 #if KMP_OS_UNIX
4652 new_thr->th.th_blocking = false;
4653 #endif
4654
4655 #if KMP_AFFINITY_SUPPORTED
4656 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4657 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4658 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4659 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4660 #endif
4661 new_thr->th.th_def_allocator = __kmp_def_allocator;
4662 new_thr->th.th_prev_level = 0;
4663 new_thr->th.th_prev_num_threads = 1;
4664
4665 TCW_4(new_thr->th.th_in_pool, FALSE);
4666 new_thr->th.th_active_in_pool = FALSE;
4667 TCW_4(new_thr->th.th_active, TRUE);
4668
4669 /* adjust the global counters */
4670 __kmp_all_nth++;
4671 __kmp_nth++;
4672
4673 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4674 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4675 if (__kmp_adjust_gtid_mode) {
4676 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4677 if (TCR_4(__kmp_gtid_mode) != 2) {
4678 TCW_4(__kmp_gtid_mode, 2);
4679 }
4680 } else {
4681 if (TCR_4(__kmp_gtid_mode) != 1) {
4682 TCW_4(__kmp_gtid_mode, 1);
4683 }
4684 }
4685 }
4686
4687 #ifdef KMP_ADJUST_BLOCKTIME
4688 /* Adjust blocktime back to zero if necessary */
4689 /* Middle initialization might not have occurred yet */
4690 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4691 if (__kmp_nth > __kmp_avail_proc) {
4692 __kmp_zero_bt = TRUE;
4693 }
4694 }
4695 #endif /* KMP_ADJUST_BLOCKTIME */
4696
4697 #if KMP_AFFINITY_SUPPORTED
4698 // Set the affinity and topology information for new thread
4699 __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4700 #endif
4701
4702 /* actually fork it and create the new worker thread */
4703 KF_TRACE(
4704 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4705 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4706 KF_TRACE(10,
4707 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4708
4709 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4710 new_gtid));
4711 KMP_MB();
4712 return new_thr;
4713 }
4714
4715 /* Reinitialize team for reuse.
4716 The hot team code calls this case at every fork barrier, so EPCC barrier
4717 test are extremely sensitive to changes in it, esp. writes to the team
4718 struct, which cause a cache invalidation in all threads.
4719 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
__kmp_reinitialize_team(kmp_team_t * team,kmp_internal_control_t * new_icvs,ident_t * loc)4720 static void __kmp_reinitialize_team(kmp_team_t *team,
4721 kmp_internal_control_t *new_icvs,
4722 ident_t *loc) {
4723 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4724 team->t.t_threads[0], team));
4725 KMP_DEBUG_ASSERT(team && new_icvs);
4726 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4727 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4728
4729 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4730 // Copy ICVs to the primary thread's implicit taskdata
4731 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4732 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4733
4734 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4735 team->t.t_threads[0], team));
4736 }
4737
4738 /* Initialize the team data structure.
4739 This assumes the t_threads and t_max_nproc are already set.
4740 Also, we don't touch the arguments */
__kmp_initialize_team(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)4741 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4742 kmp_internal_control_t *new_icvs,
4743 ident_t *loc) {
4744 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4745
4746 /* verify */
4747 KMP_DEBUG_ASSERT(team);
4748 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4749 KMP_DEBUG_ASSERT(team->t.t_threads);
4750 KMP_MB();
4751
4752 team->t.t_master_tid = 0; /* not needed */
4753 /* team->t.t_master_bar; not needed */
4754 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4755 team->t.t_nproc = new_nproc;
4756
4757 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4758 team->t.t_next_pool = NULL;
4759 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4760 * up hot team */
4761
4762 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4763 team->t.t_invoke = NULL; /* not needed */
4764
4765 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4766 team->t.t_sched.sched = new_icvs->sched.sched;
4767
4768 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4769 team->t.t_fp_control_saved = FALSE; /* not needed */
4770 team->t.t_x87_fpu_control_word = 0; /* not needed */
4771 team->t.t_mxcsr = 0; /* not needed */
4772 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4773
4774 team->t.t_construct = 0;
4775
4776 team->t.t_ordered.dt.t_value = 0;
4777 team->t.t_master_active = FALSE;
4778
4779 #ifdef KMP_DEBUG
4780 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4781 #endif
4782 #if KMP_OS_WINDOWS
4783 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4784 #endif
4785
4786 team->t.t_control_stack_top = NULL;
4787
4788 __kmp_reinitialize_team(team, new_icvs, loc);
4789
4790 KMP_MB();
4791 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4792 }
4793
4794 #if KMP_AFFINITY_SUPPORTED
__kmp_set_thread_place(kmp_team_t * team,kmp_info_t * th,int first,int last,int newp)4795 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4796 int first, int last, int newp) {
4797 th->th.th_first_place = first;
4798 th->th.th_last_place = last;
4799 th->th.th_new_place = newp;
4800 if (newp != th->th.th_current_place) {
4801 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4802 team->t.t_display_affinity = 1;
4803 // Copy topology information associated with the new place
4804 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4805 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4806 }
4807 }
4808
4809 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4810 // It calculates the worker + primary thread's partition based upon the parent
4811 // thread's partition, and binds each worker to a thread in their partition.
4812 // The primary thread's partition should already include its current binding.
__kmp_partition_places(kmp_team_t * team,int update_master_only)4813 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4814 // Do not partition places for the hidden helper team
4815 if (KMP_HIDDEN_HELPER_TEAM(team))
4816 return;
4817 // Copy the primary thread's place partition to the team struct
4818 kmp_info_t *master_th = team->t.t_threads[0];
4819 KMP_DEBUG_ASSERT(master_th != NULL);
4820 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4821 int first_place = master_th->th.th_first_place;
4822 int last_place = master_th->th.th_last_place;
4823 int masters_place = master_th->th.th_current_place;
4824 int num_masks = __kmp_affinity.num_masks;
4825 team->t.t_first_place = first_place;
4826 team->t.t_last_place = last_place;
4827
4828 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4829 "bound to place %d partition = [%d,%d]\n",
4830 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4831 team->t.t_id, masters_place, first_place, last_place));
4832
4833 switch (proc_bind) {
4834
4835 case proc_bind_default:
4836 // Serial teams might have the proc_bind policy set to proc_bind_default.
4837 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4838 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4839 break;
4840
4841 case proc_bind_primary: {
4842 int f;
4843 int n_th = team->t.t_nproc;
4844 for (f = 1; f < n_th; f++) {
4845 kmp_info_t *th = team->t.t_threads[f];
4846 KMP_DEBUG_ASSERT(th != NULL);
4847 __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4848
4849 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4850 "partition = [%d,%d]\n",
4851 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4852 f, masters_place, first_place, last_place));
4853 }
4854 } break;
4855
4856 case proc_bind_close: {
4857 int f;
4858 int n_th = team->t.t_nproc;
4859 int n_places;
4860 if (first_place <= last_place) {
4861 n_places = last_place - first_place + 1;
4862 } else {
4863 n_places = num_masks - first_place + last_place + 1;
4864 }
4865 if (n_th <= n_places) {
4866 int place = masters_place;
4867 for (f = 1; f < n_th; f++) {
4868 kmp_info_t *th = team->t.t_threads[f];
4869 KMP_DEBUG_ASSERT(th != NULL);
4870
4871 if (place == last_place) {
4872 place = first_place;
4873 } else if (place == (num_masks - 1)) {
4874 place = 0;
4875 } else {
4876 place++;
4877 }
4878 __kmp_set_thread_place(team, th, first_place, last_place, place);
4879
4880 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4881 "partition = [%d,%d]\n",
4882 __kmp_gtid_from_thread(team->t.t_threads[f]),
4883 team->t.t_id, f, place, first_place, last_place));
4884 }
4885 } else {
4886 int S, rem, gap, s_count;
4887 S = n_th / n_places;
4888 s_count = 0;
4889 rem = n_th - (S * n_places);
4890 gap = rem > 0 ? n_places / rem : n_places;
4891 int place = masters_place;
4892 int gap_ct = gap;
4893 for (f = 0; f < n_th; f++) {
4894 kmp_info_t *th = team->t.t_threads[f];
4895 KMP_DEBUG_ASSERT(th != NULL);
4896
4897 __kmp_set_thread_place(team, th, first_place, last_place, place);
4898 s_count++;
4899
4900 if ((s_count == S) && rem && (gap_ct == gap)) {
4901 // do nothing, add an extra thread to place on next iteration
4902 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4903 // we added an extra thread to this place; move to next place
4904 if (place == last_place) {
4905 place = first_place;
4906 } else if (place == (num_masks - 1)) {
4907 place = 0;
4908 } else {
4909 place++;
4910 }
4911 s_count = 0;
4912 gap_ct = 1;
4913 rem--;
4914 } else if (s_count == S) { // place full; don't add extra
4915 if (place == last_place) {
4916 place = first_place;
4917 } else if (place == (num_masks - 1)) {
4918 place = 0;
4919 } else {
4920 place++;
4921 }
4922 gap_ct++;
4923 s_count = 0;
4924 }
4925
4926 KA_TRACE(100,
4927 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4928 "partition = [%d,%d]\n",
4929 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4930 th->th.th_new_place, first_place, last_place));
4931 }
4932 KMP_DEBUG_ASSERT(place == masters_place);
4933 }
4934 } break;
4935
4936 case proc_bind_spread: {
4937 int f;
4938 int n_th = team->t.t_nproc;
4939 int n_places;
4940 int thidx;
4941 if (first_place <= last_place) {
4942 n_places = last_place - first_place + 1;
4943 } else {
4944 n_places = num_masks - first_place + last_place + 1;
4945 }
4946 if (n_th <= n_places) {
4947 int place = -1;
4948
4949 if (n_places != num_masks) {
4950 int S = n_places / n_th;
4951 int s_count, rem, gap, gap_ct;
4952
4953 place = masters_place;
4954 rem = n_places - n_th * S;
4955 gap = rem ? n_th / rem : 1;
4956 gap_ct = gap;
4957 thidx = n_th;
4958 if (update_master_only == 1)
4959 thidx = 1;
4960 for (f = 0; f < thidx; f++) {
4961 kmp_info_t *th = team->t.t_threads[f];
4962 KMP_DEBUG_ASSERT(th != NULL);
4963
4964 int fplace = place, nplace = place;
4965 s_count = 1;
4966 while (s_count < S) {
4967 if (place == last_place) {
4968 place = first_place;
4969 } else if (place == (num_masks - 1)) {
4970 place = 0;
4971 } else {
4972 place++;
4973 }
4974 s_count++;
4975 }
4976 if (rem && (gap_ct == gap)) {
4977 if (place == last_place) {
4978 place = first_place;
4979 } else if (place == (num_masks - 1)) {
4980 place = 0;
4981 } else {
4982 place++;
4983 }
4984 rem--;
4985 gap_ct = 0;
4986 }
4987 __kmp_set_thread_place(team, th, fplace, place, nplace);
4988 gap_ct++;
4989
4990 if (place == last_place) {
4991 place = first_place;
4992 } else if (place == (num_masks - 1)) {
4993 place = 0;
4994 } else {
4995 place++;
4996 }
4997
4998 KA_TRACE(100,
4999 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5000 "partition = [%d,%d], num_masks: %u\n",
5001 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5002 f, th->th.th_new_place, th->th.th_first_place,
5003 th->th.th_last_place, num_masks));
5004 }
5005 } else {
5006 /* Having uniform space of available computation places I can create
5007 T partitions of round(P/T) size and put threads into the first
5008 place of each partition. */
5009 double current = static_cast<double>(masters_place);
5010 double spacing =
5011 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5012 int first, last;
5013 kmp_info_t *th;
5014
5015 thidx = n_th + 1;
5016 if (update_master_only == 1)
5017 thidx = 1;
5018 for (f = 0; f < thidx; f++) {
5019 first = static_cast<int>(current);
5020 last = static_cast<int>(current + spacing) - 1;
5021 KMP_DEBUG_ASSERT(last >= first);
5022 if (first >= n_places) {
5023 if (masters_place) {
5024 first -= n_places;
5025 last -= n_places;
5026 if (first == (masters_place + 1)) {
5027 KMP_DEBUG_ASSERT(f == n_th);
5028 first--;
5029 }
5030 if (last == masters_place) {
5031 KMP_DEBUG_ASSERT(f == (n_th - 1));
5032 last--;
5033 }
5034 } else {
5035 KMP_DEBUG_ASSERT(f == n_th);
5036 first = 0;
5037 last = 0;
5038 }
5039 }
5040 if (last >= n_places) {
5041 last = (n_places - 1);
5042 }
5043 place = first;
5044 current += spacing;
5045 if (f < n_th) {
5046 KMP_DEBUG_ASSERT(0 <= first);
5047 KMP_DEBUG_ASSERT(n_places > first);
5048 KMP_DEBUG_ASSERT(0 <= last);
5049 KMP_DEBUG_ASSERT(n_places > last);
5050 KMP_DEBUG_ASSERT(last_place >= first_place);
5051 th = team->t.t_threads[f];
5052 KMP_DEBUG_ASSERT(th);
5053 __kmp_set_thread_place(team, th, first, last, place);
5054 KA_TRACE(100,
5055 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5056 "partition = [%d,%d], spacing = %.4f\n",
5057 __kmp_gtid_from_thread(team->t.t_threads[f]),
5058 team->t.t_id, f, th->th.th_new_place,
5059 th->th.th_first_place, th->th.th_last_place, spacing));
5060 }
5061 }
5062 }
5063 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5064 } else {
5065 int S, rem, gap, s_count;
5066 S = n_th / n_places;
5067 s_count = 0;
5068 rem = n_th - (S * n_places);
5069 gap = rem > 0 ? n_places / rem : n_places;
5070 int place = masters_place;
5071 int gap_ct = gap;
5072 thidx = n_th;
5073 if (update_master_only == 1)
5074 thidx = 1;
5075 for (f = 0; f < thidx; f++) {
5076 kmp_info_t *th = team->t.t_threads[f];
5077 KMP_DEBUG_ASSERT(th != NULL);
5078
5079 __kmp_set_thread_place(team, th, place, place, place);
5080 s_count++;
5081
5082 if ((s_count == S) && rem && (gap_ct == gap)) {
5083 // do nothing, add an extra thread to place on next iteration
5084 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085 // we added an extra thread to this place; move on to next place
5086 if (place == last_place) {
5087 place = first_place;
5088 } else if (place == (num_masks - 1)) {
5089 place = 0;
5090 } else {
5091 place++;
5092 }
5093 s_count = 0;
5094 gap_ct = 1;
5095 rem--;
5096 } else if (s_count == S) { // place is full; don't add extra thread
5097 if (place == last_place) {
5098 place = first_place;
5099 } else if (place == (num_masks - 1)) {
5100 place = 0;
5101 } else {
5102 place++;
5103 }
5104 gap_ct++;
5105 s_count = 0;
5106 }
5107
5108 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109 "partition = [%d,%d]\n",
5110 __kmp_gtid_from_thread(team->t.t_threads[f]),
5111 team->t.t_id, f, th->th.th_new_place,
5112 th->th.th_first_place, th->th.th_last_place));
5113 }
5114 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115 }
5116 } break;
5117
5118 default:
5119 break;
5120 }
5121
5122 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 }
5124
5125 #endif // KMP_AFFINITY_SUPPORTED
5126
5127 /* allocate a new team data structure to use. take one off of the free pool if
5128 available */
5129 kmp_team_t *
__kmp_allocate_team(kmp_root_t * root,int new_nproc,int max_nproc,ompt_data_t ompt_parallel_data,kmp_proc_bind_t new_proc_bind,kmp_internal_control_t * new_icvs,int argc USE_NESTED_HOT_ARG (kmp_info_t * master))5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131 #if OMPT_SUPPORT
5132 ompt_data_t ompt_parallel_data,
5133 #endif
5134 kmp_proc_bind_t new_proc_bind,
5135 kmp_internal_control_t *new_icvs,
5136 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138 int f;
5139 kmp_team_t *team;
5140 int use_hot_team = !root->r.r_active;
5141 int level = 0;
5142 int do_place_partition = 1;
5143
5144 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147 KMP_MB();
5148
5149 #if KMP_NESTED_HOT_TEAMS
5150 kmp_hot_team_ptr_t *hot_teams;
5151 if (master) {
5152 team = master->th.th_team;
5153 level = team->t.t_active_level;
5154 if (master->th.th_teams_microtask) { // in teams construct?
5155 if (master->th.th_teams_size.nteams > 1 &&
5156 ( // #teams > 1
5157 team->t.t_pkfn ==
5158 (microtask_t)__kmp_teams_master || // inner fork of the teams
5159 master->th.th_teams_level <
5160 team->t.t_level)) { // or nested parallel inside the teams
5161 ++level; // not increment if #teams==1, or for outer fork of the teams;
5162 // increment otherwise
5163 }
5164 // Do not perform the place partition if inner fork of the teams
5165 // Wait until nested parallel region encountered inside teams construct
5166 if ((master->th.th_teams_size.nteams == 1 &&
5167 master->th.th_teams_level >= team->t.t_level) ||
5168 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169 do_place_partition = 0;
5170 }
5171 hot_teams = master->th.th_hot_teams;
5172 if (level < __kmp_hot_teams_max_level && hot_teams &&
5173 hot_teams[level].hot_team) {
5174 // hot team has already been allocated for given level
5175 use_hot_team = 1;
5176 } else {
5177 use_hot_team = 0;
5178 }
5179 } else {
5180 // check we won't access uninitialized hot_teams, just in case
5181 KMP_DEBUG_ASSERT(new_nproc == 1);
5182 }
5183 #endif
5184 // Optimization to use a "hot" team
5185 if (use_hot_team && new_nproc > 1) {
5186 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187 #if KMP_NESTED_HOT_TEAMS
5188 team = hot_teams[level].hot_team;
5189 #else
5190 team = root->r.r_hot_team;
5191 #endif
5192 #if KMP_DEBUG
5193 if (__kmp_tasking_mode != tskm_immediate_exec) {
5194 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195 "task_team[1] = %p before reinit\n",
5196 team->t.t_task_team[0], team->t.t_task_team[1]));
5197 }
5198 #endif
5199
5200 if (team->t.t_nproc != new_nproc &&
5201 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202 // Distributed barrier may need a resize
5203 int old_nthr = team->t.t_nproc;
5204 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205 }
5206
5207 // If not doing the place partition, then reset the team's proc bind
5208 // to indicate that partitioning of all threads still needs to take place
5209 if (do_place_partition == 0)
5210 team->t.t_proc_bind = proc_bind_default;
5211 // Has the number of threads changed?
5212 /* Let's assume the most common case is that the number of threads is
5213 unchanged, and put that case first. */
5214 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216 // This case can mean that omp_set_num_threads() was called and the hot
5217 // team size was already reduced, so we check the special flag
5218 if (team->t.t_size_changed == -1) {
5219 team->t.t_size_changed = 1;
5220 } else {
5221 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222 }
5223
5224 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225 kmp_r_sched_t new_sched = new_icvs->sched;
5226 // set primary thread's schedule as new run-time schedule
5227 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228
5229 __kmp_reinitialize_team(team, new_icvs,
5230 root->r.r_uber_thread->th.th_ident);
5231
5232 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233 team->t.t_threads[0], team));
5234 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235
5236 #if KMP_AFFINITY_SUPPORTED
5237 if ((team->t.t_size_changed == 0) &&
5238 (team->t.t_proc_bind == new_proc_bind)) {
5239 if (new_proc_bind == proc_bind_spread) {
5240 if (do_place_partition) {
5241 // add flag to update only master for spread
5242 __kmp_partition_places(team, 1);
5243 }
5244 }
5245 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246 "proc_bind = %d, partition = [%d,%d]\n",
5247 team->t.t_id, new_proc_bind, team->t.t_first_place,
5248 team->t.t_last_place));
5249 } else {
5250 if (do_place_partition) {
5251 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252 __kmp_partition_places(team);
5253 }
5254 }
5255 #else
5256 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257 #endif /* KMP_AFFINITY_SUPPORTED */
5258 } else if (team->t.t_nproc > new_nproc) {
5259 KA_TRACE(20,
5260 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261 new_nproc));
5262
5263 team->t.t_size_changed = 1;
5264 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265 // Barrier size already reduced earlier in this function
5266 // Activate team threads via th_used_in_team
5267 __kmp_add_threads_to_team(team, new_nproc);
5268 }
5269 #if KMP_NESTED_HOT_TEAMS
5270 if (__kmp_hot_teams_mode == 0) {
5271 // AC: saved number of threads should correspond to team's value in this
5272 // mode, can be bigger in mode 1, when hot team has threads in reserve
5273 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274 hot_teams[level].hot_team_nth = new_nproc;
5275 #endif // KMP_NESTED_HOT_TEAMS
5276 /* release the extra threads we don't need any more */
5277 for (f = new_nproc; f < team->t.t_nproc; f++) {
5278 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279 if (__kmp_tasking_mode != tskm_immediate_exec) {
5280 // When decreasing team size, threads no longer in the team should
5281 // unref task team.
5282 team->t.t_threads[f]->th.th_task_team = NULL;
5283 }
5284 __kmp_free_thread(team->t.t_threads[f]);
5285 team->t.t_threads[f] = NULL;
5286 }
5287 #if KMP_NESTED_HOT_TEAMS
5288 } // (__kmp_hot_teams_mode == 0)
5289 else {
5290 // When keeping extra threads in team, switch threads to wait on own
5291 // b_go flag
5292 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295 for (int b = 0; b < bs_last_barrier; ++b) {
5296 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298 }
5299 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300 }
5301 }
5302 }
5303 #endif // KMP_NESTED_HOT_TEAMS
5304 team->t.t_nproc = new_nproc;
5305 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307 __kmp_reinitialize_team(team, new_icvs,
5308 root->r.r_uber_thread->th.th_ident);
5309
5310 // Update remaining threads
5311 for (f = 0; f < new_nproc; ++f) {
5312 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313 }
5314
5315 // restore the current task state of the primary thread: should be the
5316 // implicit task
5317 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318 team->t.t_threads[0], team));
5319
5320 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321
5322 #ifdef KMP_DEBUG
5323 for (f = 0; f < team->t.t_nproc; f++) {
5324 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325 team->t.t_threads[f]->th.th_team_nproc ==
5326 team->t.t_nproc);
5327 }
5328 #endif
5329
5330 if (do_place_partition) {
5331 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333 __kmp_partition_places(team);
5334 #endif
5335 }
5336 } else { // team->t.t_nproc < new_nproc
5337
5338 KA_TRACE(20,
5339 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5340 new_nproc));
5341 int old_nproc = team->t.t_nproc; // save old value and use to update only
5342 team->t.t_size_changed = 1;
5343
5344 #if KMP_NESTED_HOT_TEAMS
5345 int avail_threads = hot_teams[level].hot_team_nth;
5346 if (new_nproc < avail_threads)
5347 avail_threads = new_nproc;
5348 kmp_info_t **other_threads = team->t.t_threads;
5349 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5350 // Adjust barrier data of reserved threads (if any) of the team
5351 // Other data will be set in __kmp_initialize_info() below.
5352 int b;
5353 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5354 for (b = 0; b < bs_last_barrier; ++b) {
5355 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5356 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5357 #if USE_DEBUGGER
5358 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5359 #endif
5360 }
5361 }
5362 if (hot_teams[level].hot_team_nth >= new_nproc) {
5363 // we have all needed threads in reserve, no need to allocate any
5364 // this only possible in mode 1, cannot have reserved threads in mode 0
5365 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5366 team->t.t_nproc = new_nproc; // just get reserved threads involved
5367 } else {
5368 // We may have some threads in reserve, but not enough;
5369 // get reserved threads involved if any.
5370 team->t.t_nproc = hot_teams[level].hot_team_nth;
5371 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5372 #endif // KMP_NESTED_HOT_TEAMS
5373 if (team->t.t_max_nproc < new_nproc) {
5374 /* reallocate larger arrays */
5375 __kmp_reallocate_team_arrays(team, new_nproc);
5376 __kmp_reinitialize_team(team, new_icvs, NULL);
5377 }
5378
5379 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5380 /* Temporarily set full mask for primary thread before creation of
5381 workers. The reason is that workers inherit the affinity from the
5382 primary thread, so if a lot of workers are created on the single
5383 core quickly, they don't get a chance to set their own affinity for
5384 a long time. */
5385 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5386 #endif
5387
5388 /* allocate new threads for the hot team */
5389 for (f = team->t.t_nproc; f < new_nproc; f++) {
5390 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5391 KMP_DEBUG_ASSERT(new_worker);
5392 team->t.t_threads[f] = new_worker;
5393
5394 KA_TRACE(20,
5395 ("__kmp_allocate_team: team %d init T#%d arrived: "
5396 "join=%llu, plain=%llu\n",
5397 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5398 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5399 team->t.t_bar[bs_plain_barrier].b_arrived));
5400
5401 { // Initialize barrier data for new threads.
5402 int b;
5403 kmp_balign_t *balign = new_worker->th.th_bar;
5404 for (b = 0; b < bs_last_barrier; ++b) {
5405 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5406 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5407 KMP_BARRIER_PARENT_FLAG);
5408 #if USE_DEBUGGER
5409 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5410 #endif
5411 }
5412 }
5413 }
5414
5415 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5416 /* Restore initial primary thread's affinity mask */
5417 new_temp_affinity.restore();
5418 #endif
5419 #if KMP_NESTED_HOT_TEAMS
5420 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5421 #endif // KMP_NESTED_HOT_TEAMS
5422 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5423 // Barrier size already increased earlier in this function
5424 // Activate team threads via th_used_in_team
5425 __kmp_add_threads_to_team(team, new_nproc);
5426 }
5427 /* make sure everyone is syncronized */
5428 // new threads below
5429 __kmp_initialize_team(team, new_nproc, new_icvs,
5430 root->r.r_uber_thread->th.th_ident);
5431
5432 /* reinitialize the threads */
5433 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5434 for (f = 0; f < team->t.t_nproc; ++f)
5435 __kmp_initialize_info(team->t.t_threads[f], team, f,
5436 __kmp_gtid_from_tid(f, team));
5437
5438 // set th_task_state for new threads in hot team with older thread's state
5439 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5440 for (f = old_nproc; f < team->t.t_nproc; ++f)
5441 team->t.t_threads[f]->th.th_task_state = old_state;
5442
5443 #ifdef KMP_DEBUG
5444 for (f = 0; f < team->t.t_nproc; ++f) {
5445 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5446 team->t.t_threads[f]->th.th_team_nproc ==
5447 team->t.t_nproc);
5448 }
5449 #endif
5450
5451 if (do_place_partition) {
5452 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5453 #if KMP_AFFINITY_SUPPORTED
5454 __kmp_partition_places(team);
5455 #endif
5456 }
5457 } // Check changes in number of threads
5458
5459 kmp_info_t *master = team->t.t_threads[0];
5460 if (master->th.th_teams_microtask) {
5461 for (f = 1; f < new_nproc; ++f) {
5462 // propagate teams construct specific info to workers
5463 kmp_info_t *thr = team->t.t_threads[f];
5464 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5465 thr->th.th_teams_level = master->th.th_teams_level;
5466 thr->th.th_teams_size = master->th.th_teams_size;
5467 }
5468 }
5469 #if KMP_NESTED_HOT_TEAMS
5470 if (level) {
5471 // Sync barrier state for nested hot teams, not needed for outermost hot
5472 // team.
5473 for (f = 1; f < new_nproc; ++f) {
5474 kmp_info_t *thr = team->t.t_threads[f];
5475 int b;
5476 kmp_balign_t *balign = thr->th.th_bar;
5477 for (b = 0; b < bs_last_barrier; ++b) {
5478 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5479 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5480 #if USE_DEBUGGER
5481 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5482 #endif
5483 }
5484 }
5485 }
5486 #endif // KMP_NESTED_HOT_TEAMS
5487
5488 /* reallocate space for arguments if necessary */
5489 __kmp_alloc_argv_entries(argc, team, TRUE);
5490 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5491 // The hot team re-uses the previous task team,
5492 // if untouched during the previous release->gather phase.
5493
5494 KF_TRACE(10, (" hot_team = %p\n", team));
5495
5496 #if KMP_DEBUG
5497 if (__kmp_tasking_mode != tskm_immediate_exec) {
5498 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5499 "task_team[1] = %p after reinit\n",
5500 team->t.t_task_team[0], team->t.t_task_team[1]));
5501 }
5502 #endif
5503
5504 #if OMPT_SUPPORT
5505 __ompt_team_assign_id(team, ompt_parallel_data);
5506 #endif
5507
5508 KMP_MB();
5509
5510 return team;
5511 }
5512
5513 /* next, let's try to take one from the team pool */
5514 KMP_MB();
5515 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5516 /* TODO: consider resizing undersized teams instead of reaping them, now
5517 that we have a resizing mechanism */
5518 if (team->t.t_max_nproc >= max_nproc) {
5519 /* take this team from the team pool */
5520 __kmp_team_pool = team->t.t_next_pool;
5521
5522 if (max_nproc > 1 &&
5523 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5524 if (!team->t.b) { // Allocate barrier structure
5525 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5526 }
5527 }
5528
5529 /* setup the team for fresh use */
5530 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5531
5532 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5533 "task_team[1] %p to NULL\n",
5534 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5535 team->t.t_task_team[0] = NULL;
5536 team->t.t_task_team[1] = NULL;
5537
5538 /* reallocate space for arguments if necessary */
5539 __kmp_alloc_argv_entries(argc, team, TRUE);
5540 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5541
5542 KA_TRACE(
5543 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5544 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5545 { // Initialize barrier data.
5546 int b;
5547 for (b = 0; b < bs_last_barrier; ++b) {
5548 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5549 #if USE_DEBUGGER
5550 team->t.t_bar[b].b_master_arrived = 0;
5551 team->t.t_bar[b].b_team_arrived = 0;
5552 #endif
5553 }
5554 }
5555
5556 team->t.t_proc_bind = new_proc_bind;
5557
5558 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5559 team->t.t_id));
5560
5561 #if OMPT_SUPPORT
5562 __ompt_team_assign_id(team, ompt_parallel_data);
5563 #endif
5564
5565 KMP_MB();
5566
5567 return team;
5568 }
5569
5570 /* reap team if it is too small, then loop back and check the next one */
5571 // not sure if this is wise, but, will be redone during the hot-teams
5572 // rewrite.
5573 /* TODO: Use technique to find the right size hot-team, don't reap them */
5574 team = __kmp_reap_team(team);
5575 __kmp_team_pool = team;
5576 }
5577
5578 /* nothing available in the pool, no matter, make a new team! */
5579 KMP_MB();
5580 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5581
5582 /* and set it up */
5583 team->t.t_max_nproc = max_nproc;
5584 if (max_nproc > 1 &&
5585 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5586 // Allocate barrier structure
5587 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5588 }
5589
5590 /* NOTE well, for some reason allocating one big buffer and dividing it up
5591 seems to really hurt performance a lot on the P4, so, let's not use this */
5592 __kmp_allocate_team_arrays(team, max_nproc);
5593
5594 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5595 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5596
5597 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5598 "%p to NULL\n",
5599 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5600 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5601 // memory, no need to duplicate
5602 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5603 // memory, no need to duplicate
5604
5605 if (__kmp_storage_map) {
5606 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5607 }
5608
5609 /* allocate space for arguments */
5610 __kmp_alloc_argv_entries(argc, team, FALSE);
5611 team->t.t_argc = argc;
5612
5613 KA_TRACE(20,
5614 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5615 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5616 { // Initialize barrier data.
5617 int b;
5618 for (b = 0; b < bs_last_barrier; ++b) {
5619 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5620 #if USE_DEBUGGER
5621 team->t.t_bar[b].b_master_arrived = 0;
5622 team->t.t_bar[b].b_team_arrived = 0;
5623 #endif
5624 }
5625 }
5626
5627 team->t.t_proc_bind = new_proc_bind;
5628
5629 #if OMPT_SUPPORT
5630 __ompt_team_assign_id(team, ompt_parallel_data);
5631 team->t.ompt_serialized_team_info = NULL;
5632 #endif
5633
5634 KMP_MB();
5635
5636 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5637 team->t.t_id));
5638
5639 return team;
5640 }
5641
5642 /* TODO implement hot-teams at all levels */
5643 /* TODO implement lazy thread release on demand (disband request) */
5644
5645 /* free the team. return it to the team pool. release all the threads
5646 * associated with it */
__kmp_free_team(kmp_root_t * root,kmp_team_t * team USE_NESTED_HOT_ARG (kmp_info_t * master))5647 void __kmp_free_team(kmp_root_t *root,
5648 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5649 int f;
5650 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5651 team->t.t_id));
5652
5653 /* verify state */
5654 KMP_DEBUG_ASSERT(root);
5655 KMP_DEBUG_ASSERT(team);
5656 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5657 KMP_DEBUG_ASSERT(team->t.t_threads);
5658
5659 int use_hot_team = team == root->r.r_hot_team;
5660 #if KMP_NESTED_HOT_TEAMS
5661 int level;
5662 if (master) {
5663 level = team->t.t_active_level - 1;
5664 if (master->th.th_teams_microtask) { // in teams construct?
5665 if (master->th.th_teams_size.nteams > 1) {
5666 ++level; // level was not increased in teams construct for
5667 // team_of_masters
5668 }
5669 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5670 master->th.th_teams_level == team->t.t_level) {
5671 ++level; // level was not increased in teams construct for
5672 // team_of_workers before the parallel
5673 } // team->t.t_level will be increased inside parallel
5674 }
5675 #if KMP_DEBUG
5676 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5677 #endif
5678 if (level < __kmp_hot_teams_max_level) {
5679 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5680 use_hot_team = 1;
5681 }
5682 }
5683 #endif // KMP_NESTED_HOT_TEAMS
5684
5685 /* team is done working */
5686 TCW_SYNC_PTR(team->t.t_pkfn,
5687 NULL); // Important for Debugging Support Library.
5688 #if KMP_OS_WINDOWS
5689 team->t.t_copyin_counter = 0; // init counter for possible reuse
5690 #endif
5691 // Do not reset pointer to parent team to NULL for hot teams.
5692
5693 /* if we are non-hot team, release our threads */
5694 if (!use_hot_team) {
5695 if (__kmp_tasking_mode != tskm_immediate_exec) {
5696 // Wait for threads to reach reapable state
5697 for (f = 1; f < team->t.t_nproc; ++f) {
5698 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5699 kmp_info_t *th = team->t.t_threads[f];
5700 volatile kmp_uint32 *state = &th->th.th_reap_state;
5701 while (*state != KMP_SAFE_TO_REAP) {
5702 #if KMP_OS_WINDOWS
5703 // On Windows a thread can be killed at any time, check this
5704 DWORD ecode;
5705 if (!__kmp_is_thread_alive(th, &ecode)) {
5706 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5707 break;
5708 }
5709 #endif
5710 // first check if thread is sleeping
5711 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5712 if (fl.is_sleeping())
5713 fl.resume(__kmp_gtid_from_thread(th));
5714 KMP_CPU_PAUSE();
5715 }
5716 }
5717
5718 // Delete task teams
5719 int tt_idx;
5720 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5721 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5722 if (task_team != NULL) {
5723 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5724 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5725 team->t.t_threads[f]->th.th_task_team = NULL;
5726 }
5727 KA_TRACE(
5728 20,
5729 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5730 __kmp_get_gtid(), task_team, team->t.t_id));
5731 #if KMP_NESTED_HOT_TEAMS
5732 __kmp_free_task_team(master, task_team);
5733 #endif
5734 team->t.t_task_team[tt_idx] = NULL;
5735 }
5736 }
5737 }
5738
5739 // Reset pointer to parent team only for non-hot teams.
5740 team->t.t_parent = NULL;
5741 team->t.t_level = 0;
5742 team->t.t_active_level = 0;
5743
5744 /* free the worker threads */
5745 for (f = 1; f < team->t.t_nproc; ++f) {
5746 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5747 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5748 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5749 1, 2);
5750 }
5751 __kmp_free_thread(team->t.t_threads[f]);
5752 }
5753
5754 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5755 if (team->t.b) {
5756 // wake up thread at old location
5757 team->t.b->go_release();
5758 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5759 for (f = 1; f < team->t.t_nproc; ++f) {
5760 if (team->t.b->sleep[f].sleep) {
5761 __kmp_atomic_resume_64(
5762 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5763 (kmp_atomic_flag_64<> *)NULL);
5764 }
5765 }
5766 }
5767 // Wait for threads to be removed from team
5768 for (int f = 1; f < team->t.t_nproc; ++f) {
5769 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5770 KMP_CPU_PAUSE();
5771 }
5772 }
5773 }
5774
5775 for (f = 1; f < team->t.t_nproc; ++f) {
5776 team->t.t_threads[f] = NULL;
5777 }
5778
5779 if (team->t.t_max_nproc > 1 &&
5780 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5781 distributedBarrier::deallocate(team->t.b);
5782 team->t.b = NULL;
5783 }
5784 /* put the team back in the team pool */
5785 /* TODO limit size of team pool, call reap_team if pool too large */
5786 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5787 __kmp_team_pool = (volatile kmp_team_t *)team;
5788 } else { // Check if team was created for primary threads in teams construct
5789 // See if first worker is a CG root
5790 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5791 team->t.t_threads[1]->th.th_cg_roots);
5792 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5793 // Clean up the CG root nodes on workers so that this team can be re-used
5794 for (f = 1; f < team->t.t_nproc; ++f) {
5795 kmp_info_t *thr = team->t.t_threads[f];
5796 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5797 thr->th.th_cg_roots->cg_root == thr);
5798 // Pop current CG root off list
5799 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5800 thr->th.th_cg_roots = tmp->up;
5801 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5802 " up to node %p. cg_nthreads was %d\n",
5803 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5804 int i = tmp->cg_nthreads--;
5805 if (i == 1) {
5806 __kmp_free(tmp); // free CG if we are the last thread in it
5807 }
5808 // Restore current task's thread_limit from CG root
5809 if (thr->th.th_cg_roots)
5810 thr->th.th_current_task->td_icvs.thread_limit =
5811 thr->th.th_cg_roots->cg_thread_limit;
5812 }
5813 }
5814 }
5815
5816 KMP_MB();
5817 }
5818
5819 /* reap the team. destroy it, reclaim all its resources and free its memory */
__kmp_reap_team(kmp_team_t * team)5820 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5821 kmp_team_t *next_pool = team->t.t_next_pool;
5822
5823 KMP_DEBUG_ASSERT(team);
5824 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5825 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5826 KMP_DEBUG_ASSERT(team->t.t_threads);
5827 KMP_DEBUG_ASSERT(team->t.t_argv);
5828
5829 /* TODO clean the threads that are a part of this? */
5830
5831 /* free stuff */
5832 __kmp_free_team_arrays(team);
5833 if (team->t.t_argv != &team->t.t_inline_argv[0])
5834 __kmp_free((void *)team->t.t_argv);
5835 __kmp_free(team);
5836
5837 KMP_MB();
5838 return next_pool;
5839 }
5840
5841 // Free the thread. Don't reap it, just place it on the pool of available
5842 // threads.
5843 //
5844 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5845 // binding for the affinity mechanism to be useful.
5846 //
5847 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5848 // However, we want to avoid a potential performance problem by always
5849 // scanning through the list to find the correct point at which to insert
5850 // the thread (potential N**2 behavior). To do this we keep track of the
5851 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5852 // With single-level parallelism, threads will always be added to the tail
5853 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5854 // parallelism, all bets are off and we may need to scan through the entire
5855 // free list.
5856 //
5857 // This change also has a potentially large performance benefit, for some
5858 // applications. Previously, as threads were freed from the hot team, they
5859 // would be placed back on the free list in inverse order. If the hot team
5860 // grew back to it's original size, then the freed thread would be placed
5861 // back on the hot team in reverse order. This could cause bad cache
5862 // locality problems on programs where the size of the hot team regularly
5863 // grew and shrunk.
5864 //
5865 // Now, for single-level parallelism, the OMP tid is always == gtid.
__kmp_free_thread(kmp_info_t * this_th)5866 void __kmp_free_thread(kmp_info_t *this_th) {
5867 int gtid;
5868 kmp_info_t **scan;
5869
5870 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5871 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5872
5873 KMP_DEBUG_ASSERT(this_th);
5874
5875 // When moving thread to pool, switch thread to wait on own b_go flag, and
5876 // uninitialized (NULL team).
5877 int b;
5878 kmp_balign_t *balign = this_th->th.th_bar;
5879 for (b = 0; b < bs_last_barrier; ++b) {
5880 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5881 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5882 balign[b].bb.team = NULL;
5883 balign[b].bb.leaf_kids = 0;
5884 }
5885 this_th->th.th_task_state = 0;
5886 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5887
5888 /* put thread back on the free pool */
5889 TCW_PTR(this_th->th.th_team, NULL);
5890 TCW_PTR(this_th->th.th_root, NULL);
5891 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5892
5893 while (this_th->th.th_cg_roots) {
5894 this_th->th.th_cg_roots->cg_nthreads--;
5895 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5896 " %p of thread %p to %d\n",
5897 this_th, this_th->th.th_cg_roots,
5898 this_th->th.th_cg_roots->cg_root,
5899 this_th->th.th_cg_roots->cg_nthreads));
5900 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5901 if (tmp->cg_root == this_th) { // Thread is a cg_root
5902 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5903 KA_TRACE(
5904 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5905 this_th->th.th_cg_roots = tmp->up;
5906 __kmp_free(tmp);
5907 } else { // Worker thread
5908 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5909 __kmp_free(tmp);
5910 }
5911 this_th->th.th_cg_roots = NULL;
5912 break;
5913 }
5914 }
5915
5916 /* If the implicit task assigned to this thread can be used by other threads
5917 * -> multiple threads can share the data and try to free the task at
5918 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5919 * with higher probability when hot team is disabled but can occurs even when
5920 * the hot team is enabled */
5921 __kmp_free_implicit_task(this_th);
5922 this_th->th.th_current_task = NULL;
5923
5924 // If the __kmp_thread_pool_insert_pt is already past the new insert
5925 // point, then we need to re-scan the entire list.
5926 gtid = this_th->th.th_info.ds.ds_gtid;
5927 if (__kmp_thread_pool_insert_pt != NULL) {
5928 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5929 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5930 __kmp_thread_pool_insert_pt = NULL;
5931 }
5932 }
5933
5934 // Scan down the list to find the place to insert the thread.
5935 // scan is the address of a link in the list, possibly the address of
5936 // __kmp_thread_pool itself.
5937 //
5938 // In the absence of nested parallelism, the for loop will have 0 iterations.
5939 if (__kmp_thread_pool_insert_pt != NULL) {
5940 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5941 } else {
5942 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5943 }
5944 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5945 scan = &((*scan)->th.th_next_pool))
5946 ;
5947
5948 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5949 // to its address.
5950 TCW_PTR(this_th->th.th_next_pool, *scan);
5951 __kmp_thread_pool_insert_pt = *scan = this_th;
5952 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5953 (this_th->th.th_info.ds.ds_gtid <
5954 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5955 TCW_4(this_th->th.th_in_pool, TRUE);
5956 __kmp_suspend_initialize_thread(this_th);
5957 __kmp_lock_suspend_mx(this_th);
5958 if (this_th->th.th_active == TRUE) {
5959 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5960 this_th->th.th_active_in_pool = TRUE;
5961 }
5962 #if KMP_DEBUG
5963 else {
5964 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5965 }
5966 #endif
5967 __kmp_unlock_suspend_mx(this_th);
5968
5969 TCW_4(__kmp_nth, __kmp_nth - 1);
5970
5971 #ifdef KMP_ADJUST_BLOCKTIME
5972 /* Adjust blocktime back to user setting or default if necessary */
5973 /* Middle initialization might never have occurred */
5974 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5975 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5976 if (__kmp_nth <= __kmp_avail_proc) {
5977 __kmp_zero_bt = FALSE;
5978 }
5979 }
5980 #endif /* KMP_ADJUST_BLOCKTIME */
5981
5982 KMP_MB();
5983 }
5984
5985 /* ------------------------------------------------------------------------ */
5986
__kmp_launch_thread(kmp_info_t * this_thr)5987 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5988 #if OMP_PROFILING_SUPPORT
5989 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5990 // TODO: add a configuration option for time granularity
5991 if (ProfileTraceFile)
5992 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5993 #endif
5994
5995 int gtid = this_thr->th.th_info.ds.ds_gtid;
5996 /* void *stack_data;*/
5997 kmp_team_t **volatile pteam;
5998
5999 KMP_MB();
6000 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6001
6002 if (__kmp_env_consistency_check) {
6003 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6004 }
6005
6006 #if OMPD_SUPPORT
6007 if (ompd_state & OMPD_ENABLE_BP)
6008 ompd_bp_thread_begin();
6009 #endif
6010
6011 #if OMPT_SUPPORT
6012 ompt_data_t *thread_data = nullptr;
6013 if (ompt_enabled.enabled) {
6014 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6015 *thread_data = ompt_data_none;
6016
6017 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6018 this_thr->th.ompt_thread_info.wait_id = 0;
6019 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6020 this_thr->th.ompt_thread_info.parallel_flags = 0;
6021 if (ompt_enabled.ompt_callback_thread_begin) {
6022 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6023 ompt_thread_worker, thread_data);
6024 }
6025 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6026 }
6027 #endif
6028
6029 /* This is the place where threads wait for work */
6030 while (!TCR_4(__kmp_global.g.g_done)) {
6031 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6032 KMP_MB();
6033
6034 /* wait for work to do */
6035 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6036
6037 /* No tid yet since not part of a team */
6038 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6039
6040 #if OMPT_SUPPORT
6041 if (ompt_enabled.enabled) {
6042 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6043 }
6044 #endif
6045
6046 pteam = &this_thr->th.th_team;
6047
6048 /* have we been allocated? */
6049 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6050 /* we were just woken up, so run our new task */
6051 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6052 int rc;
6053 KA_TRACE(20,
6054 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6055 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6056 (*pteam)->t.t_pkfn));
6057
6058 updateHWFPControl(*pteam);
6059
6060 #if OMPT_SUPPORT
6061 if (ompt_enabled.enabled) {
6062 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6063 }
6064 #endif
6065
6066 rc = (*pteam)->t.t_invoke(gtid);
6067 KMP_ASSERT(rc);
6068
6069 KMP_MB();
6070 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6071 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6072 (*pteam)->t.t_pkfn));
6073 }
6074 #if OMPT_SUPPORT
6075 if (ompt_enabled.enabled) {
6076 /* no frame set while outside task */
6077 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6078
6079 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6080 }
6081 #endif
6082 /* join barrier after parallel region */
6083 __kmp_join_barrier(gtid);
6084 }
6085 }
6086
6087 #if OMPD_SUPPORT
6088 if (ompd_state & OMPD_ENABLE_BP)
6089 ompd_bp_thread_end();
6090 #endif
6091
6092 #if OMPT_SUPPORT
6093 if (ompt_enabled.ompt_callback_thread_end) {
6094 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6095 }
6096 #endif
6097
6098 this_thr->th.th_task_team = NULL;
6099 /* run the destructors for the threadprivate data for this thread */
6100 __kmp_common_destroy_gtid(gtid);
6101
6102 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6103 KMP_MB();
6104
6105 #if OMP_PROFILING_SUPPORT
6106 llvm::timeTraceProfilerFinishThread();
6107 #endif
6108 return this_thr;
6109 }
6110
6111 /* ------------------------------------------------------------------------ */
6112
__kmp_internal_end_dest(void * specific_gtid)6113 void __kmp_internal_end_dest(void *specific_gtid) {
6114 // Make sure no significant bits are lost
6115 int gtid;
6116 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id);
6117
6118 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6119 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6120 * this is because 0 is reserved for the nothing-stored case */
6121
6122 __kmp_internal_end_thread(gtid);
6123 }
6124
6125 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6126
__kmp_internal_end_dtor(void)6127 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6128 __kmp_internal_end_atexit();
6129 }
6130
6131 #endif
6132
6133 /* [Windows] josh: when the atexit handler is called, there may still be more
6134 than one thread alive */
__kmp_internal_end_atexit(void)6135 void __kmp_internal_end_atexit(void) {
6136 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6137 /* [Windows]
6138 josh: ideally, we want to completely shutdown the library in this atexit
6139 handler, but stat code that depends on thread specific data for gtid fails
6140 because that data becomes unavailable at some point during the shutdown, so
6141 we call __kmp_internal_end_thread instead. We should eventually remove the
6142 dependency on __kmp_get_specific_gtid in the stat code and use
6143 __kmp_internal_end_library to cleanly shutdown the library.
6144
6145 // TODO: Can some of this comment about GVS be removed?
6146 I suspect that the offending stat code is executed when the calling thread
6147 tries to clean up a dead root thread's data structures, resulting in GVS
6148 code trying to close the GVS structures for that thread, but since the stat
6149 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6150 the calling thread is cleaning up itself instead of another thread, it get
6151 confused. This happens because allowing a thread to unregister and cleanup
6152 another thread is a recent modification for addressing an issue.
6153 Based on the current design (20050722), a thread may end up
6154 trying to unregister another thread only if thread death does not trigger
6155 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6156 thread specific data destructor function to detect thread death. For
6157 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6158 is nothing. Thus, the workaround is applicable only for Windows static
6159 stat library. */
6160 __kmp_internal_end_library(-1);
6161 #if KMP_OS_WINDOWS
6162 __kmp_close_console();
6163 #endif
6164 }
6165
__kmp_reap_thread(kmp_info_t * thread,int is_root)6166 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6167 // It is assumed __kmp_forkjoin_lock is acquired.
6168
6169 int gtid;
6170
6171 KMP_DEBUG_ASSERT(thread != NULL);
6172
6173 gtid = thread->th.th_info.ds.ds_gtid;
6174
6175 if (!is_root) {
6176 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6177 /* Assume the threads are at the fork barrier here */
6178 KA_TRACE(
6179 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6180 gtid));
6181 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6182 while (
6183 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6184 KMP_CPU_PAUSE();
6185 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6186 } else {
6187 /* Need release fence here to prevent seg faults for tree forkjoin
6188 barrier (GEH) */
6189 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6190 thread);
6191 __kmp_release_64(&flag);
6192 }
6193 }
6194
6195 // Terminate OS thread.
6196 __kmp_reap_worker(thread);
6197
6198 // The thread was killed asynchronously. If it was actively
6199 // spinning in the thread pool, decrement the global count.
6200 //
6201 // There is a small timing hole here - if the worker thread was just waking
6202 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6203 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6204 // the global counter might not get updated.
6205 //
6206 // Currently, this can only happen as the library is unloaded,
6207 // so there are no harmful side effects.
6208 if (thread->th.th_active_in_pool) {
6209 thread->th.th_active_in_pool = FALSE;
6210 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6211 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6212 }
6213 }
6214
6215 __kmp_free_implicit_task(thread);
6216
6217 // Free the fast memory for tasking
6218 #if USE_FAST_MEMORY
6219 __kmp_free_fast_memory(thread);
6220 #endif /* USE_FAST_MEMORY */
6221
6222 __kmp_suspend_uninitialize_thread(thread);
6223
6224 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6225 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6226
6227 --__kmp_all_nth;
6228 // __kmp_nth was decremented when thread is added to the pool.
6229
6230 #ifdef KMP_ADJUST_BLOCKTIME
6231 /* Adjust blocktime back to user setting or default if necessary */
6232 /* Middle initialization might never have occurred */
6233 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6234 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6235 if (__kmp_nth <= __kmp_avail_proc) {
6236 __kmp_zero_bt = FALSE;
6237 }
6238 }
6239 #endif /* KMP_ADJUST_BLOCKTIME */
6240
6241 /* free the memory being used */
6242 if (__kmp_env_consistency_check) {
6243 if (thread->th.th_cons) {
6244 __kmp_free_cons_stack(thread->th.th_cons);
6245 thread->th.th_cons = NULL;
6246 }
6247 }
6248
6249 if (thread->th.th_pri_common != NULL) {
6250 __kmp_free(thread->th.th_pri_common);
6251 thread->th.th_pri_common = NULL;
6252 }
6253
6254 if (thread->th.th_task_state_memo_stack != NULL) {
6255 __kmp_free(thread->th.th_task_state_memo_stack);
6256 thread->th.th_task_state_memo_stack = NULL;
6257 }
6258
6259 #if KMP_USE_BGET
6260 if (thread->th.th_local.bget_data != NULL) {
6261 __kmp_finalize_bget(thread);
6262 }
6263 #endif
6264
6265 #if KMP_AFFINITY_SUPPORTED
6266 if (thread->th.th_affin_mask != NULL) {
6267 KMP_CPU_FREE(thread->th.th_affin_mask);
6268 thread->th.th_affin_mask = NULL;
6269 }
6270 #endif /* KMP_AFFINITY_SUPPORTED */
6271
6272 #if KMP_USE_HIER_SCHED
6273 if (thread->th.th_hier_bar_data != NULL) {
6274 __kmp_free(thread->th.th_hier_bar_data);
6275 thread->th.th_hier_bar_data = NULL;
6276 }
6277 #endif
6278
6279 __kmp_reap_team(thread->th.th_serial_team);
6280 thread->th.th_serial_team = NULL;
6281 __kmp_free(thread);
6282
6283 KMP_MB();
6284
6285 } // __kmp_reap_thread
6286
__kmp_itthash_clean(kmp_info_t * th)6287 static void __kmp_itthash_clean(kmp_info_t *th) {
6288 #if USE_ITT_NOTIFY
6289 if (__kmp_itt_region_domains.count > 0) {
6290 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6291 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6292 while (bucket) {
6293 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6294 __kmp_thread_free(th, bucket);
6295 bucket = next;
6296 }
6297 }
6298 }
6299 if (__kmp_itt_barrier_domains.count > 0) {
6300 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6301 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6302 while (bucket) {
6303 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6304 __kmp_thread_free(th, bucket);
6305 bucket = next;
6306 }
6307 }
6308 }
6309 #endif
6310 }
6311
__kmp_internal_end(void)6312 static void __kmp_internal_end(void) {
6313 int i;
6314
6315 /* First, unregister the library */
6316 __kmp_unregister_library();
6317
6318 #if KMP_OS_WINDOWS
6319 /* In Win static library, we can't tell when a root actually dies, so we
6320 reclaim the data structures for any root threads that have died but not
6321 unregistered themselves, in order to shut down cleanly.
6322 In Win dynamic library we also can't tell when a thread dies. */
6323 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6324 // dead roots
6325 #endif
6326
6327 for (i = 0; i < __kmp_threads_capacity; i++)
6328 if (__kmp_root[i])
6329 if (__kmp_root[i]->r.r_active)
6330 break;
6331 KMP_MB(); /* Flush all pending memory write invalidates. */
6332 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333
6334 if (i < __kmp_threads_capacity) {
6335 #if KMP_USE_MONITOR
6336 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6337 KMP_MB(); /* Flush all pending memory write invalidates. */
6338
6339 // Need to check that monitor was initialized before reaping it. If we are
6340 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6341 // __kmp_monitor will appear to contain valid data, but it is only valid in
6342 // the parent process, not the child.
6343 // New behavior (201008): instead of keying off of the flag
6344 // __kmp_init_parallel, the monitor thread creation is keyed off
6345 // of the new flag __kmp_init_monitor.
6346 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6347 if (TCR_4(__kmp_init_monitor)) {
6348 __kmp_reap_monitor(&__kmp_monitor);
6349 TCW_4(__kmp_init_monitor, 0);
6350 }
6351 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6352 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6353 #endif // KMP_USE_MONITOR
6354 } else {
6355 /* TODO move this to cleanup code */
6356 #ifdef KMP_DEBUG
6357 /* make sure that everything has properly ended */
6358 for (i = 0; i < __kmp_threads_capacity; i++) {
6359 if (__kmp_root[i]) {
6360 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6361 // there can be uber threads alive here
6362 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6363 }
6364 }
6365 #endif
6366
6367 KMP_MB();
6368
6369 // Reap the worker threads.
6370 // This is valid for now, but be careful if threads are reaped sooner.
6371 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6372 // Get the next thread from the pool.
6373 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6374 __kmp_thread_pool = thread->th.th_next_pool;
6375 // Reap it.
6376 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6377 thread->th.th_next_pool = NULL;
6378 thread->th.th_in_pool = FALSE;
6379 __kmp_reap_thread(thread, 0);
6380 }
6381 __kmp_thread_pool_insert_pt = NULL;
6382
6383 // Reap teams.
6384 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6385 // Get the next team from the pool.
6386 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6387 __kmp_team_pool = team->t.t_next_pool;
6388 // Reap it.
6389 team->t.t_next_pool = NULL;
6390 __kmp_reap_team(team);
6391 }
6392
6393 __kmp_reap_task_teams();
6394
6395 #if KMP_OS_UNIX
6396 // Threads that are not reaped should not access any resources since they
6397 // are going to be deallocated soon, so the shutdown sequence should wait
6398 // until all threads either exit the final spin-waiting loop or begin
6399 // sleeping after the given blocktime.
6400 for (i = 0; i < __kmp_threads_capacity; i++) {
6401 kmp_info_t *thr = __kmp_threads[i];
6402 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6403 KMP_CPU_PAUSE();
6404 }
6405 #endif
6406
6407 for (i = 0; i < __kmp_threads_capacity; ++i) {
6408 // TBD: Add some checking...
6409 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6410 }
6411
6412 /* Make sure all threadprivate destructors get run by joining with all
6413 worker threads before resetting this flag */
6414 TCW_SYNC_4(__kmp_init_common, FALSE);
6415
6416 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6417 KMP_MB();
6418
6419 #if KMP_USE_MONITOR
6420 // See note above: One of the possible fixes for CQ138434 / CQ140126
6421 //
6422 // FIXME: push both code fragments down and CSE them?
6423 // push them into __kmp_cleanup() ?
6424 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6425 if (TCR_4(__kmp_init_monitor)) {
6426 __kmp_reap_monitor(&__kmp_monitor);
6427 TCW_4(__kmp_init_monitor, 0);
6428 }
6429 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6430 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6431 #endif
6432 } /* else !__kmp_global.t_active */
6433 TCW_4(__kmp_init_gtid, FALSE);
6434 KMP_MB(); /* Flush all pending memory write invalidates. */
6435
6436 __kmp_cleanup();
6437 #if OMPT_SUPPORT
6438 ompt_fini();
6439 #endif
6440 }
6441
__kmp_internal_end_library(int gtid_req)6442 void __kmp_internal_end_library(int gtid_req) {
6443 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6444 /* this shouldn't be a race condition because __kmp_internal_end() is the
6445 only place to clear __kmp_serial_init */
6446 /* we'll check this later too, after we get the lock */
6447 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6448 // redundant, because the next check will work in any case.
6449 if (__kmp_global.g.g_abort) {
6450 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6451 /* TODO abort? */
6452 return;
6453 }
6454 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6455 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6456 return;
6457 }
6458
6459 // If hidden helper team has been initialized, we need to deinit it
6460 if (TCR_4(__kmp_init_hidden_helper) &&
6461 !TCR_4(__kmp_hidden_helper_team_done)) {
6462 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6463 // First release the main thread to let it continue its work
6464 __kmp_hidden_helper_main_thread_release();
6465 // Wait until the hidden helper team has been destroyed
6466 __kmp_hidden_helper_threads_deinitz_wait();
6467 }
6468
6469 KMP_MB(); /* Flush all pending memory write invalidates. */
6470 /* find out who we are and what we should do */
6471 {
6472 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6473 KA_TRACE(
6474 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6475 if (gtid == KMP_GTID_SHUTDOWN) {
6476 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6477 "already shutdown\n"));
6478 return;
6479 } else if (gtid == KMP_GTID_MONITOR) {
6480 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6481 "registered, or system shutdown\n"));
6482 return;
6483 } else if (gtid == KMP_GTID_DNE) {
6484 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6485 "shutdown\n"));
6486 /* we don't know who we are, but we may still shutdown the library */
6487 } else if (KMP_UBER_GTID(gtid)) {
6488 /* unregister ourselves as an uber thread. gtid is no longer valid */
6489 if (__kmp_root[gtid]->r.r_active) {
6490 __kmp_global.g.g_abort = -1;
6491 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6492 __kmp_unregister_library();
6493 KA_TRACE(10,
6494 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6495 gtid));
6496 return;
6497 } else {
6498 __kmp_itthash_clean(__kmp_threads[gtid]);
6499 KA_TRACE(
6500 10,
6501 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6502 __kmp_unregister_root_current_thread(gtid);
6503 }
6504 } else {
6505 /* worker threads may call this function through the atexit handler, if they
6506 * call exit() */
6507 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6508 TODO: do a thorough shutdown instead */
6509 #ifdef DUMP_DEBUG_ON_EXIT
6510 if (__kmp_debug_buf)
6511 __kmp_dump_debug_buffer();
6512 #endif
6513 // added unregister library call here when we switch to shm linux
6514 // if we don't, it will leave lots of files in /dev/shm
6515 // cleanup shared memory file before exiting.
6516 __kmp_unregister_library();
6517 return;
6518 }
6519 }
6520 /* synchronize the termination process */
6521 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6522
6523 /* have we already finished */
6524 if (__kmp_global.g.g_abort) {
6525 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6526 /* TODO abort? */
6527 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6528 return;
6529 }
6530 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6531 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6532 return;
6533 }
6534
6535 /* We need this lock to enforce mutex between this reading of
6536 __kmp_threads_capacity and the writing by __kmp_register_root.
6537 Alternatively, we can use a counter of roots that is atomically updated by
6538 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6539 __kmp_internal_end_*. */
6540 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6541
6542 /* now we can safely conduct the actual termination */
6543 __kmp_internal_end();
6544
6545 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6546 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547
6548 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6549
6550 #ifdef DUMP_DEBUG_ON_EXIT
6551 if (__kmp_debug_buf)
6552 __kmp_dump_debug_buffer();
6553 #endif
6554
6555 #if KMP_OS_WINDOWS
6556 __kmp_close_console();
6557 #endif
6558
6559 __kmp_fini_allocator();
6560
6561 } // __kmp_internal_end_library
6562
__kmp_internal_end_thread(int gtid_req)6563 void __kmp_internal_end_thread(int gtid_req) {
6564 int i;
6565
6566 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6567 /* this shouldn't be a race condition because __kmp_internal_end() is the
6568 * only place to clear __kmp_serial_init */
6569 /* we'll check this later too, after we get the lock */
6570 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6571 // redundant, because the next check will work in any case.
6572 if (__kmp_global.g.g_abort) {
6573 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6574 /* TODO abort? */
6575 return;
6576 }
6577 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6578 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6579 return;
6580 }
6581
6582 // If hidden helper team has been initialized, we need to deinit it
6583 if (TCR_4(__kmp_init_hidden_helper) &&
6584 !TCR_4(__kmp_hidden_helper_team_done)) {
6585 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6586 // First release the main thread to let it continue its work
6587 __kmp_hidden_helper_main_thread_release();
6588 // Wait until the hidden helper team has been destroyed
6589 __kmp_hidden_helper_threads_deinitz_wait();
6590 }
6591
6592 KMP_MB(); /* Flush all pending memory write invalidates. */
6593
6594 /* find out who we are and what we should do */
6595 {
6596 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6597 KA_TRACE(10,
6598 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6599 if (gtid == KMP_GTID_SHUTDOWN) {
6600 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6601 "already shutdown\n"));
6602 return;
6603 } else if (gtid == KMP_GTID_MONITOR) {
6604 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6605 "registered, or system shutdown\n"));
6606 return;
6607 } else if (gtid == KMP_GTID_DNE) {
6608 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6609 "shutdown\n"));
6610 return;
6611 /* we don't know who we are */
6612 } else if (KMP_UBER_GTID(gtid)) {
6613 /* unregister ourselves as an uber thread. gtid is no longer valid */
6614 if (__kmp_root[gtid]->r.r_active) {
6615 __kmp_global.g.g_abort = -1;
6616 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6617 KA_TRACE(10,
6618 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6619 gtid));
6620 return;
6621 } else {
6622 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6623 gtid));
6624 __kmp_unregister_root_current_thread(gtid);
6625 }
6626 } else {
6627 /* just a worker thread, let's leave */
6628 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6629
6630 if (gtid >= 0) {
6631 __kmp_threads[gtid]->th.th_task_team = NULL;
6632 }
6633
6634 KA_TRACE(10,
6635 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6636 gtid));
6637 return;
6638 }
6639 }
6640 #if KMP_DYNAMIC_LIB
6641 if (__kmp_pause_status != kmp_hard_paused)
6642 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6643 // because we will better shutdown later in the library destructor.
6644 {
6645 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6646 return;
6647 }
6648 #endif
6649 /* synchronize the termination process */
6650 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6651
6652 /* have we already finished */
6653 if (__kmp_global.g.g_abort) {
6654 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6655 /* TODO abort? */
6656 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657 return;
6658 }
6659 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6660 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6661 return;
6662 }
6663
6664 /* We need this lock to enforce mutex between this reading of
6665 __kmp_threads_capacity and the writing by __kmp_register_root.
6666 Alternatively, we can use a counter of roots that is atomically updated by
6667 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6668 __kmp_internal_end_*. */
6669
6670 /* should we finish the run-time? are all siblings done? */
6671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6672
6673 for (i = 0; i < __kmp_threads_capacity; ++i) {
6674 if (KMP_UBER_GTID(i)) {
6675 KA_TRACE(
6676 10,
6677 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6678 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6679 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6680 return;
6681 }
6682 }
6683
6684 /* now we can safely conduct the actual termination */
6685
6686 __kmp_internal_end();
6687
6688 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6689 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6690
6691 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6692
6693 #ifdef DUMP_DEBUG_ON_EXIT
6694 if (__kmp_debug_buf)
6695 __kmp_dump_debug_buffer();
6696 #endif
6697 } // __kmp_internal_end_thread
6698
6699 // -----------------------------------------------------------------------------
6700 // Library registration stuff.
6701
6702 static long __kmp_registration_flag = 0;
6703 // Random value used to indicate library initialization.
6704 static char *__kmp_registration_str = NULL;
6705 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6706
__kmp_reg_status_name()6707 static inline char *__kmp_reg_status_name() {
6708 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6709 each thread. If registration and unregistration go in different threads
6710 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6711 env var can not be found, because the name will contain different pid. */
6712 // macOS* complains about name being too long with additional getuid()
6713 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6714 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6715 (int)getuid());
6716 #else
6717 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6718 #endif
6719 } // __kmp_reg_status_get
6720
6721 #if defined(KMP_USE_SHM)
6722 bool __kmp_shm_available = false;
6723 bool __kmp_tmp_available = false;
6724 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6725 char *temp_reg_status_file_name = nullptr;
6726 #endif
6727
__kmp_register_library_startup(void)6728 void __kmp_register_library_startup(void) {
6729
6730 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6731 int done = 0;
6732 union {
6733 double dtime;
6734 long ltime;
6735 } time;
6736 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6737 __kmp_initialize_system_tick();
6738 #endif
6739 __kmp_read_system_time(&time.dtime);
6740 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6741 __kmp_registration_str =
6742 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6743 __kmp_registration_flag, KMP_LIBRARY_FILE);
6744
6745 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6746 __kmp_registration_str));
6747
6748 while (!done) {
6749
6750 char *value = NULL; // Actual value of the environment variable.
6751
6752 #if defined(KMP_USE_SHM)
6753 char *shm_name = nullptr;
6754 char *data1 = nullptr;
6755 __kmp_shm_available = __kmp_detect_shm();
6756 if (__kmp_shm_available) {
6757 int fd1 = -1;
6758 shm_name = __kmp_str_format("/%s", name);
6759 int shm_preexist = 0;
6760 fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6761 if ((fd1 == -1) && (errno == EEXIST)) {
6762 // file didn't open because it already exists.
6763 // try opening existing file
6764 fd1 = shm_open(shm_name, O_RDWR, 0666);
6765 if (fd1 == -1) { // file didn't open
6766 KMP_WARNING(FunctionError, "Can't open SHM");
6767 __kmp_shm_available = false;
6768 } else { // able to open existing file
6769 shm_preexist = 1;
6770 }
6771 }
6772 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6773 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6774 KMP_WARNING(FunctionError, "Can't set size of SHM");
6775 __kmp_shm_available = false;
6776 }
6777 }
6778 if (__kmp_shm_available) { // SHM exists, now map it
6779 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6780 fd1, 0);
6781 if (data1 == MAP_FAILED) { // failed to map shared memory
6782 KMP_WARNING(FunctionError, "Can't map SHM");
6783 __kmp_shm_available = false;
6784 }
6785 }
6786 if (__kmp_shm_available) { // SHM mapped
6787 if (shm_preexist == 0) { // set data to SHM, set value
6788 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6789 }
6790 // Read value from either what we just wrote or existing file.
6791 value = __kmp_str_format("%s", data1); // read value from SHM
6792 munmap(data1, SHM_SIZE);
6793 }
6794 if (fd1 != -1)
6795 close(fd1);
6796 }
6797 if (!__kmp_shm_available)
6798 __kmp_tmp_available = __kmp_detect_tmp();
6799 if (!__kmp_shm_available && __kmp_tmp_available) {
6800 // SHM failed to work due to an error other than that the file already
6801 // exists. Try to create a temp file under /tmp.
6802 // If /tmp isn't accessible, fall back to using environment variable.
6803 // TODO: /tmp might not always be the temporary directory. For now we will
6804 // not consider TMPDIR.
6805 int fd1 = -1;
6806 temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6807 int tmp_preexist = 0;
6808 fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6809 if ((fd1 == -1) && (errno == EEXIST)) {
6810 // file didn't open because it already exists.
6811 // try opening existing file
6812 fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6813 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6814 KMP_WARNING(FunctionError, "Can't open TEMP");
6815 __kmp_tmp_available = false;
6816 } else {
6817 tmp_preexist = 1;
6818 }
6819 }
6820 if (__kmp_tmp_available && tmp_preexist == 0) {
6821 // we created /tmp file now set size
6822 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6823 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6824 __kmp_tmp_available = false;
6825 }
6826 }
6827 if (__kmp_tmp_available) {
6828 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6829 fd1, 0);
6830 if (data1 == MAP_FAILED) { // failed to map /tmp
6831 KMP_WARNING(FunctionError, "Can't map /tmp");
6832 __kmp_tmp_available = false;
6833 }
6834 }
6835 if (__kmp_tmp_available) {
6836 if (tmp_preexist == 0) { // set data to TMP, set value
6837 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6838 }
6839 // Read value from either what we just wrote or existing file.
6840 value = __kmp_str_format("%s", data1); // read value from SHM
6841 munmap(data1, SHM_SIZE);
6842 }
6843 if (fd1 != -1)
6844 close(fd1);
6845 }
6846 if (!__kmp_shm_available && !__kmp_tmp_available) {
6847 // no /dev/shm and no /tmp -- fall back to environment variable
6848 // Set environment variable, but do not overwrite if it exists.
6849 __kmp_env_set(name, __kmp_registration_str, 0);
6850 // read value to see if it got set
6851 value = __kmp_env_get(name);
6852 }
6853 #else // Windows and unix with static library
6854 // Set environment variable, but do not overwrite if it exists.
6855 __kmp_env_set(name, __kmp_registration_str, 0);
6856 // read value to see if it got set
6857 value = __kmp_env_get(name);
6858 #endif
6859
6860 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6861 done = 1; // Ok, environment variable set successfully, exit the loop.
6862 } else {
6863 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6864 // Check whether it alive or dead.
6865 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6866 char *tail = value;
6867 char *flag_addr_str = NULL;
6868 char *flag_val_str = NULL;
6869 char const *file_name = NULL;
6870 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6871 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6872 file_name = tail;
6873 if (tail != NULL) {
6874 unsigned long *flag_addr = 0;
6875 unsigned long flag_val = 0;
6876 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6877 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6878 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6879 // First, check whether environment-encoded address is mapped into
6880 // addr space.
6881 // If so, dereference it to see if it still has the right value.
6882 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6883 neighbor = 1;
6884 } else {
6885 // If not, then we know the other copy of the library is no longer
6886 // running.
6887 neighbor = 2;
6888 }
6889 }
6890 }
6891 switch (neighbor) {
6892 case 0: // Cannot parse environment variable -- neighbor status unknown.
6893 // Assume it is the incompatible format of future version of the
6894 // library. Assume the other library is alive.
6895 // WARN( ... ); // TODO: Issue a warning.
6896 file_name = "unknown library";
6897 KMP_FALLTHROUGH();
6898 // Attention! Falling to the next case. That's intentional.
6899 case 1: { // Neighbor is alive.
6900 // Check it is allowed.
6901 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6902 if (!__kmp_str_match_true(duplicate_ok)) {
6903 // That's not allowed. Issue fatal error.
6904 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6905 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6906 }
6907 KMP_INTERNAL_FREE(duplicate_ok);
6908 __kmp_duplicate_library_ok = 1;
6909 done = 1; // Exit the loop.
6910 } break;
6911 case 2: { // Neighbor is dead.
6912
6913 #if defined(KMP_USE_SHM)
6914 if (__kmp_shm_available) { // close shared memory.
6915 shm_unlink(shm_name); // this removes file in /dev/shm
6916 } else if (__kmp_tmp_available) {
6917 unlink(temp_reg_status_file_name); // this removes the temp file
6918 } else {
6919 // Clear the variable and try to register library again.
6920 __kmp_env_unset(name);
6921 }
6922 #else
6923 // Clear the variable and try to register library again.
6924 __kmp_env_unset(name);
6925 #endif
6926 } break;
6927 default: {
6928 KMP_DEBUG_ASSERT(0);
6929 } break;
6930 }
6931 }
6932 KMP_INTERNAL_FREE((void *)value);
6933 #if defined(KMP_USE_SHM)
6934 if (shm_name)
6935 KMP_INTERNAL_FREE((void *)shm_name);
6936 #endif
6937 } // while
6938 KMP_INTERNAL_FREE((void *)name);
6939
6940 } // func __kmp_register_library_startup
6941
__kmp_unregister_library(void)6942 void __kmp_unregister_library(void) {
6943
6944 char *name = __kmp_reg_status_name();
6945 char *value = NULL;
6946
6947 #if defined(KMP_USE_SHM)
6948 char *shm_name = nullptr;
6949 int fd1;
6950 if (__kmp_shm_available) {
6951 shm_name = __kmp_str_format("/%s", name);
6952 fd1 = shm_open(shm_name, O_RDONLY, 0666);
6953 if (fd1 != -1) { // File opened successfully
6954 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6955 if (data1 != MAP_FAILED) {
6956 value = __kmp_str_format("%s", data1); // read value from SHM
6957 munmap(data1, SHM_SIZE);
6958 }
6959 close(fd1);
6960 }
6961 } else if (__kmp_tmp_available) { // try /tmp
6962 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6963 if (fd1 != -1) { // File opened successfully
6964 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6965 if (data1 != MAP_FAILED) {
6966 value = __kmp_str_format("%s", data1); // read value from /tmp
6967 munmap(data1, SHM_SIZE);
6968 }
6969 close(fd1);
6970 }
6971 } else { // fall back to envirable
6972 value = __kmp_env_get(name);
6973 }
6974 #else
6975 value = __kmp_env_get(name);
6976 #endif
6977
6978 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6979 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6980 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6981 // Ok, this is our variable. Delete it.
6982 #if defined(KMP_USE_SHM)
6983 if (__kmp_shm_available) {
6984 shm_unlink(shm_name); // this removes file in /dev/shm
6985 } else if (__kmp_tmp_available) {
6986 unlink(temp_reg_status_file_name); // this removes the temp file
6987 } else {
6988 __kmp_env_unset(name);
6989 }
6990 #else
6991 __kmp_env_unset(name);
6992 #endif
6993 }
6994
6995 #if defined(KMP_USE_SHM)
6996 if (shm_name)
6997 KMP_INTERNAL_FREE(shm_name);
6998 if (temp_reg_status_file_name)
6999 KMP_INTERNAL_FREE(temp_reg_status_file_name);
7000 #endif
7001
7002 KMP_INTERNAL_FREE(__kmp_registration_str);
7003 KMP_INTERNAL_FREE(value);
7004 KMP_INTERNAL_FREE(name);
7005
7006 __kmp_registration_flag = 0;
7007 __kmp_registration_str = NULL;
7008
7009 } // __kmp_unregister_library
7010
7011 // End of Library registration stuff.
7012 // -----------------------------------------------------------------------------
7013
7014 #if KMP_MIC_SUPPORTED
7015
__kmp_check_mic_type()7016 static void __kmp_check_mic_type() {
7017 kmp_cpuid_t cpuid_state = {0};
7018 kmp_cpuid_t *cs_p = &cpuid_state;
7019 __kmp_x86_cpuid(1, 0, cs_p);
7020 // We don't support mic1 at the moment
7021 if ((cs_p->eax & 0xff0) == 0xB10) {
7022 __kmp_mic_type = mic2;
7023 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7024 __kmp_mic_type = mic3;
7025 } else {
7026 __kmp_mic_type = non_mic;
7027 }
7028 }
7029
7030 #endif /* KMP_MIC_SUPPORTED */
7031
7032 #if KMP_HAVE_UMWAIT
__kmp_user_level_mwait_init()7033 static void __kmp_user_level_mwait_init() {
7034 struct kmp_cpuid buf;
7035 __kmp_x86_cpuid(7, 0, &buf);
7036 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7037 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7038 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7039 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7040 __kmp_umwait_enabled));
7041 }
7042 #elif KMP_HAVE_MWAIT
7043 #ifndef AT_INTELPHIUSERMWAIT
7044 // Spurious, non-existent value that should always fail to return anything.
7045 // Will be replaced with the correct value when we know that.
7046 #define AT_INTELPHIUSERMWAIT 10000
7047 #endif
7048 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7049 // earlier OS is used to build the RTL, we'll use the following internal
7050 // function when the entry is not found.
7051 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
getauxval(unsigned long)7052 unsigned long getauxval(unsigned long) { return 0; }
7053
__kmp_user_level_mwait_init()7054 static void __kmp_user_level_mwait_init() {
7055 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7056 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7057 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7058 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7059 if (__kmp_mic_type == mic3) {
7060 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7061 if ((res & 0x1) || __kmp_user_level_mwait) {
7062 __kmp_mwait_enabled = TRUE;
7063 if (__kmp_user_level_mwait) {
7064 KMP_INFORM(EnvMwaitWarn);
7065 }
7066 } else {
7067 __kmp_mwait_enabled = FALSE;
7068 }
7069 }
7070 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7071 "__kmp_mwait_enabled = %d\n",
7072 __kmp_mic_type, __kmp_mwait_enabled));
7073 }
7074 #endif /* KMP_HAVE_UMWAIT */
7075
__kmp_do_serial_initialize(void)7076 static void __kmp_do_serial_initialize(void) {
7077 int i, gtid;
7078 size_t size;
7079
7080 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7081
7082 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7083 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7084 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7085 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7086 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7087
7088 #if OMPT_SUPPORT
7089 ompt_pre_init();
7090 #endif
7091 #if OMPD_SUPPORT
7092 __kmp_env_dump();
7093 ompd_init();
7094 #endif
7095
7096 __kmp_validate_locks();
7097
7098 #if ENABLE_LIBOMPTARGET
7099 /* Initialize functions from libomptarget */
7100 __kmp_init_omptarget();
7101 #endif
7102
7103 /* Initialize internal memory allocator */
7104 __kmp_init_allocator();
7105
7106 /* Register the library startup via an environment variable or via mapped
7107 shared memory file and check to see whether another copy of the library is
7108 already registered. Since forked child process is often terminated, we
7109 postpone the registration till middle initialization in the child */
7110 if (__kmp_need_register_serial)
7111 __kmp_register_library_startup();
7112
7113 /* TODO reinitialization of library */
7114 if (TCR_4(__kmp_global.g.g_done)) {
7115 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7116 }
7117
7118 __kmp_global.g.g_abort = 0;
7119 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7120
7121 /* initialize the locks */
7122 #if KMP_USE_ADAPTIVE_LOCKS
7123 #if KMP_DEBUG_ADAPTIVE_LOCKS
7124 __kmp_init_speculative_stats();
7125 #endif
7126 #endif
7127 #if KMP_STATS_ENABLED
7128 __kmp_stats_init();
7129 #endif
7130 __kmp_init_lock(&__kmp_global_lock);
7131 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7132 __kmp_init_lock(&__kmp_debug_lock);
7133 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7134 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7135 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7136 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7137 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7138 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7139 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7140 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7141 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7142 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7143 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7144 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7145 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7146 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7147 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7148 #if KMP_USE_MONITOR
7149 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7150 #endif
7151 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7152
7153 /* conduct initialization and initial setup of configuration */
7154
7155 __kmp_runtime_initialize();
7156
7157 #if KMP_MIC_SUPPORTED
7158 __kmp_check_mic_type();
7159 #endif
7160
7161 // Some global variable initialization moved here from kmp_env_initialize()
7162 #ifdef KMP_DEBUG
7163 kmp_diag = 0;
7164 #endif
7165 __kmp_abort_delay = 0;
7166
7167 // From __kmp_init_dflt_team_nth()
7168 /* assume the entire machine will be used */
7169 __kmp_dflt_team_nth_ub = __kmp_xproc;
7170 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7171 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7172 }
7173 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7174 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7175 }
7176 __kmp_max_nth = __kmp_sys_max_nth;
7177 __kmp_cg_max_nth = __kmp_sys_max_nth;
7178 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7179 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7180 __kmp_teams_max_nth = __kmp_sys_max_nth;
7181 }
7182
7183 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7184 // part
7185 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7186 #if KMP_USE_MONITOR
7187 __kmp_monitor_wakeups =
7188 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189 __kmp_bt_intervals =
7190 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7191 #endif
7192 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7193 __kmp_library = library_throughput;
7194 // From KMP_SCHEDULE initialization
7195 __kmp_static = kmp_sch_static_balanced;
7196 // AC: do not use analytical here, because it is non-monotonous
7197 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7198 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7199 // need to repeat assignment
7200 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7201 // bit control and barrier method control parts
7202 #if KMP_FAST_REDUCTION_BARRIER
7203 #define kmp_reduction_barrier_gather_bb ((int)1)
7204 #define kmp_reduction_barrier_release_bb ((int)1)
7205 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7206 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7207 #endif // KMP_FAST_REDUCTION_BARRIER
7208 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7209 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7210 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7211 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7212 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7213 #if KMP_FAST_REDUCTION_BARRIER
7214 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7215 // lin_64 ): hyper,1
7216 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7217 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7218 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7219 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7220 }
7221 #endif // KMP_FAST_REDUCTION_BARRIER
7222 }
7223 #if KMP_FAST_REDUCTION_BARRIER
7224 #undef kmp_reduction_barrier_release_pat
7225 #undef kmp_reduction_barrier_gather_pat
7226 #undef kmp_reduction_barrier_release_bb
7227 #undef kmp_reduction_barrier_gather_bb
7228 #endif // KMP_FAST_REDUCTION_BARRIER
7229 #if KMP_MIC_SUPPORTED
7230 if (__kmp_mic_type == mic2) { // KNC
7231 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7232 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7233 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7234 1; // forkjoin release
7235 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7236 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7237 }
7238 #if KMP_FAST_REDUCTION_BARRIER
7239 if (__kmp_mic_type == mic2) { // KNC
7240 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7241 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7242 }
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244 #endif // KMP_MIC_SUPPORTED
7245
7246 // From KMP_CHECKS initialization
7247 #ifdef KMP_DEBUG
7248 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7249 #else
7250 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7251 #endif
7252
7253 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7254 __kmp_foreign_tp = TRUE;
7255
7256 __kmp_global.g.g_dynamic = FALSE;
7257 __kmp_global.g.g_dynamic_mode = dynamic_default;
7258
7259 __kmp_init_nesting_mode();
7260
7261 __kmp_env_initialize(NULL);
7262
7263 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7264 __kmp_user_level_mwait_init();
7265 #endif
7266 // Print all messages in message catalog for testing purposes.
7267 #ifdef KMP_DEBUG
7268 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7269 if (__kmp_str_match_true(val)) {
7270 kmp_str_buf_t buffer;
7271 __kmp_str_buf_init(&buffer);
7272 __kmp_i18n_dump_catalog(&buffer);
7273 __kmp_printf("%s", buffer.str);
7274 __kmp_str_buf_free(&buffer);
7275 }
7276 __kmp_env_free(&val);
7277 #endif
7278
7279 __kmp_threads_capacity =
7280 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7281 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7282 __kmp_tp_capacity = __kmp_default_tp_capacity(
7283 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7284
7285 // If the library is shut down properly, both pools must be NULL. Just in
7286 // case, set them to NULL -- some memory may leak, but subsequent code will
7287 // work even if pools are not freed.
7288 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7289 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7290 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7291 __kmp_thread_pool = NULL;
7292 __kmp_thread_pool_insert_pt = NULL;
7293 __kmp_team_pool = NULL;
7294
7295 /* Allocate all of the variable sized records */
7296 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7297 * expandable */
7298 /* Since allocation is cache-aligned, just add extra padding at the end */
7299 size =
7300 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7301 CACHE_LINE;
7302 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7303 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7304 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7305
7306 /* init thread counts */
7307 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7308 0); // Asserts fail if the library is reinitializing and
7309 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7310 __kmp_all_nth = 0;
7311 __kmp_nth = 0;
7312
7313 /* setup the uber master thread and hierarchy */
7314 gtid = __kmp_register_root(TRUE);
7315 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7316 KMP_ASSERT(KMP_UBER_GTID(gtid));
7317 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7318
7319 KMP_MB(); /* Flush all pending memory write invalidates. */
7320
7321 __kmp_common_initialize();
7322
7323 #if KMP_OS_UNIX
7324 /* invoke the child fork handler */
7325 __kmp_register_atfork();
7326 #endif
7327
7328 #if !KMP_DYNAMIC_LIB || \
7329 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7330 {
7331 /* Invoke the exit handler when the program finishes, only for static
7332 library and macOS* dynamic. For other dynamic libraries, we already
7333 have _fini and DllMain. */
7334 int rc = atexit(__kmp_internal_end_atexit);
7335 if (rc != 0) {
7336 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7337 __kmp_msg_null);
7338 }
7339 }
7340 #endif
7341
7342 #if KMP_HANDLE_SIGNALS
7343 #if KMP_OS_UNIX
7344 /* NOTE: make sure that this is called before the user installs their own
7345 signal handlers so that the user handlers are called first. this way they
7346 can return false, not call our handler, avoid terminating the library, and
7347 continue execution where they left off. */
7348 __kmp_install_signals(FALSE);
7349 #endif /* KMP_OS_UNIX */
7350 #if KMP_OS_WINDOWS
7351 __kmp_install_signals(TRUE);
7352 #endif /* KMP_OS_WINDOWS */
7353 #endif
7354
7355 /* we have finished the serial initialization */
7356 __kmp_init_counter++;
7357
7358 __kmp_init_serial = TRUE;
7359
7360 if (__kmp_version) {
7361 __kmp_print_version_1();
7362 }
7363
7364 if (__kmp_settings) {
7365 __kmp_env_print();
7366 }
7367
7368 if (__kmp_display_env || __kmp_display_env_verbose) {
7369 __kmp_env_print_2();
7370 }
7371
7372 #if OMPT_SUPPORT
7373 ompt_post_init();
7374 #endif
7375
7376 KMP_MB();
7377
7378 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7379 }
7380
__kmp_serial_initialize(void)7381 void __kmp_serial_initialize(void) {
7382 if (__kmp_init_serial) {
7383 return;
7384 }
7385 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7386 if (__kmp_init_serial) {
7387 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7388 return;
7389 }
7390 __kmp_do_serial_initialize();
7391 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7392 }
7393
__kmp_do_middle_initialize(void)7394 static void __kmp_do_middle_initialize(void) {
7395 int i, j;
7396 int prev_dflt_team_nth;
7397
7398 if (!__kmp_init_serial) {
7399 __kmp_do_serial_initialize();
7400 }
7401
7402 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7403
7404 if (UNLIKELY(!__kmp_need_register_serial)) {
7405 // We are in a forked child process. The registration was skipped during
7406 // serial initialization in __kmp_atfork_child handler. Do it here.
7407 __kmp_register_library_startup();
7408 }
7409
7410 // Save the previous value for the __kmp_dflt_team_nth so that
7411 // we can avoid some reinitialization if it hasn't changed.
7412 prev_dflt_team_nth = __kmp_dflt_team_nth;
7413
7414 #if KMP_AFFINITY_SUPPORTED
7415 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7416 // number of cores on the machine.
7417 __kmp_affinity_initialize(__kmp_affinity);
7418
7419 #endif /* KMP_AFFINITY_SUPPORTED */
7420
7421 KMP_ASSERT(__kmp_xproc > 0);
7422 if (__kmp_avail_proc == 0) {
7423 __kmp_avail_proc = __kmp_xproc;
7424 }
7425
7426 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7427 // correct them now
7428 j = 0;
7429 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7430 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7431 __kmp_avail_proc;
7432 j++;
7433 }
7434
7435 if (__kmp_dflt_team_nth == 0) {
7436 #ifdef KMP_DFLT_NTH_CORES
7437 // Default #threads = #cores
7438 __kmp_dflt_team_nth = __kmp_ncores;
7439 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7440 "__kmp_ncores (%d)\n",
7441 __kmp_dflt_team_nth));
7442 #else
7443 // Default #threads = #available OS procs
7444 __kmp_dflt_team_nth = __kmp_avail_proc;
7445 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7446 "__kmp_avail_proc(%d)\n",
7447 __kmp_dflt_team_nth));
7448 #endif /* KMP_DFLT_NTH_CORES */
7449 }
7450
7451 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7452 __kmp_dflt_team_nth = KMP_MIN_NTH;
7453 }
7454 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7455 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7456 }
7457
7458 if (__kmp_nesting_mode > 0)
7459 __kmp_set_nesting_mode_threads();
7460
7461 // There's no harm in continuing if the following check fails,
7462 // but it indicates an error in the previous logic.
7463 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7464
7465 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7466 // Run through the __kmp_threads array and set the num threads icv for each
7467 // root thread that is currently registered with the RTL (which has not
7468 // already explicitly set its nthreads-var with a call to
7469 // omp_set_num_threads()).
7470 for (i = 0; i < __kmp_threads_capacity; i++) {
7471 kmp_info_t *thread = __kmp_threads[i];
7472 if (thread == NULL)
7473 continue;
7474 if (thread->th.th_current_task->td_icvs.nproc != 0)
7475 continue;
7476
7477 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7478 }
7479 }
7480 KA_TRACE(
7481 20,
7482 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7483 __kmp_dflt_team_nth));
7484
7485 #ifdef KMP_ADJUST_BLOCKTIME
7486 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7487 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7488 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7489 if (__kmp_nth > __kmp_avail_proc) {
7490 __kmp_zero_bt = TRUE;
7491 }
7492 }
7493 #endif /* KMP_ADJUST_BLOCKTIME */
7494
7495 /* we have finished middle initialization */
7496 TCW_SYNC_4(__kmp_init_middle, TRUE);
7497
7498 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7499 }
7500
__kmp_middle_initialize(void)7501 void __kmp_middle_initialize(void) {
7502 if (__kmp_init_middle) {
7503 return;
7504 }
7505 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7506 if (__kmp_init_middle) {
7507 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7508 return;
7509 }
7510 __kmp_do_middle_initialize();
7511 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7512 }
7513
__kmp_parallel_initialize(void)7514 void __kmp_parallel_initialize(void) {
7515 int gtid = __kmp_entry_gtid(); // this might be a new root
7516
7517 /* synchronize parallel initialization (for sibling) */
7518 if (TCR_4(__kmp_init_parallel))
7519 return;
7520 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521 if (TCR_4(__kmp_init_parallel)) {
7522 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523 return;
7524 }
7525
7526 /* TODO reinitialization after we have already shut down */
7527 if (TCR_4(__kmp_global.g.g_done)) {
7528 KA_TRACE(
7529 10,
7530 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7531 __kmp_infinite_loop();
7532 }
7533
7534 /* jc: The lock __kmp_initz_lock is already held, so calling
7535 __kmp_serial_initialize would cause a deadlock. So we call
7536 __kmp_do_serial_initialize directly. */
7537 if (!__kmp_init_middle) {
7538 __kmp_do_middle_initialize();
7539 }
7540 __kmp_assign_root_init_mask();
7541 __kmp_resume_if_hard_paused();
7542
7543 /* begin initialization */
7544 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7545 KMP_ASSERT(KMP_UBER_GTID(gtid));
7546
7547 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7548 // Save the FP control regs.
7549 // Worker threads will set theirs to these values at thread startup.
7550 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7551 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7552 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7553 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7554
7555 #if KMP_OS_UNIX
7556 #if KMP_HANDLE_SIGNALS
7557 /* must be after __kmp_serial_initialize */
7558 __kmp_install_signals(TRUE);
7559 #endif
7560 #endif
7561
7562 __kmp_suspend_initialize();
7563
7564 #if defined(USE_LOAD_BALANCE)
7565 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7566 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7567 }
7568 #else
7569 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7570 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7571 }
7572 #endif
7573
7574 if (__kmp_version) {
7575 __kmp_print_version_2();
7576 }
7577
7578 /* we have finished parallel initialization */
7579 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7580
7581 KMP_MB();
7582 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7583
7584 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7585 }
7586
__kmp_hidden_helper_initialize()7587 void __kmp_hidden_helper_initialize() {
7588 if (TCR_4(__kmp_init_hidden_helper))
7589 return;
7590
7591 // __kmp_parallel_initialize is required before we initialize hidden helper
7592 if (!TCR_4(__kmp_init_parallel))
7593 __kmp_parallel_initialize();
7594
7595 // Double check. Note that this double check should not be placed before
7596 // __kmp_parallel_initialize as it will cause dead lock.
7597 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7598 if (TCR_4(__kmp_init_hidden_helper)) {
7599 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600 return;
7601 }
7602
7603 #if KMP_AFFINITY_SUPPORTED
7604 // Initialize hidden helper affinity settings.
7605 // The above __kmp_parallel_initialize() will initialize
7606 // regular affinity (and topology) if not already done.
7607 if (!__kmp_hh_affinity.flags.initialized)
7608 __kmp_affinity_initialize(__kmp_hh_affinity);
7609 #endif
7610
7611 // Set the count of hidden helper tasks to be executed to zero
7612 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7613
7614 // Set the global variable indicating that we're initializing hidden helper
7615 // team/threads
7616 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7617
7618 // Platform independent initialization
7619 __kmp_do_initialize_hidden_helper_threads();
7620
7621 // Wait here for the finish of initialization of hidden helper teams
7622 __kmp_hidden_helper_threads_initz_wait();
7623
7624 // We have finished hidden helper initialization
7625 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7626
7627 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7628 }
7629
7630 /* ------------------------------------------------------------------------ */
7631
__kmp_run_before_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7632 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7633 kmp_team_t *team) {
7634 kmp_disp_t *dispatch;
7635
7636 KMP_MB();
7637
7638 /* none of the threads have encountered any constructs, yet. */
7639 this_thr->th.th_local.this_construct = 0;
7640 #if KMP_CACHE_MANAGE
7641 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7642 #endif /* KMP_CACHE_MANAGE */
7643 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7644 KMP_DEBUG_ASSERT(dispatch);
7645 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7646 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7647 // this_thr->th.th_info.ds.ds_tid ] );
7648
7649 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7650 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7651 if (__kmp_env_consistency_check)
7652 __kmp_push_parallel(gtid, team->t.t_ident);
7653
7654 KMP_MB(); /* Flush all pending memory write invalidates. */
7655 }
7656
__kmp_run_after_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7657 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7658 kmp_team_t *team) {
7659 if (__kmp_env_consistency_check)
7660 __kmp_pop_parallel(gtid, team->t.t_ident);
7661
7662 __kmp_finish_implicit_task(this_thr);
7663 }
7664
__kmp_invoke_task_func(int gtid)7665 int __kmp_invoke_task_func(int gtid) {
7666 int rc;
7667 int tid = __kmp_tid_from_gtid(gtid);
7668 kmp_info_t *this_thr = __kmp_threads[gtid];
7669 kmp_team_t *team = this_thr->th.th_team;
7670
7671 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7672 #if USE_ITT_BUILD
7673 if (__itt_stack_caller_create_ptr) {
7674 // inform ittnotify about entering user's code
7675 if (team->t.t_stack_id != NULL) {
7676 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7677 } else {
7678 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7679 __kmp_itt_stack_callee_enter(
7680 (__itt_caller)team->t.t_parent->t.t_stack_id);
7681 }
7682 }
7683 #endif /* USE_ITT_BUILD */
7684 #if INCLUDE_SSC_MARKS
7685 SSC_MARK_INVOKING();
7686 #endif
7687
7688 #if OMPT_SUPPORT
7689 void *dummy;
7690 void **exit_frame_p;
7691 ompt_data_t *my_task_data;
7692 ompt_data_t *my_parallel_data;
7693 int ompt_team_size;
7694
7695 if (ompt_enabled.enabled) {
7696 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7697 .ompt_task_info.frame.exit_frame.ptr);
7698 } else {
7699 exit_frame_p = &dummy;
7700 }
7701
7702 my_task_data =
7703 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7704 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7705 if (ompt_enabled.ompt_callback_implicit_task) {
7706 ompt_team_size = team->t.t_nproc;
7707 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7708 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7709 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7710 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7711 }
7712 #endif
7713
7714 #if KMP_STATS_ENABLED
7715 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7716 if (previous_state == stats_state_e::TEAMS_REGION) {
7717 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7718 } else {
7719 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7720 }
7721 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7722 #endif
7723
7724 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7725 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7726 #if OMPT_SUPPORT
7727 ,
7728 exit_frame_p
7729 #endif
7730 );
7731 #if OMPT_SUPPORT
7732 *exit_frame_p = NULL;
7733 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7734 #endif
7735
7736 #if KMP_STATS_ENABLED
7737 if (previous_state == stats_state_e::TEAMS_REGION) {
7738 KMP_SET_THREAD_STATE(previous_state);
7739 }
7740 KMP_POP_PARTITIONED_TIMER();
7741 #endif
7742
7743 #if USE_ITT_BUILD
7744 if (__itt_stack_caller_create_ptr) {
7745 // inform ittnotify about leaving user's code
7746 if (team->t.t_stack_id != NULL) {
7747 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7748 } else {
7749 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7750 __kmp_itt_stack_callee_leave(
7751 (__itt_caller)team->t.t_parent->t.t_stack_id);
7752 }
7753 }
7754 #endif /* USE_ITT_BUILD */
7755 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7756
7757 return rc;
7758 }
7759
__kmp_teams_master(int gtid)7760 void __kmp_teams_master(int gtid) {
7761 // This routine is called by all primary threads in teams construct
7762 kmp_info_t *thr = __kmp_threads[gtid];
7763 kmp_team_t *team = thr->th.th_team;
7764 ident_t *loc = team->t.t_ident;
7765 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7766 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7767 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7768 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7769 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7770
7771 // This thread is a new CG root. Set up the proper variables.
7772 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7773 tmp->cg_root = thr; // Make thr the CG root
7774 // Init to thread limit stored when league primary threads were forked
7775 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7776 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7777 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7778 " cg_nthreads to 1\n",
7779 thr, tmp));
7780 tmp->up = thr->th.th_cg_roots;
7781 thr->th.th_cg_roots = tmp;
7782
7783 // Launch league of teams now, but not let workers execute
7784 // (they hang on fork barrier until next parallel)
7785 #if INCLUDE_SSC_MARKS
7786 SSC_MARK_FORKING();
7787 #endif
7788 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7789 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7790 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7791 #if INCLUDE_SSC_MARKS
7792 SSC_MARK_JOINING();
7793 #endif
7794 // If the team size was reduced from the limit, set it to the new size
7795 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7796 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7797 // AC: last parameter "1" eliminates join barrier which won't work because
7798 // worker threads are in a fork barrier waiting for more parallel regions
7799 __kmp_join_call(loc, gtid
7800 #if OMPT_SUPPORT
7801 ,
7802 fork_context_intel
7803 #endif
7804 ,
7805 1);
7806 }
7807
__kmp_invoke_teams_master(int gtid)7808 int __kmp_invoke_teams_master(int gtid) {
7809 kmp_info_t *this_thr = __kmp_threads[gtid];
7810 kmp_team_t *team = this_thr->th.th_team;
7811 #if KMP_DEBUG
7812 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7813 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7814 (void *)__kmp_teams_master);
7815 #endif
7816 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7817 #if OMPT_SUPPORT
7818 int tid = __kmp_tid_from_gtid(gtid);
7819 ompt_data_t *task_data =
7820 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7821 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7822 if (ompt_enabled.ompt_callback_implicit_task) {
7823 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7824 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7825 ompt_task_initial);
7826 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7827 }
7828 #endif
7829 __kmp_teams_master(gtid);
7830 #if OMPT_SUPPORT
7831 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7832 #endif
7833 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7834 return 1;
7835 }
7836
7837 /* this sets the requested number of threads for the next parallel region
7838 encountered by this team. since this should be enclosed in the forkjoin
7839 critical section it should avoid race conditions with asymmetrical nested
7840 parallelism */
7841
__kmp_push_num_threads(ident_t * id,int gtid,int num_threads)7842 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7843 kmp_info_t *thr = __kmp_threads[gtid];
7844
7845 if (num_threads > 0)
7846 thr->th.th_set_nproc = num_threads;
7847 }
7848
__kmp_push_thread_limit(kmp_info_t * thr,int num_teams,int num_threads)7849 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7850 int num_threads) {
7851 KMP_DEBUG_ASSERT(thr);
7852 // Remember the number of threads for inner parallel regions
7853 if (!TCR_4(__kmp_init_middle))
7854 __kmp_middle_initialize(); // get internal globals calculated
7855 __kmp_assign_root_init_mask();
7856 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7857 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7858
7859 if (num_threads == 0) {
7860 if (__kmp_teams_thread_limit > 0) {
7861 num_threads = __kmp_teams_thread_limit;
7862 } else {
7863 num_threads = __kmp_avail_proc / num_teams;
7864 }
7865 // adjust num_threads w/o warning as it is not user setting
7866 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7867 // no thread_limit clause specified - do not change thread-limit-var ICV
7868 if (num_threads > __kmp_dflt_team_nth) {
7869 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7870 }
7871 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7872 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7873 } // prevent team size to exceed thread-limit-var
7874 if (num_teams * num_threads > __kmp_teams_max_nth) {
7875 num_threads = __kmp_teams_max_nth / num_teams;
7876 }
7877 if (num_threads == 0) {
7878 num_threads = 1;
7879 }
7880 } else {
7881 if (num_threads < 0) {
7882 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7883 __kmp_msg_null);
7884 num_threads = 1;
7885 }
7886 // This thread will be the primary thread of the league primary threads
7887 // Store new thread limit; old limit is saved in th_cg_roots list
7888 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7889 // num_threads = min(num_threads, nthreads-var)
7890 if (num_threads > __kmp_dflt_team_nth) {
7891 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7892 }
7893 if (num_teams * num_threads > __kmp_teams_max_nth) {
7894 int new_threads = __kmp_teams_max_nth / num_teams;
7895 if (new_threads == 0) {
7896 new_threads = 1;
7897 }
7898 if (new_threads != num_threads) {
7899 if (!__kmp_reserve_warn) { // user asked for too many threads
7900 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7901 __kmp_msg(kmp_ms_warning,
7902 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7903 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904 }
7905 }
7906 num_threads = new_threads;
7907 }
7908 }
7909 thr->th.th_teams_size.nth = num_threads;
7910 }
7911
7912 /* this sets the requested number of teams for the teams region and/or
7913 the number of threads for the next parallel region encountered */
__kmp_push_num_teams(ident_t * id,int gtid,int num_teams,int num_threads)7914 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7915 int num_threads) {
7916 kmp_info_t *thr = __kmp_threads[gtid];
7917 if (num_teams < 0) {
7918 // OpenMP specification requires requested values to be positive,
7919 // but people can send us any value, so we'd better check
7920 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7921 __kmp_msg_null);
7922 num_teams = 1;
7923 }
7924 if (num_teams == 0) {
7925 if (__kmp_nteams > 0) {
7926 num_teams = __kmp_nteams;
7927 } else {
7928 num_teams = 1; // default number of teams is 1.
7929 }
7930 }
7931 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7932 if (!__kmp_reserve_warn) {
7933 __kmp_reserve_warn = 1;
7934 __kmp_msg(kmp_ms_warning,
7935 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7936 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7937 }
7938 num_teams = __kmp_teams_max_nth;
7939 }
7940 // Set number of teams (number of threads in the outer "parallel" of the
7941 // teams)
7942 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7943
7944 __kmp_push_thread_limit(thr, num_teams, num_threads);
7945 }
7946
7947 /* This sets the requested number of teams for the teams region and/or
7948 the number of threads for the next parallel region encountered */
__kmp_push_num_teams_51(ident_t * id,int gtid,int num_teams_lb,int num_teams_ub,int num_threads)7949 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7950 int num_teams_ub, int num_threads) {
7951 kmp_info_t *thr = __kmp_threads[gtid];
7952 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7953 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7954 KMP_DEBUG_ASSERT(num_threads >= 0);
7955
7956 if (num_teams_lb > num_teams_ub) {
7957 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7958 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7959 }
7960
7961 int num_teams = 1; // defalt number of teams is 1.
7962
7963 if (num_teams_lb == 0 && num_teams_ub > 0)
7964 num_teams_lb = num_teams_ub;
7965
7966 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7967 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7968 if (num_teams > __kmp_teams_max_nth) {
7969 if (!__kmp_reserve_warn) {
7970 __kmp_reserve_warn = 1;
7971 __kmp_msg(kmp_ms_warning,
7972 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7973 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7974 }
7975 num_teams = __kmp_teams_max_nth;
7976 }
7977 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7978 num_teams = num_teams_ub;
7979 } else { // num_teams_lb <= num_teams <= num_teams_ub
7980 if (num_threads <= 0) {
7981 if (num_teams_ub > __kmp_teams_max_nth) {
7982 num_teams = num_teams_lb;
7983 } else {
7984 num_teams = num_teams_ub;
7985 }
7986 } else {
7987 num_teams = (num_threads > __kmp_teams_max_nth)
7988 ? num_teams
7989 : __kmp_teams_max_nth / num_threads;
7990 if (num_teams < num_teams_lb) {
7991 num_teams = num_teams_lb;
7992 } else if (num_teams > num_teams_ub) {
7993 num_teams = num_teams_ub;
7994 }
7995 }
7996 }
7997 // Set number of teams (number of threads in the outer "parallel" of the
7998 // teams)
7999 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8000
8001 __kmp_push_thread_limit(thr, num_teams, num_threads);
8002 }
8003
8004 // Set the proc_bind var to use in the following parallel region.
__kmp_push_proc_bind(ident_t * id,int gtid,kmp_proc_bind_t proc_bind)8005 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8006 kmp_info_t *thr = __kmp_threads[gtid];
8007 thr->th.th_set_proc_bind = proc_bind;
8008 }
8009
8010 /* Launch the worker threads into the microtask. */
8011
__kmp_internal_fork(ident_t * id,int gtid,kmp_team_t * team)8012 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8013 kmp_info_t *this_thr = __kmp_threads[gtid];
8014
8015 #ifdef KMP_DEBUG
8016 int f;
8017 #endif /* KMP_DEBUG */
8018
8019 KMP_DEBUG_ASSERT(team);
8020 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8021 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8022 KMP_MB(); /* Flush all pending memory write invalidates. */
8023
8024 team->t.t_construct = 0; /* no single directives seen yet */
8025 team->t.t_ordered.dt.t_value =
8026 0; /* thread 0 enters the ordered section first */
8027
8028 /* Reset the identifiers on the dispatch buffer */
8029 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8030 if (team->t.t_max_nproc > 1) {
8031 int i;
8032 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8033 team->t.t_disp_buffer[i].buffer_index = i;
8034 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8035 }
8036 } else {
8037 team->t.t_disp_buffer[0].buffer_index = 0;
8038 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8039 }
8040
8041 KMP_MB(); /* Flush all pending memory write invalidates. */
8042 KMP_ASSERT(this_thr->th.th_team == team);
8043
8044 #ifdef KMP_DEBUG
8045 for (f = 0; f < team->t.t_nproc; f++) {
8046 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8047 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8048 }
8049 #endif /* KMP_DEBUG */
8050
8051 /* release the worker threads so they may begin working */
8052 __kmp_fork_barrier(gtid, 0);
8053 }
8054
__kmp_internal_join(ident_t * id,int gtid,kmp_team_t * team)8055 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8056 kmp_info_t *this_thr = __kmp_threads[gtid];
8057
8058 KMP_DEBUG_ASSERT(team);
8059 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8060 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8061 KMP_MB(); /* Flush all pending memory write invalidates. */
8062
8063 /* Join barrier after fork */
8064
8065 #ifdef KMP_DEBUG
8066 if (__kmp_threads[gtid] &&
8067 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8068 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8069 __kmp_threads[gtid]);
8070 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8071 "team->t.t_nproc=%d\n",
8072 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8073 team->t.t_nproc);
8074 __kmp_print_structure();
8075 }
8076 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8077 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8078 #endif /* KMP_DEBUG */
8079
8080 __kmp_join_barrier(gtid); /* wait for everyone */
8081 #if OMPT_SUPPORT
8082 if (ompt_enabled.enabled &&
8083 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8084 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8085 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8086 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8087 #if OMPT_OPTIONAL
8088 void *codeptr = NULL;
8089 if (KMP_MASTER_TID(ds_tid) &&
8090 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8091 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8092 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8093
8094 if (ompt_enabled.ompt_callback_sync_region_wait) {
8095 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8096 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8097 codeptr);
8098 }
8099 if (ompt_enabled.ompt_callback_sync_region) {
8100 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8101 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8102 codeptr);
8103 }
8104 #endif
8105 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8106 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8107 ompt_scope_end, NULL, task_data, 0, ds_tid,
8108 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8109 }
8110 }
8111 #endif
8112
8113 KMP_MB(); /* Flush all pending memory write invalidates. */
8114 KMP_ASSERT(this_thr->th.th_team == team);
8115 }
8116
8117 /* ------------------------------------------------------------------------ */
8118
8119 #ifdef USE_LOAD_BALANCE
8120
8121 // Return the worker threads actively spinning in the hot team, if we
8122 // are at the outermost level of parallelism. Otherwise, return 0.
__kmp_active_hot_team_nproc(kmp_root_t * root)8123 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8124 int i;
8125 int retval;
8126 kmp_team_t *hot_team;
8127
8128 if (root->r.r_active) {
8129 return 0;
8130 }
8131 hot_team = root->r.r_hot_team;
8132 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8133 return hot_team->t.t_nproc - 1; // Don't count primary thread
8134 }
8135
8136 // Skip the primary thread - it is accounted for elsewhere.
8137 retval = 0;
8138 for (i = 1; i < hot_team->t.t_nproc; i++) {
8139 if (hot_team->t.t_threads[i]->th.th_active) {
8140 retval++;
8141 }
8142 }
8143 return retval;
8144 }
8145
8146 // Perform an automatic adjustment to the number of
8147 // threads used by the next parallel region.
__kmp_load_balance_nproc(kmp_root_t * root,int set_nproc)8148 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8149 int retval;
8150 int pool_active;
8151 int hot_team_active;
8152 int team_curr_active;
8153 int system_active;
8154
8155 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8156 set_nproc));
8157 KMP_DEBUG_ASSERT(root);
8158 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8159 ->th.th_current_task->td_icvs.dynamic == TRUE);
8160 KMP_DEBUG_ASSERT(set_nproc > 1);
8161
8162 if (set_nproc == 1) {
8163 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8164 return 1;
8165 }
8166
8167 // Threads that are active in the thread pool, active in the hot team for this
8168 // particular root (if we are at the outer par level), and the currently
8169 // executing thread (to become the primary thread) are available to add to the
8170 // new team, but are currently contributing to the system load, and must be
8171 // accounted for.
8172 pool_active = __kmp_thread_pool_active_nth;
8173 hot_team_active = __kmp_active_hot_team_nproc(root);
8174 team_curr_active = pool_active + hot_team_active + 1;
8175
8176 // Check the system load.
8177 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8178 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8179 "hot team active = %d\n",
8180 system_active, pool_active, hot_team_active));
8181
8182 if (system_active < 0) {
8183 // There was an error reading the necessary info from /proc, so use the
8184 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8185 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8186 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8187 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8188
8189 // Make this call behave like the thread limit algorithm.
8190 retval = __kmp_avail_proc - __kmp_nth +
8191 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8192 if (retval > set_nproc) {
8193 retval = set_nproc;
8194 }
8195 if (retval < KMP_MIN_NTH) {
8196 retval = KMP_MIN_NTH;
8197 }
8198
8199 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8200 retval));
8201 return retval;
8202 }
8203
8204 // There is a slight delay in the load balance algorithm in detecting new
8205 // running procs. The real system load at this instant should be at least as
8206 // large as the #active omp thread that are available to add to the team.
8207 if (system_active < team_curr_active) {
8208 system_active = team_curr_active;
8209 }
8210 retval = __kmp_avail_proc - system_active + team_curr_active;
8211 if (retval > set_nproc) {
8212 retval = set_nproc;
8213 }
8214 if (retval < KMP_MIN_NTH) {
8215 retval = KMP_MIN_NTH;
8216 }
8217
8218 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8219 return retval;
8220 } // __kmp_load_balance_nproc()
8221
8222 #endif /* USE_LOAD_BALANCE */
8223
8224 /* ------------------------------------------------------------------------ */
8225
8226 /* NOTE: this is called with the __kmp_init_lock held */
__kmp_cleanup(void)8227 void __kmp_cleanup(void) {
8228 int f;
8229
8230 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8231
8232 if (TCR_4(__kmp_init_parallel)) {
8233 #if KMP_HANDLE_SIGNALS
8234 __kmp_remove_signals();
8235 #endif
8236 TCW_4(__kmp_init_parallel, FALSE);
8237 }
8238
8239 if (TCR_4(__kmp_init_middle)) {
8240 #if KMP_AFFINITY_SUPPORTED
8241 __kmp_affinity_uninitialize();
8242 #endif /* KMP_AFFINITY_SUPPORTED */
8243 __kmp_cleanup_hierarchy();
8244 TCW_4(__kmp_init_middle, FALSE);
8245 }
8246
8247 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8248
8249 if (__kmp_init_serial) {
8250 __kmp_runtime_destroy();
8251 __kmp_init_serial = FALSE;
8252 }
8253
8254 __kmp_cleanup_threadprivate_caches();
8255
8256 for (f = 0; f < __kmp_threads_capacity; f++) {
8257 if (__kmp_root[f] != NULL) {
8258 __kmp_free(__kmp_root[f]);
8259 __kmp_root[f] = NULL;
8260 }
8261 }
8262 __kmp_free(__kmp_threads);
8263 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8264 // there is no need in freeing __kmp_root.
8265 __kmp_threads = NULL;
8266 __kmp_root = NULL;
8267 __kmp_threads_capacity = 0;
8268
8269 // Free old __kmp_threads arrays if they exist.
8270 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8271 while (ptr) {
8272 kmp_old_threads_list_t *next = ptr->next;
8273 __kmp_free(ptr->threads);
8274 __kmp_free(ptr);
8275 ptr = next;
8276 }
8277
8278 #if KMP_USE_DYNAMIC_LOCK
8279 __kmp_cleanup_indirect_user_locks();
8280 #else
8281 __kmp_cleanup_user_locks();
8282 #endif
8283 #if OMPD_SUPPORT
8284 if (ompd_state) {
8285 __kmp_free(ompd_env_block);
8286 ompd_env_block = NULL;
8287 ompd_env_block_size = 0;
8288 }
8289 #endif
8290
8291 #if KMP_AFFINITY_SUPPORTED
8292 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8293 __kmp_cpuinfo_file = NULL;
8294 #endif /* KMP_AFFINITY_SUPPORTED */
8295
8296 #if KMP_USE_ADAPTIVE_LOCKS
8297 #if KMP_DEBUG_ADAPTIVE_LOCKS
8298 __kmp_print_speculative_stats();
8299 #endif
8300 #endif
8301 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8302 __kmp_nested_nth.nth = NULL;
8303 __kmp_nested_nth.size = 0;
8304 __kmp_nested_nth.used = 0;
8305 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8306 __kmp_nested_proc_bind.bind_types = NULL;
8307 __kmp_nested_proc_bind.size = 0;
8308 __kmp_nested_proc_bind.used = 0;
8309 if (__kmp_affinity_format) {
8310 KMP_INTERNAL_FREE(__kmp_affinity_format);
8311 __kmp_affinity_format = NULL;
8312 }
8313
8314 __kmp_i18n_catclose();
8315
8316 #if KMP_USE_HIER_SCHED
8317 __kmp_hier_scheds.deallocate();
8318 #endif
8319
8320 #if KMP_STATS_ENABLED
8321 __kmp_stats_fini();
8322 #endif
8323
8324 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8325 }
8326
8327 /* ------------------------------------------------------------------------ */
8328
__kmp_ignore_mppbeg(void)8329 int __kmp_ignore_mppbeg(void) {
8330 char *env;
8331
8332 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8333 if (__kmp_str_match_false(env))
8334 return FALSE;
8335 }
8336 // By default __kmpc_begin() is no-op.
8337 return TRUE;
8338 }
8339
__kmp_ignore_mppend(void)8340 int __kmp_ignore_mppend(void) {
8341 char *env;
8342
8343 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8344 if (__kmp_str_match_false(env))
8345 return FALSE;
8346 }
8347 // By default __kmpc_end() is no-op.
8348 return TRUE;
8349 }
8350
__kmp_internal_begin(void)8351 void __kmp_internal_begin(void) {
8352 int gtid;
8353 kmp_root_t *root;
8354
8355 /* this is a very important step as it will register new sibling threads
8356 and assign these new uber threads a new gtid */
8357 gtid = __kmp_entry_gtid();
8358 root = __kmp_threads[gtid]->th.th_root;
8359 KMP_ASSERT(KMP_UBER_GTID(gtid));
8360
8361 if (root->r.r_begin)
8362 return;
8363 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8364 if (root->r.r_begin) {
8365 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8366 return;
8367 }
8368
8369 root->r.r_begin = TRUE;
8370
8371 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8372 }
8373
8374 /* ------------------------------------------------------------------------ */
8375
__kmp_user_set_library(enum library_type arg)8376 void __kmp_user_set_library(enum library_type arg) {
8377 int gtid;
8378 kmp_root_t *root;
8379 kmp_info_t *thread;
8380
8381 /* first, make sure we are initialized so we can get our gtid */
8382
8383 gtid = __kmp_entry_gtid();
8384 thread = __kmp_threads[gtid];
8385
8386 root = thread->th.th_root;
8387
8388 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8389 library_serial));
8390 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8391 thread */
8392 KMP_WARNING(SetLibraryIncorrectCall);
8393 return;
8394 }
8395
8396 switch (arg) {
8397 case library_serial:
8398 thread->th.th_set_nproc = 0;
8399 set__nproc(thread, 1);
8400 break;
8401 case library_turnaround:
8402 thread->th.th_set_nproc = 0;
8403 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8404 : __kmp_dflt_team_nth_ub);
8405 break;
8406 case library_throughput:
8407 thread->th.th_set_nproc = 0;
8408 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8409 : __kmp_dflt_team_nth_ub);
8410 break;
8411 default:
8412 KMP_FATAL(UnknownLibraryType, arg);
8413 }
8414
8415 __kmp_aux_set_library(arg);
8416 }
8417
__kmp_aux_set_stacksize(size_t arg)8418 void __kmp_aux_set_stacksize(size_t arg) {
8419 if (!__kmp_init_serial)
8420 __kmp_serial_initialize();
8421
8422 #if KMP_OS_DARWIN
8423 if (arg & (0x1000 - 1)) {
8424 arg &= ~(0x1000 - 1);
8425 if (arg + 0x1000) /* check for overflow if we round up */
8426 arg += 0x1000;
8427 }
8428 #endif
8429 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8430
8431 /* only change the default stacksize before the first parallel region */
8432 if (!TCR_4(__kmp_init_parallel)) {
8433 size_t value = arg; /* argument is in bytes */
8434
8435 if (value < __kmp_sys_min_stksize)
8436 value = __kmp_sys_min_stksize;
8437 else if (value > KMP_MAX_STKSIZE)
8438 value = KMP_MAX_STKSIZE;
8439
8440 __kmp_stksize = value;
8441
8442 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8443 }
8444
8445 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8446 }
8447
8448 /* set the behaviour of the runtime library */
8449 /* TODO this can cause some odd behaviour with sibling parallelism... */
__kmp_aux_set_library(enum library_type arg)8450 void __kmp_aux_set_library(enum library_type arg) {
8451 __kmp_library = arg;
8452
8453 switch (__kmp_library) {
8454 case library_serial: {
8455 KMP_INFORM(LibraryIsSerial);
8456 } break;
8457 case library_turnaround:
8458 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8459 __kmp_use_yield = 2; // only yield when oversubscribed
8460 break;
8461 case library_throughput:
8462 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8463 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8464 break;
8465 default:
8466 KMP_FATAL(UnknownLibraryType, arg);
8467 }
8468 }
8469
8470 /* Getting team information common for all team API */
8471 // Returns NULL if not in teams construct
__kmp_aux_get_team_info(int & teams_serialized)8472 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8473 kmp_info_t *thr = __kmp_entry_thread();
8474 teams_serialized = 0;
8475 if (thr->th.th_teams_microtask) {
8476 kmp_team_t *team = thr->th.th_team;
8477 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8478 int ii = team->t.t_level;
8479 teams_serialized = team->t.t_serialized;
8480 int level = tlevel + 1;
8481 KMP_DEBUG_ASSERT(ii >= tlevel);
8482 while (ii > level) {
8483 for (teams_serialized = team->t.t_serialized;
8484 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8485 }
8486 if (team->t.t_serialized && (!teams_serialized)) {
8487 team = team->t.t_parent;
8488 continue;
8489 }
8490 if (ii > level) {
8491 team = team->t.t_parent;
8492 ii--;
8493 }
8494 }
8495 return team;
8496 }
8497 return NULL;
8498 }
8499
__kmp_aux_get_team_num()8500 int __kmp_aux_get_team_num() {
8501 int serialized;
8502 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8503 if (team) {
8504 if (serialized > 1) {
8505 return 0; // teams region is serialized ( 1 team of 1 thread ).
8506 } else {
8507 return team->t.t_master_tid;
8508 }
8509 }
8510 return 0;
8511 }
8512
__kmp_aux_get_num_teams()8513 int __kmp_aux_get_num_teams() {
8514 int serialized;
8515 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8516 if (team) {
8517 if (serialized > 1) {
8518 return 1;
8519 } else {
8520 return team->t.t_parent->t.t_nproc;
8521 }
8522 }
8523 return 1;
8524 }
8525
8526 /* ------------------------------------------------------------------------ */
8527
8528 /*
8529 * Affinity Format Parser
8530 *
8531 * Field is in form of: %[[[0].]size]type
8532 * % and type are required (%% means print a literal '%')
8533 * type is either single char or long name surrounded by {},
8534 * e.g., N or {num_threads}
8535 * 0 => leading zeros
8536 * . => right justified when size is specified
8537 * by default output is left justified
8538 * size is the *minimum* field length
8539 * All other characters are printed as is
8540 *
8541 * Available field types:
8542 * L {thread_level} - omp_get_level()
8543 * n {thread_num} - omp_get_thread_num()
8544 * h {host} - name of host machine
8545 * P {process_id} - process id (integer)
8546 * T {thread_identifier} - native thread identifier (integer)
8547 * N {num_threads} - omp_get_num_threads()
8548 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8549 * a {thread_affinity} - comma separated list of integers or integer ranges
8550 * (values of affinity mask)
8551 *
8552 * Implementation-specific field types can be added
8553 * If a type is unknown, print "undefined"
8554 */
8555
8556 // Structure holding the short name, long name, and corresponding data type
8557 // for snprintf. A table of these will represent the entire valid keyword
8558 // field types.
8559 typedef struct kmp_affinity_format_field_t {
8560 char short_name; // from spec e.g., L -> thread level
8561 const char *long_name; // from spec thread_level -> thread level
8562 char field_format; // data type for snprintf (typically 'd' or 's'
8563 // for integer or string)
8564 } kmp_affinity_format_field_t;
8565
8566 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8567 #if KMP_AFFINITY_SUPPORTED
8568 {'A', "thread_affinity", 's'},
8569 #endif
8570 {'t', "team_num", 'd'},
8571 {'T', "num_teams", 'd'},
8572 {'L', "nesting_level", 'd'},
8573 {'n', "thread_num", 'd'},
8574 {'N', "num_threads", 'd'},
8575 {'a', "ancestor_tnum", 'd'},
8576 {'H', "host", 's'},
8577 {'P', "process_id", 'd'},
8578 {'i', "native_thread_id", 'd'}};
8579
8580 // Return the number of characters it takes to hold field
__kmp_aux_capture_affinity_field(int gtid,const kmp_info_t * th,const char ** ptr,kmp_str_buf_t * field_buffer)8581 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8582 const char **ptr,
8583 kmp_str_buf_t *field_buffer) {
8584 int rc, format_index, field_value;
8585 const char *width_left, *width_right;
8586 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8587 static const int FORMAT_SIZE = 20;
8588 char format[FORMAT_SIZE] = {0};
8589 char absolute_short_name = 0;
8590
8591 KMP_DEBUG_ASSERT(gtid >= 0);
8592 KMP_DEBUG_ASSERT(th);
8593 KMP_DEBUG_ASSERT(**ptr == '%');
8594 KMP_DEBUG_ASSERT(field_buffer);
8595
8596 __kmp_str_buf_clear(field_buffer);
8597
8598 // Skip the initial %
8599 (*ptr)++;
8600
8601 // Check for %% first
8602 if (**ptr == '%') {
8603 __kmp_str_buf_cat(field_buffer, "%", 1);
8604 (*ptr)++; // skip over the second %
8605 return 1;
8606 }
8607
8608 // Parse field modifiers if they are present
8609 pad_zeros = false;
8610 if (**ptr == '0') {
8611 pad_zeros = true;
8612 (*ptr)++; // skip over 0
8613 }
8614 right_justify = false;
8615 if (**ptr == '.') {
8616 right_justify = true;
8617 (*ptr)++; // skip over .
8618 }
8619 // Parse width of field: [width_left, width_right)
8620 width_left = width_right = NULL;
8621 if (**ptr >= '0' && **ptr <= '9') {
8622 width_left = *ptr;
8623 SKIP_DIGITS(*ptr);
8624 width_right = *ptr;
8625 }
8626
8627 // Create the format for KMP_SNPRINTF based on flags parsed above
8628 format_index = 0;
8629 format[format_index++] = '%';
8630 if (!right_justify)
8631 format[format_index++] = '-';
8632 if (pad_zeros)
8633 format[format_index++] = '0';
8634 if (width_left && width_right) {
8635 int i = 0;
8636 // Only allow 8 digit number widths.
8637 // This also prevents overflowing format variable
8638 while (i < 8 && width_left < width_right) {
8639 format[format_index++] = *width_left;
8640 width_left++;
8641 i++;
8642 }
8643 }
8644
8645 // Parse a name (long or short)
8646 // Canonicalize the name into absolute_short_name
8647 found_valid_name = false;
8648 parse_long_name = (**ptr == '{');
8649 if (parse_long_name)
8650 (*ptr)++; // skip initial left brace
8651 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8652 sizeof(__kmp_affinity_format_table[0]);
8653 ++i) {
8654 char short_name = __kmp_affinity_format_table[i].short_name;
8655 const char *long_name = __kmp_affinity_format_table[i].long_name;
8656 char field_format = __kmp_affinity_format_table[i].field_format;
8657 if (parse_long_name) {
8658 size_t length = KMP_STRLEN(long_name);
8659 if (strncmp(*ptr, long_name, length) == 0) {
8660 found_valid_name = true;
8661 (*ptr) += length; // skip the long name
8662 }
8663 } else if (**ptr == short_name) {
8664 found_valid_name = true;
8665 (*ptr)++; // skip the short name
8666 }
8667 if (found_valid_name) {
8668 format[format_index++] = field_format;
8669 format[format_index++] = '\0';
8670 absolute_short_name = short_name;
8671 break;
8672 }
8673 }
8674 if (parse_long_name) {
8675 if (**ptr != '}') {
8676 absolute_short_name = 0;
8677 } else {
8678 (*ptr)++; // skip over the right brace
8679 }
8680 }
8681
8682 // Attempt to fill the buffer with the requested
8683 // value using snprintf within __kmp_str_buf_print()
8684 switch (absolute_short_name) {
8685 case 't':
8686 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8687 break;
8688 case 'T':
8689 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8690 break;
8691 case 'L':
8692 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8693 break;
8694 case 'n':
8695 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8696 break;
8697 case 'H': {
8698 static const int BUFFER_SIZE = 256;
8699 char buf[BUFFER_SIZE];
8700 __kmp_expand_host_name(buf, BUFFER_SIZE);
8701 rc = __kmp_str_buf_print(field_buffer, format, buf);
8702 } break;
8703 case 'P':
8704 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8705 break;
8706 case 'i':
8707 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8708 break;
8709 case 'N':
8710 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8711 break;
8712 case 'a':
8713 field_value =
8714 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8715 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8716 break;
8717 #if KMP_AFFINITY_SUPPORTED
8718 case 'A': {
8719 kmp_str_buf_t buf;
8720 __kmp_str_buf_init(&buf);
8721 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8722 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8723 __kmp_str_buf_free(&buf);
8724 } break;
8725 #endif
8726 default:
8727 // According to spec, If an implementation does not have info for field
8728 // type, then "undefined" is printed
8729 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8730 // Skip the field
8731 if (parse_long_name) {
8732 SKIP_TOKEN(*ptr);
8733 if (**ptr == '}')
8734 (*ptr)++;
8735 } else {
8736 (*ptr)++;
8737 }
8738 }
8739
8740 KMP_ASSERT(format_index <= FORMAT_SIZE);
8741 return rc;
8742 }
8743
8744 /*
8745 * Return number of characters needed to hold the affinity string
8746 * (not including null byte character)
8747 * The resultant string is printed to buffer, which the caller can then
8748 * handle afterwards
8749 */
__kmp_aux_capture_affinity(int gtid,const char * format,kmp_str_buf_t * buffer)8750 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8751 kmp_str_buf_t *buffer) {
8752 const char *parse_ptr;
8753 size_t retval;
8754 const kmp_info_t *th;
8755 kmp_str_buf_t field;
8756
8757 KMP_DEBUG_ASSERT(buffer);
8758 KMP_DEBUG_ASSERT(gtid >= 0);
8759
8760 __kmp_str_buf_init(&field);
8761 __kmp_str_buf_clear(buffer);
8762
8763 th = __kmp_threads[gtid];
8764 retval = 0;
8765
8766 // If format is NULL or zero-length string, then we use
8767 // affinity-format-var ICV
8768 parse_ptr = format;
8769 if (parse_ptr == NULL || *parse_ptr == '\0') {
8770 parse_ptr = __kmp_affinity_format;
8771 }
8772 KMP_DEBUG_ASSERT(parse_ptr);
8773
8774 while (*parse_ptr != '\0') {
8775 // Parse a field
8776 if (*parse_ptr == '%') {
8777 // Put field in the buffer
8778 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8779 __kmp_str_buf_catbuf(buffer, &field);
8780 retval += rc;
8781 } else {
8782 // Put literal character in buffer
8783 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8784 retval++;
8785 parse_ptr++;
8786 }
8787 }
8788 __kmp_str_buf_free(&field);
8789 return retval;
8790 }
8791
8792 // Displays the affinity string to stdout
__kmp_aux_display_affinity(int gtid,const char * format)8793 void __kmp_aux_display_affinity(int gtid, const char *format) {
8794 kmp_str_buf_t buf;
8795 __kmp_str_buf_init(&buf);
8796 __kmp_aux_capture_affinity(gtid, format, &buf);
8797 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8798 __kmp_str_buf_free(&buf);
8799 }
8800
8801 /* ------------------------------------------------------------------------ */
__kmp_aux_set_blocktime(int arg,kmp_info_t * thread,int tid)8802 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8803 int blocktime = arg; /* argument is in microseconds */
8804 #if KMP_USE_MONITOR
8805 int bt_intervals;
8806 #endif
8807 kmp_int8 bt_set;
8808
8809 __kmp_save_internal_controls(thread);
8810
8811 /* Normalize and set blocktime for the teams */
8812 if (blocktime < KMP_MIN_BLOCKTIME)
8813 blocktime = KMP_MIN_BLOCKTIME;
8814 else if (blocktime > KMP_MAX_BLOCKTIME)
8815 blocktime = KMP_MAX_BLOCKTIME;
8816
8817 set__blocktime_team(thread->th.th_team, tid, blocktime);
8818 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8819
8820 #if KMP_USE_MONITOR
8821 /* Calculate and set blocktime intervals for the teams */
8822 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8823
8824 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8825 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8826 #endif
8827
8828 /* Set whether blocktime has been set to "TRUE" */
8829 bt_set = TRUE;
8830
8831 set__bt_set_team(thread->th.th_team, tid, bt_set);
8832 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8833 #if KMP_USE_MONITOR
8834 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8835 "bt_intervals=%d, monitor_updates=%d\n",
8836 __kmp_gtid_from_tid(tid, thread->th.th_team),
8837 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8838 __kmp_monitor_wakeups));
8839 #else
8840 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8841 __kmp_gtid_from_tid(tid, thread->th.th_team),
8842 thread->th.th_team->t.t_id, tid, blocktime));
8843 #endif
8844 }
8845
__kmp_aux_set_defaults(char const * str,size_t len)8846 void __kmp_aux_set_defaults(char const *str, size_t len) {
8847 if (!__kmp_init_serial) {
8848 __kmp_serial_initialize();
8849 }
8850 __kmp_env_initialize(str);
8851
8852 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8853 __kmp_env_print();
8854 }
8855 } // __kmp_aux_set_defaults
8856
8857 /* ------------------------------------------------------------------------ */
8858 /* internal fast reduction routines */
8859
8860 PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)8861 __kmp_determine_reduction_method(
8862 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8863 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8864 kmp_critical_name *lck) {
8865
8866 // Default reduction method: critical construct ( lck != NULL, like in current
8867 // PAROPT )
8868 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8869 // can be selected by RTL
8870 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8871 // can be selected by RTL
8872 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8873 // among generated by PAROPT.
8874
8875 PACKED_REDUCTION_METHOD_T retval;
8876
8877 int team_size;
8878
8879 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8880
8881 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8882 (loc && \
8883 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8884 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8885
8886 retval = critical_reduce_block;
8887
8888 // another choice of getting a team size (with 1 dynamic deference) is slower
8889 team_size = __kmp_get_team_num_threads(global_tid);
8890 if (team_size == 1) {
8891
8892 retval = empty_reduce_block;
8893
8894 } else {
8895
8896 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8897
8898 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8899 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8900 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8901
8902 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8903 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
8904 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8905
8906 int teamsize_cutoff = 4;
8907
8908 #if KMP_MIC_SUPPORTED
8909 if (__kmp_mic_type != non_mic) {
8910 teamsize_cutoff = 8;
8911 }
8912 #endif
8913 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8914 if (tree_available) {
8915 if (team_size <= teamsize_cutoff) {
8916 if (atomic_available) {
8917 retval = atomic_reduce_block;
8918 }
8919 } else {
8920 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8921 }
8922 } else if (atomic_available) {
8923 retval = atomic_reduce_block;
8924 }
8925 #else
8926 #error "Unknown or unsupported OS"
8927 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8928 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8929 // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8930
8931 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8932 KMP_ARCH_WASM || KMP_ARCH_PPC
8933
8934 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8935 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
8936 KMP_OS_WASI || KMP_OS_AIX
8937
8938 // basic tuning
8939
8940 if (atomic_available) {
8941 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8942 retval = atomic_reduce_block;
8943 }
8944 } // otherwise: use critical section
8945
8946 #elif KMP_OS_DARWIN
8947
8948 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8949 if (atomic_available && (num_vars <= 3)) {
8950 retval = atomic_reduce_block;
8951 } else if (tree_available) {
8952 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8953 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8954 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8955 }
8956 } // otherwise: use critical section
8957
8958 #else
8959 #error "Unknown or unsupported OS"
8960 #endif
8961
8962 #else
8963 #error "Unknown or unsupported architecture"
8964 #endif
8965 }
8966
8967 // KMP_FORCE_REDUCTION
8968
8969 // If the team is serialized (team_size == 1), ignore the forced reduction
8970 // method and stay with the unsynchronized method (empty_reduce_block)
8971 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8972 team_size != 1) {
8973
8974 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8975
8976 int atomic_available, tree_available;
8977
8978 switch ((forced_retval = __kmp_force_reduction_method)) {
8979 case critical_reduce_block:
8980 KMP_ASSERT(lck); // lck should be != 0
8981 break;
8982
8983 case atomic_reduce_block:
8984 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8985 if (!atomic_available) {
8986 KMP_WARNING(RedMethodNotSupported, "atomic");
8987 forced_retval = critical_reduce_block;
8988 }
8989 break;
8990
8991 case tree_reduce_block:
8992 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8993 if (!tree_available) {
8994 KMP_WARNING(RedMethodNotSupported, "tree");
8995 forced_retval = critical_reduce_block;
8996 } else {
8997 #if KMP_FAST_REDUCTION_BARRIER
8998 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8999 #endif
9000 }
9001 break;
9002
9003 default:
9004 KMP_ASSERT(0); // "unsupported method specified"
9005 }
9006
9007 retval = forced_retval;
9008 }
9009
9010 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9011
9012 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9013 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9014
9015 return (retval);
9016 }
9017 // this function is for testing set/get/determine reduce method
__kmp_get_reduce_method(void)9018 kmp_int32 __kmp_get_reduce_method(void) {
9019 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9020 }
9021
9022 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9023 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
__kmp_soft_pause()9024 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9025
9026 // Hard pause shuts down the runtime completely. Resume happens naturally when
9027 // OpenMP is used subsequently.
__kmp_hard_pause()9028 void __kmp_hard_pause() {
9029 __kmp_pause_status = kmp_hard_paused;
9030 __kmp_internal_end_thread(-1);
9031 }
9032
9033 // Soft resume sets __kmp_pause_status, and wakes up all threads.
__kmp_resume_if_soft_paused()9034 void __kmp_resume_if_soft_paused() {
9035 if (__kmp_pause_status == kmp_soft_paused) {
9036 __kmp_pause_status = kmp_not_paused;
9037
9038 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9039 kmp_info_t *thread = __kmp_threads[gtid];
9040 if (thread) { // Wake it if sleeping
9041 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9042 thread);
9043 if (fl.is_sleeping())
9044 fl.resume(gtid);
9045 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9046 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9047 } else { // thread holds the lock and may sleep soon
9048 do { // until either the thread sleeps, or we can get the lock
9049 if (fl.is_sleeping()) {
9050 fl.resume(gtid);
9051 break;
9052 } else if (__kmp_try_suspend_mx(thread)) {
9053 __kmp_unlock_suspend_mx(thread);
9054 break;
9055 }
9056 } while (1);
9057 }
9058 }
9059 }
9060 }
9061 }
9062
9063 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9064 // TODO: add warning messages
__kmp_pause_resource(kmp_pause_status_t level)9065 int __kmp_pause_resource(kmp_pause_status_t level) {
9066 if (level == kmp_not_paused) { // requesting resume
9067 if (__kmp_pause_status == kmp_not_paused) {
9068 // error message about runtime not being paused, so can't resume
9069 return 1;
9070 } else {
9071 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9072 __kmp_pause_status == kmp_hard_paused);
9073 __kmp_pause_status = kmp_not_paused;
9074 return 0;
9075 }
9076 } else if (level == kmp_soft_paused) { // requesting soft pause
9077 if (__kmp_pause_status != kmp_not_paused) {
9078 // error message about already being paused
9079 return 1;
9080 } else {
9081 __kmp_soft_pause();
9082 return 0;
9083 }
9084 } else if (level == kmp_hard_paused) { // requesting hard pause
9085 if (__kmp_pause_status != kmp_not_paused) {
9086 // error message about already being paused
9087 return 1;
9088 } else {
9089 __kmp_hard_pause();
9090 return 0;
9091 }
9092 } else {
9093 // error message about invalid level
9094 return 1;
9095 }
9096 }
9097
__kmp_omp_display_env(int verbose)9098 void __kmp_omp_display_env(int verbose) {
9099 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9100 if (__kmp_init_serial == 0)
9101 __kmp_do_serial_initialize();
9102 __kmp_display_env_impl(!verbose, verbose);
9103 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9104 }
9105
9106 // The team size is changing, so distributed barrier must be modified
__kmp_resize_dist_barrier(kmp_team_t * team,int old_nthreads,int new_nthreads)9107 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9108 int new_nthreads) {
9109 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9110 bp_dist_bar);
9111 kmp_info_t **other_threads = team->t.t_threads;
9112
9113 // We want all the workers to stop waiting on the barrier while we adjust the
9114 // size of the team.
9115 for (int f = 1; f < old_nthreads; ++f) {
9116 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9117 // Ignore threads that are already inactive or not present in the team
9118 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9119 // teams construct causes thread_limit to get passed in, and some of
9120 // those could be inactive; just ignore them
9121 continue;
9122 }
9123 // If thread is transitioning still to in_use state, wait for it
9124 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9125 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9126 KMP_CPU_PAUSE();
9127 }
9128 // The thread should be in_use now
9129 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9130 // Transition to unused state
9131 team->t.t_threads[f]->th.th_used_in_team.store(2);
9132 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9133 }
9134 // Release all the workers
9135 team->t.b->go_release();
9136
9137 KMP_MFENCE();
9138
9139 // Workers should see transition status 2 and move to 0; but may need to be
9140 // woken up first
9141 int count = old_nthreads - 1;
9142 while (count > 0) {
9143 count = old_nthreads - 1;
9144 for (int f = 1; f < old_nthreads; ++f) {
9145 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9146 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9147 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9148 void *, other_threads[f]->th.th_sleep_loc);
9149 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9150 }
9151 } else {
9152 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9153 count--;
9154 }
9155 }
9156 }
9157 // Now update the barrier size
9158 team->t.b->update_num_threads(new_nthreads);
9159 team->t.b->go_reset();
9160 }
9161
__kmp_add_threads_to_team(kmp_team_t * team,int new_nthreads)9162 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9163 // Add the threads back to the team
9164 KMP_DEBUG_ASSERT(team);
9165 // Threads were paused and pointed at th_used_in_team temporarily during a
9166 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9167 // the thread that it should transition itself back into the team. Then, if
9168 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9169 // to wake it up.
9170 for (int f = 1; f < new_nthreads; ++f) {
9171 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9172 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9173 3);
9174 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9175 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9176 (kmp_flag_32<false, false> *)NULL);
9177 }
9178 }
9179 // The threads should be transitioning to the team; when they are done, they
9180 // should have set th_used_in_team to 1. This loop forces master to wait until
9181 // all threads have moved into the team and are waiting in the barrier.
9182 int count = new_nthreads - 1;
9183 while (count > 0) {
9184 count = new_nthreads - 1;
9185 for (int f = 1; f < new_nthreads; ++f) {
9186 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9187 count--;
9188 }
9189 }
9190 }
9191 }
9192
9193 // Globals and functions for hidden helper task
9194 kmp_info_t **__kmp_hidden_helper_threads;
9195 kmp_info_t *__kmp_hidden_helper_main_thread;
9196 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9197 #if KMP_OS_LINUX
9198 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9199 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9200 #else
9201 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9202 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9203 #endif
9204
9205 namespace {
9206 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9207
__kmp_hidden_helper_wrapper_fn(int * gtid,int *,...)9208 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9209 // This is an explicit synchronization on all hidden helper threads in case
9210 // that when a regular thread pushes a hidden helper task to one hidden
9211 // helper thread, the thread has not been awaken once since they're released
9212 // by the main thread after creating the team.
9213 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9214 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9215 __kmp_hidden_helper_threads_num)
9216 ;
9217
9218 // If main thread, then wait for signal
9219 if (__kmpc_master(nullptr, *gtid)) {
9220 // First, unset the initial state and release the initial thread
9221 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9222 __kmp_hidden_helper_initz_release();
9223 __kmp_hidden_helper_main_thread_wait();
9224 // Now wake up all worker threads
9225 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9226 __kmp_hidden_helper_worker_thread_signal();
9227 }
9228 }
9229 }
9230 } // namespace
9231
__kmp_hidden_helper_threads_initz_routine()9232 void __kmp_hidden_helper_threads_initz_routine() {
9233 // Create a new root for hidden helper team/threads
9234 const int gtid = __kmp_register_root(TRUE);
9235 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9236 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9237 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9238 __kmp_hidden_helper_threads_num;
9239
9240 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9241
9242 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9243
9244 // Set the initialization flag to FALSE
9245 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9246
9247 __kmp_hidden_helper_threads_deinitz_release();
9248 }
9249
9250 /* Nesting Mode:
9251 Set via KMP_NESTING_MODE, which takes an integer.
9252 Note: we skip duplicate topology levels, and skip levels with only
9253 one entity.
9254 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9255 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9256 in the topology, and initializes the number of threads at each of those
9257 levels to the number of entities at each level, respectively, below the
9258 entity at the parent level.
9259 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9260 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9261 the user to turn nesting on explicitly. This is an even more experimental
9262 option to this experimental feature, and may change or go away in the
9263 future.
9264 */
9265
9266 // Allocate space to store nesting levels
__kmp_init_nesting_mode()9267 void __kmp_init_nesting_mode() {
9268 int levels = KMP_HW_LAST;
9269 __kmp_nesting_mode_nlevels = levels;
9270 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9271 for (int i = 0; i < levels; ++i)
9272 __kmp_nesting_nth_level[i] = 0;
9273 if (__kmp_nested_nth.size < levels) {
9274 __kmp_nested_nth.nth =
9275 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9276 __kmp_nested_nth.size = levels;
9277 }
9278 }
9279
9280 // Set # threads for top levels of nesting; must be called after topology set
__kmp_set_nesting_mode_threads()9281 void __kmp_set_nesting_mode_threads() {
9282 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9283
9284 if (__kmp_nesting_mode == 1)
9285 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9286 else if (__kmp_nesting_mode > 1)
9287 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9288
9289 if (__kmp_topology) { // use topology info
9290 int loc, hw_level;
9291 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9292 loc < __kmp_nesting_mode_nlevels;
9293 loc++, hw_level++) {
9294 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9295 if (__kmp_nesting_nth_level[loc] == 1)
9296 loc--;
9297 }
9298 // Make sure all cores are used
9299 if (__kmp_nesting_mode > 1 && loc > 1) {
9300 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9301 int num_cores = __kmp_topology->get_count(core_level);
9302 int upper_levels = 1;
9303 for (int level = 0; level < loc - 1; ++level)
9304 upper_levels *= __kmp_nesting_nth_level[level];
9305 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9306 __kmp_nesting_nth_level[loc - 1] =
9307 num_cores / __kmp_nesting_nth_level[loc - 2];
9308 }
9309 __kmp_nesting_mode_nlevels = loc;
9310 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9311 } else { // no topology info available; provide a reasonable guesstimation
9312 if (__kmp_avail_proc >= 4) {
9313 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9314 __kmp_nesting_nth_level[1] = 2;
9315 __kmp_nesting_mode_nlevels = 2;
9316 } else {
9317 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9318 __kmp_nesting_mode_nlevels = 1;
9319 }
9320 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9321 }
9322 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9323 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9324 }
9325 set__nproc(thread, __kmp_nesting_nth_level[0]);
9326 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9327 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9328 if (get__max_active_levels(thread) > 1) {
9329 // if max levels was set, set nesting mode levels to same
9330 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9331 }
9332 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9333 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9334 }
9335
9336 // Empty symbols to export (see exports_so.txt) when feature is disabled
9337 extern "C" {
9338 #if !KMP_STATS_ENABLED
__kmp_reset_stats()9339 void __kmp_reset_stats() {}
9340 #endif
9341 #if !USE_DEBUGGER
9342 int __kmp_omp_debug_struct_info = FALSE;
9343 int __kmp_debugging = FALSE;
9344 #endif
9345 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
__kmp_itt_fini_ittlib()9346 void __kmp_itt_fini_ittlib() {}
__kmp_itt_init_ittlib()9347 void __kmp_itt_init_ittlib() {}
9348 #endif
9349 }
9350
9351 // end of file
9352