1 /*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41
42 #include "tsan_annotations.h"
43
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46 KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48
49 char const __kmp_version_omp_api[] =
50 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54 KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58
59 /* ------------------------------------------------------------------------ */
60
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64
65 /* Forward declarations */
66
67 void __kmp_cleanup(void);
68
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70 int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72 kmp_internal_control_t *new_icvs,
73 ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76 int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82 kmp_internal_control_t *new_icvs, ident_t *loc);
83
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
__kmp_get_global_thread_id()99 int __kmp_get_global_thread_id() {
100 int i;
101 kmp_info_t **other_threads;
102 size_t stack_data;
103 char *stack_addr;
104 size_t stack_size;
105 char *stack_base;
106
107 KA_TRACE(
108 1000,
109 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
110 __kmp_nth, __kmp_all_nth));
111
112 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115 __kmp_init_gtid for this to work. */
116
117 if (!TCR_4(__kmp_init_gtid))
118 return KMP_GTID_DNE;
119
120 #ifdef KMP_TDATA_GTID
121 if (TCR_4(__kmp_gtid_mode) >= 3) {
122 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123 return __kmp_gtid;
124 }
125 #endif
126 if (TCR_4(__kmp_gtid_mode) >= 2) {
127 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128 return __kmp_gtid_get_specific();
129 }
130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131
132 stack_addr = (char *)&stack_data;
133 other_threads = __kmp_threads;
134
135 /* ATT: The code below is a source of potential bugs due to unsynchronized
136 access to __kmp_threads array. For example:
137 1. Current thread loads other_threads[i] to thr and checks it, it is
138 non-NULL.
139 2. Current thread is suspended by OS.
140 3. Another thread unregisters and finishes (debug versions of free()
141 may fill memory with something like 0xEF).
142 4. Current thread is resumed.
143 5. Current thread reads junk from *thr.
144 TODO: Fix it. --ln */
145
146 for (i = 0; i < __kmp_threads_capacity; i++) {
147
148 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149 if (!thr)
150 continue;
151
152 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154
155 /* stack grows down -- search through all of the active threads */
156
157 if (stack_addr <= stack_base) {
158 size_t stack_diff = stack_base - stack_addr;
159
160 if (stack_diff <= stack_size) {
161 /* The only way we can be closer than the allocated */
162 /* stack size is if we are running on this thread. */
163 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164 return i;
165 }
166 }
167 }
168
169 /* get specific to try and determine our gtid */
170 KA_TRACE(1000,
171 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172 "thread, using TLS\n"));
173 i = __kmp_gtid_get_specific();
174
175 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
176
177 /* if we havn't been assigned a gtid, then return code */
178 if (i < 0)
179 return i;
180
181 /* dynamically updated stack window for uber threads to avoid get_specific
182 call */
183 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184 KMP_FATAL(StackOverflow, i);
185 }
186
187 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188 if (stack_addr > stack_base) {
189 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192 stack_base);
193 } else {
194 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195 stack_base - stack_addr);
196 }
197
198 /* Reprint stack bounds for ubermaster since they have been refined */
199 if (__kmp_storage_map) {
200 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203 other_threads[i]->th.th_info.ds.ds_stacksize,
204 "th_%d stack (refinement)", i);
205 }
206 return i;
207 }
208
__kmp_get_global_thread_id_reg()209 int __kmp_get_global_thread_id_reg() {
210 int gtid;
211
212 if (!__kmp_init_serial) {
213 gtid = KMP_GTID_DNE;
214 } else
215 #ifdef KMP_TDATA_GTID
216 if (TCR_4(__kmp_gtid_mode) >= 3) {
217 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218 gtid = __kmp_gtid;
219 } else
220 #endif
221 if (TCR_4(__kmp_gtid_mode) >= 2) {
222 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223 gtid = __kmp_gtid_get_specific();
224 } else {
225 KA_TRACE(1000,
226 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227 gtid = __kmp_get_global_thread_id();
228 }
229
230 /* we must be a new uber master sibling thread */
231 if (gtid == KMP_GTID_DNE) {
232 KA_TRACE(10,
233 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234 "Registering a new gtid.\n"));
235 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236 if (!__kmp_init_serial) {
237 __kmp_do_serial_initialize();
238 gtid = __kmp_gtid_get_specific();
239 } else {
240 gtid = __kmp_register_root(FALSE);
241 }
242 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244 }
245
246 KMP_DEBUG_ASSERT(gtid >= 0);
247
248 return gtid;
249 }
250
251 /* caller must hold forkjoin_lock */
__kmp_check_stack_overlap(kmp_info_t * th)252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253 int f;
254 char *stack_beg = NULL;
255 char *stack_end = NULL;
256 int gtid;
257
258 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259 if (__kmp_storage_map) {
260 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262
263 gtid = __kmp_gtid_from_thread(th);
264
265 if (gtid == KMP_GTID_MONITOR) {
266 __kmp_print_storage_map_gtid(
267 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268 "th_%s stack (%s)", "mon",
269 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270 } else {
271 __kmp_print_storage_map_gtid(
272 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273 "th_%d stack (%s)", gtid,
274 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275 }
276 }
277
278 /* No point in checking ubermaster threads since they use refinement and
279 * cannot overlap */
280 gtid = __kmp_gtid_from_thread(th);
281 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282 KA_TRACE(10,
283 ("__kmp_check_stack_overlap: performing extensive checking\n"));
284 if (stack_beg == NULL) {
285 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287 }
288
289 for (f = 0; f < __kmp_threads_capacity; f++) {
290 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291
292 if (f_th && f_th != th) {
293 char *other_stack_end =
294 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295 char *other_stack_beg =
296 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299
300 /* Print the other stack values before the abort */
301 if (__kmp_storage_map)
302 __kmp_print_storage_map_gtid(
303 -1, other_stack_beg, other_stack_end,
304 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306
307 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308 __kmp_msg_null);
309 }
310 }
311 }
312 }
313 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315
316 /* ------------------------------------------------------------------------ */
317
__kmp_infinite_loop(void)318 void __kmp_infinite_loop(void) {
319 static int done = FALSE;
320
321 while (!done) {
322 KMP_YIELD(TRUE);
323 }
324 }
325
326 #define MAX_MESSAGE 512
327
__kmp_print_storage_map_gtid(int gtid,void * p1,void * p2,size_t size,char const * format,...)328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329 char const *format, ...) {
330 char buffer[MAX_MESSAGE];
331 va_list ap;
332
333 va_start(ap, format);
334 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335 p2, (unsigned long)size, format);
336 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337 __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339 int node;
340 if (gtid >= 0) {
341 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342 if (__kmp_storage_map_verbose) {
343 node = __kmp_get_host_node(p1);
344 if (node < 0) /* doesn't work, so don't try this next time */
345 __kmp_storage_map_verbose = FALSE;
346 else {
347 char *last;
348 int lastNode;
349 int localProc = __kmp_get_cpu_from_gtid(gtid);
350
351 const int page_size = KMP_GET_PAGE_SIZE();
352
353 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355 if (localProc >= 0)
356 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
357 localProc >> 1);
358 else
359 __kmp_printf_no_lock(" GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361 /* The more elaborate format is disabled for now because of the prctl
362 * hanging bug. */
363 do {
364 last = p1;
365 lastNode = node;
366 /* This loop collates adjacent pages with the same host node. */
367 do {
368 (char *)p1 += page_size;
369 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
371 lastNode);
372 } while (p1 <= p2);
373 #else
374 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
375 (char *)p1 + (page_size - 1),
376 __kmp_get_host_node(p1));
377 if (p1 < p2) {
378 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
379 (char *)p2 + (page_size - 1),
380 __kmp_get_host_node(p2));
381 }
382 #endif
383 }
384 }
385 } else
386 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
387 }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391
__kmp_warn(char const * format,...)392 void __kmp_warn(char const *format, ...) {
393 char buffer[MAX_MESSAGE];
394 va_list ap;
395
396 if (__kmp_generate_warnings == kmp_warnings_off) {
397 return;
398 }
399
400 va_start(ap, format);
401
402 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404 __kmp_vprintf(kmp_err, buffer, ap);
405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406
407 va_end(ap);
408 }
409
__kmp_abort_process()410 void __kmp_abort_process() {
411 // Later threads may stall here, but that's ok because abort() will kill them.
412 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413
414 if (__kmp_debug_buf) {
415 __kmp_dump_debug_buffer();
416 }
417
418 if (KMP_OS_WINDOWS) {
419 // Let other threads know of abnormal termination and prevent deadlock
420 // if abort happened during library initialization or shutdown
421 __kmp_global.g.g_abort = SIGABRT;
422
423 /* On Windows* OS by default abort() causes pop-up error box, which stalls
424 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425 boxes. _set_abort_behavior() works well, but this function is not
426 available in VS7 (this is not problem for DLL, but it is a problem for
427 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428 help, at least in some versions of MS C RTL.
429
430 It seems following sequence is the only way to simulate abort() and
431 avoid pop-up error box. */
432 raise(SIGABRT);
433 _exit(3); // Just in case, if signal ignored, exit anyway.
434 } else {
435 abort();
436 }
437
438 __kmp_infinite_loop();
439 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440
441 } // __kmp_abort_process
442
__kmp_abort_thread(void)443 void __kmp_abort_thread(void) {
444 // TODO: Eliminate g_abort global variable and this function.
445 // In case of abort just call abort(), it will kill all the threads.
446 __kmp_infinite_loop();
447 } // __kmp_abort_thread
448
449 /* Print out the storage map for the major kmp_info_t thread data structures
450 that are allocated together. */
451
__kmp_print_thread_storage_map(kmp_info_t * thr,int gtid)452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454 gtid);
455
456 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458
459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460 sizeof(kmp_local_t), "th_%d.th_local", gtid);
461
462 __kmp_print_storage_map_gtid(
463 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465
466 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467 &thr->th.th_bar[bs_plain_barrier + 1],
468 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469 gtid);
470
471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472 &thr->th.th_bar[bs_forkjoin_barrier + 1],
473 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474 gtid);
475
476 #if KMP_FAST_REDUCTION_BARRIER
477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478 &thr->th.th_bar[bs_reduction_barrier + 1],
479 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480 gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483
484 /* Print out the storage map for the major kmp_team_t team data structures
485 that are allocated together. */
486
__kmp_print_team_storage_map(const char * header,kmp_team_t * team,int team_id,int num_thr)487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488 int team_id, int num_thr) {
489 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491 header, team_id);
492
493 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494 &team->t.t_bar[bs_last_barrier],
495 sizeof(kmp_balign_team_t) * bs_last_barrier,
496 "%s_%d.t_bar", header, team_id);
497
498 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499 &team->t.t_bar[bs_plain_barrier + 1],
500 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501 header, team_id);
502
503 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504 &team->t.t_bar[bs_forkjoin_barrier + 1],
505 sizeof(kmp_balign_team_t),
506 "%s_%d.t_bar[forkjoin]", header, team_id);
507
508 #if KMP_FAST_REDUCTION_BARRIER
509 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510 &team->t.t_bar[bs_reduction_barrier + 1],
511 sizeof(kmp_balign_team_t),
512 "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514
515 __kmp_print_storage_map_gtid(
516 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518
519 __kmp_print_storage_map_gtid(
520 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522
523 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524 &team->t.t_disp_buffer[num_disp_buff],
525 sizeof(dispatch_shared_info_t) * num_disp_buff,
526 "%s_%d.t_disp_buffer", header, team_id);
527 }
528
__kmp_init_allocator()529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
__kmp_fini_allocator()530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531
532 /* ------------------------------------------------------------------------ */
533
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536
__kmp_reset_lock(kmp_bootstrap_lock_t * lck)537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538 // TODO: Change to __kmp_break_bootstrap_lock().
539 __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541
__kmp_reset_locks_on_process_detach(int gtid_req)542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543 int i;
544 int thread_count;
545
546 // PROCESS_DETACH is expected to be called by a thread that executes
547 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548 // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550 // threads can be still alive here, although being about to be terminated. The
551 // threads in the array with ds_thread==0 are most suspicious. Actually, it
552 // can be not safe to access the __kmp_threads[].
553
554 // TODO: does it make sense to check __kmp_roots[] ?
555
556 // Let's check that there are no other alive threads registered with the OMP
557 // lib.
558 while (1) {
559 thread_count = 0;
560 for (i = 0; i < __kmp_threads_capacity; ++i) {
561 if (!__kmp_threads)
562 continue;
563 kmp_info_t *th = __kmp_threads[i];
564 if (th == NULL)
565 continue;
566 int gtid = th->th.th_info.ds.ds_gtid;
567 if (gtid == gtid_req)
568 continue;
569 if (gtid < 0)
570 continue;
571 DWORD exit_val;
572 int alive = __kmp_is_thread_alive(th, &exit_val);
573 if (alive) {
574 ++thread_count;
575 }
576 }
577 if (thread_count == 0)
578 break; // success
579 }
580
581 // Assume that I'm alone. Now it might be safe to check and reset locks.
582 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583 __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585 __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588
DllMain(HINSTANCE hInstDLL,DWORD fdwReason,LPVOID lpReserved)589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591
592 switch (fdwReason) {
593
594 case DLL_PROCESS_ATTACH:
595 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596
597 return TRUE;
598
599 case DLL_PROCESS_DETACH:
600 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601
602 if (lpReserved != NULL) {
603 // lpReserved is used for telling the difference:
604 // lpReserved == NULL when FreeLibrary() was called,
605 // lpReserved != NULL when the process terminates.
606 // When FreeLibrary() is called, worker threads remain alive. So they will
607 // release the forkjoin lock by themselves. When the process terminates,
608 // worker threads disappear triggering the problem of unreleased forkjoin
609 // lock as described below.
610
611 // A worker thread can take the forkjoin lock. The problem comes up if
612 // that worker thread becomes dead before it releases the forkjoin lock.
613 // The forkjoin lock remains taken, while the thread executing
614 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615 // to take the forkjoin lock and will always fail, so that the application
616 // will never finish [normally]. This scenario is possible if
617 // __kmpc_end() has not been executed. It looks like it's not a corner
618 // case, but common cases:
619 // - the main function was compiled by an alternative compiler;
620 // - the main function was compiled by icl but without /Qopenmp
621 // (application with plugins);
622 // - application terminates by calling C exit(), Fortran CALL EXIT() or
623 // Fortran STOP.
624 // - alive foreign thread prevented __kmpc_end from doing cleanup.
625 //
626 // This is a hack to work around the problem.
627 // TODO: !!! figure out something better.
628 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629 }
630
631 __kmp_internal_end_library(__kmp_gtid_get_specific());
632
633 return TRUE;
634
635 case DLL_THREAD_ATTACH:
636 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637
638 /* if we want to register new siblings all the time here call
639 * __kmp_get_gtid(); */
640 return TRUE;
641
642 case DLL_THREAD_DETACH:
643 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644
645 __kmp_internal_end_thread(__kmp_gtid_get_specific());
646 return TRUE;
647 }
648
649 return TRUE;
650 }
651
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
__kmp_parallel_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657 int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659 kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661
662 if (__kmp_env_consistency_check) {
663 if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669 }
670 #ifdef BUILD_PARALLEL_ORDERED
671 if (!team->t.t_serialized) {
672 KMP_MB();
673 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674 NULL);
675 KMP_MB();
676 }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679
680 /* __kmp_parallel_dxo -- Signal the next task. */
__kmp_parallel_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682 int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684 int tid = __kmp_tid_from_gtid(gtid);
685 kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687
688 if (__kmp_env_consistency_check) {
689 if (__kmp_threads[gtid]->th.th_root->r.r_active)
690 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691 }
692 #ifdef BUILD_PARALLEL_ORDERED
693 if (!team->t.t_serialized) {
694 KMP_MB(); /* Flush all pending memory write invalidates. */
695
696 /* use the tid of the next thread in this team */
697 /* TODO replace with general release procedure */
698 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699
700 KMP_MB(); /* Flush all pending memory write invalidates. */
701 }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit */
707
__kmp_enter_single(int gtid,ident_t * id_ref,int push_ws)708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709 int status;
710 kmp_info_t *th;
711 kmp_team_t *team;
712
713 if (!TCR_4(__kmp_init_parallel))
714 __kmp_parallel_initialize();
715 __kmp_resume_if_soft_paused();
716
717 th = __kmp_threads[gtid];
718 team = th->th.th_team;
719 status = 0;
720
721 th->th.th_ident = id_ref;
722
723 if (team->t.t_serialized) {
724 status = 1;
725 } else {
726 kmp_int32 old_this = th->th.th_local.this_construct;
727
728 ++th->th.th_local.this_construct;
729 /* try to set team count to thread count--success means thread got the
730 single block */
731 /* TODO: Should this be acquire or release? */
732 if (team->t.t_construct == old_this) {
733 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734 th->th.th_local.this_construct);
735 }
736 #if USE_ITT_BUILD
737 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739 team->t.t_active_level ==
740 1) { // Only report metadata by master of active team at level 1
741 __kmp_itt_metadata_single(id_ref);
742 }
743 #endif /* USE_ITT_BUILD */
744 }
745
746 if (__kmp_env_consistency_check) {
747 if (status && push_ws) {
748 __kmp_push_workshare(gtid, ct_psingle, id_ref);
749 } else {
750 __kmp_check_workshare(gtid, ct_psingle, id_ref);
751 }
752 }
753 #if USE_ITT_BUILD
754 if (status) {
755 __kmp_itt_single_start(gtid);
756 }
757 #endif /* USE_ITT_BUILD */
758 return status;
759 }
760
__kmp_exit_single(int gtid)761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763 __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765 if (__kmp_env_consistency_check)
766 __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768
769 /* determine if we can go parallel or must use a serialized parallel region and
770 * how many threads we can use
771 * set_nproc is the number of threads requested for the team
772 * returns 0 if we should serialize or only use one thread,
773 * otherwise the number of threads to use
774 * The forkjoin lock is held by the caller. */
__kmp_reserve_threads(kmp_root_t * root,kmp_team_t * parent_team,int master_tid,int set_nthreads,int enter_teams)775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776 int master_tid, int set_nthreads,
777 int enter_teams) {
778 int capacity;
779 int new_nthreads;
780 KMP_DEBUG_ASSERT(__kmp_init_serial);
781 KMP_DEBUG_ASSERT(root && parent_team);
782 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783
784 // If dyn-var is set, dynamically adjust the number of desired threads,
785 // according to the method specified by dynamic_mode.
786 new_nthreads = set_nthreads;
787 if (!get__dynamic_2(parent_team, master_tid)) {
788 ;
789 }
790 #ifdef USE_LOAD_BALANCE
791 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793 if (new_nthreads == 1) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795 "reservation to 1 thread\n",
796 master_tid));
797 return 1;
798 }
799 if (new_nthreads < set_nthreads) {
800 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801 "reservation to %d threads\n",
802 master_tid, new_nthreads));
803 }
804 }
805 #endif /* USE_LOAD_BALANCE */
806 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807 new_nthreads = __kmp_avail_proc - __kmp_nth +
808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809 if (new_nthreads <= 1) {
810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811 "reservation to 1 thread\n",
812 master_tid));
813 return 1;
814 }
815 if (new_nthreads < set_nthreads) {
816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817 "reservation to %d threads\n",
818 master_tid, new_nthreads));
819 } else {
820 new_nthreads = set_nthreads;
821 }
822 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823 if (set_nthreads > 2) {
824 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825 new_nthreads = (new_nthreads % set_nthreads) + 1;
826 if (new_nthreads == 1) {
827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828 "reservation to 1 thread\n",
829 master_tid));
830 return 1;
831 }
832 if (new_nthreads < set_nthreads) {
833 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834 "reservation to %d threads\n",
835 master_tid, new_nthreads));
836 }
837 }
838 } else {
839 KMP_ASSERT(0);
840 }
841
842 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843 if (__kmp_nth + new_nthreads -
844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845 __kmp_max_nth) {
846 int tl_nthreads = __kmp_max_nth - __kmp_nth +
847 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848 if (tl_nthreads <= 0) {
849 tl_nthreads = 1;
850 }
851
852 // If dyn-var is false, emit a 1-time warning.
853 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854 __kmp_reserve_warn = 1;
855 __kmp_msg(kmp_ms_warning,
856 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858 }
859 if (tl_nthreads == 1) {
860 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861 "reduced reservation to 1 thread\n",
862 master_tid));
863 return 1;
864 }
865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866 "reservation to %d threads\n",
867 master_tid, tl_nthreads));
868 new_nthreads = tl_nthreads;
869 }
870
871 // Respect OMP_THREAD_LIMIT
872 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874 if (cg_nthreads + new_nthreads -
875 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876 max_cg_threads) {
877 int tl_nthreads = max_cg_threads - cg_nthreads +
878 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879 if (tl_nthreads <= 0) {
880 tl_nthreads = 1;
881 }
882
883 // If dyn-var is false, emit a 1-time warning.
884 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885 __kmp_reserve_warn = 1;
886 __kmp_msg(kmp_ms_warning,
887 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889 }
890 if (tl_nthreads == 1) {
891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892 "reduced reservation to 1 thread\n",
893 master_tid));
894 return 1;
895 }
896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897 "reservation to %d threads\n",
898 master_tid, tl_nthreads));
899 new_nthreads = tl_nthreads;
900 }
901
902 // Check if the threads array is large enough, or needs expanding.
903 // See comment in __kmp_register_root() about the adjustment if
904 // __kmp_threads[0] == NULL.
905 capacity = __kmp_threads_capacity;
906 if (TCR_PTR(__kmp_threads[0]) == NULL) {
907 --capacity;
908 }
909 if (__kmp_nth + new_nthreads -
910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911 capacity) {
912 // Expand the threads array.
913 int slotsRequired = __kmp_nth + new_nthreads -
914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915 capacity;
916 int slotsAdded = __kmp_expand_threads(slotsRequired);
917 if (slotsAdded < slotsRequired) {
918 // The threads array was not expanded enough.
919 new_nthreads -= (slotsRequired - slotsAdded);
920 KMP_ASSERT(new_nthreads >= 1);
921
922 // If dyn-var is false, emit a 1-time warning.
923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924 __kmp_reserve_warn = 1;
925 if (__kmp_tp_cached) {
926 __kmp_msg(kmp_ms_warning,
927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930 } else {
931 __kmp_msg(kmp_ms_warning,
932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934 }
935 }
936 }
937 }
938
939 #ifdef KMP_DEBUG
940 if (new_nthreads == 1) {
941 KC_TRACE(10,
942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943 "dead roots and rechecking; requested %d threads\n",
944 __kmp_get_gtid(), set_nthreads));
945 } else {
946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947 " %d threads\n",
948 __kmp_get_gtid(), new_nthreads, set_nthreads));
949 }
950 #endif // KMP_DEBUG
951 return new_nthreads;
952 }
953
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955 assured that there are enough threads available, because we checked on that
956 earlier within critical section forkjoin */
__kmp_fork_team_threads(kmp_root_t * root,kmp_team_t * team,kmp_info_t * master_th,int master_gtid)957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958 kmp_info_t *master_th, int master_gtid) {
959 int i;
960 int use_hot_team;
961
962 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964 KMP_MB();
965
966 /* first, let's setup the master thread */
967 master_th->th.th_info.ds.ds_tid = 0;
968 master_th->th.th_team = team;
969 master_th->th.th_team_nproc = team->t.t_nproc;
970 master_th->th.th_team_master = master_th;
971 master_th->th.th_team_serialized = FALSE;
972 master_th->th.th_dispatch = &team->t.t_dispatch[0];
973
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976 use_hot_team = 0;
977 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978 if (hot_teams) { // hot teams array is not allocated if
979 // KMP_HOT_TEAMS_MAX_LEVEL=0
980 int level = team->t.t_active_level - 1; // index in array of hot teams
981 if (master_th->th.th_teams_microtask) { // are we inside the teams?
982 if (master_th->th.th_teams_size.nteams > 1) {
983 ++level; // level was not increased in teams construct for
984 // team_of_masters
985 }
986 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987 master_th->th.th_teams_level == team->t.t_level) {
988 ++level; // level was not increased in teams construct for
989 // team_of_workers before the parallel
990 } // team->t.t_level will be increased inside parallel
991 }
992 if (level < __kmp_hot_teams_max_level) {
993 if (hot_teams[level].hot_team) {
994 // hot team has already been allocated for given level
995 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996 use_hot_team = 1; // the team is ready to use
997 } else {
998 use_hot_team = 0; // AC: threads are not allocated yet
999 hot_teams[level].hot_team = team; // remember new hot team
1000 hot_teams[level].hot_team_nth = team->t.t_nproc;
1001 }
1002 } else {
1003 use_hot_team = 0;
1004 }
1005 }
1006 #else
1007 use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009 if (!use_hot_team) {
1010
1011 /* install the master thread */
1012 team->t.t_threads[0] = master_th;
1013 __kmp_initialize_info(master_th, team, 0, master_gtid);
1014
1015 /* now, install the worker threads */
1016 for (i = 1; i < team->t.t_nproc; i++) {
1017
1018 /* fork or reallocate a new thread and install it in team */
1019 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020 team->t.t_threads[i] = thr;
1021 KMP_DEBUG_ASSERT(thr);
1022 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023 /* align team and thread arrived states */
1024 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029 team->t.t_bar[bs_plain_barrier].b_arrived));
1030 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031 thr->th.th_teams_level = master_th->th.th_teams_level;
1032 thr->th.th_teams_size = master_th->th.th_teams_size;
1033 { // Initialize threads' barrier data.
1034 int b;
1035 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036 for (b = 0; b < bs_last_barrier; ++b) {
1037 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042 }
1043 }
1044 }
1045
1046 #if KMP_AFFINITY_SUPPORTED
1047 __kmp_partition_places(team);
1048 #endif
1049 }
1050
1051 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052 for (i = 0; i < team->t.t_nproc; i++) {
1053 kmp_info_t *thr = team->t.t_threads[i];
1054 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055 thr->th.th_prev_level != team->t.t_level) {
1056 team->t.t_display_affinity = 1;
1057 break;
1058 }
1059 }
1060 }
1061
1062 KMP_MB();
1063 }
1064
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
propagateFPControl(kmp_team_t * team)1069 inline static void propagateFPControl(kmp_team_t *team) {
1070 if (__kmp_inherit_fp_control) {
1071 kmp_int16 x87_fpu_control_word;
1072 kmp_uint32 mxcsr;
1073
1074 // Get master values of FPU control flags (both X87 and vector)
1075 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076 __kmp_store_mxcsr(&mxcsr);
1077 mxcsr &= KMP_X86_MXCSR_MASK;
1078
1079 // There is no point looking at t_fp_control_saved here.
1080 // If it is TRUE, we still have to update the values if they are different
1081 // from those we now have. If it is FALSE we didn't save anything yet, but
1082 // our objective is the same. We have to ensure that the values in the team
1083 // are the same as those we have.
1084 // So, this code achieves what we need whether or not t_fp_control_saved is
1085 // true. By checking whether the value needs updating we avoid unnecessary
1086 // writes that would put the cache-line into a written state, causing all
1087 // threads in the team to have to read it again.
1088 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090 // Although we don't use this value, other code in the runtime wants to know
1091 // whether it should restore them. So we must ensure it is correct.
1092 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093 } else {
1094 // Similarly here. Don't write to this cache-line in the team structure
1095 // unless we have to.
1096 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097 }
1098 }
1099
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
updateHWFPControl(kmp_team_t * team)1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104 // Only reset the fp control regs if they have been changed in the team.
1105 // the parallel region that we are exiting.
1106 kmp_int16 x87_fpu_control_word;
1107 kmp_uint32 mxcsr;
1108 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109 __kmp_store_mxcsr(&mxcsr);
1110 mxcsr &= KMP_X86_MXCSR_MASK;
1111
1112 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113 __kmp_clear_x87_fpu_status_word();
1114 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115 }
1116
1117 if (team->t.t_mxcsr != mxcsr) {
1118 __kmp_load_mxcsr(&team->t.t_mxcsr);
1119 }
1120 }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128 int realloc); // forward declaration
1129
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131 single master thread. */
__kmp_serialized_parallel(ident_t * loc,kmp_int32 global_tid)1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133 kmp_info_t *this_thr;
1134 kmp_team_t *serial_team;
1135
1136 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137
1138 /* Skip all this code for autopar serialized loops since it results in
1139 unacceptable overhead */
1140 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141 return;
1142
1143 if (!TCR_4(__kmp_init_parallel))
1144 __kmp_parallel_initialize();
1145 __kmp_resume_if_soft_paused();
1146
1147 this_thr = __kmp_threads[global_tid];
1148 serial_team = this_thr->th.th_serial_team;
1149
1150 /* utilize the serialized team held by this thread */
1151 KMP_DEBUG_ASSERT(serial_team);
1152 KMP_MB();
1153
1154 if (__kmp_tasking_mode != tskm_immediate_exec) {
1155 KMP_DEBUG_ASSERT(
1156 this_thr->th.th_task_team ==
1157 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159 NULL);
1160 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161 "team %p, new task_team = NULL\n",
1162 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163 this_thr->th.th_task_team = NULL;
1164 }
1165
1166 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168 proc_bind = proc_bind_false;
1169 } else if (proc_bind == proc_bind_default) {
1170 // No proc_bind clause was specified, so use the current value
1171 // of proc-bind-var for this parallel region.
1172 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173 }
1174 // Reset for next parallel region
1175 this_thr->th.th_set_proc_bind = proc_bind_default;
1176
1177 #if OMPT_SUPPORT
1178 ompt_data_t ompt_parallel_data = ompt_data_none;
1179 ompt_data_t *implicit_task_data;
1180 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181 if (ompt_enabled.enabled &&
1182 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183
1184 ompt_task_info_t *parent_task_info;
1185 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186
1187 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188 if (ompt_enabled.ompt_callback_parallel_begin) {
1189 int team_size = 1;
1190
1191 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192 &(parent_task_info->task_data), &(parent_task_info->frame),
1193 &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1194 codeptr);
1195 }
1196 }
1197 #endif // OMPT_SUPPORT
1198
1199 if (this_thr->th.th_team != serial_team) {
1200 // Nested level will be an index in the nested nthreads array
1201 int level = this_thr->th.th_team->t.t_level;
1202
1203 if (serial_team->t.t_serialized) {
1204 /* this serial team was already used
1205 TODO increase performance by making this locks more specific */
1206 kmp_team_t *new_team;
1207
1208 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209
1210 new_team =
1211 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213 ompt_parallel_data,
1214 #endif
1215 proc_bind, &this_thr->th.th_current_task->td_icvs,
1216 0 USE_NESTED_HOT_ARG(NULL));
1217 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218 KMP_ASSERT(new_team);
1219
1220 /* setup new serialized team and install it */
1221 new_team->t.t_threads[0] = this_thr;
1222 new_team->t.t_parent = this_thr->th.th_team;
1223 serial_team = new_team;
1224 this_thr->th.th_serial_team = serial_team;
1225
1226 KF_TRACE(
1227 10,
1228 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229 global_tid, serial_team));
1230
1231 /* TODO the above breaks the requirement that if we run out of resources,
1232 then we can still guarantee that serialized teams are ok, since we may
1233 need to allocate a new one */
1234 } else {
1235 KF_TRACE(
1236 10,
1237 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238 global_tid, serial_team));
1239 }
1240
1241 /* we have to initialize this serial team */
1242 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245 serial_team->t.t_ident = loc;
1246 serial_team->t.t_serialized = 1;
1247 serial_team->t.t_nproc = 1;
1248 serial_team->t.t_parent = this_thr->th.th_team;
1249 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250 this_thr->th.th_team = serial_team;
1251 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252
1253 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254 this_thr->th.th_current_task));
1255 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256 this_thr->th.th_current_task->td_flags.executing = 0;
1257
1258 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259
1260 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261 implicit task for each serialized task represented by
1262 team->t.t_serialized? */
1263 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264 &this_thr->th.th_current_task->td_parent->td_icvs);
1265
1266 // Thread value exists in the nested nthreads array for the next nested
1267 // level
1268 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269 this_thr->th.th_current_task->td_icvs.nproc =
1270 __kmp_nested_nth.nth[level + 1];
1271 }
1272
1273 if (__kmp_nested_proc_bind.used &&
1274 (level + 1 < __kmp_nested_proc_bind.used)) {
1275 this_thr->th.th_current_task->td_icvs.proc_bind =
1276 __kmp_nested_proc_bind.bind_types[level + 1];
1277 }
1278
1279 #if USE_DEBUGGER
1280 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282 this_thr->th.th_info.ds.ds_tid = 0;
1283
1284 /* set thread cache values */
1285 this_thr->th.th_team_nproc = 1;
1286 this_thr->th.th_team_master = this_thr;
1287 this_thr->th.th_team_serialized = 1;
1288
1289 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292
1293 propagateFPControl(serial_team);
1294
1295 /* check if we need to allocate dispatch buffers stack */
1296 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298 serial_team->t.t_dispatch->th_disp_buffer =
1299 (dispatch_private_info_t *)__kmp_allocate(
1300 sizeof(dispatch_private_info_t));
1301 }
1302 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303
1304 KMP_MB();
1305
1306 } else {
1307 /* this serialized team is already being used,
1308 * that's fine, just add another nested level */
1309 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312 ++serial_team->t.t_serialized;
1313 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314
1315 // Nested level will be an index in the nested nthreads array
1316 int level = this_thr->th.th_team->t.t_level;
1317 // Thread value exists in the nested nthreads array for the next nested
1318 // level
1319 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320 this_thr->th.th_current_task->td_icvs.nproc =
1321 __kmp_nested_nth.nth[level + 1];
1322 }
1323 serial_team->t.t_level++;
1324 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325 "of serial team %p to %d\n",
1326 global_tid, serial_team, serial_team->t.t_level));
1327
1328 /* allocate/push dispatch buffers stack */
1329 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330 {
1331 dispatch_private_info_t *disp_buffer =
1332 (dispatch_private_info_t *)__kmp_allocate(
1333 sizeof(dispatch_private_info_t));
1334 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336 }
1337 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338
1339 KMP_MB();
1340 }
1341 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342
1343 // Perform the display affinity functionality for
1344 // serialized parallel regions
1345 if (__kmp_display_affinity) {
1346 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347 this_thr->th.th_prev_num_threads != 1) {
1348 // NULL means use the affinity-format-var ICV
1349 __kmp_aux_display_affinity(global_tid, NULL);
1350 this_thr->th.th_prev_level = serial_team->t.t_level;
1351 this_thr->th.th_prev_num_threads = 1;
1352 }
1353 }
1354
1355 if (__kmp_env_consistency_check)
1356 __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358 serial_team->t.ompt_team_info.master_return_address = codeptr;
1359 if (ompt_enabled.enabled &&
1360 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362
1363 ompt_lw_taskteam_t lw_taskteam;
1364 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365 &ompt_parallel_data, codeptr);
1366
1367 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368 // don't use lw_taskteam after linking. content was swaped
1369
1370 /* OMPT implicit task begin */
1371 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372 if (ompt_enabled.ompt_callback_implicit_task) {
1373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376 OMPT_CUR_TASK_INFO(this_thr)
1377 ->thread_num = __kmp_tid_from_gtid(global_tid);
1378 }
1379
1380 /* OMPT state */
1381 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383 }
1384 #endif
1385 }
1386
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
__kmp_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,va_list * ap)1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390 enum fork_context_e call_context, // Intel, GNU, ...
1391 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394 va_list *ap
1395 #else
1396 va_list ap
1397 #endif
1398 ) {
1399 void **argv;
1400 int i;
1401 int master_tid;
1402 int master_this_cons;
1403 kmp_team_t *team;
1404 kmp_team_t *parent_team;
1405 kmp_info_t *master_th;
1406 kmp_root_t *root;
1407 int nthreads;
1408 int master_active;
1409 int master_set_numthreads;
1410 int level;
1411 int active_level;
1412 int teams_level;
1413 #if KMP_NESTED_HOT_TEAMS
1414 kmp_hot_team_ptr_t **p_hot_teams;
1415 #endif
1416 { // KMP_TIME_BLOCK
1417 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419
1420 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422 /* Some systems prefer the stack for the root thread(s) to start with */
1423 /* some gap from the parent stack to prevent false sharing. */
1424 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425 /* These 2 lines below are so this does not get optimized out */
1426 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427 __kmp_stkpadding += (short)((kmp_int64)dummy);
1428 }
1429
1430 /* initialize if needed */
1431 KMP_DEBUG_ASSERT(
1432 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433 if (!TCR_4(__kmp_init_parallel))
1434 __kmp_parallel_initialize();
1435 __kmp_resume_if_soft_paused();
1436
1437 /* setup current data */
1438 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439 // shutdown
1440 parent_team = master_th->th.th_team;
1441 master_tid = master_th->th.th_info.ds.ds_tid;
1442 master_this_cons = master_th->th.th_local.this_construct;
1443 root = master_th->th.th_root;
1444 master_active = root->r.r_active;
1445 master_set_numthreads = master_th->th.th_set_nproc;
1446
1447 #if OMPT_SUPPORT
1448 ompt_data_t ompt_parallel_data = ompt_data_none;
1449 ompt_data_t *parent_task_data;
1450 ompt_frame_t *ompt_frame;
1451 ompt_data_t *implicit_task_data;
1452 void *return_address = NULL;
1453
1454 if (ompt_enabled.enabled) {
1455 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456 NULL, NULL);
1457 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458 }
1459 #endif
1460
1461 // Nested level will be an index in the nested nthreads array
1462 level = parent_team->t.t_level;
1463 // used to launch non-serial teams even if nested is not allowed
1464 active_level = parent_team->t.t_active_level;
1465 // needed to check nesting inside the teams
1466 teams_level = master_th->th.th_teams_level;
1467 #if KMP_NESTED_HOT_TEAMS
1468 p_hot_teams = &master_th->th.th_hot_teams;
1469 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473 // it is either actual or not needed (when active_level > 0)
1474 (*p_hot_teams)[0].hot_team_nth = 1;
1475 }
1476 #endif
1477
1478 #if OMPT_SUPPORT
1479 if (ompt_enabled.enabled) {
1480 if (ompt_enabled.ompt_callback_parallel_begin) {
1481 int team_size = master_set_numthreads
1482 ? master_set_numthreads
1483 : get__nproc_2(parent_team, master_tid);
1484 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1485 parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1486 OMPT_INVOKER(call_context), return_address);
1487 }
1488 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489 }
1490 #endif
1491
1492 master_th->th.th_ident = loc;
1493
1494 if (master_th->th.th_teams_microtask && ap &&
1495 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1496 // AC: This is start of parallel that is nested inside teams construct.
1497 // The team is actual (hot), all workers are ready at the fork barrier.
1498 // No lock needed to initialize the team a bit, then free workers.
1499 parent_team->t.t_ident = loc;
1500 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1501 parent_team->t.t_argc = argc;
1502 argv = (void **)parent_team->t.t_argv;
1503 for (i = argc - 1; i >= 0; --i)
1504 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1505 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1506 *argv++ = va_arg(*ap, void *);
1507 #else
1508 *argv++ = va_arg(ap, void *);
1509 #endif
1510 // Increment our nested depth levels, but not increase the serialization
1511 if (parent_team == master_th->th.th_serial_team) {
1512 // AC: we are in serialized parallel
1513 __kmpc_serialized_parallel(loc, gtid);
1514 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1515 // AC: need this in order enquiry functions work
1516 // correctly, will restore at join time
1517 parent_team->t.t_serialized--;
1518 #if OMPT_SUPPORT
1519 void *dummy;
1520 void **exit_runtime_p;
1521
1522 ompt_lw_taskteam_t lw_taskteam;
1523
1524 if (ompt_enabled.enabled) {
1525 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1526 &ompt_parallel_data, return_address);
1527 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1528
1529 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1530 // don't use lw_taskteam after linking. content was swaped
1531
1532 /* OMPT implicit task begin */
1533 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1534 if (ompt_enabled.ompt_callback_implicit_task) {
1535 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1536 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1537 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1538 OMPT_CUR_TASK_INFO(master_th)
1539 ->thread_num = __kmp_tid_from_gtid(gtid);
1540 }
1541
1542 /* OMPT state */
1543 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1544 } else {
1545 exit_runtime_p = &dummy;
1546 }
1547 #endif
1548
1549 {
1550 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1551 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1552 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1553 #if OMPT_SUPPORT
1554 ,
1555 exit_runtime_p
1556 #endif
1557 );
1558 }
1559
1560 #if OMPT_SUPPORT
1561 *exit_runtime_p = NULL;
1562 if (ompt_enabled.enabled) {
1563 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1564 if (ompt_enabled.ompt_callback_implicit_task) {
1565 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1566 ompt_scope_end, NULL, implicit_task_data, 1,
1567 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1568 }
1569 __ompt_lw_taskteam_unlink(master_th);
1570
1571 if (ompt_enabled.ompt_callback_parallel_end) {
1572 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1573 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1574 OMPT_INVOKER(call_context), return_address);
1575 }
1576 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1577 }
1578 #endif
1579 return TRUE;
1580 }
1581
1582 parent_team->t.t_pkfn = microtask;
1583 parent_team->t.t_invoke = invoker;
1584 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1585 parent_team->t.t_active_level++;
1586 parent_team->t.t_level++;
1587 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1588
1589 /* Change number of threads in the team if requested */
1590 if (master_set_numthreads) { // The parallel has num_threads clause
1591 if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1592 // AC: only can reduce number of threads dynamically, can't increase
1593 kmp_info_t **other_threads = parent_team->t.t_threads;
1594 parent_team->t.t_nproc = master_set_numthreads;
1595 for (i = 0; i < master_set_numthreads; ++i) {
1596 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1597 }
1598 // Keep extra threads hot in the team for possible next parallels
1599 }
1600 master_th->th.th_set_nproc = 0;
1601 }
1602
1603 #if USE_DEBUGGER
1604 if (__kmp_debugging) { // Let debugger override number of threads.
1605 int nth = __kmp_omp_num_threads(loc);
1606 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1607 master_set_numthreads = nth;
1608 }
1609 }
1610 #endif
1611
1612 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1613 "master_th=%p, gtid=%d\n",
1614 root, parent_team, master_th, gtid));
1615 __kmp_internal_fork(loc, gtid, parent_team);
1616 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1617 "master_th=%p, gtid=%d\n",
1618 root, parent_team, master_th, gtid));
1619
1620 /* Invoke microtask for MASTER thread */
1621 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1622 parent_team->t.t_id, parent_team->t.t_pkfn));
1623
1624 if (!parent_team->t.t_invoke(gtid)) {
1625 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1626 }
1627 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1628 parent_team->t.t_id, parent_team->t.t_pkfn));
1629 KMP_MB(); /* Flush all pending memory write invalidates. */
1630
1631 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1632
1633 return TRUE;
1634 } // Parallel closely nested in teams construct
1635
1636 #if KMP_DEBUG
1637 if (__kmp_tasking_mode != tskm_immediate_exec) {
1638 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1639 parent_team->t.t_task_team[master_th->th.th_task_state]);
1640 }
1641 #endif
1642
1643 if (parent_team->t.t_active_level >=
1644 master_th->th.th_current_task->td_icvs.max_active_levels) {
1645 nthreads = 1;
1646 } else {
1647 int enter_teams = ((ap == NULL && active_level == 0) ||
1648 (ap && teams_level > 0 && teams_level == level));
1649 nthreads =
1650 master_set_numthreads
1651 ? master_set_numthreads
1652 : get__nproc_2(
1653 parent_team,
1654 master_tid); // TODO: get nproc directly from current task
1655
1656 // Check if we need to take forkjoin lock? (no need for serialized
1657 // parallel out of teams construct). This code moved here from
1658 // __kmp_reserve_threads() to speedup nested serialized parallels.
1659 if (nthreads > 1) {
1660 if ((get__max_active_levels(master_th) == 1 &&
1661 (root->r.r_in_parallel && !enter_teams)) ||
1662 (__kmp_library == library_serial)) {
1663 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1664 " threads\n",
1665 gtid, nthreads));
1666 nthreads = 1;
1667 }
1668 }
1669 if (nthreads > 1) {
1670 /* determine how many new threads we can use */
1671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1672 /* AC: If we execute teams from parallel region (on host), then teams
1673 should be created but each can only have 1 thread if nesting is
1674 disabled. If teams called from serial region, then teams and their
1675 threads should be created regardless of the nesting setting. */
1676 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1677 nthreads, enter_teams);
1678 if (nthreads == 1) {
1679 // Free lock for single thread execution here; for multi-thread
1680 // execution it will be freed later after team of threads created
1681 // and initialized
1682 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1683 }
1684 }
1685 }
1686 KMP_DEBUG_ASSERT(nthreads > 0);
1687
1688 // If we temporarily changed the set number of threads then restore it now
1689 master_th->th.th_set_nproc = 0;
1690
1691 /* create a serialized parallel region? */
1692 if (nthreads == 1) {
1693 /* josh todo: hypothetical question: what do we do for OS X*? */
1694 #if KMP_OS_LINUX && \
1695 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1696 void *args[argc];
1697 #else
1698 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1700 KMP_ARCH_AARCH64) */
1701
1702 KA_TRACE(20,
1703 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1704
1705 __kmpc_serialized_parallel(loc, gtid);
1706
1707 if (call_context == fork_context_intel) {
1708 /* TODO this sucks, use the compiler itself to pass args! :) */
1709 master_th->th.th_serial_team->t.t_ident = loc;
1710 if (!ap) {
1711 // revert change made in __kmpc_serialized_parallel()
1712 master_th->th.th_serial_team->t.t_level--;
1713 // Get args from parent team for teams construct
1714
1715 #if OMPT_SUPPORT
1716 void *dummy;
1717 void **exit_runtime_p;
1718 ompt_task_info_t *task_info;
1719
1720 ompt_lw_taskteam_t lw_taskteam;
1721
1722 if (ompt_enabled.enabled) {
1723 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724 &ompt_parallel_data, return_address);
1725
1726 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727 // don't use lw_taskteam after linking. content was swaped
1728
1729 task_info = OMPT_CUR_TASK_INFO(master_th);
1730 exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1731 if (ompt_enabled.ompt_callback_implicit_task) {
1732 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1733 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1734 &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1735 OMPT_CUR_TASK_INFO(master_th)
1736 ->thread_num = __kmp_tid_from_gtid(gtid);
1737 }
1738
1739 /* OMPT state */
1740 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1741 } else {
1742 exit_runtime_p = &dummy;
1743 }
1744 #endif
1745
1746 {
1747 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1748 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1749 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1750 parent_team->t.t_argv
1751 #if OMPT_SUPPORT
1752 ,
1753 exit_runtime_p
1754 #endif
1755 );
1756 }
1757
1758 #if OMPT_SUPPORT
1759 if (ompt_enabled.enabled) {
1760 exit_runtime_p = NULL;
1761 if (ompt_enabled.ompt_callback_implicit_task) {
1762 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763 ompt_scope_end, NULL, &(task_info->task_data), 1,
1764 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1765 }
1766
1767 __ompt_lw_taskteam_unlink(master_th);
1768 if (ompt_enabled.ompt_callback_parallel_end) {
1769 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1770 OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1771 OMPT_INVOKER(call_context), return_address);
1772 }
1773 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774 }
1775 #endif
1776 } else if (microtask == (microtask_t)__kmp_teams_master) {
1777 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1778 master_th->th.th_serial_team);
1779 team = master_th->th.th_team;
1780 // team->t.t_pkfn = microtask;
1781 team->t.t_invoke = invoker;
1782 __kmp_alloc_argv_entries(argc, team, TRUE);
1783 team->t.t_argc = argc;
1784 argv = (void **)team->t.t_argv;
1785 if (ap) {
1786 for (i = argc - 1; i >= 0; --i)
1787 // TODO: revert workaround for Intel(R) 64 tracker #96
1788 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1789 *argv++ = va_arg(*ap, void *);
1790 #else
1791 *argv++ = va_arg(ap, void *);
1792 #endif
1793 } else {
1794 for (i = 0; i < argc; ++i)
1795 // Get args from parent team for teams construct
1796 argv[i] = parent_team->t.t_argv[i];
1797 }
1798 // AC: revert change made in __kmpc_serialized_parallel()
1799 // because initial code in teams should have level=0
1800 team->t.t_level--;
1801 // AC: call special invoker for outer "parallel" of teams construct
1802 invoker(gtid);
1803 } else {
1804 argv = args;
1805 for (i = argc - 1; i >= 0; --i)
1806 // TODO: revert workaround for Intel(R) 64 tracker #96
1807 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1808 *argv++ = va_arg(*ap, void *);
1809 #else
1810 *argv++ = va_arg(ap, void *);
1811 #endif
1812 KMP_MB();
1813
1814 #if OMPT_SUPPORT
1815 void *dummy;
1816 void **exit_runtime_p;
1817 ompt_task_info_t *task_info;
1818
1819 ompt_lw_taskteam_t lw_taskteam;
1820
1821 if (ompt_enabled.enabled) {
1822 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1823 &ompt_parallel_data, return_address);
1824 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1825 // don't use lw_taskteam after linking. content was swaped
1826 task_info = OMPT_CUR_TASK_INFO(master_th);
1827 exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1828
1829 /* OMPT implicit task begin */
1830 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1831 if (ompt_enabled.ompt_callback_implicit_task) {
1832 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1833 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1834 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1835 OMPT_CUR_TASK_INFO(master_th)
1836 ->thread_num = __kmp_tid_from_gtid(gtid);
1837 }
1838
1839 /* OMPT state */
1840 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1841 } else {
1842 exit_runtime_p = &dummy;
1843 }
1844 #endif
1845
1846 {
1847 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1848 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1849 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1850 #if OMPT_SUPPORT
1851 ,
1852 exit_runtime_p
1853 #endif
1854 );
1855 }
1856
1857 #if OMPT_SUPPORT
1858 if (ompt_enabled.enabled) {
1859 *exit_runtime_p = NULL;
1860 if (ompt_enabled.ompt_callback_implicit_task) {
1861 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1862 ompt_scope_end, NULL, &(task_info->task_data), 1,
1863 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1864 }
1865
1866 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1867 __ompt_lw_taskteam_unlink(master_th);
1868 if (ompt_enabled.ompt_callback_parallel_end) {
1869 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1870 &ompt_parallel_data, parent_task_data,
1871 OMPT_INVOKER(call_context), return_address);
1872 }
1873 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874 }
1875 #endif
1876 }
1877 } else if (call_context == fork_context_gnu) {
1878 #if OMPT_SUPPORT
1879 ompt_lw_taskteam_t lwt;
1880 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1881 return_address);
1882
1883 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1884 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1885 // don't use lw_taskteam after linking. content was swaped
1886 #endif
1887
1888 // we were called from GNU native code
1889 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890 return FALSE;
1891 } else {
1892 KMP_ASSERT2(call_context < fork_context_last,
1893 "__kmp_fork_call: unknown fork_context parameter");
1894 }
1895
1896 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1897 KMP_MB();
1898 return FALSE;
1899 } // if (nthreads == 1)
1900
1901 // GEH: only modify the executing flag in the case when not serialized
1902 // serialized case is handled in kmpc_serialized_parallel
1903 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1904 "curtask=%p, curtask_max_aclevel=%d\n",
1905 parent_team->t.t_active_level, master_th,
1906 master_th->th.th_current_task,
1907 master_th->th.th_current_task->td_icvs.max_active_levels));
1908 // TODO: GEH - cannot do this assertion because root thread not set up as
1909 // executing
1910 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1911 master_th->th.th_current_task->td_flags.executing = 0;
1912
1913 if (!master_th->th.th_teams_microtask || level > teams_level) {
1914 /* Increment our nested depth level */
1915 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1916 }
1917
1918 // See if we need to make a copy of the ICVs.
1919 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1920 if ((level + 1 < __kmp_nested_nth.used) &&
1921 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1922 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1923 } else {
1924 nthreads_icv = 0; // don't update
1925 }
1926
1927 // Figure out the proc_bind_policy for the new team.
1928 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1929 kmp_proc_bind_t proc_bind_icv =
1930 proc_bind_default; // proc_bind_default means don't update
1931 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1932 proc_bind = proc_bind_false;
1933 } else {
1934 if (proc_bind == proc_bind_default) {
1935 // No proc_bind clause specified; use current proc-bind-var for this
1936 // parallel region
1937 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1938 }
1939 /* else: The proc_bind policy was specified explicitly on parallel clause.
1940 This overrides proc-bind-var for this parallel region, but does not
1941 change proc-bind-var. */
1942 // Figure the value of proc-bind-var for the child threads.
1943 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1944 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1945 master_th->th.th_current_task->td_icvs.proc_bind)) {
1946 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1947 }
1948 }
1949
1950 // Reset for next parallel region
1951 master_th->th.th_set_proc_bind = proc_bind_default;
1952
1953 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1954 kmp_internal_control_t new_icvs;
1955 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1956 new_icvs.next = NULL;
1957 if (nthreads_icv > 0) {
1958 new_icvs.nproc = nthreads_icv;
1959 }
1960 if (proc_bind_icv != proc_bind_default) {
1961 new_icvs.proc_bind = proc_bind_icv;
1962 }
1963
1964 /* allocate a new parallel team */
1965 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1966 team = __kmp_allocate_team(root, nthreads, nthreads,
1967 #if OMPT_SUPPORT
1968 ompt_parallel_data,
1969 #endif
1970 proc_bind, &new_icvs,
1971 argc USE_NESTED_HOT_ARG(master_th));
1972 } else {
1973 /* allocate a new parallel team */
1974 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1975 team = __kmp_allocate_team(root, nthreads, nthreads,
1976 #if OMPT_SUPPORT
1977 ompt_parallel_data,
1978 #endif
1979 proc_bind,
1980 &master_th->th.th_current_task->td_icvs,
1981 argc USE_NESTED_HOT_ARG(master_th));
1982 }
1983 KF_TRACE(
1984 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1985
1986 /* setup the new team */
1987 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1988 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1989 KMP_CHECK_UPDATE(team->t.t_ident, loc);
1990 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1991 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1992 #if OMPT_SUPPORT
1993 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1994 return_address);
1995 #endif
1996 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1997 // TODO: parent_team->t.t_level == INT_MAX ???
1998 if (!master_th->th.th_teams_microtask || level > teams_level) {
1999 int new_level = parent_team->t.t_level + 1;
2000 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2001 new_level = parent_team->t.t_active_level + 1;
2002 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2003 } else {
2004 // AC: Do not increase parallel level at start of the teams construct
2005 int new_level = parent_team->t.t_level;
2006 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2007 new_level = parent_team->t.t_active_level;
2008 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2009 }
2010 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2011 // set master's schedule as new run-time schedule
2012 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2013
2014 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2015 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2016
2017 // Update the floating point rounding in the team if required.
2018 propagateFPControl(team);
2019
2020 if (__kmp_tasking_mode != tskm_immediate_exec) {
2021 // Set master's task team to team's task team. Unless this is hot team, it
2022 // should be NULL.
2023 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2024 parent_team->t.t_task_team[master_th->th.th_task_state]);
2025 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2026 "%p, new task_team %p / team %p\n",
2027 __kmp_gtid_from_thread(master_th),
2028 master_th->th.th_task_team, parent_team,
2029 team->t.t_task_team[master_th->th.th_task_state], team));
2030
2031 if (active_level || master_th->th.th_task_team) {
2032 // Take a memo of master's task_state
2033 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2034 if (master_th->th.th_task_state_top >=
2035 master_th->th.th_task_state_stack_sz) { // increase size
2036 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2037 kmp_uint8 *old_stack, *new_stack;
2038 kmp_uint32 i;
2039 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2040 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2041 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2042 }
2043 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2044 ++i) { // zero-init rest of stack
2045 new_stack[i] = 0;
2046 }
2047 old_stack = master_th->th.th_task_state_memo_stack;
2048 master_th->th.th_task_state_memo_stack = new_stack;
2049 master_th->th.th_task_state_stack_sz = new_size;
2050 __kmp_free(old_stack);
2051 }
2052 // Store master's task_state on stack
2053 master_th->th
2054 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2055 master_th->th.th_task_state;
2056 master_th->th.th_task_state_top++;
2057 #if KMP_NESTED_HOT_TEAMS
2058 if (master_th->th.th_hot_teams &&
2059 active_level < __kmp_hot_teams_max_level &&
2060 team == master_th->th.th_hot_teams[active_level].hot_team) {
2061 // Restore master's nested state if nested hot team
2062 master_th->th.th_task_state =
2063 master_th->th
2064 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2065 } else {
2066 #endif
2067 master_th->th.th_task_state = 0;
2068 #if KMP_NESTED_HOT_TEAMS
2069 }
2070 #endif
2071 }
2072 #if !KMP_NESTED_HOT_TEAMS
2073 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2074 (team == root->r.r_hot_team));
2075 #endif
2076 }
2077
2078 KA_TRACE(
2079 20,
2080 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2081 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2082 team->t.t_nproc));
2083 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2084 (team->t.t_master_tid == 0 &&
2085 (team->t.t_parent == root->r.r_root_team ||
2086 team->t.t_parent->t.t_serialized)));
2087 KMP_MB();
2088
2089 /* now, setup the arguments */
2090 argv = (void **)team->t.t_argv;
2091 if (ap) {
2092 for (i = argc - 1; i >= 0; --i) {
2093 // TODO: revert workaround for Intel(R) 64 tracker #96
2094 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2095 void *new_argv = va_arg(*ap, void *);
2096 #else
2097 void *new_argv = va_arg(ap, void *);
2098 #endif
2099 KMP_CHECK_UPDATE(*argv, new_argv);
2100 argv++;
2101 }
2102 } else {
2103 for (i = 0; i < argc; ++i) {
2104 // Get args from parent team for teams construct
2105 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2106 }
2107 }
2108
2109 /* now actually fork the threads */
2110 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2111 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2112 root->r.r_active = TRUE;
2113
2114 __kmp_fork_team_threads(root, team, master_th, gtid);
2115 __kmp_setup_icv_copy(team, nthreads,
2116 &master_th->th.th_current_task->td_icvs, loc);
2117
2118 #if OMPT_SUPPORT
2119 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2120 #endif
2121
2122 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2123
2124 #if USE_ITT_BUILD
2125 if (team->t.t_active_level == 1 // only report frames at level 1
2126 && !master_th->th.th_teams_microtask) { // not in teams construct
2127 #if USE_ITT_NOTIFY
2128 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2129 (__kmp_forkjoin_frames_mode == 3 ||
2130 __kmp_forkjoin_frames_mode == 1)) {
2131 kmp_uint64 tmp_time = 0;
2132 if (__itt_get_timestamp_ptr)
2133 tmp_time = __itt_get_timestamp();
2134 // Internal fork - report frame begin
2135 master_th->th.th_frame_time = tmp_time;
2136 if (__kmp_forkjoin_frames_mode == 3)
2137 team->t.t_region_time = tmp_time;
2138 } else
2139 // only one notification scheme (either "submit" or "forking/joined", not both)
2140 #endif /* USE_ITT_NOTIFY */
2141 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2142 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2143 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2144 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2145 }
2146 }
2147 #endif /* USE_ITT_BUILD */
2148
2149 /* now go on and do the work */
2150 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2151 KMP_MB();
2152 KF_TRACE(10,
2153 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2154 root, team, master_th, gtid));
2155
2156 #if USE_ITT_BUILD
2157 if (__itt_stack_caller_create_ptr) {
2158 team->t.t_stack_id =
2159 __kmp_itt_stack_caller_create(); // create new stack stitching id
2160 // before entering fork barrier
2161 }
2162 #endif /* USE_ITT_BUILD */
2163
2164 // AC: skip __kmp_internal_fork at teams construct, let only master
2165 // threads execute
2166 if (ap) {
2167 __kmp_internal_fork(loc, gtid, team);
2168 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2169 "master_th=%p, gtid=%d\n",
2170 root, team, master_th, gtid));
2171 }
2172
2173 if (call_context == fork_context_gnu) {
2174 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2175 return TRUE;
2176 }
2177
2178 /* Invoke microtask for MASTER thread */
2179 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2180 team->t.t_id, team->t.t_pkfn));
2181 } // END of timer KMP_fork_call block
2182
2183 #if KMP_STATS_ENABLED
2184 // If beginning a teams construct, then change thread state
2185 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2186 if (!ap) {
2187 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2188 }
2189 #endif
2190
2191 if (!team->t.t_invoke(gtid)) {
2192 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2193 }
2194
2195 #if KMP_STATS_ENABLED
2196 // If was beginning of a teams construct, then reset thread state
2197 if (!ap) {
2198 KMP_SET_THREAD_STATE(previous_state);
2199 }
2200 #endif
2201
2202 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2203 team->t.t_id, team->t.t_pkfn));
2204 KMP_MB(); /* Flush all pending memory write invalidates. */
2205
2206 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2207
2208 #if OMPT_SUPPORT
2209 if (ompt_enabled.enabled) {
2210 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2211 }
2212 #endif
2213
2214 return TRUE;
2215 }
2216
2217 #if OMPT_SUPPORT
__kmp_join_restore_state(kmp_info_t * thread,kmp_team_t * team)2218 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2219 kmp_team_t *team) {
2220 // restore state outside the region
2221 thread->th.ompt_thread_info.state =
2222 ((team->t.t_serialized) ? ompt_state_work_serial
2223 : ompt_state_work_parallel);
2224 }
2225
__kmp_join_ompt(int gtid,kmp_info_t * thread,kmp_team_t * team,ompt_data_t * parallel_data,fork_context_e fork_context,void * codeptr)2226 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2227 kmp_team_t *team, ompt_data_t *parallel_data,
2228 fork_context_e fork_context, void *codeptr) {
2229 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2230 if (ompt_enabled.ompt_callback_parallel_end) {
2231 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2232 parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2233 codeptr);
2234 }
2235
2236 task_info->frame.enter_frame = ompt_data_none;
2237 __kmp_join_restore_state(thread, team);
2238 }
2239 #endif
2240
__kmp_join_call(ident_t * loc,int gtid,enum fork_context_e fork_context,int exit_teams)2241 void __kmp_join_call(ident_t *loc, int gtid
2242 #if OMPT_SUPPORT
2243 ,
2244 enum fork_context_e fork_context
2245 #endif
2246 ,
2247 int exit_teams) {
2248 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2249 kmp_team_t *team;
2250 kmp_team_t *parent_team;
2251 kmp_info_t *master_th;
2252 kmp_root_t *root;
2253 int master_active;
2254
2255 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2256
2257 /* setup current data */
2258 master_th = __kmp_threads[gtid];
2259 root = master_th->th.th_root;
2260 team = master_th->th.th_team;
2261 parent_team = team->t.t_parent;
2262
2263 master_th->th.th_ident = loc;
2264
2265 #if OMPT_SUPPORT
2266 if (ompt_enabled.enabled) {
2267 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2268 }
2269 #endif
2270
2271 #if KMP_DEBUG
2272 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2273 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2274 "th_task_team = %p\n",
2275 __kmp_gtid_from_thread(master_th), team,
2276 team->t.t_task_team[master_th->th.th_task_state],
2277 master_th->th.th_task_team));
2278 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2279 team->t.t_task_team[master_th->th.th_task_state]);
2280 }
2281 #endif
2282
2283 if (team->t.t_serialized) {
2284 if (master_th->th.th_teams_microtask) {
2285 // We are in teams construct
2286 int level = team->t.t_level;
2287 int tlevel = master_th->th.th_teams_level;
2288 if (level == tlevel) {
2289 // AC: we haven't incremented it earlier at start of teams construct,
2290 // so do it here - at the end of teams construct
2291 team->t.t_level++;
2292 } else if (level == tlevel + 1) {
2293 // AC: we are exiting parallel inside teams, need to increment
2294 // serialization in order to restore it in the next call to
2295 // __kmpc_end_serialized_parallel
2296 team->t.t_serialized++;
2297 }
2298 }
2299 __kmpc_end_serialized_parallel(loc, gtid);
2300
2301 #if OMPT_SUPPORT
2302 if (ompt_enabled.enabled) {
2303 __kmp_join_restore_state(master_th, parent_team);
2304 }
2305 #endif
2306
2307 return;
2308 }
2309
2310 master_active = team->t.t_master_active;
2311
2312 if (!exit_teams) {
2313 // AC: No barrier for internal teams at exit from teams construct.
2314 // But there is barrier for external team (league).
2315 __kmp_internal_join(loc, gtid, team);
2316 } else {
2317 master_th->th.th_task_state =
2318 0; // AC: no tasking in teams (out of any parallel)
2319 }
2320
2321 KMP_MB();
2322
2323 #if OMPT_SUPPORT
2324 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2325 void *codeptr = team->t.ompt_team_info.master_return_address;
2326 #endif
2327
2328 #if USE_ITT_BUILD
2329 if (__itt_stack_caller_create_ptr) {
2330 __kmp_itt_stack_caller_destroy(
2331 (__itt_caller)team->t
2332 .t_stack_id); // destroy the stack stitching id after join barrier
2333 }
2334
2335 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2336 if (team->t.t_active_level == 1 &&
2337 !master_th->th.th_teams_microtask) { /* not in teams construct */
2338 master_th->th.th_ident = loc;
2339 // only one notification scheme (either "submit" or "forking/joined", not
2340 // both)
2341 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2342 __kmp_forkjoin_frames_mode == 3)
2343 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2344 master_th->th.th_frame_time, 0, loc,
2345 master_th->th.th_team_nproc, 1);
2346 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2347 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2348 __kmp_itt_region_joined(gtid);
2349 } // active_level == 1
2350 #endif /* USE_ITT_BUILD */
2351
2352 if (master_th->th.th_teams_microtask && !exit_teams &&
2353 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2354 team->t.t_level == master_th->th.th_teams_level + 1) {
2355 // AC: We need to leave the team structure intact at the end of parallel
2356 // inside the teams construct, so that at the next parallel same (hot) team
2357 // works, only adjust nesting levels
2358
2359 /* Decrement our nested depth level */
2360 team->t.t_level--;
2361 team->t.t_active_level--;
2362 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2363
2364 // Restore number of threads in the team if needed. This code relies on
2365 // the proper adjustment of th_teams_size.nth after the fork in
2366 // __kmp_teams_master on each teams master in the case that
2367 // __kmp_reserve_threads reduced it.
2368 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2369 int old_num = master_th->th.th_team_nproc;
2370 int new_num = master_th->th.th_teams_size.nth;
2371 kmp_info_t **other_threads = team->t.t_threads;
2372 team->t.t_nproc = new_num;
2373 for (int i = 0; i < old_num; ++i) {
2374 other_threads[i]->th.th_team_nproc = new_num;
2375 }
2376 // Adjust states of non-used threads of the team
2377 for (int i = old_num; i < new_num; ++i) {
2378 // Re-initialize thread's barrier data.
2379 KMP_DEBUG_ASSERT(other_threads[i]);
2380 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2381 for (int b = 0; b < bs_last_barrier; ++b) {
2382 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2383 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2384 #if USE_DEBUGGER
2385 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2386 #endif
2387 }
2388 if (__kmp_tasking_mode != tskm_immediate_exec) {
2389 // Synchronize thread's task state
2390 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2391 }
2392 }
2393 }
2394
2395 #if OMPT_SUPPORT
2396 if (ompt_enabled.enabled) {
2397 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2398 codeptr);
2399 }
2400 #endif
2401
2402 return;
2403 }
2404
2405 /* do cleanup and restore the parent team */
2406 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2407 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2408
2409 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2410
2411 /* jc: The following lock has instructions with REL and ACQ semantics,
2412 separating the parallel user code called in this parallel region
2413 from the serial user code called after this function returns. */
2414 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2415
2416 if (!master_th->th.th_teams_microtask ||
2417 team->t.t_level > master_th->th.th_teams_level) {
2418 /* Decrement our nested depth level */
2419 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2420 }
2421 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2422
2423 #if OMPT_SUPPORT
2424 if (ompt_enabled.enabled) {
2425 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426 if (ompt_enabled.ompt_callback_implicit_task) {
2427 int ompt_team_size = team->t.t_nproc;
2428 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2431 }
2432
2433 task_info->frame.exit_frame = ompt_data_none;
2434 task_info->task_data = ompt_data_none;
2435 }
2436 #endif
2437
2438 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2439 master_th, team));
2440 __kmp_pop_current_task_from_thread(master_th);
2441
2442 #if KMP_AFFINITY_SUPPORTED
2443 // Restore master thread's partition.
2444 master_th->th.th_first_place = team->t.t_first_place;
2445 master_th->th.th_last_place = team->t.t_last_place;
2446 #endif // KMP_AFFINITY_SUPPORTED
2447 master_th->th.th_def_allocator = team->t.t_def_allocator;
2448
2449 updateHWFPControl(team);
2450
2451 if (root->r.r_active != master_active)
2452 root->r.r_active = master_active;
2453
2454 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2455 master_th)); // this will free worker threads
2456
2457 /* this race was fun to find. make sure the following is in the critical
2458 region otherwise assertions may fail occasionally since the old team may be
2459 reallocated and the hierarchy appears inconsistent. it is actually safe to
2460 run and won't cause any bugs, but will cause those assertion failures. it's
2461 only one deref&assign so might as well put this in the critical region */
2462 master_th->th.th_team = parent_team;
2463 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2464 master_th->th.th_team_master = parent_team->t.t_threads[0];
2465 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2466
2467 /* restore serialized team, if need be */
2468 if (parent_team->t.t_serialized &&
2469 parent_team != master_th->th.th_serial_team &&
2470 parent_team != root->r.r_root_team) {
2471 __kmp_free_team(root,
2472 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2473 master_th->th.th_serial_team = parent_team;
2474 }
2475
2476 if (__kmp_tasking_mode != tskm_immediate_exec) {
2477 if (master_th->th.th_task_state_top >
2478 0) { // Restore task state from memo stack
2479 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2480 // Remember master's state if we re-use this nested hot team
2481 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2482 master_th->th.th_task_state;
2483 --master_th->th.th_task_state_top; // pop
2484 // Now restore state at this level
2485 master_th->th.th_task_state =
2486 master_th->th
2487 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2488 }
2489 // Copy the task team from the parent team to the master thread
2490 master_th->th.th_task_team =
2491 parent_team->t.t_task_team[master_th->th.th_task_state];
2492 KA_TRACE(20,
2493 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2494 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2495 parent_team));
2496 }
2497
2498 // TODO: GEH - cannot do this assertion because root thread not set up as
2499 // executing
2500 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2501 master_th->th.th_current_task->td_flags.executing = 1;
2502
2503 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2504
2505 #if OMPT_SUPPORT
2506 if (ompt_enabled.enabled) {
2507 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2508 codeptr);
2509 }
2510 #endif
2511
2512 KMP_MB();
2513 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2514 }
2515
2516 /* Check whether we should push an internal control record onto the
2517 serial team stack. If so, do it. */
__kmp_save_internal_controls(kmp_info_t * thread)2518 void __kmp_save_internal_controls(kmp_info_t *thread) {
2519
2520 if (thread->th.th_team != thread->th.th_serial_team) {
2521 return;
2522 }
2523 if (thread->th.th_team->t.t_serialized > 1) {
2524 int push = 0;
2525
2526 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2527 push = 1;
2528 } else {
2529 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530 thread->th.th_team->t.t_serialized) {
2531 push = 1;
2532 }
2533 }
2534 if (push) { /* push a record on the serial team's stack */
2535 kmp_internal_control_t *control =
2536 (kmp_internal_control_t *)__kmp_allocate(
2537 sizeof(kmp_internal_control_t));
2538
2539 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2540
2541 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2542
2543 control->next = thread->th.th_team->t.t_control_stack_top;
2544 thread->th.th_team->t.t_control_stack_top = control;
2545 }
2546 }
2547 }
2548
2549 /* Changes set_nproc */
__kmp_set_num_threads(int new_nth,int gtid)2550 void __kmp_set_num_threads(int new_nth, int gtid) {
2551 kmp_info_t *thread;
2552 kmp_root_t *root;
2553
2554 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2555 KMP_DEBUG_ASSERT(__kmp_init_serial);
2556
2557 if (new_nth < 1)
2558 new_nth = 1;
2559 else if (new_nth > __kmp_max_nth)
2560 new_nth = __kmp_max_nth;
2561
2562 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563 thread = __kmp_threads[gtid];
2564 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2565 return; // nothing to do
2566
2567 __kmp_save_internal_controls(thread);
2568
2569 set__nproc(thread, new_nth);
2570
2571 // If this omp_set_num_threads() call will cause the hot team size to be
2572 // reduced (in the absence of a num_threads clause), then reduce it now,
2573 // rather than waiting for the next parallel region.
2574 root = thread->th.th_root;
2575 if (__kmp_init_parallel && (!root->r.r_active) &&
2576 (root->r.r_hot_team->t.t_nproc > new_nth)
2577 #if KMP_NESTED_HOT_TEAMS
2578 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2579 #endif
2580 ) {
2581 kmp_team_t *hot_team = root->r.r_hot_team;
2582 int f;
2583
2584 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2585
2586 // Release the extra threads we don't need any more.
2587 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2588 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2589 if (__kmp_tasking_mode != tskm_immediate_exec) {
2590 // When decreasing team size, threads no longer in the team should unref
2591 // task team.
2592 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2593 }
2594 __kmp_free_thread(hot_team->t.t_threads[f]);
2595 hot_team->t.t_threads[f] = NULL;
2596 }
2597 hot_team->t.t_nproc = new_nth;
2598 #if KMP_NESTED_HOT_TEAMS
2599 if (thread->th.th_hot_teams) {
2600 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2601 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2602 }
2603 #endif
2604
2605 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2606
2607 // Update the t_nproc field in the threads that are still active.
2608 for (f = 0; f < new_nth; f++) {
2609 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2610 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2611 }
2612 // Special flag in case omp_set_num_threads() call
2613 hot_team->t.t_size_changed = -1;
2614 }
2615 }
2616
2617 /* Changes max_active_levels */
__kmp_set_max_active_levels(int gtid,int max_active_levels)2618 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2619 kmp_info_t *thread;
2620
2621 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2622 "%d = (%d)\n",
2623 gtid, max_active_levels));
2624 KMP_DEBUG_ASSERT(__kmp_init_serial);
2625
2626 // validate max_active_levels
2627 if (max_active_levels < 0) {
2628 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2629 // We ignore this call if the user has specified a negative value.
2630 // The current setting won't be changed. The last valid setting will be
2631 // used. A warning will be issued (if warnings are allowed as controlled by
2632 // the KMP_WARNINGS env var).
2633 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2634 "max_active_levels for thread %d = (%d)\n",
2635 gtid, max_active_levels));
2636 return;
2637 }
2638 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2639 // it's OK, the max_active_levels is within the valid range: [ 0;
2640 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2641 // We allow a zero value. (implementation defined behavior)
2642 } else {
2643 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2644 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2645 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2646 // Current upper limit is MAX_INT. (implementation defined behavior)
2647 // If the input exceeds the upper limit, we correct the input to be the
2648 // upper limit. (implementation defined behavior)
2649 // Actually, the flow should never get here until we use MAX_INT limit.
2650 }
2651 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2652 "max_active_levels for thread %d = (%d)\n",
2653 gtid, max_active_levels));
2654
2655 thread = __kmp_threads[gtid];
2656
2657 __kmp_save_internal_controls(thread);
2658
2659 set__max_active_levels(thread, max_active_levels);
2660 }
2661
2662 /* Gets max_active_levels */
__kmp_get_max_active_levels(int gtid)2663 int __kmp_get_max_active_levels(int gtid) {
2664 kmp_info_t *thread;
2665
2666 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2667 KMP_DEBUG_ASSERT(__kmp_init_serial);
2668
2669 thread = __kmp_threads[gtid];
2670 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2671 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2672 "curtask_maxaclevel=%d\n",
2673 gtid, thread->th.th_current_task,
2674 thread->th.th_current_task->td_icvs.max_active_levels));
2675 return thread->th.th_current_task->td_icvs.max_active_levels;
2676 }
2677
2678 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2679 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2680
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
__kmp_set_schedule(int gtid,kmp_sched_t kind,int chunk)2682 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2683 kmp_info_t *thread;
2684 kmp_sched_t orig_kind;
2685 // kmp_team_t *team;
2686
2687 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2688 gtid, (int)kind, chunk));
2689 KMP_DEBUG_ASSERT(__kmp_init_serial);
2690
2691 // Check if the kind parameter is valid, correct if needed.
2692 // Valid parameters should fit in one of two intervals - standard or extended:
2693 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2695 orig_kind = kind;
2696 kind = __kmp_sched_without_mods(kind);
2697
2698 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2699 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2700 // TODO: Hint needs attention in case we change the default schedule.
2701 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2702 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2703 __kmp_msg_null);
2704 kind = kmp_sched_default;
2705 chunk = 0; // ignore chunk value in case of bad kind
2706 }
2707
2708 thread = __kmp_threads[gtid];
2709
2710 __kmp_save_internal_controls(thread);
2711
2712 if (kind < kmp_sched_upper_std) {
2713 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2714 // differ static chunked vs. unchunked: chunk should be invalid to
2715 // indicate unchunked schedule (which is the default)
2716 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2717 } else {
2718 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2719 __kmp_sch_map[kind - kmp_sched_lower - 1];
2720 }
2721 } else {
2722 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2723 // kmp_sched_lower - 2 ];
2724 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2725 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2726 kmp_sched_lower - 2];
2727 }
2728 __kmp_sched_apply_mods_intkind(
2729 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2730 if (kind == kmp_sched_auto || chunk < 1) {
2731 // ignore parameter chunk for schedule auto
2732 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2733 } else {
2734 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2735 }
2736 }
2737
2738 /* Gets def_sched_var ICV values */
__kmp_get_schedule(int gtid,kmp_sched_t * kind,int * chunk)2739 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2740 kmp_info_t *thread;
2741 enum sched_type th_type;
2742
2743 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2744 KMP_DEBUG_ASSERT(__kmp_init_serial);
2745
2746 thread = __kmp_threads[gtid];
2747
2748 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2749 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2750 case kmp_sch_static:
2751 case kmp_sch_static_greedy:
2752 case kmp_sch_static_balanced:
2753 *kind = kmp_sched_static;
2754 __kmp_sched_apply_mods_stdkind(kind, th_type);
2755 *chunk = 0; // chunk was not set, try to show this fact via zero value
2756 return;
2757 case kmp_sch_static_chunked:
2758 *kind = kmp_sched_static;
2759 break;
2760 case kmp_sch_dynamic_chunked:
2761 *kind = kmp_sched_dynamic;
2762 break;
2763 case kmp_sch_guided_chunked:
2764 case kmp_sch_guided_iterative_chunked:
2765 case kmp_sch_guided_analytical_chunked:
2766 *kind = kmp_sched_guided;
2767 break;
2768 case kmp_sch_auto:
2769 *kind = kmp_sched_auto;
2770 break;
2771 case kmp_sch_trapezoidal:
2772 *kind = kmp_sched_trapezoidal;
2773 break;
2774 #if KMP_STATIC_STEAL_ENABLED
2775 case kmp_sch_static_steal:
2776 *kind = kmp_sched_static_steal;
2777 break;
2778 #endif
2779 default:
2780 KMP_FATAL(UnknownSchedulingType, th_type);
2781 }
2782
2783 __kmp_sched_apply_mods_stdkind(kind, th_type);
2784 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2785 }
2786
__kmp_get_ancestor_thread_num(int gtid,int level)2787 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2788
2789 int ii, dd;
2790 kmp_team_t *team;
2791 kmp_info_t *thr;
2792
2793 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2794 KMP_DEBUG_ASSERT(__kmp_init_serial);
2795
2796 // validate level
2797 if (level == 0)
2798 return 0;
2799 if (level < 0)
2800 return -1;
2801 thr = __kmp_threads[gtid];
2802 team = thr->th.th_team;
2803 ii = team->t.t_level;
2804 if (level > ii)
2805 return -1;
2806
2807 if (thr->th.th_teams_microtask) {
2808 // AC: we are in teams region where multiple nested teams have same level
2809 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2810 if (level <=
2811 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2812 KMP_DEBUG_ASSERT(ii >= tlevel);
2813 // AC: As we need to pass by the teams league, we need to artificially
2814 // increase ii
2815 if (ii == tlevel) {
2816 ii += 2; // three teams have same level
2817 } else {
2818 ii++; // two teams have same level
2819 }
2820 }
2821 }
2822
2823 if (ii == level)
2824 return __kmp_tid_from_gtid(gtid);
2825
2826 dd = team->t.t_serialized;
2827 level++;
2828 while (ii > level) {
2829 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2830 }
2831 if ((team->t.t_serialized) && (!dd)) {
2832 team = team->t.t_parent;
2833 continue;
2834 }
2835 if (ii > level) {
2836 team = team->t.t_parent;
2837 dd = team->t.t_serialized;
2838 ii--;
2839 }
2840 }
2841
2842 return (dd > 1) ? (0) : (team->t.t_master_tid);
2843 }
2844
__kmp_get_team_size(int gtid,int level)2845 int __kmp_get_team_size(int gtid, int level) {
2846
2847 int ii, dd;
2848 kmp_team_t *team;
2849 kmp_info_t *thr;
2850
2851 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2852 KMP_DEBUG_ASSERT(__kmp_init_serial);
2853
2854 // validate level
2855 if (level == 0)
2856 return 1;
2857 if (level < 0)
2858 return -1;
2859 thr = __kmp_threads[gtid];
2860 team = thr->th.th_team;
2861 ii = team->t.t_level;
2862 if (level > ii)
2863 return -1;
2864
2865 if (thr->th.th_teams_microtask) {
2866 // AC: we are in teams region where multiple nested teams have same level
2867 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2868 if (level <=
2869 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2870 KMP_DEBUG_ASSERT(ii >= tlevel);
2871 // AC: As we need to pass by the teams league, we need to artificially
2872 // increase ii
2873 if (ii == tlevel) {
2874 ii += 2; // three teams have same level
2875 } else {
2876 ii++; // two teams have same level
2877 }
2878 }
2879 }
2880
2881 while (ii > level) {
2882 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2883 }
2884 if (team->t.t_serialized && (!dd)) {
2885 team = team->t.t_parent;
2886 continue;
2887 }
2888 if (ii > level) {
2889 team = team->t.t_parent;
2890 ii--;
2891 }
2892 }
2893
2894 return team->t.t_nproc;
2895 }
2896
__kmp_get_schedule_global()2897 kmp_r_sched_t __kmp_get_schedule_global() {
2898 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2899 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2900 // independently. So one can get the updated schedule here.
2901
2902 kmp_r_sched_t r_sched;
2903
2904 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2905 // __kmp_guided. __kmp_sched should keep original value, so that user can set
2906 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2907 // different roots (even in OMP 2.5)
2908 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2909 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2910 if (s == kmp_sch_static) {
2911 // replace STATIC with more detailed schedule (balanced or greedy)
2912 r_sched.r_sched_type = __kmp_static;
2913 } else if (s == kmp_sch_guided_chunked) {
2914 // replace GUIDED with more detailed schedule (iterative or analytical)
2915 r_sched.r_sched_type = __kmp_guided;
2916 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2917 r_sched.r_sched_type = __kmp_sched;
2918 }
2919 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2920
2921 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2922 // __kmp_chunk may be wrong here (if it was not ever set)
2923 r_sched.chunk = KMP_DEFAULT_CHUNK;
2924 } else {
2925 r_sched.chunk = __kmp_chunk;
2926 }
2927
2928 return r_sched;
2929 }
2930
2931 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2932 at least argc number of *t_argv entries for the requested team. */
__kmp_alloc_argv_entries(int argc,kmp_team_t * team,int realloc)2933 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2934
2935 KMP_DEBUG_ASSERT(team);
2936 if (!realloc || argc > team->t.t_max_argc) {
2937
2938 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2939 "current entries=%d\n",
2940 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2941 /* if previously allocated heap space for args, free them */
2942 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2943 __kmp_free((void *)team->t.t_argv);
2944
2945 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2946 /* use unused space in the cache line for arguments */
2947 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2948 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2949 "argv entries\n",
2950 team->t.t_id, team->t.t_max_argc));
2951 team->t.t_argv = &team->t.t_inline_argv[0];
2952 if (__kmp_storage_map) {
2953 __kmp_print_storage_map_gtid(
2954 -1, &team->t.t_inline_argv[0],
2955 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2956 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2957 team->t.t_id);
2958 }
2959 } else {
2960 /* allocate space for arguments in the heap */
2961 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2962 ? KMP_MIN_MALLOC_ARGV_ENTRIES
2963 : 2 * argc;
2964 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2965 "argv entries\n",
2966 team->t.t_id, team->t.t_max_argc));
2967 team->t.t_argv =
2968 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2969 if (__kmp_storage_map) {
2970 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2971 &team->t.t_argv[team->t.t_max_argc],
2972 sizeof(void *) * team->t.t_max_argc,
2973 "team_%d.t_argv", team->t.t_id);
2974 }
2975 }
2976 }
2977 }
2978
__kmp_allocate_team_arrays(kmp_team_t * team,int max_nth)2979 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2980 int i;
2981 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2982 team->t.t_threads =
2983 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
2984 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
2985 sizeof(dispatch_shared_info_t) * num_disp_buff);
2986 team->t.t_dispatch =
2987 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
2988 team->t.t_implicit_task_taskdata =
2989 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
2990 team->t.t_max_nproc = max_nth;
2991
2992 /* setup dispatch buffers */
2993 for (i = 0; i < num_disp_buff; ++i) {
2994 team->t.t_disp_buffer[i].buffer_index = i;
2995 team->t.t_disp_buffer[i].doacross_buf_idx = i;
2996 }
2997 }
2998
__kmp_free_team_arrays(kmp_team_t * team)2999 static void __kmp_free_team_arrays(kmp_team_t *team) {
3000 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3001 int i;
3002 for (i = 0; i < team->t.t_max_nproc; ++i) {
3003 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3004 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3005 team->t.t_dispatch[i].th_disp_buffer = NULL;
3006 }
3007 }
3008 #if KMP_USE_HIER_SCHED
3009 __kmp_dispatch_free_hierarchies(team);
3010 #endif
3011 __kmp_free(team->t.t_threads);
3012 __kmp_free(team->t.t_disp_buffer);
3013 __kmp_free(team->t.t_dispatch);
3014 __kmp_free(team->t.t_implicit_task_taskdata);
3015 team->t.t_threads = NULL;
3016 team->t.t_disp_buffer = NULL;
3017 team->t.t_dispatch = NULL;
3018 team->t.t_implicit_task_taskdata = 0;
3019 }
3020
__kmp_reallocate_team_arrays(kmp_team_t * team,int max_nth)3021 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3022 kmp_info_t **oldThreads = team->t.t_threads;
3023
3024 __kmp_free(team->t.t_disp_buffer);
3025 __kmp_free(team->t.t_dispatch);
3026 __kmp_free(team->t.t_implicit_task_taskdata);
3027 __kmp_allocate_team_arrays(team, max_nth);
3028
3029 KMP_MEMCPY(team->t.t_threads, oldThreads,
3030 team->t.t_nproc * sizeof(kmp_info_t *));
3031
3032 __kmp_free(oldThreads);
3033 }
3034
__kmp_get_global_icvs(void)3035 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3036
3037 kmp_r_sched_t r_sched =
3038 __kmp_get_schedule_global(); // get current state of scheduling globals
3039
3040 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3041
3042 kmp_internal_control_t g_icvs = {
3043 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3044 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3045 // adjustment of threads (per thread)
3046 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3047 // whether blocktime is explicitly set
3048 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3049 #if KMP_USE_MONITOR
3050 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3051 // intervals
3052 #endif
3053 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3054 // next parallel region (per thread)
3055 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3056 __kmp_cg_max_nth, // int thread_limit;
3057 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3058 // for max_active_levels
3059 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3060 // {sched,chunk} pair
3061 __kmp_nested_proc_bind.bind_types[0],
3062 __kmp_default_device,
3063 NULL // struct kmp_internal_control *next;
3064 };
3065
3066 return g_icvs;
3067 }
3068
__kmp_get_x_global_icvs(const kmp_team_t * team)3069 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3070
3071 kmp_internal_control_t gx_icvs;
3072 gx_icvs.serial_nesting_level =
3073 0; // probably =team->t.t_serial like in save_inter_controls
3074 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3075 gx_icvs.next = NULL;
3076
3077 return gx_icvs;
3078 }
3079
__kmp_initialize_root(kmp_root_t * root)3080 static void __kmp_initialize_root(kmp_root_t *root) {
3081 int f;
3082 kmp_team_t *root_team;
3083 kmp_team_t *hot_team;
3084 int hot_team_max_nth;
3085 kmp_r_sched_t r_sched =
3086 __kmp_get_schedule_global(); // get current state of scheduling globals
3087 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3088 KMP_DEBUG_ASSERT(root);
3089 KMP_ASSERT(!root->r.r_begin);
3090
3091 /* setup the root state structure */
3092 __kmp_init_lock(&root->r.r_begin_lock);
3093 root->r.r_begin = FALSE;
3094 root->r.r_active = FALSE;
3095 root->r.r_in_parallel = 0;
3096 root->r.r_blocktime = __kmp_dflt_blocktime;
3097
3098 /* setup the root team for this task */
3099 /* allocate the root team structure */
3100 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3101
3102 root_team =
3103 __kmp_allocate_team(root,
3104 1, // new_nproc
3105 1, // max_nproc
3106 #if OMPT_SUPPORT
3107 ompt_data_none, // root parallel id
3108 #endif
3109 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3110 0 // argc
3111 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3112 );
3113 #if USE_DEBUGGER
3114 // Non-NULL value should be assigned to make the debugger display the root
3115 // team.
3116 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3117 #endif
3118
3119 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3120
3121 root->r.r_root_team = root_team;
3122 root_team->t.t_control_stack_top = NULL;
3123
3124 /* initialize root team */
3125 root_team->t.t_threads[0] = NULL;
3126 root_team->t.t_nproc = 1;
3127 root_team->t.t_serialized = 1;
3128 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3129 root_team->t.t_sched.sched = r_sched.sched;
3130 KA_TRACE(
3131 20,
3132 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3133 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3134
3135 /* setup the hot team for this task */
3136 /* allocate the hot team structure */
3137 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3138
3139 hot_team =
3140 __kmp_allocate_team(root,
3141 1, // new_nproc
3142 __kmp_dflt_team_nth_ub * 2, // max_nproc
3143 #if OMPT_SUPPORT
3144 ompt_data_none, // root parallel id
3145 #endif
3146 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3147 0 // argc
3148 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3149 );
3150 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3151
3152 root->r.r_hot_team = hot_team;
3153 root_team->t.t_control_stack_top = NULL;
3154
3155 /* first-time initialization */
3156 hot_team->t.t_parent = root_team;
3157
3158 /* initialize hot team */
3159 hot_team_max_nth = hot_team->t.t_max_nproc;
3160 for (f = 0; f < hot_team_max_nth; ++f) {
3161 hot_team->t.t_threads[f] = NULL;
3162 }
3163 hot_team->t.t_nproc = 1;
3164 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3165 hot_team->t.t_sched.sched = r_sched.sched;
3166 hot_team->t.t_size_changed = 0;
3167 }
3168
3169 #ifdef KMP_DEBUG
3170
3171 typedef struct kmp_team_list_item {
3172 kmp_team_p const *entry;
3173 struct kmp_team_list_item *next;
3174 } kmp_team_list_item_t;
3175 typedef kmp_team_list_item_t *kmp_team_list_t;
3176
__kmp_print_structure_team_accum(kmp_team_list_t list,kmp_team_p const * team)3177 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3178 kmp_team_list_t list, // List of teams.
3179 kmp_team_p const *team // Team to add.
3180 ) {
3181
3182 // List must terminate with item where both entry and next are NULL.
3183 // Team is added to the list only once.
3184 // List is sorted in ascending order by team id.
3185 // Team id is *not* a key.
3186
3187 kmp_team_list_t l;
3188
3189 KMP_DEBUG_ASSERT(list != NULL);
3190 if (team == NULL) {
3191 return;
3192 }
3193
3194 __kmp_print_structure_team_accum(list, team->t.t_parent);
3195 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3196
3197 // Search list for the team.
3198 l = list;
3199 while (l->next != NULL && l->entry != team) {
3200 l = l->next;
3201 }
3202 if (l->next != NULL) {
3203 return; // Team has been added before, exit.
3204 }
3205
3206 // Team is not found. Search list again for insertion point.
3207 l = list;
3208 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3209 l = l->next;
3210 }
3211
3212 // Insert team.
3213 {
3214 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3215 sizeof(kmp_team_list_item_t));
3216 *item = *l;
3217 l->entry = team;
3218 l->next = item;
3219 }
3220 }
3221
__kmp_print_structure_team(char const * title,kmp_team_p const * team)3222 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3223
3224 ) {
3225 __kmp_printf("%s", title);
3226 if (team != NULL) {
3227 __kmp_printf("%2x %p\n", team->t.t_id, team);
3228 } else {
3229 __kmp_printf(" - (nil)\n");
3230 }
3231 }
3232
__kmp_print_structure_thread(char const * title,kmp_info_p const * thread)3233 static void __kmp_print_structure_thread(char const *title,
3234 kmp_info_p const *thread) {
3235 __kmp_printf("%s", title);
3236 if (thread != NULL) {
3237 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3238 } else {
3239 __kmp_printf(" - (nil)\n");
3240 }
3241 }
3242
__kmp_print_structure(void)3243 void __kmp_print_structure(void) {
3244
3245 kmp_team_list_t list;
3246
3247 // Initialize list of teams.
3248 list =
3249 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3250 list->entry = NULL;
3251 list->next = NULL;
3252
3253 __kmp_printf("\n------------------------------\nGlobal Thread "
3254 "Table\n------------------------------\n");
3255 {
3256 int gtid;
3257 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3258 __kmp_printf("%2d", gtid);
3259 if (__kmp_threads != NULL) {
3260 __kmp_printf(" %p", __kmp_threads[gtid]);
3261 }
3262 if (__kmp_root != NULL) {
3263 __kmp_printf(" %p", __kmp_root[gtid]);
3264 }
3265 __kmp_printf("\n");
3266 }
3267 }
3268
3269 // Print out __kmp_threads array.
3270 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3271 "----------\n");
3272 if (__kmp_threads != NULL) {
3273 int gtid;
3274 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3275 kmp_info_t const *thread = __kmp_threads[gtid];
3276 if (thread != NULL) {
3277 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3278 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3279 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3280 __kmp_print_structure_team(" Serial Team: ",
3281 thread->th.th_serial_team);
3282 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3283 __kmp_print_structure_thread(" Master: ",
3284 thread->th.th_team_master);
3285 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3286 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3287 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3288 __kmp_print_structure_thread(" Next in pool: ",
3289 thread->th.th_next_pool);
3290 __kmp_printf("\n");
3291 __kmp_print_structure_team_accum(list, thread->th.th_team);
3292 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3293 }
3294 }
3295 } else {
3296 __kmp_printf("Threads array is not allocated.\n");
3297 }
3298
3299 // Print out __kmp_root array.
3300 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3301 "--------\n");
3302 if (__kmp_root != NULL) {
3303 int gtid;
3304 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3305 kmp_root_t const *root = __kmp_root[gtid];
3306 if (root != NULL) {
3307 __kmp_printf("GTID %2d %p:\n", gtid, root);
3308 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3309 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3310 __kmp_print_structure_thread(" Uber Thread: ",
3311 root->r.r_uber_thread);
3312 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3313 __kmp_printf(" In Parallel: %2d\n",
3314 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3315 __kmp_printf("\n");
3316 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3317 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3318 }
3319 }
3320 } else {
3321 __kmp_printf("Ubers array is not allocated.\n");
3322 }
3323
3324 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3325 "--------\n");
3326 while (list->next != NULL) {
3327 kmp_team_p const *team = list->entry;
3328 int i;
3329 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3330 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3331 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3332 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3333 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3334 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3335 for (i = 0; i < team->t.t_nproc; ++i) {
3336 __kmp_printf(" Thread %2d: ", i);
3337 __kmp_print_structure_thread("", team->t.t_threads[i]);
3338 }
3339 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3340 __kmp_printf("\n");
3341 list = list->next;
3342 }
3343
3344 // Print out __kmp_thread_pool and __kmp_team_pool.
3345 __kmp_printf("\n------------------------------\nPools\n----------------------"
3346 "--------\n");
3347 __kmp_print_structure_thread("Thread pool: ",
3348 CCAST(kmp_info_t *, __kmp_thread_pool));
3349 __kmp_print_structure_team("Team pool: ",
3350 CCAST(kmp_team_t *, __kmp_team_pool));
3351 __kmp_printf("\n");
3352
3353 // Free team list.
3354 while (list != NULL) {
3355 kmp_team_list_item_t *item = list;
3356 list = list->next;
3357 KMP_INTERNAL_FREE(item);
3358 }
3359 }
3360
3361 #endif
3362
3363 //---------------------------------------------------------------------------
3364 // Stuff for per-thread fast random number generator
3365 // Table of primes
3366 static const unsigned __kmp_primes[] = {
3367 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3368 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3369 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3370 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3371 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3372 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3373 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3374 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3375 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3376 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3377 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3378
3379 //---------------------------------------------------------------------------
3380 // __kmp_get_random: Get a random number using a linear congruential method.
__kmp_get_random(kmp_info_t * thread)3381 unsigned short __kmp_get_random(kmp_info_t *thread) {
3382 unsigned x = thread->th.th_x;
3383 unsigned short r = x >> 16;
3384
3385 thread->th.th_x = x * thread->th.th_a + 1;
3386
3387 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3388 thread->th.th_info.ds.ds_tid, r));
3389
3390 return r;
3391 }
3392 //--------------------------------------------------------
3393 // __kmp_init_random: Initialize a random number generator
__kmp_init_random(kmp_info_t * thread)3394 void __kmp_init_random(kmp_info_t *thread) {
3395 unsigned seed = thread->th.th_info.ds.ds_tid;
3396
3397 thread->th.th_a =
3398 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3399 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3400 KA_TRACE(30,
3401 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3402 }
3403
3404 #if KMP_OS_WINDOWS
3405 /* reclaim array entries for root threads that are already dead, returns number
3406 * reclaimed */
__kmp_reclaim_dead_roots(void)3407 static int __kmp_reclaim_dead_roots(void) {
3408 int i, r = 0;
3409
3410 for (i = 0; i < __kmp_threads_capacity; ++i) {
3411 if (KMP_UBER_GTID(i) &&
3412 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3413 !__kmp_root[i]
3414 ->r.r_active) { // AC: reclaim only roots died in non-active state
3415 r += __kmp_unregister_root_other_thread(i);
3416 }
3417 }
3418 return r;
3419 }
3420 #endif
3421
3422 /* This function attempts to create free entries in __kmp_threads and
3423 __kmp_root, and returns the number of free entries generated.
3424
3425 For Windows* OS static library, the first mechanism used is to reclaim array
3426 entries for root threads that are already dead.
3427
3428 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3429 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3430 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3431 threadprivate cache array has been created. Synchronization with
3432 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3433
3434 After any dead root reclamation, if the clipping value allows array expansion
3435 to result in the generation of a total of nNeed free slots, the function does
3436 that expansion. If not, nothing is done beyond the possible initial root
3437 thread reclamation.
3438
3439 If any argument is negative, the behavior is undefined. */
__kmp_expand_threads(int nNeed)3440 static int __kmp_expand_threads(int nNeed) {
3441 int added = 0;
3442 int minimumRequiredCapacity;
3443 int newCapacity;
3444 kmp_info_t **newThreads;
3445 kmp_root_t **newRoot;
3446
3447 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3448 // resizing __kmp_threads does not need additional protection if foreign
3449 // threads are present
3450
3451 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3452 /* only for Windows static library */
3453 /* reclaim array entries for root threads that are already dead */
3454 added = __kmp_reclaim_dead_roots();
3455
3456 if (nNeed) {
3457 nNeed -= added;
3458 if (nNeed < 0)
3459 nNeed = 0;
3460 }
3461 #endif
3462 if (nNeed <= 0)
3463 return added;
3464
3465 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3466 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3467 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3468 // > __kmp_max_nth in one of two ways:
3469 //
3470 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3471 // may not be resused by another thread, so we may need to increase
3472 // __kmp_threads_capacity to __kmp_max_nth + 1.
3473 //
3474 // 2) New foreign root(s) are encountered. We always register new foreign
3475 // roots. This may cause a smaller # of threads to be allocated at
3476 // subsequent parallel regions, but the worker threads hang around (and
3477 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3478 //
3479 // Anyway, that is the reason for moving the check to see if
3480 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3481 // instead of having it performed here. -BB
3482
3483 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3484
3485 /* compute expansion headroom to check if we can expand */
3486 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3487 /* possible expansion too small -- give up */
3488 return added;
3489 }
3490 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3491
3492 newCapacity = __kmp_threads_capacity;
3493 do {
3494 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3495 : __kmp_sys_max_nth;
3496 } while (newCapacity < minimumRequiredCapacity);
3497 newThreads = (kmp_info_t **)__kmp_allocate(
3498 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3499 newRoot =
3500 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3501 KMP_MEMCPY(newThreads, __kmp_threads,
3502 __kmp_threads_capacity * sizeof(kmp_info_t *));
3503 KMP_MEMCPY(newRoot, __kmp_root,
3504 __kmp_threads_capacity * sizeof(kmp_root_t *));
3505
3506 kmp_info_t **temp_threads = __kmp_threads;
3507 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3508 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3509 __kmp_free(temp_threads);
3510 added += newCapacity - __kmp_threads_capacity;
3511 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3512
3513 if (newCapacity > __kmp_tp_capacity) {
3514 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3515 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3516 __kmp_threadprivate_resize_cache(newCapacity);
3517 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3518 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3519 }
3520 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3521 }
3522
3523 return added;
3524 }
3525
3526 /* Register the current thread as a root thread and obtain our gtid. We must
3527 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3528 thread that calls from __kmp_do_serial_initialize() */
__kmp_register_root(int initial_thread)3529 int __kmp_register_root(int initial_thread) {
3530 kmp_info_t *root_thread;
3531 kmp_root_t *root;
3532 int gtid;
3533 int capacity;
3534 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3535 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3536 KMP_MB();
3537
3538 /* 2007-03-02:
3539 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3540 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3541 work as expected -- it may return false (that means there is at least one
3542 empty slot in __kmp_threads array), but it is possible the only free slot
3543 is #0, which is reserved for initial thread and so cannot be used for this
3544 one. Following code workarounds this bug.
3545
3546 However, right solution seems to be not reserving slot #0 for initial
3547 thread because:
3548 (1) there is no magic in slot #0,
3549 (2) we cannot detect initial thread reliably (the first thread which does
3550 serial initialization may be not a real initial thread).
3551 */
3552 capacity = __kmp_threads_capacity;
3553 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3554 --capacity;
3555 }
3556
3557 /* see if there are too many threads */
3558 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3559 if (__kmp_tp_cached) {
3560 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3561 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3562 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3563 } else {
3564 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3565 __kmp_msg_null);
3566 }
3567 }
3568
3569 /* find an available thread slot */
3570 /* Don't reassign the zero slot since we need that to only be used by initial
3571 thread */
3572 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3573 gtid++)
3574 ;
3575 KA_TRACE(1,
3576 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3577 KMP_ASSERT(gtid < __kmp_threads_capacity);
3578
3579 /* update global accounting */
3580 __kmp_all_nth++;
3581 TCW_4(__kmp_nth, __kmp_nth + 1);
3582
3583 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3584 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3585 if (__kmp_adjust_gtid_mode) {
3586 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3587 if (TCR_4(__kmp_gtid_mode) != 2) {
3588 TCW_4(__kmp_gtid_mode, 2);
3589 }
3590 } else {
3591 if (TCR_4(__kmp_gtid_mode) != 1) {
3592 TCW_4(__kmp_gtid_mode, 1);
3593 }
3594 }
3595 }
3596
3597 #ifdef KMP_ADJUST_BLOCKTIME
3598 /* Adjust blocktime to zero if necessary */
3599 /* Middle initialization might not have occurred yet */
3600 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3601 if (__kmp_nth > __kmp_avail_proc) {
3602 __kmp_zero_bt = TRUE;
3603 }
3604 }
3605 #endif /* KMP_ADJUST_BLOCKTIME */
3606
3607 /* setup this new hierarchy */
3608 if (!(root = __kmp_root[gtid])) {
3609 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3610 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3611 }
3612
3613 #if KMP_STATS_ENABLED
3614 // Initialize stats as soon as possible (right after gtid assignment).
3615 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3616 __kmp_stats_thread_ptr->startLife();
3617 KMP_SET_THREAD_STATE(SERIAL_REGION);
3618 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3619 #endif
3620 __kmp_initialize_root(root);
3621
3622 /* setup new root thread structure */
3623 if (root->r.r_uber_thread) {
3624 root_thread = root->r.r_uber_thread;
3625 } else {
3626 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3627 if (__kmp_storage_map) {
3628 __kmp_print_thread_storage_map(root_thread, gtid);
3629 }
3630 root_thread->th.th_info.ds.ds_gtid = gtid;
3631 #if OMPT_SUPPORT
3632 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3633 #endif
3634 root_thread->th.th_root = root;
3635 if (__kmp_env_consistency_check) {
3636 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3637 }
3638 #if USE_FAST_MEMORY
3639 __kmp_initialize_fast_memory(root_thread);
3640 #endif /* USE_FAST_MEMORY */
3641
3642 #if KMP_USE_BGET
3643 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3644 __kmp_initialize_bget(root_thread);
3645 #endif
3646 __kmp_init_random(root_thread); // Initialize random number generator
3647 }
3648
3649 /* setup the serial team held in reserve by the root thread */
3650 if (!root_thread->th.th_serial_team) {
3651 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3652 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3653 root_thread->th.th_serial_team = __kmp_allocate_team(
3654 root, 1, 1,
3655 #if OMPT_SUPPORT
3656 ompt_data_none, // root parallel id
3657 #endif
3658 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3659 }
3660 KMP_ASSERT(root_thread->th.th_serial_team);
3661 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3662 root_thread->th.th_serial_team));
3663
3664 /* drop root_thread into place */
3665 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3666
3667 root->r.r_root_team->t.t_threads[0] = root_thread;
3668 root->r.r_hot_team->t.t_threads[0] = root_thread;
3669 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3670 // AC: the team created in reserve, not for execution (it is unused for now).
3671 root_thread->th.th_serial_team->t.t_serialized = 0;
3672 root->r.r_uber_thread = root_thread;
3673
3674 /* initialize the thread, get it ready to go */
3675 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3676 TCW_4(__kmp_init_gtid, TRUE);
3677
3678 /* prepare the master thread for get_gtid() */
3679 __kmp_gtid_set_specific(gtid);
3680
3681 #if USE_ITT_BUILD
3682 __kmp_itt_thread_name(gtid);
3683 #endif /* USE_ITT_BUILD */
3684
3685 #ifdef KMP_TDATA_GTID
3686 __kmp_gtid = gtid;
3687 #endif
3688 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3689 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3690
3691 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3692 "plain=%u\n",
3693 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3694 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3695 KMP_INIT_BARRIER_STATE));
3696 { // Initialize barrier data.
3697 int b;
3698 for (b = 0; b < bs_last_barrier; ++b) {
3699 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3700 #if USE_DEBUGGER
3701 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3702 #endif
3703 }
3704 }
3705 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3706 KMP_INIT_BARRIER_STATE);
3707
3708 #if KMP_AFFINITY_SUPPORTED
3709 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3710 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3711 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3712 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3713 if (TCR_4(__kmp_init_middle)) {
3714 __kmp_affinity_set_init_mask(gtid, TRUE);
3715 }
3716 #endif /* KMP_AFFINITY_SUPPORTED */
3717 root_thread->th.th_def_allocator = __kmp_def_allocator;
3718 root_thread->th.th_prev_level = 0;
3719 root_thread->th.th_prev_num_threads = 1;
3720
3721 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3722 tmp->cg_root = root_thread;
3723 tmp->cg_thread_limit = __kmp_cg_max_nth;
3724 tmp->cg_nthreads = 1;
3725 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3726 " cg_nthreads init to 1\n",
3727 root_thread, tmp));
3728 tmp->up = NULL;
3729 root_thread->th.th_cg_roots = tmp;
3730
3731 __kmp_root_counter++;
3732
3733 #if OMPT_SUPPORT
3734 if (!initial_thread && ompt_enabled.enabled) {
3735
3736 kmp_info_t *root_thread = ompt_get_thread();
3737
3738 ompt_set_thread_state(root_thread, ompt_state_overhead);
3739
3740 if (ompt_enabled.ompt_callback_thread_begin) {
3741 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3742 ompt_thread_initial, __ompt_get_thread_data_internal());
3743 }
3744 ompt_data_t *task_data;
3745 ompt_data_t *parallel_data;
3746 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3747 if (ompt_enabled.ompt_callback_implicit_task) {
3748 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3749 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3750 }
3751
3752 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3753 }
3754 #endif
3755
3756 KMP_MB();
3757 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3758
3759 return gtid;
3760 }
3761
3762 #if KMP_NESTED_HOT_TEAMS
__kmp_free_hot_teams(kmp_root_t * root,kmp_info_t * thr,int level,const int max_level)3763 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3764 const int max_level) {
3765 int i, n, nth;
3766 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3767 if (!hot_teams || !hot_teams[level].hot_team) {
3768 return 0;
3769 }
3770 KMP_DEBUG_ASSERT(level < max_level);
3771 kmp_team_t *team = hot_teams[level].hot_team;
3772 nth = hot_teams[level].hot_team_nth;
3773 n = nth - 1; // master is not freed
3774 if (level < max_level - 1) {
3775 for (i = 0; i < nth; ++i) {
3776 kmp_info_t *th = team->t.t_threads[i];
3777 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3778 if (i > 0 && th->th.th_hot_teams) {
3779 __kmp_free(th->th.th_hot_teams);
3780 th->th.th_hot_teams = NULL;
3781 }
3782 }
3783 }
3784 __kmp_free_team(root, team, NULL);
3785 return n;
3786 }
3787 #endif
3788
3789 // Resets a root thread and clear its root and hot teams.
3790 // Returns the number of __kmp_threads entries directly and indirectly freed.
__kmp_reset_root(int gtid,kmp_root_t * root)3791 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3792 kmp_team_t *root_team = root->r.r_root_team;
3793 kmp_team_t *hot_team = root->r.r_hot_team;
3794 int n = hot_team->t.t_nproc;
3795 int i;
3796
3797 KMP_DEBUG_ASSERT(!root->r.r_active);
3798
3799 root->r.r_root_team = NULL;
3800 root->r.r_hot_team = NULL;
3801 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3802 // before call to __kmp_free_team().
3803 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3804 #if KMP_NESTED_HOT_TEAMS
3805 if (__kmp_hot_teams_max_level >
3806 0) { // need to free nested hot teams and their threads if any
3807 for (i = 0; i < hot_team->t.t_nproc; ++i) {
3808 kmp_info_t *th = hot_team->t.t_threads[i];
3809 if (__kmp_hot_teams_max_level > 1) {
3810 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3811 }
3812 if (th->th.th_hot_teams) {
3813 __kmp_free(th->th.th_hot_teams);
3814 th->th.th_hot_teams = NULL;
3815 }
3816 }
3817 }
3818 #endif
3819 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3820
3821 // Before we can reap the thread, we need to make certain that all other
3822 // threads in the teams that had this root as ancestor have stopped trying to
3823 // steal tasks.
3824 if (__kmp_tasking_mode != tskm_immediate_exec) {
3825 __kmp_wait_to_unref_task_teams();
3826 }
3827
3828 #if KMP_OS_WINDOWS
3829 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3830 KA_TRACE(
3831 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3832 "\n",
3833 (LPVOID) & (root->r.r_uber_thread->th),
3834 root->r.r_uber_thread->th.th_info.ds.ds_thread));
3835 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3836 #endif /* KMP_OS_WINDOWS */
3837
3838 #if OMPT_SUPPORT
3839 ompt_data_t *task_data;
3840 ompt_data_t *parallel_data;
3841 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3842 if (ompt_enabled.ompt_callback_implicit_task) {
3843 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3844 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3845 }
3846 if (ompt_enabled.ompt_callback_thread_end) {
3847 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3848 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3849 }
3850 #endif
3851
3852 TCW_4(__kmp_nth,
3853 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3854 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3855 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3856 " to %d\n",
3857 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3858 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3859 if (i == 1) {
3860 // need to free contention group structure
3861 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3862 root->r.r_uber_thread->th.th_cg_roots->cg_root);
3863 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3864 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3865 root->r.r_uber_thread->th.th_cg_roots = NULL;
3866 }
3867 __kmp_reap_thread(root->r.r_uber_thread, 1);
3868
3869 // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3870 // of freeing.
3871 root->r.r_uber_thread = NULL;
3872 /* mark root as no longer in use */
3873 root->r.r_begin = FALSE;
3874
3875 return n;
3876 }
3877
__kmp_unregister_root_current_thread(int gtid)3878 void __kmp_unregister_root_current_thread(int gtid) {
3879 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3880 /* this lock should be ok, since unregister_root_current_thread is never
3881 called during an abort, only during a normal close. furthermore, if you
3882 have the forkjoin lock, you should never try to get the initz lock */
3883 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3884 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3885 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3886 "exiting T#%d\n",
3887 gtid));
3888 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3889 return;
3890 }
3891 kmp_root_t *root = __kmp_root[gtid];
3892
3893 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3894 KMP_ASSERT(KMP_UBER_GTID(gtid));
3895 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3896 KMP_ASSERT(root->r.r_active == FALSE);
3897
3898 KMP_MB();
3899
3900 kmp_info_t *thread = __kmp_threads[gtid];
3901 kmp_team_t *team = thread->th.th_team;
3902 kmp_task_team_t *task_team = thread->th.th_task_team;
3903
3904 // we need to wait for the proxy tasks before finishing the thread
3905 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3906 #if OMPT_SUPPORT
3907 // the runtime is shutting down so we won't report any events
3908 thread->th.ompt_thread_info.state = ompt_state_undefined;
3909 #endif
3910 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3911 }
3912
3913 __kmp_reset_root(gtid, root);
3914
3915 /* free up this thread slot */
3916 __kmp_gtid_set_specific(KMP_GTID_DNE);
3917 #ifdef KMP_TDATA_GTID
3918 __kmp_gtid = KMP_GTID_DNE;
3919 #endif
3920
3921 KMP_MB();
3922 KC_TRACE(10,
3923 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3924
3925 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3926 }
3927
3928 #if KMP_OS_WINDOWS
3929 /* __kmp_forkjoin_lock must be already held
3930 Unregisters a root thread that is not the current thread. Returns the number
3931 of __kmp_threads entries freed as a result. */
__kmp_unregister_root_other_thread(int gtid)3932 static int __kmp_unregister_root_other_thread(int gtid) {
3933 kmp_root_t *root = __kmp_root[gtid];
3934 int r;
3935
3936 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3937 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938 KMP_ASSERT(KMP_UBER_GTID(gtid));
3939 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940 KMP_ASSERT(root->r.r_active == FALSE);
3941
3942 r = __kmp_reset_root(gtid, root);
3943 KC_TRACE(10,
3944 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3945 return r;
3946 }
3947 #endif
3948
3949 #if KMP_DEBUG
__kmp_task_info()3950 void __kmp_task_info() {
3951
3952 kmp_int32 gtid = __kmp_entry_gtid();
3953 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3954 kmp_info_t *this_thr = __kmp_threads[gtid];
3955 kmp_team_t *steam = this_thr->th.th_serial_team;
3956 kmp_team_t *team = this_thr->th.th_team;
3957
3958 __kmp_printf(
3959 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3960 "ptask=%p\n",
3961 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3962 team->t.t_implicit_task_taskdata[tid].td_parent);
3963 }
3964 #endif // KMP_DEBUG
3965
3966 /* TODO optimize with one big memclr, take out what isn't needed, split
3967 responsibility to workers as much as possible, and delay initialization of
3968 features as much as possible */
__kmp_initialize_info(kmp_info_t * this_thr,kmp_team_t * team,int tid,int gtid)3969 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3970 int tid, int gtid) {
3971 /* this_thr->th.th_info.ds.ds_gtid is setup in
3972 kmp_allocate_thread/create_worker.
3973 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3974 kmp_info_t *master = team->t.t_threads[0];
3975 KMP_DEBUG_ASSERT(this_thr != NULL);
3976 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
3977 KMP_DEBUG_ASSERT(team);
3978 KMP_DEBUG_ASSERT(team->t.t_threads);
3979 KMP_DEBUG_ASSERT(team->t.t_dispatch);
3980 KMP_DEBUG_ASSERT(master);
3981 KMP_DEBUG_ASSERT(master->th.th_root);
3982
3983 KMP_MB();
3984
3985 TCW_SYNC_PTR(this_thr->th.th_team, team);
3986
3987 this_thr->th.th_info.ds.ds_tid = tid;
3988 this_thr->th.th_set_nproc = 0;
3989 if (__kmp_tasking_mode != tskm_immediate_exec)
3990 // When tasking is possible, threads are not safe to reap until they are
3991 // done tasking; this will be set when tasking code is exited in wait
3992 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3993 else // no tasking --> always safe to reap
3994 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
3995 this_thr->th.th_set_proc_bind = proc_bind_default;
3996 #if KMP_AFFINITY_SUPPORTED
3997 this_thr->th.th_new_place = this_thr->th.th_current_place;
3998 #endif
3999 this_thr->th.th_root = master->th.th_root;
4000
4001 /* setup the thread's cache of the team structure */
4002 this_thr->th.th_team_nproc = team->t.t_nproc;
4003 this_thr->th.th_team_master = master;
4004 this_thr->th.th_team_serialized = team->t.t_serialized;
4005 TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4006
4007 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4008
4009 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4010 tid, gtid, this_thr, this_thr->th.th_current_task));
4011
4012 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4013 team, tid, TRUE);
4014
4015 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4016 tid, gtid, this_thr, this_thr->th.th_current_task));
4017 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4018 // __kmp_initialize_team()?
4019
4020 /* TODO no worksharing in speculative threads */
4021 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4022
4023 this_thr->th.th_local.this_construct = 0;
4024
4025 if (!this_thr->th.th_pri_common) {
4026 this_thr->th.th_pri_common =
4027 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4028 if (__kmp_storage_map) {
4029 __kmp_print_storage_map_gtid(
4030 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4031 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4032 }
4033 this_thr->th.th_pri_head = NULL;
4034 }
4035
4036 if (this_thr != master && // Master's CG root is initialized elsewhere
4037 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4038 // Make new thread's CG root same as master's
4039 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4040 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4041 if (tmp) {
4042 // worker changes CG, need to check if old CG should be freed
4043 int i = tmp->cg_nthreads--;
4044 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4045 " on node %p of thread %p to %d\n",
4046 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4047 if (i == 1) {
4048 __kmp_free(tmp); // last thread left CG --> free it
4049 }
4050 }
4051 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4052 // Increment new thread's CG root's counter to add the new thread
4053 this_thr->th.th_cg_roots->cg_nthreads++;
4054 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4055 " node %p of thread %p to %d\n",
4056 this_thr, this_thr->th.th_cg_roots,
4057 this_thr->th.th_cg_roots->cg_root,
4058 this_thr->th.th_cg_roots->cg_nthreads));
4059 this_thr->th.th_current_task->td_icvs.thread_limit =
4060 this_thr->th.th_cg_roots->cg_thread_limit;
4061 }
4062
4063 /* Initialize dynamic dispatch */
4064 {
4065 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4066 // Use team max_nproc since this will never change for the team.
4067 size_t disp_size =
4068 sizeof(dispatch_private_info_t) *
4069 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4070 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4071 team->t.t_max_nproc));
4072 KMP_ASSERT(dispatch);
4073 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4075
4076 dispatch->th_disp_index = 0;
4077 dispatch->th_doacross_buf_idx = 0;
4078 if (!dispatch->th_disp_buffer) {
4079 dispatch->th_disp_buffer =
4080 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4081
4082 if (__kmp_storage_map) {
4083 __kmp_print_storage_map_gtid(
4084 gtid, &dispatch->th_disp_buffer[0],
4085 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4086 ? 1
4087 : __kmp_dispatch_num_buffers],
4088 disp_size, "th_%d.th_dispatch.th_disp_buffer "
4089 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4090 gtid, team->t.t_id, gtid);
4091 }
4092 } else {
4093 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4094 }
4095
4096 dispatch->th_dispatch_pr_current = 0;
4097 dispatch->th_dispatch_sh_current = 0;
4098
4099 dispatch->th_deo_fcn = 0; /* ORDERED */
4100 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4101 }
4102
4103 this_thr->th.th_next_pool = NULL;
4104
4105 if (!this_thr->th.th_task_state_memo_stack) {
4106 size_t i;
4107 this_thr->th.th_task_state_memo_stack =
4108 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4109 this_thr->th.th_task_state_top = 0;
4110 this_thr->th.th_task_state_stack_sz = 4;
4111 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4112 ++i) // zero init the stack
4113 this_thr->th.th_task_state_memo_stack[i] = 0;
4114 }
4115
4116 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4117 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4118
4119 KMP_MB();
4120 }
4121
4122 /* allocate a new thread for the requesting team. this is only called from
4123 within a forkjoin critical section. we will first try to get an available
4124 thread from the thread pool. if none is available, we will fork a new one
4125 assuming we are able to create a new one. this should be assured, as the
4126 caller should check on this first. */
__kmp_allocate_thread(kmp_root_t * root,kmp_team_t * team,int new_tid)4127 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4128 int new_tid) {
4129 kmp_team_t *serial_team;
4130 kmp_info_t *new_thr;
4131 int new_gtid;
4132
4133 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4134 KMP_DEBUG_ASSERT(root && team);
4135 #if !KMP_NESTED_HOT_TEAMS
4136 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4137 #endif
4138 KMP_MB();
4139
4140 /* first, try to get one from the thread pool */
4141 if (__kmp_thread_pool) {
4142 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4143 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4144 if (new_thr == __kmp_thread_pool_insert_pt) {
4145 __kmp_thread_pool_insert_pt = NULL;
4146 }
4147 TCW_4(new_thr->th.th_in_pool, FALSE);
4148 __kmp_suspend_initialize_thread(new_thr);
4149 __kmp_lock_suspend_mx(new_thr);
4150 if (new_thr->th.th_active_in_pool == TRUE) {
4151 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4152 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4153 new_thr->th.th_active_in_pool = FALSE;
4154 }
4155 __kmp_unlock_suspend_mx(new_thr);
4156
4157 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4158 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4159 KMP_ASSERT(!new_thr->th.th_team);
4160 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4161
4162 /* setup the thread structure */
4163 __kmp_initialize_info(new_thr, team, new_tid,
4164 new_thr->th.th_info.ds.ds_gtid);
4165 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4166
4167 TCW_4(__kmp_nth, __kmp_nth + 1);
4168
4169 new_thr->th.th_task_state = 0;
4170 new_thr->th.th_task_state_top = 0;
4171 new_thr->th.th_task_state_stack_sz = 4;
4172
4173 #ifdef KMP_ADJUST_BLOCKTIME
4174 /* Adjust blocktime back to zero if necessary */
4175 /* Middle initialization might not have occurred yet */
4176 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4177 if (__kmp_nth > __kmp_avail_proc) {
4178 __kmp_zero_bt = TRUE;
4179 }
4180 }
4181 #endif /* KMP_ADJUST_BLOCKTIME */
4182
4183 #if KMP_DEBUG
4184 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4185 // KMP_BARRIER_PARENT_FLAG.
4186 int b;
4187 kmp_balign_t *balign = new_thr->th.th_bar;
4188 for (b = 0; b < bs_last_barrier; ++b)
4189 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4190 #endif
4191
4192 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4193 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4194
4195 KMP_MB();
4196 return new_thr;
4197 }
4198
4199 /* no, well fork a new one */
4200 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4201 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4202
4203 #if KMP_USE_MONITOR
4204 // If this is the first worker thread the RTL is creating, then also
4205 // launch the monitor thread. We try to do this as early as possible.
4206 if (!TCR_4(__kmp_init_monitor)) {
4207 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4208 if (!TCR_4(__kmp_init_monitor)) {
4209 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4210 TCW_4(__kmp_init_monitor, 1);
4211 __kmp_create_monitor(&__kmp_monitor);
4212 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4213 #if KMP_OS_WINDOWS
4214 // AC: wait until monitor has started. This is a fix for CQ232808.
4215 // The reason is that if the library is loaded/unloaded in a loop with
4216 // small (parallel) work in between, then there is high probability that
4217 // monitor thread started after the library shutdown. At shutdown it is
4218 // too late to cope with the problem, because when the master is in
4219 // DllMain (process detach) the monitor has no chances to start (it is
4220 // blocked), and master has no means to inform the monitor that the
4221 // library has gone, because all the memory which the monitor can access
4222 // is going to be released/reset.
4223 while (TCR_4(__kmp_init_monitor) < 2) {
4224 KMP_YIELD(TRUE);
4225 }
4226 KF_TRACE(10, ("after monitor thread has started\n"));
4227 #endif
4228 }
4229 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4230 }
4231 #endif
4232
4233 KMP_MB();
4234 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4235 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4236 }
4237
4238 /* allocate space for it. */
4239 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4240
4241 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4242
4243 if (__kmp_storage_map) {
4244 __kmp_print_thread_storage_map(new_thr, new_gtid);
4245 }
4246
4247 // add the reserve serialized team, initialized from the team's master thread
4248 {
4249 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4250 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4251 new_thr->th.th_serial_team = serial_team =
4252 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4253 #if OMPT_SUPPORT
4254 ompt_data_none, // root parallel id
4255 #endif
4256 proc_bind_default, &r_icvs,
4257 0 USE_NESTED_HOT_ARG(NULL));
4258 }
4259 KMP_ASSERT(serial_team);
4260 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4261 // execution (it is unused for now).
4262 serial_team->t.t_threads[0] = new_thr;
4263 KF_TRACE(10,
4264 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4265 new_thr));
4266
4267 /* setup the thread structures */
4268 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4269
4270 #if USE_FAST_MEMORY
4271 __kmp_initialize_fast_memory(new_thr);
4272 #endif /* USE_FAST_MEMORY */
4273
4274 #if KMP_USE_BGET
4275 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4276 __kmp_initialize_bget(new_thr);
4277 #endif
4278
4279 __kmp_init_random(new_thr); // Initialize random number generator
4280
4281 /* Initialize these only once when thread is grabbed for a team allocation */
4282 KA_TRACE(20,
4283 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4284 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4285
4286 int b;
4287 kmp_balign_t *balign = new_thr->th.th_bar;
4288 for (b = 0; b < bs_last_barrier; ++b) {
4289 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4290 balign[b].bb.team = NULL;
4291 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4292 balign[b].bb.use_oncore_barrier = 0;
4293 }
4294
4295 new_thr->th.th_spin_here = FALSE;
4296 new_thr->th.th_next_waiting = 0;
4297 #if KMP_OS_UNIX
4298 new_thr->th.th_blocking = false;
4299 #endif
4300
4301 #if KMP_AFFINITY_SUPPORTED
4302 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4303 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4304 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4305 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4306 #endif
4307 new_thr->th.th_def_allocator = __kmp_def_allocator;
4308 new_thr->th.th_prev_level = 0;
4309 new_thr->th.th_prev_num_threads = 1;
4310
4311 TCW_4(new_thr->th.th_in_pool, FALSE);
4312 new_thr->th.th_active_in_pool = FALSE;
4313 TCW_4(new_thr->th.th_active, TRUE);
4314
4315 /* adjust the global counters */
4316 __kmp_all_nth++;
4317 __kmp_nth++;
4318
4319 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4320 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4321 if (__kmp_adjust_gtid_mode) {
4322 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4323 if (TCR_4(__kmp_gtid_mode) != 2) {
4324 TCW_4(__kmp_gtid_mode, 2);
4325 }
4326 } else {
4327 if (TCR_4(__kmp_gtid_mode) != 1) {
4328 TCW_4(__kmp_gtid_mode, 1);
4329 }
4330 }
4331 }
4332
4333 #ifdef KMP_ADJUST_BLOCKTIME
4334 /* Adjust blocktime back to zero if necessary */
4335 /* Middle initialization might not have occurred yet */
4336 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4337 if (__kmp_nth > __kmp_avail_proc) {
4338 __kmp_zero_bt = TRUE;
4339 }
4340 }
4341 #endif /* KMP_ADJUST_BLOCKTIME */
4342
4343 /* actually fork it and create the new worker thread */
4344 KF_TRACE(
4345 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4346 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4347 KF_TRACE(10,
4348 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4349
4350 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4351 new_gtid));
4352 KMP_MB();
4353 return new_thr;
4354 }
4355
4356 /* Reinitialize team for reuse.
4357 The hot team code calls this case at every fork barrier, so EPCC barrier
4358 test are extremely sensitive to changes in it, esp. writes to the team
4359 struct, which cause a cache invalidation in all threads.
4360 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
__kmp_reinitialize_team(kmp_team_t * team,kmp_internal_control_t * new_icvs,ident_t * loc)4361 static void __kmp_reinitialize_team(kmp_team_t *team,
4362 kmp_internal_control_t *new_icvs,
4363 ident_t *loc) {
4364 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4365 team->t.t_threads[0], team));
4366 KMP_DEBUG_ASSERT(team && new_icvs);
4367 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4368 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4369
4370 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4371 // Copy ICVs to the master thread's implicit taskdata
4372 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4373 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4374
4375 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4376 team->t.t_threads[0], team));
4377 }
4378
4379 /* Initialize the team data structure.
4380 This assumes the t_threads and t_max_nproc are already set.
4381 Also, we don't touch the arguments */
__kmp_initialize_team(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)4382 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4383 kmp_internal_control_t *new_icvs,
4384 ident_t *loc) {
4385 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4386
4387 /* verify */
4388 KMP_DEBUG_ASSERT(team);
4389 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4390 KMP_DEBUG_ASSERT(team->t.t_threads);
4391 KMP_MB();
4392
4393 team->t.t_master_tid = 0; /* not needed */
4394 /* team->t.t_master_bar; not needed */
4395 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4396 team->t.t_nproc = new_nproc;
4397
4398 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4399 team->t.t_next_pool = NULL;
4400 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4401 * up hot team */
4402
4403 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4404 team->t.t_invoke = NULL; /* not needed */
4405
4406 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4407 team->t.t_sched.sched = new_icvs->sched.sched;
4408
4409 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4410 team->t.t_fp_control_saved = FALSE; /* not needed */
4411 team->t.t_x87_fpu_control_word = 0; /* not needed */
4412 team->t.t_mxcsr = 0; /* not needed */
4413 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4414
4415 team->t.t_construct = 0;
4416
4417 team->t.t_ordered.dt.t_value = 0;
4418 team->t.t_master_active = FALSE;
4419
4420 #ifdef KMP_DEBUG
4421 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4422 #endif
4423 #if KMP_OS_WINDOWS
4424 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4425 #endif
4426
4427 team->t.t_control_stack_top = NULL;
4428
4429 __kmp_reinitialize_team(team, new_icvs, loc);
4430
4431 KMP_MB();
4432 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4433 }
4434
4435 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4436 /* Sets full mask for thread and returns old mask, no changes to structures. */
4437 static void
__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t * old_mask)4438 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4439 if (KMP_AFFINITY_CAPABLE()) {
4440 int status;
4441 if (old_mask != NULL) {
4442 status = __kmp_get_system_affinity(old_mask, TRUE);
4443 int error = errno;
4444 if (status != 0) {
4445 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4446 __kmp_msg_null);
4447 }
4448 }
4449 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4450 }
4451 }
4452 #endif
4453
4454 #if KMP_AFFINITY_SUPPORTED
4455
4456 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4457 // It calculats the worker + master thread's partition based upon the parent
4458 // thread's partition, and binds each worker to a thread in their partition.
4459 // The master thread's partition should already include its current binding.
__kmp_partition_places(kmp_team_t * team,int update_master_only)4460 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4461 // Copy the master thread's place partion to the team struct
4462 kmp_info_t *master_th = team->t.t_threads[0];
4463 KMP_DEBUG_ASSERT(master_th != NULL);
4464 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4465 int first_place = master_th->th.th_first_place;
4466 int last_place = master_th->th.th_last_place;
4467 int masters_place = master_th->th.th_current_place;
4468 team->t.t_first_place = first_place;
4469 team->t.t_last_place = last_place;
4470
4471 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4472 "bound to place %d partition = [%d,%d]\n",
4473 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4474 team->t.t_id, masters_place, first_place, last_place));
4475
4476 switch (proc_bind) {
4477
4478 case proc_bind_default:
4479 // serial teams might have the proc_bind policy set to proc_bind_default. It
4480 // doesn't matter, as we don't rebind master thread for any proc_bind policy
4481 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4482 break;
4483
4484 case proc_bind_master: {
4485 int f;
4486 int n_th = team->t.t_nproc;
4487 for (f = 1; f < n_th; f++) {
4488 kmp_info_t *th = team->t.t_threads[f];
4489 KMP_DEBUG_ASSERT(th != NULL);
4490 th->th.th_first_place = first_place;
4491 th->th.th_last_place = last_place;
4492 th->th.th_new_place = masters_place;
4493 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4494 team->t.t_display_affinity != 1) {
4495 team->t.t_display_affinity = 1;
4496 }
4497
4498 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4499 "partition = [%d,%d]\n",
4500 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4501 f, masters_place, first_place, last_place));
4502 }
4503 } break;
4504
4505 case proc_bind_close: {
4506 int f;
4507 int n_th = team->t.t_nproc;
4508 int n_places;
4509 if (first_place <= last_place) {
4510 n_places = last_place - first_place + 1;
4511 } else {
4512 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4513 }
4514 if (n_th <= n_places) {
4515 int place = masters_place;
4516 for (f = 1; f < n_th; f++) {
4517 kmp_info_t *th = team->t.t_threads[f];
4518 KMP_DEBUG_ASSERT(th != NULL);
4519
4520 if (place == last_place) {
4521 place = first_place;
4522 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4523 place = 0;
4524 } else {
4525 place++;
4526 }
4527 th->th.th_first_place = first_place;
4528 th->th.th_last_place = last_place;
4529 th->th.th_new_place = place;
4530 if (__kmp_display_affinity && place != th->th.th_current_place &&
4531 team->t.t_display_affinity != 1) {
4532 team->t.t_display_affinity = 1;
4533 }
4534
4535 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4536 "partition = [%d,%d]\n",
4537 __kmp_gtid_from_thread(team->t.t_threads[f]),
4538 team->t.t_id, f, place, first_place, last_place));
4539 }
4540 } else {
4541 int S, rem, gap, s_count;
4542 S = n_th / n_places;
4543 s_count = 0;
4544 rem = n_th - (S * n_places);
4545 gap = rem > 0 ? n_places / rem : n_places;
4546 int place = masters_place;
4547 int gap_ct = gap;
4548 for (f = 0; f < n_th; f++) {
4549 kmp_info_t *th = team->t.t_threads[f];
4550 KMP_DEBUG_ASSERT(th != NULL);
4551
4552 th->th.th_first_place = first_place;
4553 th->th.th_last_place = last_place;
4554 th->th.th_new_place = place;
4555 if (__kmp_display_affinity && place != th->th.th_current_place &&
4556 team->t.t_display_affinity != 1) {
4557 team->t.t_display_affinity = 1;
4558 }
4559 s_count++;
4560
4561 if ((s_count == S) && rem && (gap_ct == gap)) {
4562 // do nothing, add an extra thread to place on next iteration
4563 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4564 // we added an extra thread to this place; move to next place
4565 if (place == last_place) {
4566 place = first_place;
4567 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4568 place = 0;
4569 } else {
4570 place++;
4571 }
4572 s_count = 0;
4573 gap_ct = 1;
4574 rem--;
4575 } else if (s_count == S) { // place full; don't add extra
4576 if (place == last_place) {
4577 place = first_place;
4578 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4579 place = 0;
4580 } else {
4581 place++;
4582 }
4583 gap_ct++;
4584 s_count = 0;
4585 }
4586
4587 KA_TRACE(100,
4588 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4589 "partition = [%d,%d]\n",
4590 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4591 th->th.th_new_place, first_place, last_place));
4592 }
4593 KMP_DEBUG_ASSERT(place == masters_place);
4594 }
4595 } break;
4596
4597 case proc_bind_spread: {
4598 int f;
4599 int n_th = team->t.t_nproc;
4600 int n_places;
4601 int thidx;
4602 if (first_place <= last_place) {
4603 n_places = last_place - first_place + 1;
4604 } else {
4605 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4606 }
4607 if (n_th <= n_places) {
4608 int place = -1;
4609
4610 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4611 int S = n_places / n_th;
4612 int s_count, rem, gap, gap_ct;
4613
4614 place = masters_place;
4615 rem = n_places - n_th * S;
4616 gap = rem ? n_th / rem : 1;
4617 gap_ct = gap;
4618 thidx = n_th;
4619 if (update_master_only == 1)
4620 thidx = 1;
4621 for (f = 0; f < thidx; f++) {
4622 kmp_info_t *th = team->t.t_threads[f];
4623 KMP_DEBUG_ASSERT(th != NULL);
4624
4625 th->th.th_first_place = place;
4626 th->th.th_new_place = place;
4627 if (__kmp_display_affinity && place != th->th.th_current_place &&
4628 team->t.t_display_affinity != 1) {
4629 team->t.t_display_affinity = 1;
4630 }
4631 s_count = 1;
4632 while (s_count < S) {
4633 if (place == last_place) {
4634 place = first_place;
4635 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4636 place = 0;
4637 } else {
4638 place++;
4639 }
4640 s_count++;
4641 }
4642 if (rem && (gap_ct == gap)) {
4643 if (place == last_place) {
4644 place = first_place;
4645 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4646 place = 0;
4647 } else {
4648 place++;
4649 }
4650 rem--;
4651 gap_ct = 0;
4652 }
4653 th->th.th_last_place = place;
4654 gap_ct++;
4655
4656 if (place == last_place) {
4657 place = first_place;
4658 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659 place = 0;
4660 } else {
4661 place++;
4662 }
4663
4664 KA_TRACE(100,
4665 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4666 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4667 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4668 f, th->th.th_new_place, th->th.th_first_place,
4669 th->th.th_last_place, __kmp_affinity_num_masks));
4670 }
4671 } else {
4672 /* Having uniform space of available computation places I can create
4673 T partitions of round(P/T) size and put threads into the first
4674 place of each partition. */
4675 double current = static_cast<double>(masters_place);
4676 double spacing =
4677 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4678 int first, last;
4679 kmp_info_t *th;
4680
4681 thidx = n_th + 1;
4682 if (update_master_only == 1)
4683 thidx = 1;
4684 for (f = 0; f < thidx; f++) {
4685 first = static_cast<int>(current);
4686 last = static_cast<int>(current + spacing) - 1;
4687 KMP_DEBUG_ASSERT(last >= first);
4688 if (first >= n_places) {
4689 if (masters_place) {
4690 first -= n_places;
4691 last -= n_places;
4692 if (first == (masters_place + 1)) {
4693 KMP_DEBUG_ASSERT(f == n_th);
4694 first--;
4695 }
4696 if (last == masters_place) {
4697 KMP_DEBUG_ASSERT(f == (n_th - 1));
4698 last--;
4699 }
4700 } else {
4701 KMP_DEBUG_ASSERT(f == n_th);
4702 first = 0;
4703 last = 0;
4704 }
4705 }
4706 if (last >= n_places) {
4707 last = (n_places - 1);
4708 }
4709 place = first;
4710 current += spacing;
4711 if (f < n_th) {
4712 KMP_DEBUG_ASSERT(0 <= first);
4713 KMP_DEBUG_ASSERT(n_places > first);
4714 KMP_DEBUG_ASSERT(0 <= last);
4715 KMP_DEBUG_ASSERT(n_places > last);
4716 KMP_DEBUG_ASSERT(last_place >= first_place);
4717 th = team->t.t_threads[f];
4718 KMP_DEBUG_ASSERT(th);
4719 th->th.th_first_place = first;
4720 th->th.th_new_place = place;
4721 th->th.th_last_place = last;
4722 if (__kmp_display_affinity && place != th->th.th_current_place &&
4723 team->t.t_display_affinity != 1) {
4724 team->t.t_display_affinity = 1;
4725 }
4726 KA_TRACE(100,
4727 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4728 "partition = [%d,%d], spacing = %.4f\n",
4729 __kmp_gtid_from_thread(team->t.t_threads[f]),
4730 team->t.t_id, f, th->th.th_new_place,
4731 th->th.th_first_place, th->th.th_last_place, spacing));
4732 }
4733 }
4734 }
4735 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4736 } else {
4737 int S, rem, gap, s_count;
4738 S = n_th / n_places;
4739 s_count = 0;
4740 rem = n_th - (S * n_places);
4741 gap = rem > 0 ? n_places / rem : n_places;
4742 int place = masters_place;
4743 int gap_ct = gap;
4744 thidx = n_th;
4745 if (update_master_only == 1)
4746 thidx = 1;
4747 for (f = 0; f < thidx; f++) {
4748 kmp_info_t *th = team->t.t_threads[f];
4749 KMP_DEBUG_ASSERT(th != NULL);
4750
4751 th->th.th_first_place = place;
4752 th->th.th_last_place = place;
4753 th->th.th_new_place = place;
4754 if (__kmp_display_affinity && place != th->th.th_current_place &&
4755 team->t.t_display_affinity != 1) {
4756 team->t.t_display_affinity = 1;
4757 }
4758 s_count++;
4759
4760 if ((s_count == S) && rem && (gap_ct == gap)) {
4761 // do nothing, add an extra thread to place on next iteration
4762 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763 // we added an extra thread to this place; move on to next place
4764 if (place == last_place) {
4765 place = first_place;
4766 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767 place = 0;
4768 } else {
4769 place++;
4770 }
4771 s_count = 0;
4772 gap_ct = 1;
4773 rem--;
4774 } else if (s_count == S) { // place is full; don't add extra thread
4775 if (place == last_place) {
4776 place = first_place;
4777 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778 place = 0;
4779 } else {
4780 place++;
4781 }
4782 gap_ct++;
4783 s_count = 0;
4784 }
4785
4786 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4787 "partition = [%d,%d]\n",
4788 __kmp_gtid_from_thread(team->t.t_threads[f]),
4789 team->t.t_id, f, th->th.th_new_place,
4790 th->th.th_first_place, th->th.th_last_place));
4791 }
4792 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4793 }
4794 } break;
4795
4796 default:
4797 break;
4798 }
4799
4800 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4801 }
4802
4803 #endif // KMP_AFFINITY_SUPPORTED
4804
4805 /* allocate a new team data structure to use. take one off of the free pool if
4806 available */
4807 kmp_team_t *
__kmp_allocate_team(kmp_root_t * root,int new_nproc,int max_nproc,ompt_data_t ompt_parallel_data,kmp_proc_bind_t new_proc_bind,kmp_internal_control_t * new_icvs,int argc USE_NESTED_HOT_ARG (kmp_info_t * master))4808 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4809 #if OMPT_SUPPORT
4810 ompt_data_t ompt_parallel_data,
4811 #endif
4812 kmp_proc_bind_t new_proc_bind,
4813 kmp_internal_control_t *new_icvs,
4814 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4815 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4816 int f;
4817 kmp_team_t *team;
4818 int use_hot_team = !root->r.r_active;
4819 int level = 0;
4820
4821 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4822 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4823 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4824 KMP_MB();
4825
4826 #if KMP_NESTED_HOT_TEAMS
4827 kmp_hot_team_ptr_t *hot_teams;
4828 if (master) {
4829 team = master->th.th_team;
4830 level = team->t.t_active_level;
4831 if (master->th.th_teams_microtask) { // in teams construct?
4832 if (master->th.th_teams_size.nteams > 1 &&
4833 ( // #teams > 1
4834 team->t.t_pkfn ==
4835 (microtask_t)__kmp_teams_master || // inner fork of the teams
4836 master->th.th_teams_level <
4837 team->t.t_level)) { // or nested parallel inside the teams
4838 ++level; // not increment if #teams==1, or for outer fork of the teams;
4839 // increment otherwise
4840 }
4841 }
4842 hot_teams = master->th.th_hot_teams;
4843 if (level < __kmp_hot_teams_max_level && hot_teams &&
4844 hot_teams[level]
4845 .hot_team) { // hot team has already been allocated for given level
4846 use_hot_team = 1;
4847 } else {
4848 use_hot_team = 0;
4849 }
4850 }
4851 #endif
4852 // Optimization to use a "hot" team
4853 if (use_hot_team && new_nproc > 1) {
4854 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4855 #if KMP_NESTED_HOT_TEAMS
4856 team = hot_teams[level].hot_team;
4857 #else
4858 team = root->r.r_hot_team;
4859 #endif
4860 #if KMP_DEBUG
4861 if (__kmp_tasking_mode != tskm_immediate_exec) {
4862 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4863 "task_team[1] = %p before reinit\n",
4864 team->t.t_task_team[0], team->t.t_task_team[1]));
4865 }
4866 #endif
4867
4868 // Has the number of threads changed?
4869 /* Let's assume the most common case is that the number of threads is
4870 unchanged, and put that case first. */
4871 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4872 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4873 // This case can mean that omp_set_num_threads() was called and the hot
4874 // team size was already reduced, so we check the special flag
4875 if (team->t.t_size_changed == -1) {
4876 team->t.t_size_changed = 1;
4877 } else {
4878 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4879 }
4880
4881 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4882 kmp_r_sched_t new_sched = new_icvs->sched;
4883 // set master's schedule as new run-time schedule
4884 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4885
4886 __kmp_reinitialize_team(team, new_icvs,
4887 root->r.r_uber_thread->th.th_ident);
4888
4889 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4890 team->t.t_threads[0], team));
4891 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4892
4893 #if KMP_AFFINITY_SUPPORTED
4894 if ((team->t.t_size_changed == 0) &&
4895 (team->t.t_proc_bind == new_proc_bind)) {
4896 if (new_proc_bind == proc_bind_spread) {
4897 __kmp_partition_places(
4898 team, 1); // add flag to update only master for spread
4899 }
4900 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4901 "proc_bind = %d, partition = [%d,%d]\n",
4902 team->t.t_id, new_proc_bind, team->t.t_first_place,
4903 team->t.t_last_place));
4904 } else {
4905 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4906 __kmp_partition_places(team);
4907 }
4908 #else
4909 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4910 #endif /* KMP_AFFINITY_SUPPORTED */
4911 } else if (team->t.t_nproc > new_nproc) {
4912 KA_TRACE(20,
4913 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4914 new_nproc));
4915
4916 team->t.t_size_changed = 1;
4917 #if KMP_NESTED_HOT_TEAMS
4918 if (__kmp_hot_teams_mode == 0) {
4919 // AC: saved number of threads should correspond to team's value in this
4920 // mode, can be bigger in mode 1, when hot team has threads in reserve
4921 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4922 hot_teams[level].hot_team_nth = new_nproc;
4923 #endif // KMP_NESTED_HOT_TEAMS
4924 /* release the extra threads we don't need any more */
4925 for (f = new_nproc; f < team->t.t_nproc; f++) {
4926 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4927 if (__kmp_tasking_mode != tskm_immediate_exec) {
4928 // When decreasing team size, threads no longer in the team should
4929 // unref task team.
4930 team->t.t_threads[f]->th.th_task_team = NULL;
4931 }
4932 __kmp_free_thread(team->t.t_threads[f]);
4933 team->t.t_threads[f] = NULL;
4934 }
4935 #if KMP_NESTED_HOT_TEAMS
4936 } // (__kmp_hot_teams_mode == 0)
4937 else {
4938 // When keeping extra threads in team, switch threads to wait on own
4939 // b_go flag
4940 for (f = new_nproc; f < team->t.t_nproc; ++f) {
4941 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4942 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4943 for (int b = 0; b < bs_last_barrier; ++b) {
4944 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4945 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4946 }
4947 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4948 }
4949 }
4950 }
4951 #endif // KMP_NESTED_HOT_TEAMS
4952 team->t.t_nproc = new_nproc;
4953 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4954 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4955 __kmp_reinitialize_team(team, new_icvs,
4956 root->r.r_uber_thread->th.th_ident);
4957
4958 // Update remaining threads
4959 for (f = 0; f < new_nproc; ++f) {
4960 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4961 }
4962
4963 // restore the current task state of the master thread: should be the
4964 // implicit task
4965 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4966 team->t.t_threads[0], team));
4967
4968 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4969
4970 #ifdef KMP_DEBUG
4971 for (f = 0; f < team->t.t_nproc; f++) {
4972 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4973 team->t.t_threads[f]->th.th_team_nproc ==
4974 team->t.t_nproc);
4975 }
4976 #endif
4977
4978 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4979 #if KMP_AFFINITY_SUPPORTED
4980 __kmp_partition_places(team);
4981 #endif
4982 } else { // team->t.t_nproc < new_nproc
4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984 kmp_affin_mask_t *old_mask;
4985 if (KMP_AFFINITY_CAPABLE()) {
4986 KMP_CPU_ALLOC(old_mask);
4987 }
4988 #endif
4989
4990 KA_TRACE(20,
4991 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4992 new_nproc));
4993
4994 team->t.t_size_changed = 1;
4995
4996 #if KMP_NESTED_HOT_TEAMS
4997 int avail_threads = hot_teams[level].hot_team_nth;
4998 if (new_nproc < avail_threads)
4999 avail_threads = new_nproc;
5000 kmp_info_t **other_threads = team->t.t_threads;
5001 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002 // Adjust barrier data of reserved threads (if any) of the team
5003 // Other data will be set in __kmp_initialize_info() below.
5004 int b;
5005 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006 for (b = 0; b < bs_last_barrier; ++b) {
5007 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009 #if USE_DEBUGGER
5010 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5011 #endif
5012 }
5013 }
5014 if (hot_teams[level].hot_team_nth >= new_nproc) {
5015 // we have all needed threads in reserve, no need to allocate any
5016 // this only possible in mode 1, cannot have reserved threads in mode 0
5017 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018 team->t.t_nproc = new_nproc; // just get reserved threads involved
5019 } else {
5020 // we may have some threads in reserve, but not enough
5021 team->t.t_nproc =
5022 hot_teams[level]
5023 .hot_team_nth; // get reserved threads involved if any
5024 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025 #endif // KMP_NESTED_HOT_TEAMS
5026 if (team->t.t_max_nproc < new_nproc) {
5027 /* reallocate larger arrays */
5028 __kmp_reallocate_team_arrays(team, new_nproc);
5029 __kmp_reinitialize_team(team, new_icvs, NULL);
5030 }
5031
5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033 /* Temporarily set full mask for master thread before creation of
5034 workers. The reason is that workers inherit the affinity from master,
5035 so if a lot of workers are created on the single core quickly, they
5036 don't get a chance to set their own affinity for a long time. */
5037 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5038 #endif
5039
5040 /* allocate new threads for the hot team */
5041 for (f = team->t.t_nproc; f < new_nproc; f++) {
5042 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043 KMP_DEBUG_ASSERT(new_worker);
5044 team->t.t_threads[f] = new_worker;
5045
5046 KA_TRACE(20,
5047 ("__kmp_allocate_team: team %d init T#%d arrived: "
5048 "join=%llu, plain=%llu\n",
5049 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051 team->t.t_bar[bs_plain_barrier].b_arrived));
5052
5053 { // Initialize barrier data for new threads.
5054 int b;
5055 kmp_balign_t *balign = new_worker->th.th_bar;
5056 for (b = 0; b < bs_last_barrier; ++b) {
5057 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059 KMP_BARRIER_PARENT_FLAG);
5060 #if USE_DEBUGGER
5061 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5062 #endif
5063 }
5064 }
5065 }
5066
5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068 if (KMP_AFFINITY_CAPABLE()) {
5069 /* Restore initial master thread's affinity mask */
5070 __kmp_set_system_affinity(old_mask, TRUE);
5071 KMP_CPU_FREE(old_mask);
5072 }
5073 #endif
5074 #if KMP_NESTED_HOT_TEAMS
5075 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076 #endif // KMP_NESTED_HOT_TEAMS
5077 /* make sure everyone is syncronized */
5078 int old_nproc = team->t.t_nproc; // save old value and use to update only
5079 // new threads below
5080 __kmp_initialize_team(team, new_nproc, new_icvs,
5081 root->r.r_uber_thread->th.th_ident);
5082
5083 /* reinitialize the threads */
5084 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085 for (f = 0; f < team->t.t_nproc; ++f)
5086 __kmp_initialize_info(team->t.t_threads[f], team, f,
5087 __kmp_gtid_from_tid(f, team));
5088
5089 if (level) { // set th_task_state for new threads in nested hot team
5090 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5091 // only need to set the th_task_state for the new threads. th_task_state
5092 // for master thread will not be accurate until after this in
5093 // __kmp_fork_call(), so we look to the master's memo_stack to get the
5094 // correct value.
5095 for (f = old_nproc; f < team->t.t_nproc; ++f)
5096 team->t.t_threads[f]->th.th_task_state =
5097 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5098 } else { // set th_task_state for new threads in non-nested hot team
5099 int old_state =
5100 team->t.t_threads[0]->th.th_task_state; // copy master's state
5101 for (f = old_nproc; f < team->t.t_nproc; ++f)
5102 team->t.t_threads[f]->th.th_task_state = old_state;
5103 }
5104
5105 #ifdef KMP_DEBUG
5106 for (f = 0; f < team->t.t_nproc; ++f) {
5107 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5108 team->t.t_threads[f]->th.th_team_nproc ==
5109 team->t.t_nproc);
5110 }
5111 #endif
5112
5113 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114 #if KMP_AFFINITY_SUPPORTED
5115 __kmp_partition_places(team);
5116 #endif
5117 } // Check changes in number of threads
5118
5119 kmp_info_t *master = team->t.t_threads[0];
5120 if (master->th.th_teams_microtask) {
5121 for (f = 1; f < new_nproc; ++f) {
5122 // propagate teams construct specific info to workers
5123 kmp_info_t *thr = team->t.t_threads[f];
5124 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5125 thr->th.th_teams_level = master->th.th_teams_level;
5126 thr->th.th_teams_size = master->th.th_teams_size;
5127 }
5128 }
5129 #if KMP_NESTED_HOT_TEAMS
5130 if (level) {
5131 // Sync barrier state for nested hot teams, not needed for outermost hot
5132 // team.
5133 for (f = 1; f < new_nproc; ++f) {
5134 kmp_info_t *thr = team->t.t_threads[f];
5135 int b;
5136 kmp_balign_t *balign = thr->th.th_bar;
5137 for (b = 0; b < bs_last_barrier; ++b) {
5138 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5139 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5140 #if USE_DEBUGGER
5141 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5142 #endif
5143 }
5144 }
5145 }
5146 #endif // KMP_NESTED_HOT_TEAMS
5147
5148 /* reallocate space for arguments if necessary */
5149 __kmp_alloc_argv_entries(argc, team, TRUE);
5150 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5151 // The hot team re-uses the previous task team,
5152 // if untouched during the previous release->gather phase.
5153
5154 KF_TRACE(10, (" hot_team = %p\n", team));
5155
5156 #if KMP_DEBUG
5157 if (__kmp_tasking_mode != tskm_immediate_exec) {
5158 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5159 "task_team[1] = %p after reinit\n",
5160 team->t.t_task_team[0], team->t.t_task_team[1]));
5161 }
5162 #endif
5163
5164 #if OMPT_SUPPORT
5165 __ompt_team_assign_id(team, ompt_parallel_data);
5166 #endif
5167
5168 KMP_MB();
5169
5170 return team;
5171 }
5172
5173 /* next, let's try to take one from the team pool */
5174 KMP_MB();
5175 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5176 /* TODO: consider resizing undersized teams instead of reaping them, now
5177 that we have a resizing mechanism */
5178 if (team->t.t_max_nproc >= max_nproc) {
5179 /* take this team from the team pool */
5180 __kmp_team_pool = team->t.t_next_pool;
5181
5182 /* setup the team for fresh use */
5183 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5184
5185 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5186 "task_team[1] %p to NULL\n",
5187 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5188 team->t.t_task_team[0] = NULL;
5189 team->t.t_task_team[1] = NULL;
5190
5191 /* reallocate space for arguments if necessary */
5192 __kmp_alloc_argv_entries(argc, team, TRUE);
5193 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5194
5195 KA_TRACE(
5196 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5197 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5198 { // Initialize barrier data.
5199 int b;
5200 for (b = 0; b < bs_last_barrier; ++b) {
5201 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5202 #if USE_DEBUGGER
5203 team->t.t_bar[b].b_master_arrived = 0;
5204 team->t.t_bar[b].b_team_arrived = 0;
5205 #endif
5206 }
5207 }
5208
5209 team->t.t_proc_bind = new_proc_bind;
5210
5211 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5212 team->t.t_id));
5213
5214 #if OMPT_SUPPORT
5215 __ompt_team_assign_id(team, ompt_parallel_data);
5216 #endif
5217
5218 KMP_MB();
5219
5220 return team;
5221 }
5222
5223 /* reap team if it is too small, then loop back and check the next one */
5224 // not sure if this is wise, but, will be redone during the hot-teams
5225 // rewrite.
5226 /* TODO: Use technique to find the right size hot-team, don't reap them */
5227 team = __kmp_reap_team(team);
5228 __kmp_team_pool = team;
5229 }
5230
5231 /* nothing available in the pool, no matter, make a new team! */
5232 KMP_MB();
5233 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5234
5235 /* and set it up */
5236 team->t.t_max_nproc = max_nproc;
5237 /* NOTE well, for some reason allocating one big buffer and dividing it up
5238 seems to really hurt performance a lot on the P4, so, let's not use this */
5239 __kmp_allocate_team_arrays(team, max_nproc);
5240
5241 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5242 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5243
5244 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5245 "%p to NULL\n",
5246 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5247 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5248 // memory, no need to duplicate
5249 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5250 // memory, no need to duplicate
5251
5252 if (__kmp_storage_map) {
5253 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5254 }
5255
5256 /* allocate space for arguments */
5257 __kmp_alloc_argv_entries(argc, team, FALSE);
5258 team->t.t_argc = argc;
5259
5260 KA_TRACE(20,
5261 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5262 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5263 { // Initialize barrier data.
5264 int b;
5265 for (b = 0; b < bs_last_barrier; ++b) {
5266 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5267 #if USE_DEBUGGER
5268 team->t.t_bar[b].b_master_arrived = 0;
5269 team->t.t_bar[b].b_team_arrived = 0;
5270 #endif
5271 }
5272 }
5273
5274 team->t.t_proc_bind = new_proc_bind;
5275
5276 #if OMPT_SUPPORT
5277 __ompt_team_assign_id(team, ompt_parallel_data);
5278 team->t.ompt_serialized_team_info = NULL;
5279 #endif
5280
5281 KMP_MB();
5282
5283 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5284 team->t.t_id));
5285
5286 return team;
5287 }
5288
5289 /* TODO implement hot-teams at all levels */
5290 /* TODO implement lazy thread release on demand (disband request) */
5291
5292 /* free the team. return it to the team pool. release all the threads
5293 * associated with it */
__kmp_free_team(kmp_root_t * root,kmp_team_t * team USE_NESTED_HOT_ARG (kmp_info_t * master))5294 void __kmp_free_team(kmp_root_t *root,
5295 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5296 int f;
5297 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5298 team->t.t_id));
5299
5300 /* verify state */
5301 KMP_DEBUG_ASSERT(root);
5302 KMP_DEBUG_ASSERT(team);
5303 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5304 KMP_DEBUG_ASSERT(team->t.t_threads);
5305
5306 int use_hot_team = team == root->r.r_hot_team;
5307 #if KMP_NESTED_HOT_TEAMS
5308 int level;
5309 kmp_hot_team_ptr_t *hot_teams;
5310 if (master) {
5311 level = team->t.t_active_level - 1;
5312 if (master->th.th_teams_microtask) { // in teams construct?
5313 if (master->th.th_teams_size.nteams > 1) {
5314 ++level; // level was not increased in teams construct for
5315 // team_of_masters
5316 }
5317 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5318 master->th.th_teams_level == team->t.t_level) {
5319 ++level; // level was not increased in teams construct for
5320 // team_of_workers before the parallel
5321 } // team->t.t_level will be increased inside parallel
5322 }
5323 hot_teams = master->th.th_hot_teams;
5324 if (level < __kmp_hot_teams_max_level) {
5325 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5326 use_hot_team = 1;
5327 }
5328 }
5329 #endif // KMP_NESTED_HOT_TEAMS
5330
5331 /* team is done working */
5332 TCW_SYNC_PTR(team->t.t_pkfn,
5333 NULL); // Important for Debugging Support Library.
5334 #if KMP_OS_WINDOWS
5335 team->t.t_copyin_counter = 0; // init counter for possible reuse
5336 #endif
5337 // Do not reset pointer to parent team to NULL for hot teams.
5338
5339 /* if we are non-hot team, release our threads */
5340 if (!use_hot_team) {
5341 if (__kmp_tasking_mode != tskm_immediate_exec) {
5342 // Wait for threads to reach reapable state
5343 for (f = 1; f < team->t.t_nproc; ++f) {
5344 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5345 kmp_info_t *th = team->t.t_threads[f];
5346 volatile kmp_uint32 *state = &th->th.th_reap_state;
5347 while (*state != KMP_SAFE_TO_REAP) {
5348 #if KMP_OS_WINDOWS
5349 // On Windows a thread can be killed at any time, check this
5350 DWORD ecode;
5351 if (!__kmp_is_thread_alive(th, &ecode)) {
5352 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5353 break;
5354 }
5355 #endif
5356 // first check if thread is sleeping
5357 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5358 if (fl.is_sleeping())
5359 fl.resume(__kmp_gtid_from_thread(th));
5360 KMP_CPU_PAUSE();
5361 }
5362 }
5363
5364 // Delete task teams
5365 int tt_idx;
5366 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5367 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5368 if (task_team != NULL) {
5369 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5370 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5371 team->t.t_threads[f]->th.th_task_team = NULL;
5372 }
5373 KA_TRACE(
5374 20,
5375 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5376 __kmp_get_gtid(), task_team, team->t.t_id));
5377 #if KMP_NESTED_HOT_TEAMS
5378 __kmp_free_task_team(master, task_team);
5379 #endif
5380 team->t.t_task_team[tt_idx] = NULL;
5381 }
5382 }
5383 }
5384
5385 // Reset pointer to parent team only for non-hot teams.
5386 team->t.t_parent = NULL;
5387 team->t.t_level = 0;
5388 team->t.t_active_level = 0;
5389
5390 /* free the worker threads */
5391 for (f = 1; f < team->t.t_nproc; ++f) {
5392 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5393 __kmp_free_thread(team->t.t_threads[f]);
5394 team->t.t_threads[f] = NULL;
5395 }
5396
5397 /* put the team back in the team pool */
5398 /* TODO limit size of team pool, call reap_team if pool too large */
5399 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5400 __kmp_team_pool = (volatile kmp_team_t *)team;
5401 } else { // Check if team was created for the masters in a teams construct
5402 // See if first worker is a CG root
5403 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5404 team->t.t_threads[1]->th.th_cg_roots);
5405 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5406 // Clean up the CG root nodes on workers so that this team can be re-used
5407 for (f = 1; f < team->t.t_nproc; ++f) {
5408 kmp_info_t *thr = team->t.t_threads[f];
5409 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5410 thr->th.th_cg_roots->cg_root == thr);
5411 // Pop current CG root off list
5412 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5413 thr->th.th_cg_roots = tmp->up;
5414 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5415 " up to node %p. cg_nthreads was %d\n",
5416 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5417 int i = tmp->cg_nthreads--;
5418 if (i == 1) {
5419 __kmp_free(tmp); // free CG if we are the last thread in it
5420 }
5421 // Restore current task's thread_limit from CG root
5422 if (thr->th.th_cg_roots)
5423 thr->th.th_current_task->td_icvs.thread_limit =
5424 thr->th.th_cg_roots->cg_thread_limit;
5425 }
5426 }
5427 }
5428
5429 KMP_MB();
5430 }
5431
5432 /* reap the team. destroy it, reclaim all its resources and free its memory */
__kmp_reap_team(kmp_team_t * team)5433 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5434 kmp_team_t *next_pool = team->t.t_next_pool;
5435
5436 KMP_DEBUG_ASSERT(team);
5437 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5438 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5439 KMP_DEBUG_ASSERT(team->t.t_threads);
5440 KMP_DEBUG_ASSERT(team->t.t_argv);
5441
5442 /* TODO clean the threads that are a part of this? */
5443
5444 /* free stuff */
5445 __kmp_free_team_arrays(team);
5446 if (team->t.t_argv != &team->t.t_inline_argv[0])
5447 __kmp_free((void *)team->t.t_argv);
5448 __kmp_free(team);
5449
5450 KMP_MB();
5451 return next_pool;
5452 }
5453
5454 // Free the thread. Don't reap it, just place it on the pool of available
5455 // threads.
5456 //
5457 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5458 // binding for the affinity mechanism to be useful.
5459 //
5460 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5461 // However, we want to avoid a potential performance problem by always
5462 // scanning through the list to find the correct point at which to insert
5463 // the thread (potential N**2 behavior). To do this we keep track of the
5464 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5465 // With single-level parallelism, threads will always be added to the tail
5466 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5467 // parallelism, all bets are off and we may need to scan through the entire
5468 // free list.
5469 //
5470 // This change also has a potentially large performance benefit, for some
5471 // applications. Previously, as threads were freed from the hot team, they
5472 // would be placed back on the free list in inverse order. If the hot team
5473 // grew back to it's original size, then the freed thread would be placed
5474 // back on the hot team in reverse order. This could cause bad cache
5475 // locality problems on programs where the size of the hot team regularly
5476 // grew and shrunk.
5477 //
5478 // Now, for single-level parallelism, the OMP tid is alway == gtid.
__kmp_free_thread(kmp_info_t * this_th)5479 void __kmp_free_thread(kmp_info_t *this_th) {
5480 int gtid;
5481 kmp_info_t **scan;
5482
5483 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5484 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5485
5486 KMP_DEBUG_ASSERT(this_th);
5487
5488 // When moving thread to pool, switch thread to wait on own b_go flag, and
5489 // uninitialized (NULL team).
5490 int b;
5491 kmp_balign_t *balign = this_th->th.th_bar;
5492 for (b = 0; b < bs_last_barrier; ++b) {
5493 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5494 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5495 balign[b].bb.team = NULL;
5496 balign[b].bb.leaf_kids = 0;
5497 }
5498 this_th->th.th_task_state = 0;
5499 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5500
5501 /* put thread back on the free pool */
5502 TCW_PTR(this_th->th.th_team, NULL);
5503 TCW_PTR(this_th->th.th_root, NULL);
5504 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5505
5506 while (this_th->th.th_cg_roots) {
5507 this_th->th.th_cg_roots->cg_nthreads--;
5508 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5509 " %p of thread %p to %d\n",
5510 this_th, this_th->th.th_cg_roots,
5511 this_th->th.th_cg_roots->cg_root,
5512 this_th->th.th_cg_roots->cg_nthreads));
5513 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5514 if (tmp->cg_root == this_th) { // Thread is a cg_root
5515 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5516 KA_TRACE(
5517 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5518 this_th->th.th_cg_roots = tmp->up;
5519 __kmp_free(tmp);
5520 } else { // Worker thread
5521 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5522 __kmp_free(tmp);
5523 }
5524 this_th->th.th_cg_roots = NULL;
5525 break;
5526 }
5527 }
5528
5529 /* If the implicit task assigned to this thread can be used by other threads
5530 * -> multiple threads can share the data and try to free the task at
5531 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5532 * with higher probability when hot team is disabled but can occurs even when
5533 * the hot team is enabled */
5534 __kmp_free_implicit_task(this_th);
5535 this_th->th.th_current_task = NULL;
5536
5537 // If the __kmp_thread_pool_insert_pt is already past the new insert
5538 // point, then we need to re-scan the entire list.
5539 gtid = this_th->th.th_info.ds.ds_gtid;
5540 if (__kmp_thread_pool_insert_pt != NULL) {
5541 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5542 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5543 __kmp_thread_pool_insert_pt = NULL;
5544 }
5545 }
5546
5547 // Scan down the list to find the place to insert the thread.
5548 // scan is the address of a link in the list, possibly the address of
5549 // __kmp_thread_pool itself.
5550 //
5551 // In the absence of nested parallism, the for loop will have 0 iterations.
5552 if (__kmp_thread_pool_insert_pt != NULL) {
5553 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5554 } else {
5555 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5556 }
5557 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5558 scan = &((*scan)->th.th_next_pool))
5559 ;
5560
5561 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5562 // to its address.
5563 TCW_PTR(this_th->th.th_next_pool, *scan);
5564 __kmp_thread_pool_insert_pt = *scan = this_th;
5565 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5566 (this_th->th.th_info.ds.ds_gtid <
5567 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5568 TCW_4(this_th->th.th_in_pool, TRUE);
5569 __kmp_suspend_initialize_thread(this_th);
5570 __kmp_lock_suspend_mx(this_th);
5571 if (this_th->th.th_active == TRUE) {
5572 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5573 this_th->th.th_active_in_pool = TRUE;
5574 }
5575 #if KMP_DEBUG
5576 else {
5577 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5578 }
5579 #endif
5580 __kmp_unlock_suspend_mx(this_th);
5581
5582 TCW_4(__kmp_nth, __kmp_nth - 1);
5583
5584 #ifdef KMP_ADJUST_BLOCKTIME
5585 /* Adjust blocktime back to user setting or default if necessary */
5586 /* Middle initialization might never have occurred */
5587 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5588 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5589 if (__kmp_nth <= __kmp_avail_proc) {
5590 __kmp_zero_bt = FALSE;
5591 }
5592 }
5593 #endif /* KMP_ADJUST_BLOCKTIME */
5594
5595 KMP_MB();
5596 }
5597
5598 /* ------------------------------------------------------------------------ */
5599
__kmp_launch_thread(kmp_info_t * this_thr)5600 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5601 int gtid = this_thr->th.th_info.ds.ds_gtid;
5602 /* void *stack_data;*/
5603 kmp_team_t *(*volatile pteam);
5604
5605 KMP_MB();
5606 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5607
5608 if (__kmp_env_consistency_check) {
5609 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5610 }
5611
5612 #if OMPT_SUPPORT
5613 ompt_data_t *thread_data;
5614 if (ompt_enabled.enabled) {
5615 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5616 *thread_data = ompt_data_none;
5617
5618 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5619 this_thr->th.ompt_thread_info.wait_id = 0;
5620 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5621 if (ompt_enabled.ompt_callback_thread_begin) {
5622 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5623 ompt_thread_worker, thread_data);
5624 }
5625 }
5626 #endif
5627
5628 #if OMPT_SUPPORT
5629 if (ompt_enabled.enabled) {
5630 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5631 }
5632 #endif
5633 /* This is the place where threads wait for work */
5634 while (!TCR_4(__kmp_global.g.g_done)) {
5635 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5636 KMP_MB();
5637
5638 /* wait for work to do */
5639 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5640
5641 /* No tid yet since not part of a team */
5642 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5643
5644 #if OMPT_SUPPORT
5645 if (ompt_enabled.enabled) {
5646 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5647 }
5648 #endif
5649
5650 pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5651
5652 /* have we been allocated? */
5653 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5654 /* we were just woken up, so run our new task */
5655 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5656 int rc;
5657 KA_TRACE(20,
5658 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5659 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5660 (*pteam)->t.t_pkfn));
5661
5662 updateHWFPControl(*pteam);
5663
5664 #if OMPT_SUPPORT
5665 if (ompt_enabled.enabled) {
5666 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5667 }
5668 #endif
5669
5670 rc = (*pteam)->t.t_invoke(gtid);
5671 KMP_ASSERT(rc);
5672
5673 KMP_MB();
5674 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5675 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5676 (*pteam)->t.t_pkfn));
5677 }
5678 #if OMPT_SUPPORT
5679 if (ompt_enabled.enabled) {
5680 /* no frame set while outside task */
5681 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5682
5683 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5684 }
5685 #endif
5686 /* join barrier after parallel region */
5687 __kmp_join_barrier(gtid);
5688 }
5689 }
5690 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5691
5692 #if OMPT_SUPPORT
5693 if (ompt_enabled.ompt_callback_thread_end) {
5694 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5695 }
5696 #endif
5697
5698 this_thr->th.th_task_team = NULL;
5699 /* run the destructors for the threadprivate data for this thread */
5700 __kmp_common_destroy_gtid(gtid);
5701
5702 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5703 KMP_MB();
5704 return this_thr;
5705 }
5706
5707 /* ------------------------------------------------------------------------ */
5708
__kmp_internal_end_dest(void * specific_gtid)5709 void __kmp_internal_end_dest(void *specific_gtid) {
5710 #if KMP_COMPILER_ICC
5711 #pragma warning(push)
5712 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5713 // significant bits
5714 #endif
5715 // Make sure no significant bits are lost
5716 int gtid = (kmp_intptr_t)specific_gtid - 1;
5717 #if KMP_COMPILER_ICC
5718 #pragma warning(pop)
5719 #endif
5720
5721 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5722 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5723 * this is because 0 is reserved for the nothing-stored case */
5724
5725 /* josh: One reason for setting the gtid specific data even when it is being
5726 destroyed by pthread is to allow gtid lookup through thread specific data
5727 (__kmp_gtid_get_specific). Some of the code, especially stat code,
5728 that gets executed in the call to __kmp_internal_end_thread, actually
5729 gets the gtid through the thread specific data. Setting it here seems
5730 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5731 to run smoothly.
5732 todo: get rid of this after we remove the dependence on
5733 __kmp_gtid_get_specific */
5734 if (gtid >= 0 && KMP_UBER_GTID(gtid))
5735 __kmp_gtid_set_specific(gtid);
5736 #ifdef KMP_TDATA_GTID
5737 __kmp_gtid = gtid;
5738 #endif
5739 __kmp_internal_end_thread(gtid);
5740 }
5741
5742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5743
5744 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5745 // destructors work perfectly, but in real libomp.so I have no evidence it is
5746 // ever called. However, -fini linker option in makefile.mk works fine.
5747
__kmp_internal_end_dtor(void)5748 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5749 __kmp_internal_end_atexit();
5750 }
5751
__kmp_internal_end_fini(void)5752 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5753
5754 #endif
5755
5756 /* [Windows] josh: when the atexit handler is called, there may still be more
5757 than one thread alive */
__kmp_internal_end_atexit(void)5758 void __kmp_internal_end_atexit(void) {
5759 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5760 /* [Windows]
5761 josh: ideally, we want to completely shutdown the library in this atexit
5762 handler, but stat code that depends on thread specific data for gtid fails
5763 because that data becomes unavailable at some point during the shutdown, so
5764 we call __kmp_internal_end_thread instead. We should eventually remove the
5765 dependency on __kmp_get_specific_gtid in the stat code and use
5766 __kmp_internal_end_library to cleanly shutdown the library.
5767
5768 // TODO: Can some of this comment about GVS be removed?
5769 I suspect that the offending stat code is executed when the calling thread
5770 tries to clean up a dead root thread's data structures, resulting in GVS
5771 code trying to close the GVS structures for that thread, but since the stat
5772 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5773 the calling thread is cleaning up itself instead of another thread, it get
5774 confused. This happens because allowing a thread to unregister and cleanup
5775 another thread is a recent modification for addressing an issue.
5776 Based on the current design (20050722), a thread may end up
5777 trying to unregister another thread only if thread death does not trigger
5778 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5779 thread specific data destructor function to detect thread death. For
5780 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5781 is nothing. Thus, the workaround is applicable only for Windows static
5782 stat library. */
5783 __kmp_internal_end_library(-1);
5784 #if KMP_OS_WINDOWS
5785 __kmp_close_console();
5786 #endif
5787 }
5788
__kmp_reap_thread(kmp_info_t * thread,int is_root)5789 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5790 // It is assumed __kmp_forkjoin_lock is acquired.
5791
5792 int gtid;
5793
5794 KMP_DEBUG_ASSERT(thread != NULL);
5795
5796 gtid = thread->th.th_info.ds.ds_gtid;
5797
5798 if (!is_root) {
5799 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5800 /* Assume the threads are at the fork barrier here */
5801 KA_TRACE(
5802 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5803 gtid));
5804 /* Need release fence here to prevent seg faults for tree forkjoin barrier
5805 * (GEH) */
5806 ANNOTATE_HAPPENS_BEFORE(thread);
5807 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5808 __kmp_release_64(&flag);
5809 }
5810
5811 // Terminate OS thread.
5812 __kmp_reap_worker(thread);
5813
5814 // The thread was killed asynchronously. If it was actively
5815 // spinning in the thread pool, decrement the global count.
5816 //
5817 // There is a small timing hole here - if the worker thread was just waking
5818 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5819 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5820 // the global counter might not get updated.
5821 //
5822 // Currently, this can only happen as the library is unloaded,
5823 // so there are no harmful side effects.
5824 if (thread->th.th_active_in_pool) {
5825 thread->th.th_active_in_pool = FALSE;
5826 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5827 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5828 }
5829 }
5830
5831 __kmp_free_implicit_task(thread);
5832
5833 // Free the fast memory for tasking
5834 #if USE_FAST_MEMORY
5835 __kmp_free_fast_memory(thread);
5836 #endif /* USE_FAST_MEMORY */
5837
5838 __kmp_suspend_uninitialize_thread(thread);
5839
5840 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5841 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5842
5843 --__kmp_all_nth;
5844 // __kmp_nth was decremented when thread is added to the pool.
5845
5846 #ifdef KMP_ADJUST_BLOCKTIME
5847 /* Adjust blocktime back to user setting or default if necessary */
5848 /* Middle initialization might never have occurred */
5849 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5850 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5851 if (__kmp_nth <= __kmp_avail_proc) {
5852 __kmp_zero_bt = FALSE;
5853 }
5854 }
5855 #endif /* KMP_ADJUST_BLOCKTIME */
5856
5857 /* free the memory being used */
5858 if (__kmp_env_consistency_check) {
5859 if (thread->th.th_cons) {
5860 __kmp_free_cons_stack(thread->th.th_cons);
5861 thread->th.th_cons = NULL;
5862 }
5863 }
5864
5865 if (thread->th.th_pri_common != NULL) {
5866 __kmp_free(thread->th.th_pri_common);
5867 thread->th.th_pri_common = NULL;
5868 }
5869
5870 if (thread->th.th_task_state_memo_stack != NULL) {
5871 __kmp_free(thread->th.th_task_state_memo_stack);
5872 thread->th.th_task_state_memo_stack = NULL;
5873 }
5874
5875 #if KMP_USE_BGET
5876 if (thread->th.th_local.bget_data != NULL) {
5877 __kmp_finalize_bget(thread);
5878 }
5879 #endif
5880
5881 #if KMP_AFFINITY_SUPPORTED
5882 if (thread->th.th_affin_mask != NULL) {
5883 KMP_CPU_FREE(thread->th.th_affin_mask);
5884 thread->th.th_affin_mask = NULL;
5885 }
5886 #endif /* KMP_AFFINITY_SUPPORTED */
5887
5888 #if KMP_USE_HIER_SCHED
5889 if (thread->th.th_hier_bar_data != NULL) {
5890 __kmp_free(thread->th.th_hier_bar_data);
5891 thread->th.th_hier_bar_data = NULL;
5892 }
5893 #endif
5894
5895 __kmp_reap_team(thread->th.th_serial_team);
5896 thread->th.th_serial_team = NULL;
5897 __kmp_free(thread);
5898
5899 KMP_MB();
5900
5901 } // __kmp_reap_thread
5902
__kmp_internal_end(void)5903 static void __kmp_internal_end(void) {
5904 int i;
5905
5906 /* First, unregister the library */
5907 __kmp_unregister_library();
5908
5909 #if KMP_OS_WINDOWS
5910 /* In Win static library, we can't tell when a root actually dies, so we
5911 reclaim the data structures for any root threads that have died but not
5912 unregistered themselves, in order to shut down cleanly.
5913 In Win dynamic library we also can't tell when a thread dies. */
5914 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5915 // dead roots
5916 #endif
5917
5918 for (i = 0; i < __kmp_threads_capacity; i++)
5919 if (__kmp_root[i])
5920 if (__kmp_root[i]->r.r_active)
5921 break;
5922 KMP_MB(); /* Flush all pending memory write invalidates. */
5923 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5924
5925 if (i < __kmp_threads_capacity) {
5926 #if KMP_USE_MONITOR
5927 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5928 KMP_MB(); /* Flush all pending memory write invalidates. */
5929
5930 // Need to check that monitor was initialized before reaping it. If we are
5931 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5932 // __kmp_monitor will appear to contain valid data, but it is only valid in
5933 // the parent process, not the child.
5934 // New behavior (201008): instead of keying off of the flag
5935 // __kmp_init_parallel, the monitor thread creation is keyed off
5936 // of the new flag __kmp_init_monitor.
5937 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5938 if (TCR_4(__kmp_init_monitor)) {
5939 __kmp_reap_monitor(&__kmp_monitor);
5940 TCW_4(__kmp_init_monitor, 0);
5941 }
5942 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5943 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5944 #endif // KMP_USE_MONITOR
5945 } else {
5946 /* TODO move this to cleanup code */
5947 #ifdef KMP_DEBUG
5948 /* make sure that everything has properly ended */
5949 for (i = 0; i < __kmp_threads_capacity; i++) {
5950 if (__kmp_root[i]) {
5951 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5952 // there can be uber threads alive here
5953 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5954 }
5955 }
5956 #endif
5957
5958 KMP_MB();
5959
5960 // Reap the worker threads.
5961 // This is valid for now, but be careful if threads are reaped sooner.
5962 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5963 // Get the next thread from the pool.
5964 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5965 __kmp_thread_pool = thread->th.th_next_pool;
5966 // Reap it.
5967 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5968 thread->th.th_next_pool = NULL;
5969 thread->th.th_in_pool = FALSE;
5970 __kmp_reap_thread(thread, 0);
5971 }
5972 __kmp_thread_pool_insert_pt = NULL;
5973
5974 // Reap teams.
5975 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5976 // Get the next team from the pool.
5977 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5978 __kmp_team_pool = team->t.t_next_pool;
5979 // Reap it.
5980 team->t.t_next_pool = NULL;
5981 __kmp_reap_team(team);
5982 }
5983
5984 __kmp_reap_task_teams();
5985
5986 #if KMP_OS_UNIX
5987 // Threads that are not reaped should not access any resources since they
5988 // are going to be deallocated soon, so the shutdown sequence should wait
5989 // until all threads either exit the final spin-waiting loop or begin
5990 // sleeping after the given blocktime.
5991 for (i = 0; i < __kmp_threads_capacity; i++) {
5992 kmp_info_t *thr = __kmp_threads[i];
5993 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5994 KMP_CPU_PAUSE();
5995 }
5996 #endif
5997
5998 for (i = 0; i < __kmp_threads_capacity; ++i) {
5999 // TBD: Add some checking...
6000 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6001 }
6002
6003 /* Make sure all threadprivate destructors get run by joining with all
6004 worker threads before resetting this flag */
6005 TCW_SYNC_4(__kmp_init_common, FALSE);
6006
6007 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6008 KMP_MB();
6009
6010 #if KMP_USE_MONITOR
6011 // See note above: One of the possible fixes for CQ138434 / CQ140126
6012 //
6013 // FIXME: push both code fragments down and CSE them?
6014 // push them into __kmp_cleanup() ?
6015 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6016 if (TCR_4(__kmp_init_monitor)) {
6017 __kmp_reap_monitor(&__kmp_monitor);
6018 TCW_4(__kmp_init_monitor, 0);
6019 }
6020 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6021 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6022 #endif
6023 } /* else !__kmp_global.t_active */
6024 TCW_4(__kmp_init_gtid, FALSE);
6025 KMP_MB(); /* Flush all pending memory write invalidates. */
6026
6027 __kmp_cleanup();
6028 #if OMPT_SUPPORT
6029 ompt_fini();
6030 #endif
6031 }
6032
__kmp_internal_end_library(int gtid_req)6033 void __kmp_internal_end_library(int gtid_req) {
6034 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6035 /* this shouldn't be a race condition because __kmp_internal_end() is the
6036 only place to clear __kmp_serial_init */
6037 /* we'll check this later too, after we get the lock */
6038 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6039 // redundaant, because the next check will work in any case.
6040 if (__kmp_global.g.g_abort) {
6041 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6042 /* TODO abort? */
6043 return;
6044 }
6045 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6046 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6047 return;
6048 }
6049
6050 KMP_MB(); /* Flush all pending memory write invalidates. */
6051
6052 /* find out who we are and what we should do */
6053 {
6054 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6055 KA_TRACE(
6056 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6057 if (gtid == KMP_GTID_SHUTDOWN) {
6058 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6059 "already shutdown\n"));
6060 return;
6061 } else if (gtid == KMP_GTID_MONITOR) {
6062 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6063 "registered, or system shutdown\n"));
6064 return;
6065 } else if (gtid == KMP_GTID_DNE) {
6066 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6067 "shutdown\n"));
6068 /* we don't know who we are, but we may still shutdown the library */
6069 } else if (KMP_UBER_GTID(gtid)) {
6070 /* unregister ourselves as an uber thread. gtid is no longer valid */
6071 if (__kmp_root[gtid]->r.r_active) {
6072 __kmp_global.g.g_abort = -1;
6073 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6074 KA_TRACE(10,
6075 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6076 gtid));
6077 return;
6078 } else {
6079 KA_TRACE(
6080 10,
6081 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6082 __kmp_unregister_root_current_thread(gtid);
6083 }
6084 } else {
6085 /* worker threads may call this function through the atexit handler, if they
6086 * call exit() */
6087 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6088 TODO: do a thorough shutdown instead */
6089 #ifdef DUMP_DEBUG_ON_EXIT
6090 if (__kmp_debug_buf)
6091 __kmp_dump_debug_buffer();
6092 #endif
6093 return;
6094 }
6095 }
6096 /* synchronize the termination process */
6097 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6098
6099 /* have we already finished */
6100 if (__kmp_global.g.g_abort) {
6101 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6102 /* TODO abort? */
6103 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6104 return;
6105 }
6106 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6107 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6108 return;
6109 }
6110
6111 /* We need this lock to enforce mutex between this reading of
6112 __kmp_threads_capacity and the writing by __kmp_register_root.
6113 Alternatively, we can use a counter of roots that is atomically updated by
6114 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6115 __kmp_internal_end_*. */
6116 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6117
6118 /* now we can safely conduct the actual termination */
6119 __kmp_internal_end();
6120
6121 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6122 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6123
6124 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6125
6126 #ifdef DUMP_DEBUG_ON_EXIT
6127 if (__kmp_debug_buf)
6128 __kmp_dump_debug_buffer();
6129 #endif
6130
6131 #if KMP_OS_WINDOWS
6132 __kmp_close_console();
6133 #endif
6134
6135 __kmp_fini_allocator();
6136
6137 } // __kmp_internal_end_library
6138
__kmp_internal_end_thread(int gtid_req)6139 void __kmp_internal_end_thread(int gtid_req) {
6140 int i;
6141
6142 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6143 /* this shouldn't be a race condition because __kmp_internal_end() is the
6144 * only place to clear __kmp_serial_init */
6145 /* we'll check this later too, after we get the lock */
6146 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6147 // redundant, because the next check will work in any case.
6148 if (__kmp_global.g.g_abort) {
6149 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6150 /* TODO abort? */
6151 return;
6152 }
6153 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6154 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6155 return;
6156 }
6157
6158 KMP_MB(); /* Flush all pending memory write invalidates. */
6159
6160 /* find out who we are and what we should do */
6161 {
6162 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6163 KA_TRACE(10,
6164 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6165 if (gtid == KMP_GTID_SHUTDOWN) {
6166 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6167 "already shutdown\n"));
6168 return;
6169 } else if (gtid == KMP_GTID_MONITOR) {
6170 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6171 "registered, or system shutdown\n"));
6172 return;
6173 } else if (gtid == KMP_GTID_DNE) {
6174 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6175 "shutdown\n"));
6176 return;
6177 /* we don't know who we are */
6178 } else if (KMP_UBER_GTID(gtid)) {
6179 /* unregister ourselves as an uber thread. gtid is no longer valid */
6180 if (__kmp_root[gtid]->r.r_active) {
6181 __kmp_global.g.g_abort = -1;
6182 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6183 KA_TRACE(10,
6184 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6185 gtid));
6186 return;
6187 } else {
6188 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6189 gtid));
6190 __kmp_unregister_root_current_thread(gtid);
6191 }
6192 } else {
6193 /* just a worker thread, let's leave */
6194 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6195
6196 if (gtid >= 0) {
6197 __kmp_threads[gtid]->th.th_task_team = NULL;
6198 }
6199
6200 KA_TRACE(10,
6201 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6202 gtid));
6203 return;
6204 }
6205 }
6206 #if KMP_DYNAMIC_LIB
6207 if (__kmp_pause_status != kmp_hard_paused)
6208 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6209 // because we will better shutdown later in the library destructor.
6210 {
6211 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6212 return;
6213 }
6214 #endif
6215 /* synchronize the termination process */
6216 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6217
6218 /* have we already finished */
6219 if (__kmp_global.g.g_abort) {
6220 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6221 /* TODO abort? */
6222 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6223 return;
6224 }
6225 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6226 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6227 return;
6228 }
6229
6230 /* We need this lock to enforce mutex between this reading of
6231 __kmp_threads_capacity and the writing by __kmp_register_root.
6232 Alternatively, we can use a counter of roots that is atomically updated by
6233 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6234 __kmp_internal_end_*. */
6235
6236 /* should we finish the run-time? are all siblings done? */
6237 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6238
6239 for (i = 0; i < __kmp_threads_capacity; ++i) {
6240 if (KMP_UBER_GTID(i)) {
6241 KA_TRACE(
6242 10,
6243 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6244 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6245 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6246 return;
6247 }
6248 }
6249
6250 /* now we can safely conduct the actual termination */
6251
6252 __kmp_internal_end();
6253
6254 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6255 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6256
6257 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6258
6259 #ifdef DUMP_DEBUG_ON_EXIT
6260 if (__kmp_debug_buf)
6261 __kmp_dump_debug_buffer();
6262 #endif
6263 } // __kmp_internal_end_thread
6264
6265 // -----------------------------------------------------------------------------
6266 // Library registration stuff.
6267
6268 static long __kmp_registration_flag = 0;
6269 // Random value used to indicate library initialization.
6270 static char *__kmp_registration_str = NULL;
6271 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6272
__kmp_reg_status_name()6273 static inline char *__kmp_reg_status_name() {
6274 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6275 each thread. If registration and unregistration go in different threads
6276 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6277 env var can not be found, because the name will contain different pid. */
6278 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6279 } // __kmp_reg_status_get
6280
__kmp_register_library_startup(void)6281 void __kmp_register_library_startup(void) {
6282
6283 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6284 int done = 0;
6285 union {
6286 double dtime;
6287 long ltime;
6288 } time;
6289 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6290 __kmp_initialize_system_tick();
6291 #endif
6292 __kmp_read_system_time(&time.dtime);
6293 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6294 __kmp_registration_str =
6295 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6296 __kmp_registration_flag, KMP_LIBRARY_FILE);
6297
6298 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6299 __kmp_registration_str));
6300
6301 while (!done) {
6302
6303 char *value = NULL; // Actual value of the environment variable.
6304
6305 // Set environment variable, but do not overwrite if it is exist.
6306 __kmp_env_set(name, __kmp_registration_str, 0);
6307 // Check the variable is written.
6308 value = __kmp_env_get(name);
6309 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6310
6311 done = 1; // Ok, environment variable set successfully, exit the loop.
6312
6313 } else {
6314
6315 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6316 // Check whether it alive or dead.
6317 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6318 char *tail = value;
6319 char *flag_addr_str = NULL;
6320 char *flag_val_str = NULL;
6321 char const *file_name = NULL;
6322 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6323 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6324 file_name = tail;
6325 if (tail != NULL) {
6326 long *flag_addr = 0;
6327 long flag_val = 0;
6328 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6329 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6330 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6331 // First, check whether environment-encoded address is mapped into
6332 // addr space.
6333 // If so, dereference it to see if it still has the right value.
6334 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6335 neighbor = 1;
6336 } else {
6337 // If not, then we know the other copy of the library is no longer
6338 // running.
6339 neighbor = 2;
6340 }
6341 }
6342 }
6343 switch (neighbor) {
6344 case 0: // Cannot parse environment variable -- neighbor status unknown.
6345 // Assume it is the incompatible format of future version of the
6346 // library. Assume the other library is alive.
6347 // WARN( ... ); // TODO: Issue a warning.
6348 file_name = "unknown library";
6349 KMP_FALLTHROUGH();
6350 // Attention! Falling to the next case. That's intentional.
6351 case 1: { // Neighbor is alive.
6352 // Check it is allowed.
6353 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6354 if (!__kmp_str_match_true(duplicate_ok)) {
6355 // That's not allowed. Issue fatal error.
6356 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6357 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6358 }
6359 KMP_INTERNAL_FREE(duplicate_ok);
6360 __kmp_duplicate_library_ok = 1;
6361 done = 1; // Exit the loop.
6362 } break;
6363 case 2: { // Neighbor is dead.
6364 // Clear the variable and try to register library again.
6365 __kmp_env_unset(name);
6366 } break;
6367 default: { KMP_DEBUG_ASSERT(0); } break;
6368 }
6369 }
6370 KMP_INTERNAL_FREE((void *)value);
6371 }
6372 KMP_INTERNAL_FREE((void *)name);
6373
6374 } // func __kmp_register_library_startup
6375
__kmp_unregister_library(void)6376 void __kmp_unregister_library(void) {
6377
6378 char *name = __kmp_reg_status_name();
6379 char *value = __kmp_env_get(name);
6380
6381 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6382 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6383 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6384 // Ok, this is our variable. Delete it.
6385 __kmp_env_unset(name);
6386 }
6387
6388 KMP_INTERNAL_FREE(__kmp_registration_str);
6389 KMP_INTERNAL_FREE(value);
6390 KMP_INTERNAL_FREE(name);
6391
6392 __kmp_registration_flag = 0;
6393 __kmp_registration_str = NULL;
6394
6395 } // __kmp_unregister_library
6396
6397 // End of Library registration stuff.
6398 // -----------------------------------------------------------------------------
6399
6400 #if KMP_MIC_SUPPORTED
6401
__kmp_check_mic_type()6402 static void __kmp_check_mic_type() {
6403 kmp_cpuid_t cpuid_state = {0};
6404 kmp_cpuid_t *cs_p = &cpuid_state;
6405 __kmp_x86_cpuid(1, 0, cs_p);
6406 // We don't support mic1 at the moment
6407 if ((cs_p->eax & 0xff0) == 0xB10) {
6408 __kmp_mic_type = mic2;
6409 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6410 __kmp_mic_type = mic3;
6411 } else {
6412 __kmp_mic_type = non_mic;
6413 }
6414 }
6415
6416 #endif /* KMP_MIC_SUPPORTED */
6417
__kmp_do_serial_initialize(void)6418 static void __kmp_do_serial_initialize(void) {
6419 int i, gtid;
6420 int size;
6421
6422 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6423
6424 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6425 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6426 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6427 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6428 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6429
6430 #if OMPT_SUPPORT
6431 ompt_pre_init();
6432 #endif
6433
6434 __kmp_validate_locks();
6435
6436 /* Initialize internal memory allocator */
6437 __kmp_init_allocator();
6438
6439 /* Register the library startup via an environment variable and check to see
6440 whether another copy of the library is already registered. */
6441
6442 __kmp_register_library_startup();
6443
6444 /* TODO reinitialization of library */
6445 if (TCR_4(__kmp_global.g.g_done)) {
6446 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6447 }
6448
6449 __kmp_global.g.g_abort = 0;
6450 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6451
6452 /* initialize the locks */
6453 #if KMP_USE_ADAPTIVE_LOCKS
6454 #if KMP_DEBUG_ADAPTIVE_LOCKS
6455 __kmp_init_speculative_stats();
6456 #endif
6457 #endif
6458 #if KMP_STATS_ENABLED
6459 __kmp_stats_init();
6460 #endif
6461 __kmp_init_lock(&__kmp_global_lock);
6462 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6463 __kmp_init_lock(&__kmp_debug_lock);
6464 __kmp_init_atomic_lock(&__kmp_atomic_lock);
6465 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6466 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6467 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6468 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6469 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6470 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6471 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6472 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6473 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6474 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6475 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6476 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6477 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6478 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6479 #if KMP_USE_MONITOR
6480 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6481 #endif
6482 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6483
6484 /* conduct initialization and initial setup of configuration */
6485
6486 __kmp_runtime_initialize();
6487
6488 #if KMP_MIC_SUPPORTED
6489 __kmp_check_mic_type();
6490 #endif
6491
6492 // Some global variable initialization moved here from kmp_env_initialize()
6493 #ifdef KMP_DEBUG
6494 kmp_diag = 0;
6495 #endif
6496 __kmp_abort_delay = 0;
6497
6498 // From __kmp_init_dflt_team_nth()
6499 /* assume the entire machine will be used */
6500 __kmp_dflt_team_nth_ub = __kmp_xproc;
6501 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6502 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6503 }
6504 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6505 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6506 }
6507 __kmp_max_nth = __kmp_sys_max_nth;
6508 __kmp_cg_max_nth = __kmp_sys_max_nth;
6509 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6510 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6511 __kmp_teams_max_nth = __kmp_sys_max_nth;
6512 }
6513
6514 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6515 // part
6516 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6517 #if KMP_USE_MONITOR
6518 __kmp_monitor_wakeups =
6519 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6520 __kmp_bt_intervals =
6521 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6522 #endif
6523 // From "KMP_LIBRARY" part of __kmp_env_initialize()
6524 __kmp_library = library_throughput;
6525 // From KMP_SCHEDULE initialization
6526 __kmp_static = kmp_sch_static_balanced;
6527 // AC: do not use analytical here, because it is non-monotonous
6528 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6529 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6530 // need to repeat assignment
6531 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6532 // bit control and barrier method control parts
6533 #if KMP_FAST_REDUCTION_BARRIER
6534 #define kmp_reduction_barrier_gather_bb ((int)1)
6535 #define kmp_reduction_barrier_release_bb ((int)1)
6536 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6537 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6538 #endif // KMP_FAST_REDUCTION_BARRIER
6539 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6540 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6541 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6542 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6543 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6544 #if KMP_FAST_REDUCTION_BARRIER
6545 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6546 // lin_64 ): hyper,1
6547 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6548 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6549 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6550 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6551 }
6552 #endif // KMP_FAST_REDUCTION_BARRIER
6553 }
6554 #if KMP_FAST_REDUCTION_BARRIER
6555 #undef kmp_reduction_barrier_release_pat
6556 #undef kmp_reduction_barrier_gather_pat
6557 #undef kmp_reduction_barrier_release_bb
6558 #undef kmp_reduction_barrier_gather_bb
6559 #endif // KMP_FAST_REDUCTION_BARRIER
6560 #if KMP_MIC_SUPPORTED
6561 if (__kmp_mic_type == mic2) { // KNC
6562 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6563 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6564 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6565 1; // forkjoin release
6566 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6567 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6568 }
6569 #if KMP_FAST_REDUCTION_BARRIER
6570 if (__kmp_mic_type == mic2) { // KNC
6571 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6572 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6573 }
6574 #endif // KMP_FAST_REDUCTION_BARRIER
6575 #endif // KMP_MIC_SUPPORTED
6576
6577 // From KMP_CHECKS initialization
6578 #ifdef KMP_DEBUG
6579 __kmp_env_checks = TRUE; /* development versions have the extra checks */
6580 #else
6581 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6582 #endif
6583
6584 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6585 __kmp_foreign_tp = TRUE;
6586
6587 __kmp_global.g.g_dynamic = FALSE;
6588 __kmp_global.g.g_dynamic_mode = dynamic_default;
6589
6590 __kmp_env_initialize(NULL);
6591
6592 // Print all messages in message catalog for testing purposes.
6593 #ifdef KMP_DEBUG
6594 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6595 if (__kmp_str_match_true(val)) {
6596 kmp_str_buf_t buffer;
6597 __kmp_str_buf_init(&buffer);
6598 __kmp_i18n_dump_catalog(&buffer);
6599 __kmp_printf("%s", buffer.str);
6600 __kmp_str_buf_free(&buffer);
6601 }
6602 __kmp_env_free(&val);
6603 #endif
6604
6605 __kmp_threads_capacity =
6606 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6607 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6608 __kmp_tp_capacity = __kmp_default_tp_capacity(
6609 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6610
6611 // If the library is shut down properly, both pools must be NULL. Just in
6612 // case, set them to NULL -- some memory may leak, but subsequent code will
6613 // work even if pools are not freed.
6614 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6615 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6616 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6617 __kmp_thread_pool = NULL;
6618 __kmp_thread_pool_insert_pt = NULL;
6619 __kmp_team_pool = NULL;
6620
6621 /* Allocate all of the variable sized records */
6622 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6623 * expandable */
6624 /* Since allocation is cache-aligned, just add extra padding at the end */
6625 size =
6626 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6627 CACHE_LINE;
6628 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6629 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6630 sizeof(kmp_info_t *) * __kmp_threads_capacity);
6631
6632 /* init thread counts */
6633 KMP_DEBUG_ASSERT(__kmp_all_nth ==
6634 0); // Asserts fail if the library is reinitializing and
6635 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6636 __kmp_all_nth = 0;
6637 __kmp_nth = 0;
6638
6639 /* setup the uber master thread and hierarchy */
6640 gtid = __kmp_register_root(TRUE);
6641 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6642 KMP_ASSERT(KMP_UBER_GTID(gtid));
6643 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6644
6645 KMP_MB(); /* Flush all pending memory write invalidates. */
6646
6647 __kmp_common_initialize();
6648
6649 #if KMP_OS_UNIX
6650 /* invoke the child fork handler */
6651 __kmp_register_atfork();
6652 #endif
6653
6654 #if !KMP_DYNAMIC_LIB
6655 {
6656 /* Invoke the exit handler when the program finishes, only for static
6657 library. For dynamic library, we already have _fini and DllMain. */
6658 int rc = atexit(__kmp_internal_end_atexit);
6659 if (rc != 0) {
6660 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6661 __kmp_msg_null);
6662 }
6663 }
6664 #endif
6665
6666 #if KMP_HANDLE_SIGNALS
6667 #if KMP_OS_UNIX
6668 /* NOTE: make sure that this is called before the user installs their own
6669 signal handlers so that the user handlers are called first. this way they
6670 can return false, not call our handler, avoid terminating the library, and
6671 continue execution where they left off. */
6672 __kmp_install_signals(FALSE);
6673 #endif /* KMP_OS_UNIX */
6674 #if KMP_OS_WINDOWS
6675 __kmp_install_signals(TRUE);
6676 #endif /* KMP_OS_WINDOWS */
6677 #endif
6678
6679 /* we have finished the serial initialization */
6680 __kmp_init_counter++;
6681
6682 __kmp_init_serial = TRUE;
6683
6684 if (__kmp_settings) {
6685 __kmp_env_print();
6686 }
6687
6688 if (__kmp_display_env || __kmp_display_env_verbose) {
6689 __kmp_env_print_2();
6690 }
6691
6692 #if OMPT_SUPPORT
6693 ompt_post_init();
6694 #endif
6695
6696 KMP_MB();
6697
6698 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6699 }
6700
__kmp_serial_initialize(void)6701 void __kmp_serial_initialize(void) {
6702 if (__kmp_init_serial) {
6703 return;
6704 }
6705 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6706 if (__kmp_init_serial) {
6707 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6708 return;
6709 }
6710 __kmp_do_serial_initialize();
6711 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6712 }
6713
__kmp_do_middle_initialize(void)6714 static void __kmp_do_middle_initialize(void) {
6715 int i, j;
6716 int prev_dflt_team_nth;
6717
6718 if (!__kmp_init_serial) {
6719 __kmp_do_serial_initialize();
6720 }
6721
6722 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6723
6724 // Save the previous value for the __kmp_dflt_team_nth so that
6725 // we can avoid some reinitialization if it hasn't changed.
6726 prev_dflt_team_nth = __kmp_dflt_team_nth;
6727
6728 #if KMP_AFFINITY_SUPPORTED
6729 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6730 // number of cores on the machine.
6731 __kmp_affinity_initialize();
6732
6733 // Run through the __kmp_threads array and set the affinity mask
6734 // for each root thread that is currently registered with the RTL.
6735 for (i = 0; i < __kmp_threads_capacity; i++) {
6736 if (TCR_PTR(__kmp_threads[i]) != NULL) {
6737 __kmp_affinity_set_init_mask(i, TRUE);
6738 }
6739 }
6740 #endif /* KMP_AFFINITY_SUPPORTED */
6741
6742 KMP_ASSERT(__kmp_xproc > 0);
6743 if (__kmp_avail_proc == 0) {
6744 __kmp_avail_proc = __kmp_xproc;
6745 }
6746
6747 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6748 // correct them now
6749 j = 0;
6750 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6751 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6752 __kmp_avail_proc;
6753 j++;
6754 }
6755
6756 if (__kmp_dflt_team_nth == 0) {
6757 #ifdef KMP_DFLT_NTH_CORES
6758 // Default #threads = #cores
6759 __kmp_dflt_team_nth = __kmp_ncores;
6760 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6761 "__kmp_ncores (%d)\n",
6762 __kmp_dflt_team_nth));
6763 #else
6764 // Default #threads = #available OS procs
6765 __kmp_dflt_team_nth = __kmp_avail_proc;
6766 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6767 "__kmp_avail_proc(%d)\n",
6768 __kmp_dflt_team_nth));
6769 #endif /* KMP_DFLT_NTH_CORES */
6770 }
6771
6772 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6773 __kmp_dflt_team_nth = KMP_MIN_NTH;
6774 }
6775 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6776 __kmp_dflt_team_nth = __kmp_sys_max_nth;
6777 }
6778
6779 // There's no harm in continuing if the following check fails,
6780 // but it indicates an error in the previous logic.
6781 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6782
6783 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6784 // Run through the __kmp_threads array and set the num threads icv for each
6785 // root thread that is currently registered with the RTL (which has not
6786 // already explicitly set its nthreads-var with a call to
6787 // omp_set_num_threads()).
6788 for (i = 0; i < __kmp_threads_capacity; i++) {
6789 kmp_info_t *thread = __kmp_threads[i];
6790 if (thread == NULL)
6791 continue;
6792 if (thread->th.th_current_task->td_icvs.nproc != 0)
6793 continue;
6794
6795 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6796 }
6797 }
6798 KA_TRACE(
6799 20,
6800 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6801 __kmp_dflt_team_nth));
6802
6803 #ifdef KMP_ADJUST_BLOCKTIME
6804 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6805 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6806 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6807 if (__kmp_nth > __kmp_avail_proc) {
6808 __kmp_zero_bt = TRUE;
6809 }
6810 }
6811 #endif /* KMP_ADJUST_BLOCKTIME */
6812
6813 /* we have finished middle initialization */
6814 TCW_SYNC_4(__kmp_init_middle, TRUE);
6815
6816 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6817 }
6818
__kmp_middle_initialize(void)6819 void __kmp_middle_initialize(void) {
6820 if (__kmp_init_middle) {
6821 return;
6822 }
6823 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6824 if (__kmp_init_middle) {
6825 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6826 return;
6827 }
6828 __kmp_do_middle_initialize();
6829 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6830 }
6831
__kmp_parallel_initialize(void)6832 void __kmp_parallel_initialize(void) {
6833 int gtid = __kmp_entry_gtid(); // this might be a new root
6834
6835 /* synchronize parallel initialization (for sibling) */
6836 if (TCR_4(__kmp_init_parallel))
6837 return;
6838 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6839 if (TCR_4(__kmp_init_parallel)) {
6840 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6841 return;
6842 }
6843
6844 /* TODO reinitialization after we have already shut down */
6845 if (TCR_4(__kmp_global.g.g_done)) {
6846 KA_TRACE(
6847 10,
6848 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6849 __kmp_infinite_loop();
6850 }
6851
6852 /* jc: The lock __kmp_initz_lock is already held, so calling
6853 __kmp_serial_initialize would cause a deadlock. So we call
6854 __kmp_do_serial_initialize directly. */
6855 if (!__kmp_init_middle) {
6856 __kmp_do_middle_initialize();
6857 }
6858 __kmp_resume_if_hard_paused();
6859
6860 /* begin initialization */
6861 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6862 KMP_ASSERT(KMP_UBER_GTID(gtid));
6863
6864 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6865 // Save the FP control regs.
6866 // Worker threads will set theirs to these values at thread startup.
6867 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6868 __kmp_store_mxcsr(&__kmp_init_mxcsr);
6869 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6870 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6871
6872 #if KMP_OS_UNIX
6873 #if KMP_HANDLE_SIGNALS
6874 /* must be after __kmp_serial_initialize */
6875 __kmp_install_signals(TRUE);
6876 #endif
6877 #endif
6878
6879 __kmp_suspend_initialize();
6880
6881 #if defined(USE_LOAD_BALANCE)
6882 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6883 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6884 }
6885 #else
6886 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6887 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6888 }
6889 #endif
6890
6891 if (__kmp_version) {
6892 __kmp_print_version_2();
6893 }
6894
6895 /* we have finished parallel initialization */
6896 TCW_SYNC_4(__kmp_init_parallel, TRUE);
6897
6898 KMP_MB();
6899 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6900
6901 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6902 }
6903
6904 /* ------------------------------------------------------------------------ */
6905
__kmp_run_before_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)6906 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6907 kmp_team_t *team) {
6908 kmp_disp_t *dispatch;
6909
6910 KMP_MB();
6911
6912 /* none of the threads have encountered any constructs, yet. */
6913 this_thr->th.th_local.this_construct = 0;
6914 #if KMP_CACHE_MANAGE
6915 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6916 #endif /* KMP_CACHE_MANAGE */
6917 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6918 KMP_DEBUG_ASSERT(dispatch);
6919 KMP_DEBUG_ASSERT(team->t.t_dispatch);
6920 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6921 // this_thr->th.th_info.ds.ds_tid ] );
6922
6923 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6924 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6925 if (__kmp_env_consistency_check)
6926 __kmp_push_parallel(gtid, team->t.t_ident);
6927
6928 KMP_MB(); /* Flush all pending memory write invalidates. */
6929 }
6930
__kmp_run_after_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)6931 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6932 kmp_team_t *team) {
6933 if (__kmp_env_consistency_check)
6934 __kmp_pop_parallel(gtid, team->t.t_ident);
6935
6936 __kmp_finish_implicit_task(this_thr);
6937 }
6938
__kmp_invoke_task_func(int gtid)6939 int __kmp_invoke_task_func(int gtid) {
6940 int rc;
6941 int tid = __kmp_tid_from_gtid(gtid);
6942 kmp_info_t *this_thr = __kmp_threads[gtid];
6943 kmp_team_t *team = this_thr->th.th_team;
6944
6945 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6946 #if USE_ITT_BUILD
6947 if (__itt_stack_caller_create_ptr) {
6948 __kmp_itt_stack_callee_enter(
6949 (__itt_caller)
6950 team->t.t_stack_id); // inform ittnotify about entering user's code
6951 }
6952 #endif /* USE_ITT_BUILD */
6953 #if INCLUDE_SSC_MARKS
6954 SSC_MARK_INVOKING();
6955 #endif
6956
6957 #if OMPT_SUPPORT
6958 void *dummy;
6959 void **exit_runtime_p;
6960 ompt_data_t *my_task_data;
6961 ompt_data_t *my_parallel_data;
6962 int ompt_team_size;
6963
6964 if (ompt_enabled.enabled) {
6965 exit_runtime_p = &(
6966 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6967 } else {
6968 exit_runtime_p = &dummy;
6969 }
6970
6971 my_task_data =
6972 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6973 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6974 if (ompt_enabled.ompt_callback_implicit_task) {
6975 ompt_team_size = team->t.t_nproc;
6976 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6977 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6978 __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
6979 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6980 }
6981 #endif
6982
6983 #if KMP_STATS_ENABLED
6984 stats_state_e previous_state = KMP_GET_THREAD_STATE();
6985 if (previous_state == stats_state_e::TEAMS_REGION) {
6986 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
6987 } else {
6988 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
6989 }
6990 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
6991 #endif
6992
6993 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6994 tid, (int)team->t.t_argc, (void **)team->t.t_argv
6995 #if OMPT_SUPPORT
6996 ,
6997 exit_runtime_p
6998 #endif
6999 );
7000 #if OMPT_SUPPORT
7001 *exit_runtime_p = NULL;
7002 #endif
7003
7004 #if KMP_STATS_ENABLED
7005 if (previous_state == stats_state_e::TEAMS_REGION) {
7006 KMP_SET_THREAD_STATE(previous_state);
7007 }
7008 KMP_POP_PARTITIONED_TIMER();
7009 #endif
7010
7011 #if USE_ITT_BUILD
7012 if (__itt_stack_caller_create_ptr) {
7013 __kmp_itt_stack_callee_leave(
7014 (__itt_caller)
7015 team->t.t_stack_id); // inform ittnotify about leaving user's code
7016 }
7017 #endif /* USE_ITT_BUILD */
7018 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7019
7020 return rc;
7021 }
7022
__kmp_teams_master(int gtid)7023 void __kmp_teams_master(int gtid) {
7024 // This routine is called by all master threads in teams construct
7025 kmp_info_t *thr = __kmp_threads[gtid];
7026 kmp_team_t *team = thr->th.th_team;
7027 ident_t *loc = team->t.t_ident;
7028 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7029 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7030 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7031 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7032 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7033
7034 // This thread is a new CG root. Set up the proper variables.
7035 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7036 tmp->cg_root = thr; // Make thr the CG root
7037 // Init to thread limit that was stored when league masters were forked
7038 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7039 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7040 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7041 " cg_nthreads to 1\n",
7042 thr, tmp));
7043 tmp->up = thr->th.th_cg_roots;
7044 thr->th.th_cg_roots = tmp;
7045
7046 // Launch league of teams now, but not let workers execute
7047 // (they hang on fork barrier until next parallel)
7048 #if INCLUDE_SSC_MARKS
7049 SSC_MARK_FORKING();
7050 #endif
7051 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7052 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7053 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7054 #if INCLUDE_SSC_MARKS
7055 SSC_MARK_JOINING();
7056 #endif
7057 // If the team size was reduced from the limit, set it to the new size
7058 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7059 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7060 // AC: last parameter "1" eliminates join barrier which won't work because
7061 // worker threads are in a fork barrier waiting for more parallel regions
7062 __kmp_join_call(loc, gtid
7063 #if OMPT_SUPPORT
7064 ,
7065 fork_context_intel
7066 #endif
7067 ,
7068 1);
7069 }
7070
__kmp_invoke_teams_master(int gtid)7071 int __kmp_invoke_teams_master(int gtid) {
7072 kmp_info_t *this_thr = __kmp_threads[gtid];
7073 kmp_team_t *team = this_thr->th.th_team;
7074 #if KMP_DEBUG
7075 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7076 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7077 (void *)__kmp_teams_master);
7078 #endif
7079 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7080 __kmp_teams_master(gtid);
7081 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7082 return 1;
7083 }
7084
7085 /* this sets the requested number of threads for the next parallel region
7086 encountered by this team. since this should be enclosed in the forkjoin
7087 critical section it should avoid race conditions with assymmetrical nested
7088 parallelism */
7089
__kmp_push_num_threads(ident_t * id,int gtid,int num_threads)7090 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7091 kmp_info_t *thr = __kmp_threads[gtid];
7092
7093 if (num_threads > 0)
7094 thr->th.th_set_nproc = num_threads;
7095 }
7096
7097 /* this sets the requested number of teams for the teams region and/or
7098 the number of threads for the next parallel region encountered */
__kmp_push_num_teams(ident_t * id,int gtid,int num_teams,int num_threads)7099 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7100 int num_threads) {
7101 kmp_info_t *thr = __kmp_threads[gtid];
7102 KMP_DEBUG_ASSERT(num_teams >= 0);
7103 KMP_DEBUG_ASSERT(num_threads >= 0);
7104
7105 if (num_teams == 0)
7106 num_teams = 1; // default number of teams is 1.
7107 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7108 if (!__kmp_reserve_warn) {
7109 __kmp_reserve_warn = 1;
7110 __kmp_msg(kmp_ms_warning,
7111 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7112 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7113 }
7114 num_teams = __kmp_teams_max_nth;
7115 }
7116 // Set number of teams (number of threads in the outer "parallel" of the
7117 // teams)
7118 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7119
7120 // Remember the number of threads for inner parallel regions
7121 if (num_threads == 0) {
7122 if (!TCR_4(__kmp_init_middle))
7123 __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7124 num_threads = __kmp_avail_proc / num_teams;
7125 if (num_teams * num_threads > __kmp_teams_max_nth) {
7126 // adjust num_threads w/o warning as it is not user setting
7127 num_threads = __kmp_teams_max_nth / num_teams;
7128 }
7129 } else {
7130 // This thread will be the master of the league masters
7131 // Store new thread limit; old limit is saved in th_cg_roots list
7132 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7133
7134 if (num_teams * num_threads > __kmp_teams_max_nth) {
7135 int new_threads = __kmp_teams_max_nth / num_teams;
7136 if (!__kmp_reserve_warn) { // user asked for too many threads
7137 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7138 __kmp_msg(kmp_ms_warning,
7139 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7140 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7141 }
7142 num_threads = new_threads;
7143 }
7144 }
7145 thr->th.th_teams_size.nth = num_threads;
7146 }
7147
7148 // Set the proc_bind var to use in the following parallel region.
__kmp_push_proc_bind(ident_t * id,int gtid,kmp_proc_bind_t proc_bind)7149 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7150 kmp_info_t *thr = __kmp_threads[gtid];
7151 thr->th.th_set_proc_bind = proc_bind;
7152 }
7153
7154 /* Launch the worker threads into the microtask. */
7155
__kmp_internal_fork(ident_t * id,int gtid,kmp_team_t * team)7156 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7157 kmp_info_t *this_thr = __kmp_threads[gtid];
7158
7159 #ifdef KMP_DEBUG
7160 int f;
7161 #endif /* KMP_DEBUG */
7162
7163 KMP_DEBUG_ASSERT(team);
7164 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7165 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7166 KMP_MB(); /* Flush all pending memory write invalidates. */
7167
7168 team->t.t_construct = 0; /* no single directives seen yet */
7169 team->t.t_ordered.dt.t_value =
7170 0; /* thread 0 enters the ordered section first */
7171
7172 /* Reset the identifiers on the dispatch buffer */
7173 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7174 if (team->t.t_max_nproc > 1) {
7175 int i;
7176 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7177 team->t.t_disp_buffer[i].buffer_index = i;
7178 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7179 }
7180 } else {
7181 team->t.t_disp_buffer[0].buffer_index = 0;
7182 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7183 }
7184
7185 KMP_MB(); /* Flush all pending memory write invalidates. */
7186 KMP_ASSERT(this_thr->th.th_team == team);
7187
7188 #ifdef KMP_DEBUG
7189 for (f = 0; f < team->t.t_nproc; f++) {
7190 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7191 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7192 }
7193 #endif /* KMP_DEBUG */
7194
7195 /* release the worker threads so they may begin working */
7196 __kmp_fork_barrier(gtid, 0);
7197 }
7198
__kmp_internal_join(ident_t * id,int gtid,kmp_team_t * team)7199 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7200 kmp_info_t *this_thr = __kmp_threads[gtid];
7201
7202 KMP_DEBUG_ASSERT(team);
7203 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7204 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7205 KMP_MB(); /* Flush all pending memory write invalidates. */
7206
7207 /* Join barrier after fork */
7208
7209 #ifdef KMP_DEBUG
7210 if (__kmp_threads[gtid] &&
7211 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7212 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7213 __kmp_threads[gtid]);
7214 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7215 "team->t.t_nproc=%d\n",
7216 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7217 team->t.t_nproc);
7218 __kmp_print_structure();
7219 }
7220 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7221 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7222 #endif /* KMP_DEBUG */
7223
7224 __kmp_join_barrier(gtid); /* wait for everyone */
7225 #if OMPT_SUPPORT
7226 if (ompt_enabled.enabled &&
7227 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7228 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7229 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7230 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7231 #if OMPT_OPTIONAL
7232 void *codeptr = NULL;
7233 if (KMP_MASTER_TID(ds_tid) &&
7234 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7235 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7236 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7237
7238 if (ompt_enabled.ompt_callback_sync_region_wait) {
7239 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7240 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7241 codeptr);
7242 }
7243 if (ompt_enabled.ompt_callback_sync_region) {
7244 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7245 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7246 codeptr);
7247 }
7248 #endif
7249 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7250 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7251 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7252 }
7253 }
7254 #endif
7255
7256 KMP_MB(); /* Flush all pending memory write invalidates. */
7257 KMP_ASSERT(this_thr->th.th_team == team);
7258 }
7259
7260 /* ------------------------------------------------------------------------ */
7261
7262 #ifdef USE_LOAD_BALANCE
7263
7264 // Return the worker threads actively spinning in the hot team, if we
7265 // are at the outermost level of parallelism. Otherwise, return 0.
__kmp_active_hot_team_nproc(kmp_root_t * root)7266 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7267 int i;
7268 int retval;
7269 kmp_team_t *hot_team;
7270
7271 if (root->r.r_active) {
7272 return 0;
7273 }
7274 hot_team = root->r.r_hot_team;
7275 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7276 return hot_team->t.t_nproc - 1; // Don't count master thread
7277 }
7278
7279 // Skip the master thread - it is accounted for elsewhere.
7280 retval = 0;
7281 for (i = 1; i < hot_team->t.t_nproc; i++) {
7282 if (hot_team->t.t_threads[i]->th.th_active) {
7283 retval++;
7284 }
7285 }
7286 return retval;
7287 }
7288
7289 // Perform an automatic adjustment to the number of
7290 // threads used by the next parallel region.
__kmp_load_balance_nproc(kmp_root_t * root,int set_nproc)7291 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7292 int retval;
7293 int pool_active;
7294 int hot_team_active;
7295 int team_curr_active;
7296 int system_active;
7297
7298 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7299 set_nproc));
7300 KMP_DEBUG_ASSERT(root);
7301 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7302 ->th.th_current_task->td_icvs.dynamic == TRUE);
7303 KMP_DEBUG_ASSERT(set_nproc > 1);
7304
7305 if (set_nproc == 1) {
7306 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7307 return 1;
7308 }
7309
7310 // Threads that are active in the thread pool, active in the hot team for this
7311 // particular root (if we are at the outer par level), and the currently
7312 // executing thread (to become the master) are available to add to the new
7313 // team, but are currently contributing to the system load, and must be
7314 // accounted for.
7315 pool_active = __kmp_thread_pool_active_nth;
7316 hot_team_active = __kmp_active_hot_team_nproc(root);
7317 team_curr_active = pool_active + hot_team_active + 1;
7318
7319 // Check the system load.
7320 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7321 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7322 "hot team active = %d\n",
7323 system_active, pool_active, hot_team_active));
7324
7325 if (system_active < 0) {
7326 // There was an error reading the necessary info from /proc, so use the
7327 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7328 // = dynamic_thread_limit, we shouldn't wind up getting back here.
7329 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7330 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7331
7332 // Make this call behave like the thread limit algorithm.
7333 retval = __kmp_avail_proc - __kmp_nth +
7334 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7335 if (retval > set_nproc) {
7336 retval = set_nproc;
7337 }
7338 if (retval < KMP_MIN_NTH) {
7339 retval = KMP_MIN_NTH;
7340 }
7341
7342 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7343 retval));
7344 return retval;
7345 }
7346
7347 // There is a slight delay in the load balance algorithm in detecting new
7348 // running procs. The real system load at this instant should be at least as
7349 // large as the #active omp thread that are available to add to the team.
7350 if (system_active < team_curr_active) {
7351 system_active = team_curr_active;
7352 }
7353 retval = __kmp_avail_proc - system_active + team_curr_active;
7354 if (retval > set_nproc) {
7355 retval = set_nproc;
7356 }
7357 if (retval < KMP_MIN_NTH) {
7358 retval = KMP_MIN_NTH;
7359 }
7360
7361 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7362 return retval;
7363 } // __kmp_load_balance_nproc()
7364
7365 #endif /* USE_LOAD_BALANCE */
7366
7367 /* ------------------------------------------------------------------------ */
7368
7369 /* NOTE: this is called with the __kmp_init_lock held */
__kmp_cleanup(void)7370 void __kmp_cleanup(void) {
7371 int f;
7372
7373 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7374
7375 if (TCR_4(__kmp_init_parallel)) {
7376 #if KMP_HANDLE_SIGNALS
7377 __kmp_remove_signals();
7378 #endif
7379 TCW_4(__kmp_init_parallel, FALSE);
7380 }
7381
7382 if (TCR_4(__kmp_init_middle)) {
7383 #if KMP_AFFINITY_SUPPORTED
7384 __kmp_affinity_uninitialize();
7385 #endif /* KMP_AFFINITY_SUPPORTED */
7386 __kmp_cleanup_hierarchy();
7387 TCW_4(__kmp_init_middle, FALSE);
7388 }
7389
7390 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7391
7392 if (__kmp_init_serial) {
7393 __kmp_runtime_destroy();
7394 __kmp_init_serial = FALSE;
7395 }
7396
7397 __kmp_cleanup_threadprivate_caches();
7398
7399 for (f = 0; f < __kmp_threads_capacity; f++) {
7400 if (__kmp_root[f] != NULL) {
7401 __kmp_free(__kmp_root[f]);
7402 __kmp_root[f] = NULL;
7403 }
7404 }
7405 __kmp_free(__kmp_threads);
7406 // __kmp_threads and __kmp_root were allocated at once, as single block, so
7407 // there is no need in freeing __kmp_root.
7408 __kmp_threads = NULL;
7409 __kmp_root = NULL;
7410 __kmp_threads_capacity = 0;
7411
7412 #if KMP_USE_DYNAMIC_LOCK
7413 __kmp_cleanup_indirect_user_locks();
7414 #else
7415 __kmp_cleanup_user_locks();
7416 #endif
7417
7418 #if KMP_AFFINITY_SUPPORTED
7419 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7420 __kmp_cpuinfo_file = NULL;
7421 #endif /* KMP_AFFINITY_SUPPORTED */
7422
7423 #if KMP_USE_ADAPTIVE_LOCKS
7424 #if KMP_DEBUG_ADAPTIVE_LOCKS
7425 __kmp_print_speculative_stats();
7426 #endif
7427 #endif
7428 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7429 __kmp_nested_nth.nth = NULL;
7430 __kmp_nested_nth.size = 0;
7431 __kmp_nested_nth.used = 0;
7432 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7433 __kmp_nested_proc_bind.bind_types = NULL;
7434 __kmp_nested_proc_bind.size = 0;
7435 __kmp_nested_proc_bind.used = 0;
7436 if (__kmp_affinity_format) {
7437 KMP_INTERNAL_FREE(__kmp_affinity_format);
7438 __kmp_affinity_format = NULL;
7439 }
7440
7441 __kmp_i18n_catclose();
7442
7443 #if KMP_USE_HIER_SCHED
7444 __kmp_hier_scheds.deallocate();
7445 #endif
7446
7447 #if KMP_STATS_ENABLED
7448 __kmp_stats_fini();
7449 #endif
7450
7451 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7452 }
7453
7454 /* ------------------------------------------------------------------------ */
7455
__kmp_ignore_mppbeg(void)7456 int __kmp_ignore_mppbeg(void) {
7457 char *env;
7458
7459 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7460 if (__kmp_str_match_false(env))
7461 return FALSE;
7462 }
7463 // By default __kmpc_begin() is no-op.
7464 return TRUE;
7465 }
7466
__kmp_ignore_mppend(void)7467 int __kmp_ignore_mppend(void) {
7468 char *env;
7469
7470 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7471 if (__kmp_str_match_false(env))
7472 return FALSE;
7473 }
7474 // By default __kmpc_end() is no-op.
7475 return TRUE;
7476 }
7477
__kmp_internal_begin(void)7478 void __kmp_internal_begin(void) {
7479 int gtid;
7480 kmp_root_t *root;
7481
7482 /* this is a very important step as it will register new sibling threads
7483 and assign these new uber threads a new gtid */
7484 gtid = __kmp_entry_gtid();
7485 root = __kmp_threads[gtid]->th.th_root;
7486 KMP_ASSERT(KMP_UBER_GTID(gtid));
7487
7488 if (root->r.r_begin)
7489 return;
7490 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7491 if (root->r.r_begin) {
7492 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7493 return;
7494 }
7495
7496 root->r.r_begin = TRUE;
7497
7498 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7499 }
7500
7501 /* ------------------------------------------------------------------------ */
7502
__kmp_user_set_library(enum library_type arg)7503 void __kmp_user_set_library(enum library_type arg) {
7504 int gtid;
7505 kmp_root_t *root;
7506 kmp_info_t *thread;
7507
7508 /* first, make sure we are initialized so we can get our gtid */
7509
7510 gtid = __kmp_entry_gtid();
7511 thread = __kmp_threads[gtid];
7512
7513 root = thread->th.th_root;
7514
7515 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7516 library_serial));
7517 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7518 thread */
7519 KMP_WARNING(SetLibraryIncorrectCall);
7520 return;
7521 }
7522
7523 switch (arg) {
7524 case library_serial:
7525 thread->th.th_set_nproc = 0;
7526 set__nproc(thread, 1);
7527 break;
7528 case library_turnaround:
7529 thread->th.th_set_nproc = 0;
7530 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7531 : __kmp_dflt_team_nth_ub);
7532 break;
7533 case library_throughput:
7534 thread->th.th_set_nproc = 0;
7535 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7536 : __kmp_dflt_team_nth_ub);
7537 break;
7538 default:
7539 KMP_FATAL(UnknownLibraryType, arg);
7540 }
7541
7542 __kmp_aux_set_library(arg);
7543 }
7544
__kmp_aux_set_stacksize(size_t arg)7545 void __kmp_aux_set_stacksize(size_t arg) {
7546 if (!__kmp_init_serial)
7547 __kmp_serial_initialize();
7548
7549 #if KMP_OS_DARWIN
7550 if (arg & (0x1000 - 1)) {
7551 arg &= ~(0x1000 - 1);
7552 if (arg + 0x1000) /* check for overflow if we round up */
7553 arg += 0x1000;
7554 }
7555 #endif
7556 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7557
7558 /* only change the default stacksize before the first parallel region */
7559 if (!TCR_4(__kmp_init_parallel)) {
7560 size_t value = arg; /* argument is in bytes */
7561
7562 if (value < __kmp_sys_min_stksize)
7563 value = __kmp_sys_min_stksize;
7564 else if (value > KMP_MAX_STKSIZE)
7565 value = KMP_MAX_STKSIZE;
7566
7567 __kmp_stksize = value;
7568
7569 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7570 }
7571
7572 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7573 }
7574
7575 /* set the behaviour of the runtime library */
7576 /* TODO this can cause some odd behaviour with sibling parallelism... */
__kmp_aux_set_library(enum library_type arg)7577 void __kmp_aux_set_library(enum library_type arg) {
7578 __kmp_library = arg;
7579
7580 switch (__kmp_library) {
7581 case library_serial: {
7582 KMP_INFORM(LibraryIsSerial);
7583 } break;
7584 case library_turnaround:
7585 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7586 __kmp_use_yield = 2; // only yield when oversubscribed
7587 break;
7588 case library_throughput:
7589 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7590 __kmp_dflt_blocktime = 200;
7591 break;
7592 default:
7593 KMP_FATAL(UnknownLibraryType, arg);
7594 }
7595 }
7596
7597 /* Getting team information common for all team API */
7598 // Returns NULL if not in teams construct
__kmp_aux_get_team_info(int & teams_serialized)7599 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7600 kmp_info_t *thr = __kmp_entry_thread();
7601 teams_serialized = 0;
7602 if (thr->th.th_teams_microtask) {
7603 kmp_team_t *team = thr->th.th_team;
7604 int tlevel = thr->th.th_teams_level; // the level of the teams construct
7605 int ii = team->t.t_level;
7606 teams_serialized = team->t.t_serialized;
7607 int level = tlevel + 1;
7608 KMP_DEBUG_ASSERT(ii >= tlevel);
7609 while (ii > level) {
7610 for (teams_serialized = team->t.t_serialized;
7611 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7612 }
7613 if (team->t.t_serialized && (!teams_serialized)) {
7614 team = team->t.t_parent;
7615 continue;
7616 }
7617 if (ii > level) {
7618 team = team->t.t_parent;
7619 ii--;
7620 }
7621 }
7622 return team;
7623 }
7624 return NULL;
7625 }
7626
__kmp_aux_get_team_num()7627 int __kmp_aux_get_team_num() {
7628 int serialized;
7629 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7630 if (team) {
7631 if (serialized > 1) {
7632 return 0; // teams region is serialized ( 1 team of 1 thread ).
7633 } else {
7634 return team->t.t_master_tid;
7635 }
7636 }
7637 return 0;
7638 }
7639
__kmp_aux_get_num_teams()7640 int __kmp_aux_get_num_teams() {
7641 int serialized;
7642 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7643 if (team) {
7644 if (serialized > 1) {
7645 return 1;
7646 } else {
7647 return team->t.t_parent->t.t_nproc;
7648 }
7649 }
7650 return 1;
7651 }
7652
7653 /* ------------------------------------------------------------------------ */
7654
7655 /*
7656 * Affinity Format Parser
7657 *
7658 * Field is in form of: %[[[0].]size]type
7659 * % and type are required (%% means print a literal '%')
7660 * type is either single char or long name surrounded by {},
7661 * e.g., N or {num_threads}
7662 * 0 => leading zeros
7663 * . => right justified when size is specified
7664 * by default output is left justified
7665 * size is the *minimum* field length
7666 * All other characters are printed as is
7667 *
7668 * Available field types:
7669 * L {thread_level} - omp_get_level()
7670 * n {thread_num} - omp_get_thread_num()
7671 * h {host} - name of host machine
7672 * P {process_id} - process id (integer)
7673 * T {thread_identifier} - native thread identifier (integer)
7674 * N {num_threads} - omp_get_num_threads()
7675 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7676 * a {thread_affinity} - comma separated list of integers or integer ranges
7677 * (values of affinity mask)
7678 *
7679 * Implementation-specific field types can be added
7680 * If a type is unknown, print "undefined"
7681 */
7682
7683 // Structure holding the short name, long name, and corresponding data type
7684 // for snprintf. A table of these will represent the entire valid keyword
7685 // field types.
7686 typedef struct kmp_affinity_format_field_t {
7687 char short_name; // from spec e.g., L -> thread level
7688 const char *long_name; // from spec thread_level -> thread level
7689 char field_format; // data type for snprintf (typically 'd' or 's'
7690 // for integer or string)
7691 } kmp_affinity_format_field_t;
7692
7693 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7694 #if KMP_AFFINITY_SUPPORTED
7695 {'A', "thread_affinity", 's'},
7696 #endif
7697 {'t', "team_num", 'd'},
7698 {'T', "num_teams", 'd'},
7699 {'L', "nesting_level", 'd'},
7700 {'n', "thread_num", 'd'},
7701 {'N', "num_threads", 'd'},
7702 {'a', "ancestor_tnum", 'd'},
7703 {'H', "host", 's'},
7704 {'P', "process_id", 'd'},
7705 {'i', "native_thread_id", 'd'}};
7706
7707 // Return the number of characters it takes to hold field
__kmp_aux_capture_affinity_field(int gtid,const kmp_info_t * th,const char ** ptr,kmp_str_buf_t * field_buffer)7708 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7709 const char **ptr,
7710 kmp_str_buf_t *field_buffer) {
7711 int rc, format_index, field_value;
7712 const char *width_left, *width_right;
7713 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7714 static const int FORMAT_SIZE = 20;
7715 char format[FORMAT_SIZE] = {0};
7716 char absolute_short_name = 0;
7717
7718 KMP_DEBUG_ASSERT(gtid >= 0);
7719 KMP_DEBUG_ASSERT(th);
7720 KMP_DEBUG_ASSERT(**ptr == '%');
7721 KMP_DEBUG_ASSERT(field_buffer);
7722
7723 __kmp_str_buf_clear(field_buffer);
7724
7725 // Skip the initial %
7726 (*ptr)++;
7727
7728 // Check for %% first
7729 if (**ptr == '%') {
7730 __kmp_str_buf_cat(field_buffer, "%", 1);
7731 (*ptr)++; // skip over the second %
7732 return 1;
7733 }
7734
7735 // Parse field modifiers if they are present
7736 pad_zeros = false;
7737 if (**ptr == '0') {
7738 pad_zeros = true;
7739 (*ptr)++; // skip over 0
7740 }
7741 right_justify = false;
7742 if (**ptr == '.') {
7743 right_justify = true;
7744 (*ptr)++; // skip over .
7745 }
7746 // Parse width of field: [width_left, width_right)
7747 width_left = width_right = NULL;
7748 if (**ptr >= '0' && **ptr <= '9') {
7749 width_left = *ptr;
7750 SKIP_DIGITS(*ptr);
7751 width_right = *ptr;
7752 }
7753
7754 // Create the format for KMP_SNPRINTF based on flags parsed above
7755 format_index = 0;
7756 format[format_index++] = '%';
7757 if (!right_justify)
7758 format[format_index++] = '-';
7759 if (pad_zeros)
7760 format[format_index++] = '0';
7761 if (width_left && width_right) {
7762 int i = 0;
7763 // Only allow 8 digit number widths.
7764 // This also prevents overflowing format variable
7765 while (i < 8 && width_left < width_right) {
7766 format[format_index++] = *width_left;
7767 width_left++;
7768 i++;
7769 }
7770 }
7771
7772 // Parse a name (long or short)
7773 // Canonicalize the name into absolute_short_name
7774 found_valid_name = false;
7775 parse_long_name = (**ptr == '{');
7776 if (parse_long_name)
7777 (*ptr)++; // skip initial left brace
7778 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7779 sizeof(__kmp_affinity_format_table[0]);
7780 ++i) {
7781 char short_name = __kmp_affinity_format_table[i].short_name;
7782 const char *long_name = __kmp_affinity_format_table[i].long_name;
7783 char field_format = __kmp_affinity_format_table[i].field_format;
7784 if (parse_long_name) {
7785 int length = KMP_STRLEN(long_name);
7786 if (strncmp(*ptr, long_name, length) == 0) {
7787 found_valid_name = true;
7788 (*ptr) += length; // skip the long name
7789 }
7790 } else if (**ptr == short_name) {
7791 found_valid_name = true;
7792 (*ptr)++; // skip the short name
7793 }
7794 if (found_valid_name) {
7795 format[format_index++] = field_format;
7796 format[format_index++] = '\0';
7797 absolute_short_name = short_name;
7798 break;
7799 }
7800 }
7801 if (parse_long_name) {
7802 if (**ptr != '}') {
7803 absolute_short_name = 0;
7804 } else {
7805 (*ptr)++; // skip over the right brace
7806 }
7807 }
7808
7809 // Attempt to fill the buffer with the requested
7810 // value using snprintf within __kmp_str_buf_print()
7811 switch (absolute_short_name) {
7812 case 't':
7813 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7814 break;
7815 case 'T':
7816 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7817 break;
7818 case 'L':
7819 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7820 break;
7821 case 'n':
7822 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7823 break;
7824 case 'H': {
7825 static const int BUFFER_SIZE = 256;
7826 char buf[BUFFER_SIZE];
7827 __kmp_expand_host_name(buf, BUFFER_SIZE);
7828 rc = __kmp_str_buf_print(field_buffer, format, buf);
7829 } break;
7830 case 'P':
7831 rc = __kmp_str_buf_print(field_buffer, format, getpid());
7832 break;
7833 case 'i':
7834 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7835 break;
7836 case 'N':
7837 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7838 break;
7839 case 'a':
7840 field_value =
7841 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7842 rc = __kmp_str_buf_print(field_buffer, format, field_value);
7843 break;
7844 #if KMP_AFFINITY_SUPPORTED
7845 case 'A': {
7846 kmp_str_buf_t buf;
7847 __kmp_str_buf_init(&buf);
7848 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7849 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7850 __kmp_str_buf_free(&buf);
7851 } break;
7852 #endif
7853 default:
7854 // According to spec, If an implementation does not have info for field
7855 // type, then "undefined" is printed
7856 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7857 // Skip the field
7858 if (parse_long_name) {
7859 SKIP_TOKEN(*ptr);
7860 if (**ptr == '}')
7861 (*ptr)++;
7862 } else {
7863 (*ptr)++;
7864 }
7865 }
7866
7867 KMP_ASSERT(format_index <= FORMAT_SIZE);
7868 return rc;
7869 }
7870
7871 /*
7872 * Return number of characters needed to hold the affinity string
7873 * (not including null byte character)
7874 * The resultant string is printed to buffer, which the caller can then
7875 * handle afterwards
7876 */
__kmp_aux_capture_affinity(int gtid,const char * format,kmp_str_buf_t * buffer)7877 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7878 kmp_str_buf_t *buffer) {
7879 const char *parse_ptr;
7880 size_t retval;
7881 const kmp_info_t *th;
7882 kmp_str_buf_t field;
7883
7884 KMP_DEBUG_ASSERT(buffer);
7885 KMP_DEBUG_ASSERT(gtid >= 0);
7886
7887 __kmp_str_buf_init(&field);
7888 __kmp_str_buf_clear(buffer);
7889
7890 th = __kmp_threads[gtid];
7891 retval = 0;
7892
7893 // If format is NULL or zero-length string, then we use
7894 // affinity-format-var ICV
7895 parse_ptr = format;
7896 if (parse_ptr == NULL || *parse_ptr == '\0') {
7897 parse_ptr = __kmp_affinity_format;
7898 }
7899 KMP_DEBUG_ASSERT(parse_ptr);
7900
7901 while (*parse_ptr != '\0') {
7902 // Parse a field
7903 if (*parse_ptr == '%') {
7904 // Put field in the buffer
7905 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7906 __kmp_str_buf_catbuf(buffer, &field);
7907 retval += rc;
7908 } else {
7909 // Put literal character in buffer
7910 __kmp_str_buf_cat(buffer, parse_ptr, 1);
7911 retval++;
7912 parse_ptr++;
7913 }
7914 }
7915 __kmp_str_buf_free(&field);
7916 return retval;
7917 }
7918
7919 // Displays the affinity string to stdout
__kmp_aux_display_affinity(int gtid,const char * format)7920 void __kmp_aux_display_affinity(int gtid, const char *format) {
7921 kmp_str_buf_t buf;
7922 __kmp_str_buf_init(&buf);
7923 __kmp_aux_capture_affinity(gtid, format, &buf);
7924 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7925 __kmp_str_buf_free(&buf);
7926 }
7927
7928 /* ------------------------------------------------------------------------ */
7929
__kmp_aux_set_blocktime(int arg,kmp_info_t * thread,int tid)7930 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7931 int blocktime = arg; /* argument is in milliseconds */
7932 #if KMP_USE_MONITOR
7933 int bt_intervals;
7934 #endif
7935 int bt_set;
7936
7937 __kmp_save_internal_controls(thread);
7938
7939 /* Normalize and set blocktime for the teams */
7940 if (blocktime < KMP_MIN_BLOCKTIME)
7941 blocktime = KMP_MIN_BLOCKTIME;
7942 else if (blocktime > KMP_MAX_BLOCKTIME)
7943 blocktime = KMP_MAX_BLOCKTIME;
7944
7945 set__blocktime_team(thread->th.th_team, tid, blocktime);
7946 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7947
7948 #if KMP_USE_MONITOR
7949 /* Calculate and set blocktime intervals for the teams */
7950 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7951
7952 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7953 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7954 #endif
7955
7956 /* Set whether blocktime has been set to "TRUE" */
7957 bt_set = TRUE;
7958
7959 set__bt_set_team(thread->th.th_team, tid, bt_set);
7960 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7961 #if KMP_USE_MONITOR
7962 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7963 "bt_intervals=%d, monitor_updates=%d\n",
7964 __kmp_gtid_from_tid(tid, thread->th.th_team),
7965 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7966 __kmp_monitor_wakeups));
7967 #else
7968 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7969 __kmp_gtid_from_tid(tid, thread->th.th_team),
7970 thread->th.th_team->t.t_id, tid, blocktime));
7971 #endif
7972 }
7973
__kmp_aux_set_defaults(char const * str,int len)7974 void __kmp_aux_set_defaults(char const *str, int len) {
7975 if (!__kmp_init_serial) {
7976 __kmp_serial_initialize();
7977 }
7978 __kmp_env_initialize(str);
7979
7980 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
7981 __kmp_env_print();
7982 }
7983 } // __kmp_aux_set_defaults
7984
7985 /* ------------------------------------------------------------------------ */
7986 /* internal fast reduction routines */
7987
7988 PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)7989 __kmp_determine_reduction_method(
7990 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7991 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7992 kmp_critical_name *lck) {
7993
7994 // Default reduction method: critical construct ( lck != NULL, like in current
7995 // PAROPT )
7996 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7997 // can be selected by RTL
7998 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7999 // can be selected by RTL
8000 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8001 // among generated by PAROPT.
8002
8003 PACKED_REDUCTION_METHOD_T retval;
8004
8005 int team_size;
8006
8007 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8008 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8009
8010 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8011 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8012 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8013
8014 retval = critical_reduce_block;
8015
8016 // another choice of getting a team size (with 1 dynamic deference) is slower
8017 team_size = __kmp_get_team_num_threads(global_tid);
8018 if (team_size == 1) {
8019
8020 retval = empty_reduce_block;
8021
8022 } else {
8023
8024 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8025
8026 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8027
8028 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8029 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8030
8031 int teamsize_cutoff = 4;
8032
8033 #if KMP_MIC_SUPPORTED
8034 if (__kmp_mic_type != non_mic) {
8035 teamsize_cutoff = 8;
8036 }
8037 #endif
8038 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8039 if (tree_available) {
8040 if (team_size <= teamsize_cutoff) {
8041 if (atomic_available) {
8042 retval = atomic_reduce_block;
8043 }
8044 } else {
8045 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8046 }
8047 } else if (atomic_available) {
8048 retval = atomic_reduce_block;
8049 }
8050 #else
8051 #error "Unknown or unsupported OS"
8052 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8053 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8054
8055 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8056
8057 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8058
8059 // basic tuning
8060
8061 if (atomic_available) {
8062 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8063 retval = atomic_reduce_block;
8064 }
8065 } // otherwise: use critical section
8066
8067 #elif KMP_OS_DARWIN
8068
8069 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8070 if (atomic_available && (num_vars <= 3)) {
8071 retval = atomic_reduce_block;
8072 } else if (tree_available) {
8073 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8074 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8075 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8076 }
8077 } // otherwise: use critical section
8078
8079 #else
8080 #error "Unknown or unsupported OS"
8081 #endif
8082
8083 #else
8084 #error "Unknown or unsupported architecture"
8085 #endif
8086 }
8087
8088 // KMP_FORCE_REDUCTION
8089
8090 // If the team is serialized (team_size == 1), ignore the forced reduction
8091 // method and stay with the unsynchronized method (empty_reduce_block)
8092 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8093 team_size != 1) {
8094
8095 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8096
8097 int atomic_available, tree_available;
8098
8099 switch ((forced_retval = __kmp_force_reduction_method)) {
8100 case critical_reduce_block:
8101 KMP_ASSERT(lck); // lck should be != 0
8102 break;
8103
8104 case atomic_reduce_block:
8105 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8106 if (!atomic_available) {
8107 KMP_WARNING(RedMethodNotSupported, "atomic");
8108 forced_retval = critical_reduce_block;
8109 }
8110 break;
8111
8112 case tree_reduce_block:
8113 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8114 if (!tree_available) {
8115 KMP_WARNING(RedMethodNotSupported, "tree");
8116 forced_retval = critical_reduce_block;
8117 } else {
8118 #if KMP_FAST_REDUCTION_BARRIER
8119 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8120 #endif
8121 }
8122 break;
8123
8124 default:
8125 KMP_ASSERT(0); // "unsupported method specified"
8126 }
8127
8128 retval = forced_retval;
8129 }
8130
8131 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8132
8133 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8134 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8135
8136 return (retval);
8137 }
8138
8139 // this function is for testing set/get/determine reduce method
__kmp_get_reduce_method(void)8140 kmp_int32 __kmp_get_reduce_method(void) {
8141 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8142 }
8143
8144 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8145 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
__kmp_soft_pause()8146 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8147
8148 // Hard pause shuts down the runtime completely. Resume happens naturally when
8149 // OpenMP is used subsequently.
__kmp_hard_pause()8150 void __kmp_hard_pause() {
8151 __kmp_pause_status = kmp_hard_paused;
8152 __kmp_internal_end_thread(-1);
8153 }
8154
8155 // Soft resume sets __kmp_pause_status, and wakes up all threads.
__kmp_resume_if_soft_paused()8156 void __kmp_resume_if_soft_paused() {
8157 if (__kmp_pause_status == kmp_soft_paused) {
8158 __kmp_pause_status = kmp_not_paused;
8159
8160 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8161 kmp_info_t *thread = __kmp_threads[gtid];
8162 if (thread) { // Wake it if sleeping
8163 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8164 if (fl.is_sleeping())
8165 fl.resume(gtid);
8166 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8167 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8168 } else { // thread holds the lock and may sleep soon
8169 do { // until either the thread sleeps, or we can get the lock
8170 if (fl.is_sleeping()) {
8171 fl.resume(gtid);
8172 break;
8173 } else if (__kmp_try_suspend_mx(thread)) {
8174 __kmp_unlock_suspend_mx(thread);
8175 break;
8176 }
8177 } while (1);
8178 }
8179 }
8180 }
8181 }
8182 }
8183
8184 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8185 // TODO: add warning messages
__kmp_pause_resource(kmp_pause_status_t level)8186 int __kmp_pause_resource(kmp_pause_status_t level) {
8187 if (level == kmp_not_paused) { // requesting resume
8188 if (__kmp_pause_status == kmp_not_paused) {
8189 // error message about runtime not being paused, so can't resume
8190 return 1;
8191 } else {
8192 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8193 __kmp_pause_status == kmp_hard_paused);
8194 __kmp_pause_status = kmp_not_paused;
8195 return 0;
8196 }
8197 } else if (level == kmp_soft_paused) { // requesting soft pause
8198 if (__kmp_pause_status != kmp_not_paused) {
8199 // error message about already being paused
8200 return 1;
8201 } else {
8202 __kmp_soft_pause();
8203 return 0;
8204 }
8205 } else if (level == kmp_hard_paused) { // requesting hard pause
8206 if (__kmp_pause_status != kmp_not_paused) {
8207 // error message about already being paused
8208 return 1;
8209 } else {
8210 __kmp_hard_pause();
8211 return 0;
8212 }
8213 } else {
8214 // error message about invalid level
8215 return 1;
8216 }
8217 }
8218