1 /*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22
23 #define MAX_MESSAGE 512
24
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27
28 /*!
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc in source location information
31 * @param flags in for future use (currently ignored)
32 *
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
35 */
__kmpc_begin(ident_t * loc,kmp_int32 flags)36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37 // By default __kmpc_begin() is no-op.
38 char *env;
39 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40 __kmp_str_match_true(env)) {
41 __kmp_middle_initialize();
42 __kmp_assign_root_init_mask();
43 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44 } else if (__kmp_ignore_mppbeg() == FALSE) {
45 // By default __kmp_ignore_mppbeg() returns TRUE.
46 __kmp_internal_begin();
47 KC_TRACE(10, ("__kmpc_begin: called\n"));
48 }
49 }
50
51 /*!
52 * @ingroup STARTUP_SHUTDOWN
53 * @param loc source location information
54 *
55 * Shutdown the runtime library. This is also optional, and even if called will
56 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57 * zero.
58 */
__kmpc_end(ident_t * loc)59 void __kmpc_end(ident_t *loc) {
60 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63 // returns FALSE and __kmpc_end() will unregister this root (it can cause
64 // library shut down).
65 if (__kmp_ignore_mppend() == FALSE) {
66 KC_TRACE(10, ("__kmpc_end: called\n"));
67 KA_TRACE(30, ("__kmpc_end\n"));
68
69 __kmp_internal_end_thread(-1);
70 }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72 // Normal exit process on Windows does not allow worker threads of the final
73 // parallel region to finish reporting their events, so shutting down the
74 // library here fixes the issue at least for the cases where __kmpc_end() is
75 // placed properly.
76 if (ompt_enabled.enabled)
77 __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85
86 This function can be called in any context.
87
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
__kmpc_global_thread_num(ident_t * loc)99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100 kmp_int32 gtid = __kmp_entry_gtid();
101
102 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103
104 return gtid;
105 }
106
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
__kmpc_global_num_threads(ident_t * loc)121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122 KC_TRACE(10,
123 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124
125 return TCR_4(__kmp_all_nth);
126 }
127
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
__kmpc_bound_thread_num(ident_t * loc)134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136 return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
__kmpc_bound_num_threads(ident_t * loc)144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146
147 return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149
150 /*!
151 * @ingroup DEPRECATED
152 * @param loc location description
153 *
154 * This function need not be called. It always returns TRUE.
155 */
__kmpc_ok_to_fork(ident_t * loc)156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158
159 return TRUE;
160
161 #else
162
163 const char *semi2;
164 const char *semi3;
165 int line_no;
166
167 if (__kmp_par_range == 0) {
168 return TRUE;
169 }
170 semi2 = loc->psource;
171 if (semi2 == NULL) {
172 return TRUE;
173 }
174 semi2 = strchr(semi2, ';');
175 if (semi2 == NULL) {
176 return TRUE;
177 }
178 semi2 = strchr(semi2 + 1, ';');
179 if (semi2 == NULL) {
180 return TRUE;
181 }
182 if (__kmp_par_range_filename[0]) {
183 const char *name = semi2 - 1;
184 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185 name--;
186 }
187 if ((*name == '/') || (*name == ';')) {
188 name++;
189 }
190 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191 return __kmp_par_range < 0;
192 }
193 }
194 semi3 = strchr(semi2 + 1, ';');
195 if (__kmp_par_range_routine[0]) {
196 if ((semi3 != NULL) && (semi3 > semi2) &&
197 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198 return __kmp_par_range < 0;
199 }
200 }
201 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203 return __kmp_par_range > 0;
204 }
205 return __kmp_par_range < 0;
206 }
207 return TRUE;
208
209 #endif /* KMP_DEBUG */
210 }
211
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
__kmpc_in_parallel(ident_t * loc)218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219 return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
__kmpc_push_num_threads(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_threads)231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232 kmp_int32 num_threads) {
233 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234 global_tid, num_threads));
235 __kmp_assert_valid_gtid(global_tid);
236 __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238
__kmpc_pop_num_threads(ident_t * loc,kmp_int32 global_tid)239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241 /* the num_threads are automatically popped */
242 }
243
__kmpc_push_proc_bind(ident_t * loc,kmp_int32 global_tid,kmp_int32 proc_bind)244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245 kmp_int32 proc_bind) {
246 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247 proc_bind));
248 __kmp_assert_valid_gtid(global_tid);
249 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251
252 /*!
253 @ingroup PARALLEL
254 @param loc source location information
255 @param argc total number of arguments in the ellipsis
256 @param microtask pointer to callback routine consisting of outlined parallel
257 construct
258 @param ... pointers to shared variables that aren't global
259
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
__kmpc_fork_call(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263 int gtid = __kmp_entry_gtid();
264
265 #if (KMP_STATS_ENABLED)
266 // If we were in a serial region, then stop the serial timer, record
267 // the event, and start parallel region timer
268 stats_state_e previous_state = KMP_GET_THREAD_STATE();
269 if (previous_state == stats_state_e::SERIAL_REGION) {
270 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271 } else {
272 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273 }
274 int inParallel = __kmpc_in_parallel(loc);
275 if (inParallel) {
276 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277 } else {
278 KMP_COUNT_BLOCK(OMP_PARALLEL);
279 }
280 #endif
281
282 // maybe to save thr_state is enough here
283 {
284 va_list ap;
285 va_start(ap, microtask);
286
287 #if OMPT_SUPPORT
288 ompt_frame_t *ompt_frame;
289 if (ompt_enabled.enabled) {
290 kmp_info_t *master_th = __kmp_threads[gtid];
291 ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293 }
294 OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296
297 #if INCLUDE_SSC_MARKS
298 SSC_MARK_FORKING();
299 #endif
300 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303 kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305 SSC_MARK_JOINING();
306 #endif
307 __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309 ,
310 fork_context_intel
311 #endif
312 );
313
314 va_end(ap);
315
316 #if OMPT_SUPPORT
317 if (ompt_enabled.enabled) {
318 ompt_frame->enter_frame = ompt_data_none;
319 }
320 #endif
321 }
322
323 #if KMP_STATS_ENABLED
324 if (previous_state == stats_state_e::SERIAL_REGION) {
325 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326 KMP_SET_THREAD_STATE(previous_state);
327 } else {
328 KMP_POP_PARTITIONED_TIMER();
329 }
330 #endif // KMP_STATS_ENABLED
331 }
332
333 /*!
334 @ingroup PARALLEL
335 @param loc source location information
336 @param microtask pointer to callback routine consisting of outlined parallel
337 construct
338 @param cond condition for running in parallel
339 @param args struct of pointers to shared variables that aren't global
340
341 Perform a fork only if the condition is true.
342 */
__kmpc_fork_call_if(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,kmp_int32 cond,void * args)343 void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
344 kmp_int32 cond, void *args) {
345 int gtid = __kmp_entry_gtid();
346 if (cond) {
347 if (args)
348 __kmpc_fork_call(loc, argc, microtask, args);
349 else
350 __kmpc_fork_call(loc, argc, microtask);
351 } else {
352 __kmpc_serialized_parallel(loc, gtid);
353
354 #if OMPT_SUPPORT
355 void *exit_frame_ptr;
356 #endif
357
358 if (args)
359 __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
360 /*npr=*/0,
361 /*argc=*/1, &args
362 #if OMPT_SUPPORT
363 ,
364 &exit_frame_ptr
365 #endif
366 );
367 else
368 __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
369 /*npr=*/0,
370 /*argc=*/0,
371 /*args=*/nullptr
372 #if OMPT_SUPPORT
373 ,
374 &exit_frame_ptr
375 #endif
376 );
377
378 __kmpc_end_serialized_parallel(loc, gtid);
379 }
380 }
381
382 /*!
383 @ingroup PARALLEL
384 @param loc source location information
385 @param global_tid global thread number
386 @param num_teams number of teams requested for the teams construct
387 @param num_threads number of threads per team requested for the teams construct
388
389 Set the number of teams to be used by the teams construct.
390 This call is only required if the teams construct has a `num_teams` clause
391 or a `thread_limit` clause (or both).
392 */
__kmpc_push_num_teams(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_teams,kmp_int32 num_threads)393 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
394 kmp_int32 num_teams, kmp_int32 num_threads) {
395 KA_TRACE(20,
396 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
397 global_tid, num_teams, num_threads));
398 __kmp_assert_valid_gtid(global_tid);
399 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
400 }
401
402 /*!
403 @ingroup PARALLEL
404 @param loc source location information
405 @param global_tid global thread number
406 @param thread_limit limit on number of threads which can be created within the
407 current task
408
409 Set the thread_limit for the current task
410 This call is there to support `thread_limit` clause on the `target` construct
411 */
__kmpc_set_thread_limit(ident_t * loc,kmp_int32 global_tid,kmp_int32 thread_limit)412 void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
413 kmp_int32 thread_limit) {
414 __kmp_assert_valid_gtid(global_tid);
415 kmp_info_t *thread = __kmp_threads[global_tid];
416 if (thread_limit > 0)
417 thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
418 }
419
420 /*!
421 @ingroup PARALLEL
422 @param loc source location information
423 @param global_tid global thread number
424 @param num_teams_lb lower bound on number of teams requested for the teams
425 construct
426 @param num_teams_ub upper bound on number of teams requested for the teams
427 construct
428 @param num_threads number of threads per team requested for the teams construct
429
430 Set the number of teams to be used by the teams construct. The number of initial
431 teams cretaed will be greater than or equal to the lower bound and less than or
432 equal to the upper bound.
433 This call is only required if the teams construct has a `num_teams` clause
434 or a `thread_limit` clause (or both).
435 */
__kmpc_push_num_teams_51(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_teams_lb,kmp_int32 num_teams_ub,kmp_int32 num_threads)436 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
437 kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
438 kmp_int32 num_threads) {
439 KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
440 " num_teams_ub=%d num_threads=%d\n",
441 global_tid, num_teams_lb, num_teams_ub, num_threads));
442 __kmp_assert_valid_gtid(global_tid);
443 __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
444 num_threads);
445 }
446
447 /*!
448 @ingroup PARALLEL
449 @param loc source location information
450 @param argc total number of arguments in the ellipsis
451 @param microtask pointer to callback routine consisting of outlined teams
452 construct
453 @param ... pointers to shared variables that aren't global
454
455 Do the actual fork and call the microtask in the relevant number of threads.
456 */
__kmpc_fork_teams(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)457 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
458 ...) {
459 int gtid = __kmp_entry_gtid();
460 kmp_info_t *this_thr = __kmp_threads[gtid];
461 va_list ap;
462 va_start(ap, microtask);
463
464 #if KMP_STATS_ENABLED
465 KMP_COUNT_BLOCK(OMP_TEAMS);
466 stats_state_e previous_state = KMP_GET_THREAD_STATE();
467 if (previous_state == stats_state_e::SERIAL_REGION) {
468 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
469 } else {
470 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
471 }
472 #endif
473
474 // remember teams entry point and nesting level
475 this_thr->th.th_teams_microtask = microtask;
476 this_thr->th.th_teams_level =
477 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
478
479 #if OMPT_SUPPORT
480 kmp_team_t *parent_team = this_thr->th.th_team;
481 int tid = __kmp_tid_from_gtid(gtid);
482 if (ompt_enabled.enabled) {
483 parent_team->t.t_implicit_task_taskdata[tid]
484 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
485 }
486 OMPT_STORE_RETURN_ADDRESS(gtid);
487 #endif
488
489 // check if __kmpc_push_num_teams called, set default number of teams
490 // otherwise
491 if (this_thr->th.th_teams_size.nteams == 0) {
492 __kmp_push_num_teams(loc, gtid, 0, 0);
493 }
494 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
495 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
496 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
497
498 __kmp_fork_call(
499 loc, gtid, fork_context_intel, argc,
500 VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
501 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
502 __kmp_join_call(loc, gtid
503 #if OMPT_SUPPORT
504 ,
505 fork_context_intel
506 #endif
507 );
508
509 // Pop current CG root off list
510 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
511 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
512 this_thr->th.th_cg_roots = tmp->up;
513 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
514 " to node %p. cg_nthreads was %d\n",
515 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
516 KMP_DEBUG_ASSERT(tmp->cg_nthreads);
517 int i = tmp->cg_nthreads--;
518 if (i == 1) { // check is we are the last thread in CG (not always the case)
519 __kmp_free(tmp);
520 }
521 // Restore current task's thread_limit from CG root
522 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
523 this_thr->th.th_current_task->td_icvs.thread_limit =
524 this_thr->th.th_cg_roots->cg_thread_limit;
525
526 this_thr->th.th_teams_microtask = NULL;
527 this_thr->th.th_teams_level = 0;
528 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
529 va_end(ap);
530 #if KMP_STATS_ENABLED
531 if (previous_state == stats_state_e::SERIAL_REGION) {
532 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
533 KMP_SET_THREAD_STATE(previous_state);
534 } else {
535 KMP_POP_PARTITIONED_TIMER();
536 }
537 #endif // KMP_STATS_ENABLED
538 }
539
540 // I don't think this function should ever have been exported.
541 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
542 // openmp code ever called it, but it's been exported from the RTL for so
543 // long that I'm afraid to remove the definition.
__kmpc_invoke_task_func(int gtid)544 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
545
546 /*!
547 @ingroup PARALLEL
548 @param loc source location information
549 @param global_tid global thread number
550
551 Enter a serialized parallel construct. This interface is used to handle a
552 conditional parallel region, like this,
553 @code
554 #pragma omp parallel if (condition)
555 @endcode
556 when the condition is false.
557 */
__kmpc_serialized_parallel(ident_t * loc,kmp_int32 global_tid)558 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
559 // The implementation is now in kmp_runtime.cpp so that it can share static
560 // functions with kmp_fork_call since the tasks to be done are similar in
561 // each case.
562 __kmp_assert_valid_gtid(global_tid);
563 #if OMPT_SUPPORT
564 OMPT_STORE_RETURN_ADDRESS(global_tid);
565 #endif
566 __kmp_serialized_parallel(loc, global_tid);
567 }
568
569 /*!
570 @ingroup PARALLEL
571 @param loc source location information
572 @param global_tid global thread number
573
574 Leave a serialized parallel construct.
575 */
__kmpc_end_serialized_parallel(ident_t * loc,kmp_int32 global_tid)576 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
577 kmp_internal_control_t *top;
578 kmp_info_t *this_thr;
579 kmp_team_t *serial_team;
580
581 KC_TRACE(10,
582 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
583
584 /* skip all this code for autopar serialized loops since it results in
585 unacceptable overhead */
586 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
587 return;
588
589 // Not autopar code
590 __kmp_assert_valid_gtid(global_tid);
591 if (!TCR_4(__kmp_init_parallel))
592 __kmp_parallel_initialize();
593
594 __kmp_resume_if_soft_paused();
595
596 this_thr = __kmp_threads[global_tid];
597 serial_team = this_thr->th.th_serial_team;
598
599 kmp_task_team_t *task_team = this_thr->th.th_task_team;
600 // we need to wait for the proxy tasks before finishing the thread
601 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
602 task_team->tt.tt_hidden_helper_task_encountered))
603 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
604
605 KMP_MB();
606 KMP_DEBUG_ASSERT(serial_team);
607 KMP_ASSERT(serial_team->t.t_serialized);
608 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
609 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
610 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
611 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
612
613 #if OMPT_SUPPORT
614 if (ompt_enabled.enabled &&
615 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
616 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
617 if (ompt_enabled.ompt_callback_implicit_task) {
618 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
619 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
620 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
621 }
622
623 // reset clear the task id only after unlinking the task
624 ompt_data_t *parent_task_data;
625 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
626
627 if (ompt_enabled.ompt_callback_parallel_end) {
628 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
629 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
630 ompt_parallel_invoker_program | ompt_parallel_team,
631 OMPT_LOAD_RETURN_ADDRESS(global_tid));
632 }
633 __ompt_lw_taskteam_unlink(this_thr);
634 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
635 }
636 #endif
637
638 /* If necessary, pop the internal control stack values and replace the team
639 * values */
640 top = serial_team->t.t_control_stack_top;
641 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
642 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
643 serial_team->t.t_control_stack_top = top->next;
644 __kmp_free(top);
645 }
646
647 /* pop dispatch buffers stack */
648 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
649 {
650 dispatch_private_info_t *disp_buffer =
651 serial_team->t.t_dispatch->th_disp_buffer;
652 serial_team->t.t_dispatch->th_disp_buffer =
653 serial_team->t.t_dispatch->th_disp_buffer->next;
654 __kmp_free(disp_buffer);
655 }
656 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
657
658 --serial_team->t.t_serialized;
659 if (serial_team->t.t_serialized == 0) {
660
661 /* return to the parallel section */
662
663 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
664 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
665 __kmp_clear_x87_fpu_status_word();
666 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
667 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
668 }
669 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
670
671 __kmp_pop_current_task_from_thread(this_thr);
672 #if OMPD_SUPPORT
673 if (ompd_state & OMPD_ENABLE_BP)
674 ompd_bp_parallel_end();
675 #endif
676
677 this_thr->th.th_team = serial_team->t.t_parent;
678 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
679
680 /* restore values cached in the thread */
681 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
682 this_thr->th.th_team_master =
683 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
684 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
685
686 /* TODO the below shouldn't need to be adjusted for serialized teams */
687 this_thr->th.th_dispatch =
688 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
689
690 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
691 this_thr->th.th_current_task->td_flags.executing = 1;
692
693 if (__kmp_tasking_mode != tskm_immediate_exec) {
694 // Copy the task team from the new child / old parent team to the thread.
695 this_thr->th.th_task_team =
696 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
697 KA_TRACE(20,
698 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
699 "team %p\n",
700 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
701 }
702 #if KMP_AFFINITY_SUPPORTED
703 if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
704 __kmp_reset_root_init_mask(global_tid);
705 }
706 #endif
707 } else {
708 if (__kmp_tasking_mode != tskm_immediate_exec) {
709 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
710 "depth of serial team %p to %d\n",
711 global_tid, serial_team, serial_team->t.t_serialized));
712 }
713 }
714
715 serial_team->t.t_level--;
716 if (__kmp_env_consistency_check)
717 __kmp_pop_parallel(global_tid, NULL);
718 #if OMPT_SUPPORT
719 if (ompt_enabled.enabled)
720 this_thr->th.ompt_thread_info.state =
721 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
722 : ompt_state_work_parallel);
723 #endif
724 }
725
726 /*!
727 @ingroup SYNCHRONIZATION
728 @param loc source location information.
729
730 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
731 depending on the memory ordering convention obeyed by the compiler
732 even that may not be necessary).
733 */
__kmpc_flush(ident_t * loc)734 void __kmpc_flush(ident_t *loc) {
735 KC_TRACE(10, ("__kmpc_flush: called\n"));
736
737 /* need explicit __mf() here since use volatile instead in library */
738 KMP_MFENCE(); /* Flush all pending memory write invalidates. */
739
740 #if OMPT_SUPPORT && OMPT_OPTIONAL
741 if (ompt_enabled.ompt_callback_flush) {
742 ompt_callbacks.ompt_callback(ompt_callback_flush)(
743 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
744 }
745 #endif
746 }
747
748 /* -------------------------------------------------------------------------- */
749 /*!
750 @ingroup SYNCHRONIZATION
751 @param loc source location information
752 @param global_tid thread id.
753
754 Execute a barrier.
755 */
__kmpc_barrier(ident_t * loc,kmp_int32 global_tid)756 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
757 KMP_COUNT_BLOCK(OMP_BARRIER);
758 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
759 __kmp_assert_valid_gtid(global_tid);
760
761 if (!TCR_4(__kmp_init_parallel))
762 __kmp_parallel_initialize();
763
764 __kmp_resume_if_soft_paused();
765
766 if (__kmp_env_consistency_check) {
767 if (loc == 0) {
768 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
769 }
770 __kmp_check_barrier(global_tid, ct_barrier, loc);
771 }
772
773 #if OMPT_SUPPORT
774 ompt_frame_t *ompt_frame;
775 if (ompt_enabled.enabled) {
776 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
777 if (ompt_frame->enter_frame.ptr == NULL)
778 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
779 }
780 OMPT_STORE_RETURN_ADDRESS(global_tid);
781 #endif
782 __kmp_threads[global_tid]->th.th_ident = loc;
783 // TODO: explicit barrier_wait_id:
784 // this function is called when 'barrier' directive is present or
785 // implicit barrier at the end of a worksharing construct.
786 // 1) better to add a per-thread barrier counter to a thread data structure
787 // 2) set to 0 when a new team is created
788 // 4) no sync is required
789
790 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
791 #if OMPT_SUPPORT && OMPT_OPTIONAL
792 if (ompt_enabled.enabled) {
793 ompt_frame->enter_frame = ompt_data_none;
794 }
795 #endif
796 }
797
798 /* The BARRIER for a MASTER section is always explicit */
799 /*!
800 @ingroup WORK_SHARING
801 @param loc source location information.
802 @param global_tid global thread number .
803 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
804 */
__kmpc_master(ident_t * loc,kmp_int32 global_tid)805 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
806 int status = 0;
807
808 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
809 __kmp_assert_valid_gtid(global_tid);
810
811 if (!TCR_4(__kmp_init_parallel))
812 __kmp_parallel_initialize();
813
814 __kmp_resume_if_soft_paused();
815
816 if (KMP_MASTER_GTID(global_tid)) {
817 KMP_COUNT_BLOCK(OMP_MASTER);
818 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
819 status = 1;
820 }
821
822 #if OMPT_SUPPORT && OMPT_OPTIONAL
823 if (status) {
824 if (ompt_enabled.ompt_callback_masked) {
825 kmp_info_t *this_thr = __kmp_threads[global_tid];
826 kmp_team_t *team = this_thr->th.th_team;
827
828 int tid = __kmp_tid_from_gtid(global_tid);
829 ompt_callbacks.ompt_callback(ompt_callback_masked)(
830 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
831 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
832 OMPT_GET_RETURN_ADDRESS(0));
833 }
834 }
835 #endif
836
837 if (__kmp_env_consistency_check) {
838 #if KMP_USE_DYNAMIC_LOCK
839 if (status)
840 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
841 else
842 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
843 #else
844 if (status)
845 __kmp_push_sync(global_tid, ct_master, loc, NULL);
846 else
847 __kmp_check_sync(global_tid, ct_master, loc, NULL);
848 #endif
849 }
850
851 return status;
852 }
853
854 /*!
855 @ingroup WORK_SHARING
856 @param loc source location information.
857 @param global_tid global thread number .
858
859 Mark the end of a <tt>master</tt> region. This should only be called by the
860 thread that executes the <tt>master</tt> region.
861 */
__kmpc_end_master(ident_t * loc,kmp_int32 global_tid)862 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
863 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
864 __kmp_assert_valid_gtid(global_tid);
865 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
866 KMP_POP_PARTITIONED_TIMER();
867
868 #if OMPT_SUPPORT && OMPT_OPTIONAL
869 kmp_info_t *this_thr = __kmp_threads[global_tid];
870 kmp_team_t *team = this_thr->th.th_team;
871 if (ompt_enabled.ompt_callback_masked) {
872 int tid = __kmp_tid_from_gtid(global_tid);
873 ompt_callbacks.ompt_callback(ompt_callback_masked)(
874 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
875 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
876 OMPT_GET_RETURN_ADDRESS(0));
877 }
878 #endif
879
880 if (__kmp_env_consistency_check) {
881 if (KMP_MASTER_GTID(global_tid))
882 __kmp_pop_sync(global_tid, ct_master, loc);
883 }
884 }
885
886 /*!
887 @ingroup WORK_SHARING
888 @param loc source location information.
889 @param global_tid global thread number.
890 @param filter result of evaluating filter clause on thread global_tid, or zero
891 if no filter clause present
892 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
893 */
__kmpc_masked(ident_t * loc,kmp_int32 global_tid,kmp_int32 filter)894 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
895 int status = 0;
896 int tid;
897 KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
898 __kmp_assert_valid_gtid(global_tid);
899
900 if (!TCR_4(__kmp_init_parallel))
901 __kmp_parallel_initialize();
902
903 __kmp_resume_if_soft_paused();
904
905 tid = __kmp_tid_from_gtid(global_tid);
906 if (tid == filter) {
907 KMP_COUNT_BLOCK(OMP_MASKED);
908 KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
909 status = 1;
910 }
911
912 #if OMPT_SUPPORT && OMPT_OPTIONAL
913 if (status) {
914 if (ompt_enabled.ompt_callback_masked) {
915 kmp_info_t *this_thr = __kmp_threads[global_tid];
916 kmp_team_t *team = this_thr->th.th_team;
917 ompt_callbacks.ompt_callback(ompt_callback_masked)(
918 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
919 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
920 OMPT_GET_RETURN_ADDRESS(0));
921 }
922 }
923 #endif
924
925 if (__kmp_env_consistency_check) {
926 #if KMP_USE_DYNAMIC_LOCK
927 if (status)
928 __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
929 else
930 __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
931 #else
932 if (status)
933 __kmp_push_sync(global_tid, ct_masked, loc, NULL);
934 else
935 __kmp_check_sync(global_tid, ct_masked, loc, NULL);
936 #endif
937 }
938
939 return status;
940 }
941
942 /*!
943 @ingroup WORK_SHARING
944 @param loc source location information.
945 @param global_tid global thread number .
946
947 Mark the end of a <tt>masked</tt> region. This should only be called by the
948 thread that executes the <tt>masked</tt> region.
949 */
__kmpc_end_masked(ident_t * loc,kmp_int32 global_tid)950 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
951 KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
952 __kmp_assert_valid_gtid(global_tid);
953 KMP_POP_PARTITIONED_TIMER();
954
955 #if OMPT_SUPPORT && OMPT_OPTIONAL
956 kmp_info_t *this_thr = __kmp_threads[global_tid];
957 kmp_team_t *team = this_thr->th.th_team;
958 if (ompt_enabled.ompt_callback_masked) {
959 int tid = __kmp_tid_from_gtid(global_tid);
960 ompt_callbacks.ompt_callback(ompt_callback_masked)(
961 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
962 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
963 OMPT_GET_RETURN_ADDRESS(0));
964 }
965 #endif
966
967 if (__kmp_env_consistency_check) {
968 __kmp_pop_sync(global_tid, ct_masked, loc);
969 }
970 }
971
972 /*!
973 @ingroup WORK_SHARING
974 @param loc source location information.
975 @param gtid global thread number.
976
977 Start execution of an <tt>ordered</tt> construct.
978 */
__kmpc_ordered(ident_t * loc,kmp_int32 gtid)979 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
980 int cid = 0;
981 kmp_info_t *th;
982 KMP_DEBUG_ASSERT(__kmp_init_serial);
983
984 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
985 __kmp_assert_valid_gtid(gtid);
986
987 if (!TCR_4(__kmp_init_parallel))
988 __kmp_parallel_initialize();
989
990 __kmp_resume_if_soft_paused();
991
992 #if USE_ITT_BUILD
993 __kmp_itt_ordered_prep(gtid);
994 // TODO: ordered_wait_id
995 #endif /* USE_ITT_BUILD */
996
997 th = __kmp_threads[gtid];
998
999 #if OMPT_SUPPORT && OMPT_OPTIONAL
1000 kmp_team_t *team;
1001 ompt_wait_id_t lck;
1002 void *codeptr_ra;
1003 OMPT_STORE_RETURN_ADDRESS(gtid);
1004 if (ompt_enabled.enabled) {
1005 team = __kmp_team_from_gtid(gtid);
1006 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
1007 /* OMPT state update */
1008 th->th.ompt_thread_info.wait_id = lck;
1009 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
1010
1011 /* OMPT event callback */
1012 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1013 if (ompt_enabled.ompt_callback_mutex_acquire) {
1014 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1015 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
1016 codeptr_ra);
1017 }
1018 }
1019 #endif
1020
1021 if (th->th.th_dispatch->th_deo_fcn != 0)
1022 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc);
1023 else
1024 __kmp_parallel_deo(>id, &cid, loc);
1025
1026 #if OMPT_SUPPORT && OMPT_OPTIONAL
1027 if (ompt_enabled.enabled) {
1028 /* OMPT state update */
1029 th->th.ompt_thread_info.state = ompt_state_work_parallel;
1030 th->th.ompt_thread_info.wait_id = 0;
1031
1032 /* OMPT event callback */
1033 if (ompt_enabled.ompt_callback_mutex_acquired) {
1034 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1035 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1036 }
1037 }
1038 #endif
1039
1040 #if USE_ITT_BUILD
1041 __kmp_itt_ordered_start(gtid);
1042 #endif /* USE_ITT_BUILD */
1043 }
1044
1045 /*!
1046 @ingroup WORK_SHARING
1047 @param loc source location information.
1048 @param gtid global thread number.
1049
1050 End execution of an <tt>ordered</tt> construct.
1051 */
__kmpc_end_ordered(ident_t * loc,kmp_int32 gtid)1052 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1053 int cid = 0;
1054 kmp_info_t *th;
1055
1056 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1057 __kmp_assert_valid_gtid(gtid);
1058
1059 #if USE_ITT_BUILD
1060 __kmp_itt_ordered_end(gtid);
1061 // TODO: ordered_wait_id
1062 #endif /* USE_ITT_BUILD */
1063
1064 th = __kmp_threads[gtid];
1065
1066 if (th->th.th_dispatch->th_dxo_fcn != 0)
1067 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc);
1068 else
1069 __kmp_parallel_dxo(>id, &cid, loc);
1070
1071 #if OMPT_SUPPORT && OMPT_OPTIONAL
1072 OMPT_STORE_RETURN_ADDRESS(gtid);
1073 if (ompt_enabled.ompt_callback_mutex_released) {
1074 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1075 ompt_mutex_ordered,
1076 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1077 ->t.t_ordered.dt.t_value,
1078 OMPT_LOAD_RETURN_ADDRESS(gtid));
1079 }
1080 #endif
1081 }
1082
1083 #if KMP_USE_DYNAMIC_LOCK
1084
1085 static __forceinline void
__kmp_init_indirect_csptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid,kmp_indirect_locktag_t tag)1086 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1087 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1088 // Pointer to the allocated indirect lock is written to crit, while indexing
1089 // is ignored.
1090 void *idx;
1091 kmp_indirect_lock_t **lck;
1092 lck = (kmp_indirect_lock_t **)crit;
1093 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1094 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1095 KMP_SET_I_LOCK_LOCATION(ilk, loc);
1096 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1097 KA_TRACE(20,
1098 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1099 #if USE_ITT_BUILD
1100 __kmp_itt_critical_creating(ilk->lock, loc);
1101 #endif
1102 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1103 if (status == 0) {
1104 #if USE_ITT_BUILD
1105 __kmp_itt_critical_destroyed(ilk->lock);
1106 #endif
1107 // We don't really need to destroy the unclaimed lock here since it will be
1108 // cleaned up at program exit.
1109 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1110 }
1111 KMP_DEBUG_ASSERT(*lck != NULL);
1112 }
1113
1114 // Fast-path acquire tas lock
1115 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
1116 { \
1117 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1118 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1119 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1120 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1121 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
1122 kmp_uint32 spins; \
1123 KMP_FSYNC_PREPARE(l); \
1124 KMP_INIT_YIELD(spins); \
1125 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
1126 do { \
1127 if (TCR_4(__kmp_nth) > \
1128 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
1129 KMP_YIELD(TRUE); \
1130 } else { \
1131 KMP_YIELD_SPIN(spins); \
1132 } \
1133 __kmp_spin_backoff(&backoff); \
1134 } while ( \
1135 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1136 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
1137 } \
1138 KMP_FSYNC_ACQUIRED(l); \
1139 }
1140
1141 // Fast-path test tas lock
1142 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
1143 { \
1144 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1145 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1146 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1147 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
1148 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1149 }
1150
1151 // Fast-path release tas lock
1152 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1153 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1154
1155 #if KMP_USE_FUTEX
1156
1157 #include <sys/syscall.h>
1158 #include <unistd.h>
1159 #ifndef FUTEX_WAIT
1160 #define FUTEX_WAIT 0
1161 #endif
1162 #ifndef FUTEX_WAKE
1163 #define FUTEX_WAKE 1
1164 #endif
1165
1166 // Fast-path acquire futex lock
1167 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1168 { \
1169 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1170 kmp_int32 gtid_code = (gtid + 1) << 1; \
1171 KMP_MB(); \
1172 KMP_FSYNC_PREPARE(ftx); \
1173 kmp_int32 poll_val; \
1174 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1175 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1176 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1177 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1178 if (!cond) { \
1179 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1180 poll_val | \
1181 KMP_LOCK_BUSY(1, futex))) { \
1182 continue; \
1183 } \
1184 poll_val |= KMP_LOCK_BUSY(1, futex); \
1185 } \
1186 kmp_int32 rc; \
1187 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1188 NULL, NULL, 0)) != 0) { \
1189 continue; \
1190 } \
1191 gtid_code |= 1; \
1192 } \
1193 KMP_FSYNC_ACQUIRED(ftx); \
1194 }
1195
1196 // Fast-path test futex lock
1197 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1198 { \
1199 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1200 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1201 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1202 KMP_FSYNC_ACQUIRED(ftx); \
1203 rc = TRUE; \
1204 } else { \
1205 rc = FALSE; \
1206 } \
1207 }
1208
1209 // Fast-path release futex lock
1210 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1211 { \
1212 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1213 KMP_MB(); \
1214 KMP_FSYNC_RELEASING(ftx); \
1215 kmp_int32 poll_val = \
1216 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1217 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1218 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1219 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1220 } \
1221 KMP_MB(); \
1222 KMP_YIELD_OVERSUB(); \
1223 }
1224
1225 #endif // KMP_USE_FUTEX
1226
1227 #else // KMP_USE_DYNAMIC_LOCK
1228
__kmp_get_critical_section_ptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid)1229 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1230 ident_t const *loc,
1231 kmp_int32 gtid) {
1232 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1233
1234 // Because of the double-check, the following load doesn't need to be volatile
1235 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1236
1237 if (lck == NULL) {
1238 void *idx;
1239
1240 // Allocate & initialize the lock.
1241 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1242 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1243 __kmp_init_user_lock_with_checks(lck);
1244 __kmp_set_user_lock_location(lck, loc);
1245 #if USE_ITT_BUILD
1246 __kmp_itt_critical_creating(lck);
1247 // __kmp_itt_critical_creating() should be called *before* the first usage
1248 // of underlying lock. It is the only place where we can guarantee it. There
1249 // are chances the lock will destroyed with no usage, but it is not a
1250 // problem, because this is not real event seen by user but rather setting
1251 // name for object (lock). See more details in kmp_itt.h.
1252 #endif /* USE_ITT_BUILD */
1253
1254 // Use a cmpxchg instruction to slam the start of the critical section with
1255 // the lock pointer. If another thread beat us to it, deallocate the lock,
1256 // and use the lock that the other thread allocated.
1257 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1258
1259 if (status == 0) {
1260 // Deallocate the lock and reload the value.
1261 #if USE_ITT_BUILD
1262 __kmp_itt_critical_destroyed(lck);
1263 // Let ITT know the lock is destroyed and the same memory location may be reused
1264 // for another purpose.
1265 #endif /* USE_ITT_BUILD */
1266 __kmp_destroy_user_lock_with_checks(lck);
1267 __kmp_user_lock_free(&idx, gtid, lck);
1268 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1269 KMP_DEBUG_ASSERT(lck != NULL);
1270 }
1271 }
1272 return lck;
1273 }
1274
1275 #endif // KMP_USE_DYNAMIC_LOCK
1276
1277 /*!
1278 @ingroup WORK_SHARING
1279 @param loc source location information.
1280 @param global_tid global thread number.
1281 @param crit identity of the critical section. This could be a pointer to a lock
1282 associated with the critical section, or some other suitably unique value.
1283
1284 Enter code protected by a `critical` construct.
1285 This function blocks until the executing thread can enter the critical section.
1286 */
__kmpc_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1287 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1288 kmp_critical_name *crit) {
1289 #if KMP_USE_DYNAMIC_LOCK
1290 #if OMPT_SUPPORT && OMPT_OPTIONAL
1291 OMPT_STORE_RETURN_ADDRESS(global_tid);
1292 #endif // OMPT_SUPPORT
1293 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1294 #else
1295 KMP_COUNT_BLOCK(OMP_CRITICAL);
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297 ompt_state_t prev_state = ompt_state_undefined;
1298 ompt_thread_info_t ti;
1299 #endif
1300 kmp_user_lock_p lck;
1301
1302 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1303 __kmp_assert_valid_gtid(global_tid);
1304
1305 // TODO: add THR_OVHD_STATE
1306
1307 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1308 KMP_CHECK_USER_LOCK_INIT();
1309
1310 if ((__kmp_user_lock_kind == lk_tas) &&
1311 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1312 lck = (kmp_user_lock_p)crit;
1313 }
1314 #if KMP_USE_FUTEX
1315 else if ((__kmp_user_lock_kind == lk_futex) &&
1316 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1317 lck = (kmp_user_lock_p)crit;
1318 }
1319 #endif
1320 else { // ticket, queuing or drdpa
1321 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1322 }
1323
1324 if (__kmp_env_consistency_check)
1325 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1326
1327 // since the critical directive binds to all threads, not just the current
1328 // team we have to check this even if we are in a serialized team.
1329 // also, even if we are the uber thread, we still have to conduct the lock,
1330 // as we have to contend with sibling threads.
1331
1332 #if USE_ITT_BUILD
1333 __kmp_itt_critical_acquiring(lck);
1334 #endif /* USE_ITT_BUILD */
1335 #if OMPT_SUPPORT && OMPT_OPTIONAL
1336 OMPT_STORE_RETURN_ADDRESS(gtid);
1337 void *codeptr_ra = NULL;
1338 if (ompt_enabled.enabled) {
1339 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1340 /* OMPT state update */
1341 prev_state = ti.state;
1342 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1343 ti.state = ompt_state_wait_critical;
1344
1345 /* OMPT event callback */
1346 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1347 if (ompt_enabled.ompt_callback_mutex_acquire) {
1348 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1349 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1350 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1351 }
1352 }
1353 #endif
1354 // Value of 'crit' should be good for using as a critical_id of the critical
1355 // section directive.
1356 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1357
1358 #if USE_ITT_BUILD
1359 __kmp_itt_critical_acquired(lck);
1360 #endif /* USE_ITT_BUILD */
1361 #if OMPT_SUPPORT && OMPT_OPTIONAL
1362 if (ompt_enabled.enabled) {
1363 /* OMPT state update */
1364 ti.state = prev_state;
1365 ti.wait_id = 0;
1366
1367 /* OMPT event callback */
1368 if (ompt_enabled.ompt_callback_mutex_acquired) {
1369 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1370 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1371 }
1372 }
1373 #endif
1374 KMP_POP_PARTITIONED_TIMER();
1375
1376 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1377 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1378 #endif // KMP_USE_DYNAMIC_LOCK
1379 }
1380
1381 #if KMP_USE_DYNAMIC_LOCK
1382
1383 // Converts the given hint to an internal lock implementation
__kmp_map_hint_to_lock(uintptr_t hint)1384 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1385 #if KMP_USE_TSX
1386 #define KMP_TSX_LOCK(seq) lockseq_##seq
1387 #else
1388 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1389 #endif
1390
1391 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1392 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1393 #else
1394 #define KMP_CPUINFO_RTM 0
1395 #endif
1396
1397 // Hints that do not require further logic
1398 if (hint & kmp_lock_hint_hle)
1399 return KMP_TSX_LOCK(hle);
1400 if (hint & kmp_lock_hint_rtm)
1401 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1402 if (hint & kmp_lock_hint_adaptive)
1403 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1404
1405 // Rule out conflicting hints first by returning the default lock
1406 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1407 return __kmp_user_lock_seq;
1408 if ((hint & omp_lock_hint_speculative) &&
1409 (hint & omp_lock_hint_nonspeculative))
1410 return __kmp_user_lock_seq;
1411
1412 // Do not even consider speculation when it appears to be contended
1413 if (hint & omp_lock_hint_contended)
1414 return lockseq_queuing;
1415
1416 // Uncontended lock without speculation
1417 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1418 return lockseq_tas;
1419
1420 // Use RTM lock for speculation
1421 if (hint & omp_lock_hint_speculative)
1422 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1423
1424 return __kmp_user_lock_seq;
1425 }
1426
1427 #if OMPT_SUPPORT && OMPT_OPTIONAL
1428 #if KMP_USE_DYNAMIC_LOCK
1429 static kmp_mutex_impl_t
__ompt_get_mutex_impl_type(void * user_lock,kmp_indirect_lock_t * ilock=0)1430 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1431 if (user_lock) {
1432 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1433 case 0:
1434 break;
1435 #if KMP_USE_FUTEX
1436 case locktag_futex:
1437 return kmp_mutex_impl_queuing;
1438 #endif
1439 case locktag_tas:
1440 return kmp_mutex_impl_spin;
1441 #if KMP_USE_TSX
1442 case locktag_hle:
1443 case locktag_rtm_spin:
1444 return kmp_mutex_impl_speculative;
1445 #endif
1446 default:
1447 return kmp_mutex_impl_none;
1448 }
1449 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1450 }
1451 KMP_ASSERT(ilock);
1452 switch (ilock->type) {
1453 #if KMP_USE_TSX
1454 case locktag_adaptive:
1455 case locktag_rtm_queuing:
1456 return kmp_mutex_impl_speculative;
1457 #endif
1458 case locktag_nested_tas:
1459 return kmp_mutex_impl_spin;
1460 #if KMP_USE_FUTEX
1461 case locktag_nested_futex:
1462 #endif
1463 case locktag_ticket:
1464 case locktag_queuing:
1465 case locktag_drdpa:
1466 case locktag_nested_ticket:
1467 case locktag_nested_queuing:
1468 case locktag_nested_drdpa:
1469 return kmp_mutex_impl_queuing;
1470 default:
1471 return kmp_mutex_impl_none;
1472 }
1473 }
1474 #else
1475 // For locks without dynamic binding
__ompt_get_mutex_impl_type()1476 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1477 switch (__kmp_user_lock_kind) {
1478 case lk_tas:
1479 return kmp_mutex_impl_spin;
1480 #if KMP_USE_FUTEX
1481 case lk_futex:
1482 #endif
1483 case lk_ticket:
1484 case lk_queuing:
1485 case lk_drdpa:
1486 return kmp_mutex_impl_queuing;
1487 #if KMP_USE_TSX
1488 case lk_hle:
1489 case lk_rtm_queuing:
1490 case lk_rtm_spin:
1491 case lk_adaptive:
1492 return kmp_mutex_impl_speculative;
1493 #endif
1494 default:
1495 return kmp_mutex_impl_none;
1496 }
1497 }
1498 #endif // KMP_USE_DYNAMIC_LOCK
1499 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1500
1501 /*!
1502 @ingroup WORK_SHARING
1503 @param loc source location information.
1504 @param global_tid global thread number.
1505 @param crit identity of the critical section. This could be a pointer to a lock
1506 associated with the critical section, or some other suitably unique value.
1507 @param hint the lock hint.
1508
1509 Enter code protected by a `critical` construct with a hint. The hint value is
1510 used to suggest a lock implementation. This function blocks until the executing
1511 thread can enter the critical section unless the hint suggests use of
1512 speculative execution and the hardware supports it.
1513 */
__kmpc_critical_with_hint(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit,uint32_t hint)1514 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1515 kmp_critical_name *crit, uint32_t hint) {
1516 KMP_COUNT_BLOCK(OMP_CRITICAL);
1517 kmp_user_lock_p lck;
1518 #if OMPT_SUPPORT && OMPT_OPTIONAL
1519 ompt_state_t prev_state = ompt_state_undefined;
1520 ompt_thread_info_t ti;
1521 // This is the case, if called from __kmpc_critical:
1522 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1523 if (!codeptr)
1524 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1525 #endif
1526
1527 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1528 __kmp_assert_valid_gtid(global_tid);
1529
1530 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1531 // Check if it is initialized.
1532 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1533 kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1534 if (*lk == 0) {
1535 if (KMP_IS_D_LOCK(lockseq)) {
1536 KMP_COMPARE_AND_STORE_ACQ32(
1537 (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
1538 KMP_GET_D_TAG(lockseq));
1539 } else {
1540 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1541 }
1542 }
1543 // Branch for accessing the actual lock object and set operation. This
1544 // branching is inevitable since this lock initialization does not follow the
1545 // normal dispatch path (lock table is not used).
1546 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1547 lck = (kmp_user_lock_p)lk;
1548 if (__kmp_env_consistency_check) {
1549 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1550 __kmp_map_hint_to_lock(hint));
1551 }
1552 #if USE_ITT_BUILD
1553 __kmp_itt_critical_acquiring(lck);
1554 #endif
1555 #if OMPT_SUPPORT && OMPT_OPTIONAL
1556 if (ompt_enabled.enabled) {
1557 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1558 /* OMPT state update */
1559 prev_state = ti.state;
1560 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1561 ti.state = ompt_state_wait_critical;
1562
1563 /* OMPT event callback */
1564 if (ompt_enabled.ompt_callback_mutex_acquire) {
1565 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1566 ompt_mutex_critical, (unsigned int)hint,
1567 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1568 codeptr);
1569 }
1570 }
1571 #endif
1572 #if KMP_USE_INLINED_TAS
1573 if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1574 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1575 } else
1576 #elif KMP_USE_INLINED_FUTEX
1577 if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1578 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1579 } else
1580 #endif
1581 {
1582 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1583 }
1584 } else {
1585 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1586 lck = ilk->lock;
1587 if (__kmp_env_consistency_check) {
1588 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1589 __kmp_map_hint_to_lock(hint));
1590 }
1591 #if USE_ITT_BUILD
1592 __kmp_itt_critical_acquiring(lck);
1593 #endif
1594 #if OMPT_SUPPORT && OMPT_OPTIONAL
1595 if (ompt_enabled.enabled) {
1596 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1597 /* OMPT state update */
1598 prev_state = ti.state;
1599 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1600 ti.state = ompt_state_wait_critical;
1601
1602 /* OMPT event callback */
1603 if (ompt_enabled.ompt_callback_mutex_acquire) {
1604 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1605 ompt_mutex_critical, (unsigned int)hint,
1606 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1607 codeptr);
1608 }
1609 }
1610 #endif
1611 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1612 }
1613 KMP_POP_PARTITIONED_TIMER();
1614
1615 #if USE_ITT_BUILD
1616 __kmp_itt_critical_acquired(lck);
1617 #endif /* USE_ITT_BUILD */
1618 #if OMPT_SUPPORT && OMPT_OPTIONAL
1619 if (ompt_enabled.enabled) {
1620 /* OMPT state update */
1621 ti.state = prev_state;
1622 ti.wait_id = 0;
1623
1624 /* OMPT event callback */
1625 if (ompt_enabled.ompt_callback_mutex_acquired) {
1626 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1627 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1628 }
1629 }
1630 #endif
1631
1632 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1633 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1634 } // __kmpc_critical_with_hint
1635
1636 #endif // KMP_USE_DYNAMIC_LOCK
1637
1638 /*!
1639 @ingroup WORK_SHARING
1640 @param loc source location information.
1641 @param global_tid global thread number .
1642 @param crit identity of the critical section. This could be a pointer to a lock
1643 associated with the critical section, or some other suitably unique value.
1644
1645 Leave a critical section, releasing any lock that was held during its execution.
1646 */
__kmpc_end_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1647 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1648 kmp_critical_name *crit) {
1649 kmp_user_lock_p lck;
1650
1651 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1652
1653 #if KMP_USE_DYNAMIC_LOCK
1654 int locktag = KMP_EXTRACT_D_TAG(crit);
1655 if (locktag) {
1656 lck = (kmp_user_lock_p)crit;
1657 KMP_ASSERT(lck != NULL);
1658 if (__kmp_env_consistency_check) {
1659 __kmp_pop_sync(global_tid, ct_critical, loc);
1660 }
1661 #if USE_ITT_BUILD
1662 __kmp_itt_critical_releasing(lck);
1663 #endif
1664 #if KMP_USE_INLINED_TAS
1665 if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1666 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1667 } else
1668 #elif KMP_USE_INLINED_FUTEX
1669 if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1670 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1671 } else
1672 #endif
1673 {
1674 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1675 }
1676 } else {
1677 kmp_indirect_lock_t *ilk =
1678 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1679 KMP_ASSERT(ilk != NULL);
1680 lck = ilk->lock;
1681 if (__kmp_env_consistency_check) {
1682 __kmp_pop_sync(global_tid, ct_critical, loc);
1683 }
1684 #if USE_ITT_BUILD
1685 __kmp_itt_critical_releasing(lck);
1686 #endif
1687 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1688 }
1689
1690 #else // KMP_USE_DYNAMIC_LOCK
1691
1692 if ((__kmp_user_lock_kind == lk_tas) &&
1693 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1694 lck = (kmp_user_lock_p)crit;
1695 }
1696 #if KMP_USE_FUTEX
1697 else if ((__kmp_user_lock_kind == lk_futex) &&
1698 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1699 lck = (kmp_user_lock_p)crit;
1700 }
1701 #endif
1702 else { // ticket, queuing or drdpa
1703 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1704 }
1705
1706 KMP_ASSERT(lck != NULL);
1707
1708 if (__kmp_env_consistency_check)
1709 __kmp_pop_sync(global_tid, ct_critical, loc);
1710
1711 #if USE_ITT_BUILD
1712 __kmp_itt_critical_releasing(lck);
1713 #endif /* USE_ITT_BUILD */
1714 // Value of 'crit' should be good for using as a critical_id of the critical
1715 // section directive.
1716 __kmp_release_user_lock_with_checks(lck, global_tid);
1717
1718 #endif // KMP_USE_DYNAMIC_LOCK
1719
1720 #if OMPT_SUPPORT && OMPT_OPTIONAL
1721 /* OMPT release event triggers after lock is released; place here to trigger
1722 * for all #if branches */
1723 OMPT_STORE_RETURN_ADDRESS(global_tid);
1724 if (ompt_enabled.ompt_callback_mutex_released) {
1725 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1726 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1727 OMPT_LOAD_RETURN_ADDRESS(0));
1728 }
1729 #endif
1730
1731 KMP_POP_PARTITIONED_TIMER();
1732 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1733 }
1734
1735 /*!
1736 @ingroup SYNCHRONIZATION
1737 @param loc source location information
1738 @param global_tid thread id.
1739 @return one if the thread should execute the master block, zero otherwise
1740
1741 Start execution of a combined barrier and master. The barrier is executed inside
1742 this function.
1743 */
__kmpc_barrier_master(ident_t * loc,kmp_int32 global_tid)1744 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1745 int status;
1746 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1747 __kmp_assert_valid_gtid(global_tid);
1748
1749 if (!TCR_4(__kmp_init_parallel))
1750 __kmp_parallel_initialize();
1751
1752 __kmp_resume_if_soft_paused();
1753
1754 if (__kmp_env_consistency_check)
1755 __kmp_check_barrier(global_tid, ct_barrier, loc);
1756
1757 #if OMPT_SUPPORT
1758 ompt_frame_t *ompt_frame;
1759 if (ompt_enabled.enabled) {
1760 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1761 if (ompt_frame->enter_frame.ptr == NULL)
1762 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1763 }
1764 OMPT_STORE_RETURN_ADDRESS(global_tid);
1765 #endif
1766 #if USE_ITT_NOTIFY
1767 __kmp_threads[global_tid]->th.th_ident = loc;
1768 #endif
1769 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1770 #if OMPT_SUPPORT && OMPT_OPTIONAL
1771 if (ompt_enabled.enabled) {
1772 ompt_frame->enter_frame = ompt_data_none;
1773 }
1774 #endif
1775
1776 return (status != 0) ? 0 : 1;
1777 }
1778
1779 /*!
1780 @ingroup SYNCHRONIZATION
1781 @param loc source location information
1782 @param global_tid thread id.
1783
1784 Complete the execution of a combined barrier and master. This function should
1785 only be called at the completion of the <tt>master</tt> code. Other threads will
1786 still be waiting at the barrier and this call releases them.
1787 */
__kmpc_end_barrier_master(ident_t * loc,kmp_int32 global_tid)1788 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1789 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1790 __kmp_assert_valid_gtid(global_tid);
1791 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1792 }
1793
1794 /*!
1795 @ingroup SYNCHRONIZATION
1796 @param loc source location information
1797 @param global_tid thread id.
1798 @return one if the thread should execute the master block, zero otherwise
1799
1800 Start execution of a combined barrier and master(nowait) construct.
1801 The barrier is executed inside this function.
1802 There is no equivalent "end" function, since the
1803 */
__kmpc_barrier_master_nowait(ident_t * loc,kmp_int32 global_tid)1804 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1805 kmp_int32 ret;
1806 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1807 __kmp_assert_valid_gtid(global_tid);
1808
1809 if (!TCR_4(__kmp_init_parallel))
1810 __kmp_parallel_initialize();
1811
1812 __kmp_resume_if_soft_paused();
1813
1814 if (__kmp_env_consistency_check) {
1815 if (loc == 0) {
1816 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1817 }
1818 __kmp_check_barrier(global_tid, ct_barrier, loc);
1819 }
1820
1821 #if OMPT_SUPPORT
1822 ompt_frame_t *ompt_frame;
1823 if (ompt_enabled.enabled) {
1824 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1825 if (ompt_frame->enter_frame.ptr == NULL)
1826 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1827 }
1828 OMPT_STORE_RETURN_ADDRESS(global_tid);
1829 #endif
1830 #if USE_ITT_NOTIFY
1831 __kmp_threads[global_tid]->th.th_ident = loc;
1832 #endif
1833 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835 if (ompt_enabled.enabled) {
1836 ompt_frame->enter_frame = ompt_data_none;
1837 }
1838 #endif
1839
1840 ret = __kmpc_master(loc, global_tid);
1841
1842 if (__kmp_env_consistency_check) {
1843 /* there's no __kmpc_end_master called; so the (stats) */
1844 /* actions of __kmpc_end_master are done here */
1845 if (ret) {
1846 /* only one thread should do the pop since only */
1847 /* one did the push (see __kmpc_master()) */
1848 __kmp_pop_sync(global_tid, ct_master, loc);
1849 }
1850 }
1851
1852 return (ret);
1853 }
1854
1855 /* The BARRIER for a SINGLE process section is always explicit */
1856 /*!
1857 @ingroup WORK_SHARING
1858 @param loc source location information
1859 @param global_tid global thread number
1860 @return One if this thread should execute the single construct, zero otherwise.
1861
1862 Test whether to execute a <tt>single</tt> construct.
1863 There are no implicit barriers in the two "single" calls, rather the compiler
1864 should introduce an explicit barrier if it is required.
1865 */
1866
__kmpc_single(ident_t * loc,kmp_int32 global_tid)1867 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1868 __kmp_assert_valid_gtid(global_tid);
1869 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1870
1871 if (rc) {
1872 // We are going to execute the single statement, so we should count it.
1873 KMP_COUNT_BLOCK(OMP_SINGLE);
1874 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1875 }
1876
1877 #if OMPT_SUPPORT && OMPT_OPTIONAL
1878 kmp_info_t *this_thr = __kmp_threads[global_tid];
1879 kmp_team_t *team = this_thr->th.th_team;
1880 int tid = __kmp_tid_from_gtid(global_tid);
1881
1882 if (ompt_enabled.enabled) {
1883 if (rc) {
1884 if (ompt_enabled.ompt_callback_work) {
1885 ompt_callbacks.ompt_callback(ompt_callback_work)(
1886 ompt_work_single_executor, ompt_scope_begin,
1887 &(team->t.ompt_team_info.parallel_data),
1888 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1889 1, OMPT_GET_RETURN_ADDRESS(0));
1890 }
1891 } else {
1892 if (ompt_enabled.ompt_callback_work) {
1893 ompt_callbacks.ompt_callback(ompt_callback_work)(
1894 ompt_work_single_other, ompt_scope_begin,
1895 &(team->t.ompt_team_info.parallel_data),
1896 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1897 1, OMPT_GET_RETURN_ADDRESS(0));
1898 ompt_callbacks.ompt_callback(ompt_callback_work)(
1899 ompt_work_single_other, ompt_scope_end,
1900 &(team->t.ompt_team_info.parallel_data),
1901 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1902 1, OMPT_GET_RETURN_ADDRESS(0));
1903 }
1904 }
1905 }
1906 #endif
1907
1908 return rc;
1909 }
1910
1911 /*!
1912 @ingroup WORK_SHARING
1913 @param loc source location information
1914 @param global_tid global thread number
1915
1916 Mark the end of a <tt>single</tt> construct. This function should
1917 only be called by the thread that executed the block of code protected
1918 by the `single` construct.
1919 */
__kmpc_end_single(ident_t * loc,kmp_int32 global_tid)1920 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1921 __kmp_assert_valid_gtid(global_tid);
1922 __kmp_exit_single(global_tid);
1923 KMP_POP_PARTITIONED_TIMER();
1924
1925 #if OMPT_SUPPORT && OMPT_OPTIONAL
1926 kmp_info_t *this_thr = __kmp_threads[global_tid];
1927 kmp_team_t *team = this_thr->th.th_team;
1928 int tid = __kmp_tid_from_gtid(global_tid);
1929
1930 if (ompt_enabled.ompt_callback_work) {
1931 ompt_callbacks.ompt_callback(ompt_callback_work)(
1932 ompt_work_single_executor, ompt_scope_end,
1933 &(team->t.ompt_team_info.parallel_data),
1934 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1935 OMPT_GET_RETURN_ADDRESS(0));
1936 }
1937 #endif
1938 }
1939
1940 /*!
1941 @ingroup WORK_SHARING
1942 @param loc Source location
1943 @param global_tid Global thread id
1944
1945 Mark the end of a statically scheduled loop.
1946 */
__kmpc_for_static_fini(ident_t * loc,kmp_int32 global_tid)1947 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1948 KMP_POP_PARTITIONED_TIMER();
1949 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1950
1951 #if OMPT_SUPPORT && OMPT_OPTIONAL
1952 if (ompt_enabled.ompt_callback_work) {
1953 ompt_work_t ompt_work_type = ompt_work_loop;
1954 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1955 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1956 // Determine workshare type
1957 if (loc != NULL) {
1958 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1959 ompt_work_type = ompt_work_loop;
1960 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1961 ompt_work_type = ompt_work_sections;
1962 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1963 ompt_work_type = ompt_work_distribute;
1964 } else {
1965 // use default set above.
1966 // a warning about this case is provided in __kmpc_for_static_init
1967 }
1968 KMP_DEBUG_ASSERT(ompt_work_type);
1969 }
1970 ompt_callbacks.ompt_callback(ompt_callback_work)(
1971 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1972 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1973 }
1974 #endif
1975 if (__kmp_env_consistency_check)
1976 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1977 }
1978
1979 // User routines which take C-style arguments (call by value)
1980 // different from the Fortran equivalent routines
1981
ompc_set_num_threads(int arg)1982 void ompc_set_num_threads(int arg) {
1983 // !!!!! TODO: check the per-task binding
1984 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1985 }
1986
ompc_set_dynamic(int flag)1987 void ompc_set_dynamic(int flag) {
1988 kmp_info_t *thread;
1989
1990 /* For the thread-private implementation of the internal controls */
1991 thread = __kmp_entry_thread();
1992
1993 __kmp_save_internal_controls(thread);
1994
1995 set__dynamic(thread, flag ? true : false);
1996 }
1997
ompc_set_nested(int flag)1998 void ompc_set_nested(int flag) {
1999 kmp_info_t *thread;
2000
2001 /* For the thread-private internal controls implementation */
2002 thread = __kmp_entry_thread();
2003
2004 __kmp_save_internal_controls(thread);
2005
2006 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
2007 }
2008
ompc_set_max_active_levels(int max_active_levels)2009 void ompc_set_max_active_levels(int max_active_levels) {
2010 /* TO DO */
2011 /* we want per-task implementation of this internal control */
2012
2013 /* For the per-thread internal controls implementation */
2014 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
2015 }
2016
ompc_set_schedule(omp_sched_t kind,int modifier)2017 void ompc_set_schedule(omp_sched_t kind, int modifier) {
2018 // !!!!! TODO: check the per-task binding
2019 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
2020 }
2021
ompc_get_ancestor_thread_num(int level)2022 int ompc_get_ancestor_thread_num(int level) {
2023 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
2024 }
2025
ompc_get_team_size(int level)2026 int ompc_get_team_size(int level) {
2027 return __kmp_get_team_size(__kmp_entry_gtid(), level);
2028 }
2029
2030 /* OpenMP 5.0 Affinity Format API */
KMP_EXPAND_NAME(ompc_set_affinity_format)2031 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2032 if (!__kmp_init_serial) {
2033 __kmp_serial_initialize();
2034 }
2035 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2036 format, KMP_STRLEN(format) + 1);
2037 }
2038
KMP_EXPAND_NAME(ompc_get_affinity_format)2039 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2040 size_t format_size;
2041 if (!__kmp_init_serial) {
2042 __kmp_serial_initialize();
2043 }
2044 format_size = KMP_STRLEN(__kmp_affinity_format);
2045 if (buffer && size) {
2046 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2047 format_size + 1);
2048 }
2049 return format_size;
2050 }
2051
KMP_EXPAND_NAME(ompc_display_affinity)2052 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2053 int gtid;
2054 if (!TCR_4(__kmp_init_middle)) {
2055 __kmp_middle_initialize();
2056 }
2057 __kmp_assign_root_init_mask();
2058 gtid = __kmp_get_gtid();
2059 #if KMP_AFFINITY_SUPPORTED
2060 if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2061 __kmp_affinity.flags.reset) {
2062 __kmp_reset_root_init_mask(gtid);
2063 }
2064 #endif
2065 __kmp_aux_display_affinity(gtid, format);
2066 }
2067
KMP_EXPAND_NAME(ompc_capture_affinity)2068 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2069 char const *format) {
2070 int gtid;
2071 size_t num_required;
2072 kmp_str_buf_t capture_buf;
2073 if (!TCR_4(__kmp_init_middle)) {
2074 __kmp_middle_initialize();
2075 }
2076 __kmp_assign_root_init_mask();
2077 gtid = __kmp_get_gtid();
2078 #if KMP_AFFINITY_SUPPORTED
2079 if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2080 __kmp_affinity.flags.reset) {
2081 __kmp_reset_root_init_mask(gtid);
2082 }
2083 #endif
2084 __kmp_str_buf_init(&capture_buf);
2085 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2086 if (buffer && buf_size) {
2087 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2088 capture_buf.used + 1);
2089 }
2090 __kmp_str_buf_free(&capture_buf);
2091 return num_required;
2092 }
2093
kmpc_set_stacksize(int arg)2094 void kmpc_set_stacksize(int arg) {
2095 // __kmp_aux_set_stacksize initializes the library if needed
2096 __kmp_aux_set_stacksize(arg);
2097 }
2098
kmpc_set_stacksize_s(size_t arg)2099 void kmpc_set_stacksize_s(size_t arg) {
2100 // __kmp_aux_set_stacksize initializes the library if needed
2101 __kmp_aux_set_stacksize(arg);
2102 }
2103
kmpc_set_blocktime(int arg)2104 void kmpc_set_blocktime(int arg) {
2105 int gtid, tid, bt = arg;
2106 kmp_info_t *thread;
2107
2108 gtid = __kmp_entry_gtid();
2109 tid = __kmp_tid_from_gtid(gtid);
2110 thread = __kmp_thread_from_gtid(gtid);
2111
2112 __kmp_aux_convert_blocktime(&bt);
2113 __kmp_aux_set_blocktime(bt, thread, tid);
2114 }
2115
kmpc_set_library(int arg)2116 void kmpc_set_library(int arg) {
2117 // __kmp_user_set_library initializes the library if needed
2118 __kmp_user_set_library((enum library_type)arg);
2119 }
2120
kmpc_set_defaults(char const * str)2121 void kmpc_set_defaults(char const *str) {
2122 // __kmp_aux_set_defaults initializes the library if needed
2123 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2124 }
2125
kmpc_set_disp_num_buffers(int arg)2126 void kmpc_set_disp_num_buffers(int arg) {
2127 // ignore after initialization because some teams have already
2128 // allocated dispatch buffers
2129 if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2130 arg <= KMP_MAX_DISP_NUM_BUFF) {
2131 __kmp_dispatch_num_buffers = arg;
2132 }
2133 }
2134
kmpc_set_affinity_mask_proc(int proc,void ** mask)2135 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2136 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2137 return -1;
2138 #else
2139 if (!TCR_4(__kmp_init_middle)) {
2140 __kmp_middle_initialize();
2141 }
2142 __kmp_assign_root_init_mask();
2143 return __kmp_aux_set_affinity_mask_proc(proc, mask);
2144 #endif
2145 }
2146
kmpc_unset_affinity_mask_proc(int proc,void ** mask)2147 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2148 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2149 return -1;
2150 #else
2151 if (!TCR_4(__kmp_init_middle)) {
2152 __kmp_middle_initialize();
2153 }
2154 __kmp_assign_root_init_mask();
2155 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2156 #endif
2157 }
2158
kmpc_get_affinity_mask_proc(int proc,void ** mask)2159 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2160 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2161 return -1;
2162 #else
2163 if (!TCR_4(__kmp_init_middle)) {
2164 __kmp_middle_initialize();
2165 }
2166 __kmp_assign_root_init_mask();
2167 return __kmp_aux_get_affinity_mask_proc(proc, mask);
2168 #endif
2169 }
2170
2171 /* -------------------------------------------------------------------------- */
2172 /*!
2173 @ingroup THREADPRIVATE
2174 @param loc source location information
2175 @param gtid global thread number
2176 @param cpy_size size of the cpy_data buffer
2177 @param cpy_data pointer to data to be copied
2178 @param cpy_func helper function to call for copying data
2179 @param didit flag variable: 1=single thread; 0=not single thread
2180
2181 __kmpc_copyprivate implements the interface for the private data broadcast
2182 needed for the copyprivate clause associated with a single region in an
2183 OpenMP<sup>*</sup> program (both C and Fortran).
2184 All threads participating in the parallel region call this routine.
2185 One of the threads (called the single thread) should have the <tt>didit</tt>
2186 variable set to 1 and all other threads should have that variable set to 0.
2187 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2188
2189 The OpenMP specification forbids the use of nowait on the single region when a
2190 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2191 barrier internally to avoid race conditions, so the code generation for the
2192 single region should avoid generating a barrier after the call to @ref
2193 __kmpc_copyprivate.
2194
2195 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2196 The <tt>loc</tt> parameter is a pointer to source location information.
2197
2198 Internal implementation: The single thread will first copy its descriptor
2199 address (cpy_data) to a team-private location, then the other threads will each
2200 call the function pointed to by the parameter cpy_func, which carries out the
2201 copy by copying the data using the cpy_data buffer.
2202
2203 The cpy_func routine used for the copy and the contents of the data area defined
2204 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2205 to be done. For instance, the cpy_data buffer can hold the actual data to be
2206 copied or it may hold a list of pointers to the data. The cpy_func routine must
2207 interpret the cpy_data buffer appropriately.
2208
2209 The interface to cpy_func is as follows:
2210 @code
2211 void cpy_func( void *destination, void *source )
2212 @endcode
2213 where void *destination is the cpy_data pointer for the thread being copied to
2214 and void *source is the cpy_data pointer for the thread being copied from.
2215 */
__kmpc_copyprivate(ident_t * loc,kmp_int32 gtid,size_t cpy_size,void * cpy_data,void (* cpy_func)(void *,void *),kmp_int32 didit)2216 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2217 void *cpy_data, void (*cpy_func)(void *, void *),
2218 kmp_int32 didit) {
2219 void **data_ptr;
2220 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2221 __kmp_assert_valid_gtid(gtid);
2222
2223 KMP_MB();
2224
2225 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2226
2227 if (__kmp_env_consistency_check) {
2228 if (loc == 0) {
2229 KMP_WARNING(ConstructIdentInvalid);
2230 }
2231 }
2232
2233 // ToDo: Optimize the following two barriers into some kind of split barrier
2234
2235 if (didit)
2236 *data_ptr = cpy_data;
2237
2238 #if OMPT_SUPPORT
2239 ompt_frame_t *ompt_frame;
2240 if (ompt_enabled.enabled) {
2241 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2242 if (ompt_frame->enter_frame.ptr == NULL)
2243 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2244 }
2245 OMPT_STORE_RETURN_ADDRESS(gtid);
2246 #endif
2247 /* This barrier is not a barrier region boundary */
2248 #if USE_ITT_NOTIFY
2249 __kmp_threads[gtid]->th.th_ident = loc;
2250 #endif
2251 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2252
2253 if (!didit)
2254 (*cpy_func)(cpy_data, *data_ptr);
2255
2256 // Consider next barrier a user-visible barrier for barrier region boundaries
2257 // Nesting checks are already handled by the single construct checks
2258 {
2259 #if OMPT_SUPPORT
2260 OMPT_STORE_RETURN_ADDRESS(gtid);
2261 #endif
2262 #if USE_ITT_NOTIFY
2263 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2264 // tasks can overwrite the location)
2265 #endif
2266 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2267 #if OMPT_SUPPORT && OMPT_OPTIONAL
2268 if (ompt_enabled.enabled) {
2269 ompt_frame->enter_frame = ompt_data_none;
2270 }
2271 #endif
2272 }
2273 }
2274
2275 /* --------------------------------------------------------------------------*/
2276 /*!
2277 @ingroup THREADPRIVATE
2278 @param loc source location information
2279 @param gtid global thread number
2280 @param cpy_data pointer to the data to be saved/copied or 0
2281 @return the saved pointer to the data
2282
2283 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2284 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2285 coming from single), and returns that pointer in all calls (for single thread
2286 it's not needed). This version doesn't do any actual data copying. Data copying
2287 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2288 this function doesn't have any barrier at the end of the function, like
2289 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2290 data was done.
2291 */
__kmpc_copyprivate_light(ident_t * loc,kmp_int32 gtid,void * cpy_data)2292 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2293 void **data_ptr;
2294
2295 KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2296
2297 KMP_MB();
2298
2299 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2300
2301 if (__kmp_env_consistency_check) {
2302 if (loc == 0) {
2303 KMP_WARNING(ConstructIdentInvalid);
2304 }
2305 }
2306
2307 // ToDo: Optimize the following barrier
2308
2309 if (cpy_data)
2310 *data_ptr = cpy_data;
2311
2312 #if OMPT_SUPPORT
2313 ompt_frame_t *ompt_frame;
2314 if (ompt_enabled.enabled) {
2315 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2316 if (ompt_frame->enter_frame.ptr == NULL)
2317 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2318 OMPT_STORE_RETURN_ADDRESS(gtid);
2319 }
2320 #endif
2321 /* This barrier is not a barrier region boundary */
2322 #if USE_ITT_NOTIFY
2323 __kmp_threads[gtid]->th.th_ident = loc;
2324 #endif
2325 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2326
2327 return *data_ptr;
2328 }
2329
2330 /* -------------------------------------------------------------------------- */
2331
2332 #define INIT_LOCK __kmp_init_user_lock_with_checks
2333 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2334 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2335 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2336 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2337 #define ACQUIRE_NESTED_LOCK_TIMED \
2338 __kmp_acquire_nested_user_lock_with_checks_timed
2339 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2340 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2341 #define TEST_LOCK __kmp_test_user_lock_with_checks
2342 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2343 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2344 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2345
2346 // TODO: Make check abort messages use location info & pass it into
2347 // with_checks routines
2348
2349 #if KMP_USE_DYNAMIC_LOCK
2350
2351 // internal lock initializer
__kmp_init_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2352 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2353 kmp_dyna_lockseq_t seq) {
2354 if (KMP_IS_D_LOCK(seq)) {
2355 KMP_INIT_D_LOCK(lock, seq);
2356 #if USE_ITT_BUILD
2357 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2358 #endif
2359 } else {
2360 KMP_INIT_I_LOCK(lock, seq);
2361 #if USE_ITT_BUILD
2362 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2363 __kmp_itt_lock_creating(ilk->lock, loc);
2364 #endif
2365 }
2366 }
2367
2368 // internal nest lock initializer
2369 static __forceinline void
__kmp_init_nest_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2370 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2371 kmp_dyna_lockseq_t seq) {
2372 #if KMP_USE_TSX
2373 // Don't have nested lock implementation for speculative locks
2374 if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2375 seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2376 seq = __kmp_user_lock_seq;
2377 #endif
2378 switch (seq) {
2379 case lockseq_tas:
2380 seq = lockseq_nested_tas;
2381 break;
2382 #if KMP_USE_FUTEX
2383 case lockseq_futex:
2384 seq = lockseq_nested_futex;
2385 break;
2386 #endif
2387 case lockseq_ticket:
2388 seq = lockseq_nested_ticket;
2389 break;
2390 case lockseq_queuing:
2391 seq = lockseq_nested_queuing;
2392 break;
2393 case lockseq_drdpa:
2394 seq = lockseq_nested_drdpa;
2395 break;
2396 default:
2397 seq = lockseq_nested_queuing;
2398 }
2399 KMP_INIT_I_LOCK(lock, seq);
2400 #if USE_ITT_BUILD
2401 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2402 __kmp_itt_lock_creating(ilk->lock, loc);
2403 #endif
2404 }
2405
2406 /* initialize the lock with a hint */
__kmpc_init_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2407 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2408 uintptr_t hint) {
2409 KMP_DEBUG_ASSERT(__kmp_init_serial);
2410 if (__kmp_env_consistency_check && user_lock == NULL) {
2411 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2412 }
2413
2414 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2415
2416 #if OMPT_SUPPORT && OMPT_OPTIONAL
2417 // This is the case, if called from omp_init_lock_with_hint:
2418 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2419 if (!codeptr)
2420 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2421 if (ompt_enabled.ompt_callback_lock_init) {
2422 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2423 ompt_mutex_lock, (omp_lock_hint_t)hint,
2424 __ompt_get_mutex_impl_type(user_lock),
2425 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2426 }
2427 #endif
2428 }
2429
2430 /* initialize the lock with a hint */
__kmpc_init_nest_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2431 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2432 void **user_lock, uintptr_t hint) {
2433 KMP_DEBUG_ASSERT(__kmp_init_serial);
2434 if (__kmp_env_consistency_check && user_lock == NULL) {
2435 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2436 }
2437
2438 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2439
2440 #if OMPT_SUPPORT && OMPT_OPTIONAL
2441 // This is the case, if called from omp_init_lock_with_hint:
2442 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2443 if (!codeptr)
2444 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2445 if (ompt_enabled.ompt_callback_lock_init) {
2446 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2447 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2448 __ompt_get_mutex_impl_type(user_lock),
2449 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2450 }
2451 #endif
2452 }
2453
2454 #endif // KMP_USE_DYNAMIC_LOCK
2455
2456 /* initialize the lock */
__kmpc_init_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2457 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2458 #if KMP_USE_DYNAMIC_LOCK
2459
2460 KMP_DEBUG_ASSERT(__kmp_init_serial);
2461 if (__kmp_env_consistency_check && user_lock == NULL) {
2462 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2463 }
2464 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2465
2466 #if OMPT_SUPPORT && OMPT_OPTIONAL
2467 // This is the case, if called from omp_init_lock_with_hint:
2468 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2469 if (!codeptr)
2470 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2471 if (ompt_enabled.ompt_callback_lock_init) {
2472 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2473 ompt_mutex_lock, omp_lock_hint_none,
2474 __ompt_get_mutex_impl_type(user_lock),
2475 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2476 }
2477 #endif
2478
2479 #else // KMP_USE_DYNAMIC_LOCK
2480
2481 static char const *const func = "omp_init_lock";
2482 kmp_user_lock_p lck;
2483 KMP_DEBUG_ASSERT(__kmp_init_serial);
2484
2485 if (__kmp_env_consistency_check) {
2486 if (user_lock == NULL) {
2487 KMP_FATAL(LockIsUninitialized, func);
2488 }
2489 }
2490
2491 KMP_CHECK_USER_LOCK_INIT();
2492
2493 if ((__kmp_user_lock_kind == lk_tas) &&
2494 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2495 lck = (kmp_user_lock_p)user_lock;
2496 }
2497 #if KMP_USE_FUTEX
2498 else if ((__kmp_user_lock_kind == lk_futex) &&
2499 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2500 lck = (kmp_user_lock_p)user_lock;
2501 }
2502 #endif
2503 else {
2504 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2505 }
2506 INIT_LOCK(lck);
2507 __kmp_set_user_lock_location(lck, loc);
2508
2509 #if OMPT_SUPPORT && OMPT_OPTIONAL
2510 // This is the case, if called from omp_init_lock_with_hint:
2511 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2512 if (!codeptr)
2513 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2514 if (ompt_enabled.ompt_callback_lock_init) {
2515 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2516 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2517 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2518 }
2519 #endif
2520
2521 #if USE_ITT_BUILD
2522 __kmp_itt_lock_creating(lck);
2523 #endif /* USE_ITT_BUILD */
2524
2525 #endif // KMP_USE_DYNAMIC_LOCK
2526 } // __kmpc_init_lock
2527
2528 /* initialize the lock */
__kmpc_init_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2529 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2530 #if KMP_USE_DYNAMIC_LOCK
2531
2532 KMP_DEBUG_ASSERT(__kmp_init_serial);
2533 if (__kmp_env_consistency_check && user_lock == NULL) {
2534 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2535 }
2536 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2537
2538 #if OMPT_SUPPORT && OMPT_OPTIONAL
2539 // This is the case, if called from omp_init_lock_with_hint:
2540 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2541 if (!codeptr)
2542 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2543 if (ompt_enabled.ompt_callback_lock_init) {
2544 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2545 ompt_mutex_nest_lock, omp_lock_hint_none,
2546 __ompt_get_mutex_impl_type(user_lock),
2547 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2548 }
2549 #endif
2550
2551 #else // KMP_USE_DYNAMIC_LOCK
2552
2553 static char const *const func = "omp_init_nest_lock";
2554 kmp_user_lock_p lck;
2555 KMP_DEBUG_ASSERT(__kmp_init_serial);
2556
2557 if (__kmp_env_consistency_check) {
2558 if (user_lock == NULL) {
2559 KMP_FATAL(LockIsUninitialized, func);
2560 }
2561 }
2562
2563 KMP_CHECK_USER_LOCK_INIT();
2564
2565 if ((__kmp_user_lock_kind == lk_tas) &&
2566 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2567 OMP_NEST_LOCK_T_SIZE)) {
2568 lck = (kmp_user_lock_p)user_lock;
2569 }
2570 #if KMP_USE_FUTEX
2571 else if ((__kmp_user_lock_kind == lk_futex) &&
2572 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2573 OMP_NEST_LOCK_T_SIZE)) {
2574 lck = (kmp_user_lock_p)user_lock;
2575 }
2576 #endif
2577 else {
2578 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2579 }
2580
2581 INIT_NESTED_LOCK(lck);
2582 __kmp_set_user_lock_location(lck, loc);
2583
2584 #if OMPT_SUPPORT && OMPT_OPTIONAL
2585 // This is the case, if called from omp_init_lock_with_hint:
2586 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2587 if (!codeptr)
2588 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2589 if (ompt_enabled.ompt_callback_lock_init) {
2590 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2591 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2592 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2593 }
2594 #endif
2595
2596 #if USE_ITT_BUILD
2597 __kmp_itt_lock_creating(lck);
2598 #endif /* USE_ITT_BUILD */
2599
2600 #endif // KMP_USE_DYNAMIC_LOCK
2601 } // __kmpc_init_nest_lock
2602
__kmpc_destroy_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2603 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2604 #if KMP_USE_DYNAMIC_LOCK
2605
2606 #if USE_ITT_BUILD
2607 kmp_user_lock_p lck;
2608 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2609 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2610 } else {
2611 lck = (kmp_user_lock_p)user_lock;
2612 }
2613 __kmp_itt_lock_destroyed(lck);
2614 #endif
2615 #if OMPT_SUPPORT && OMPT_OPTIONAL
2616 // This is the case, if called from omp_init_lock_with_hint:
2617 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2618 if (!codeptr)
2619 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2620 if (ompt_enabled.ompt_callback_lock_destroy) {
2621 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2622 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2623 }
2624 #endif
2625 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2626 #else
2627 kmp_user_lock_p lck;
2628
2629 if ((__kmp_user_lock_kind == lk_tas) &&
2630 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2631 lck = (kmp_user_lock_p)user_lock;
2632 }
2633 #if KMP_USE_FUTEX
2634 else if ((__kmp_user_lock_kind == lk_futex) &&
2635 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2636 lck = (kmp_user_lock_p)user_lock;
2637 }
2638 #endif
2639 else {
2640 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2641 }
2642
2643 #if OMPT_SUPPORT && OMPT_OPTIONAL
2644 // This is the case, if called from omp_init_lock_with_hint:
2645 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2646 if (!codeptr)
2647 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2648 if (ompt_enabled.ompt_callback_lock_destroy) {
2649 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2650 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2651 }
2652 #endif
2653
2654 #if USE_ITT_BUILD
2655 __kmp_itt_lock_destroyed(lck);
2656 #endif /* USE_ITT_BUILD */
2657 DESTROY_LOCK(lck);
2658
2659 if ((__kmp_user_lock_kind == lk_tas) &&
2660 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2661 ;
2662 }
2663 #if KMP_USE_FUTEX
2664 else if ((__kmp_user_lock_kind == lk_futex) &&
2665 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2666 ;
2667 }
2668 #endif
2669 else {
2670 __kmp_user_lock_free(user_lock, gtid, lck);
2671 }
2672 #endif // KMP_USE_DYNAMIC_LOCK
2673 } // __kmpc_destroy_lock
2674
2675 /* destroy the lock */
__kmpc_destroy_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2676 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2677 #if KMP_USE_DYNAMIC_LOCK
2678
2679 #if USE_ITT_BUILD
2680 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2681 __kmp_itt_lock_destroyed(ilk->lock);
2682 #endif
2683 #if OMPT_SUPPORT && OMPT_OPTIONAL
2684 // This is the case, if called from omp_init_lock_with_hint:
2685 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2686 if (!codeptr)
2687 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2688 if (ompt_enabled.ompt_callback_lock_destroy) {
2689 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2690 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2691 }
2692 #endif
2693 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2694
2695 #else // KMP_USE_DYNAMIC_LOCK
2696
2697 kmp_user_lock_p lck;
2698
2699 if ((__kmp_user_lock_kind == lk_tas) &&
2700 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2701 OMP_NEST_LOCK_T_SIZE)) {
2702 lck = (kmp_user_lock_p)user_lock;
2703 }
2704 #if KMP_USE_FUTEX
2705 else if ((__kmp_user_lock_kind == lk_futex) &&
2706 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2707 OMP_NEST_LOCK_T_SIZE)) {
2708 lck = (kmp_user_lock_p)user_lock;
2709 }
2710 #endif
2711 else {
2712 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2713 }
2714
2715 #if OMPT_SUPPORT && OMPT_OPTIONAL
2716 // This is the case, if called from omp_init_lock_with_hint:
2717 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718 if (!codeptr)
2719 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720 if (ompt_enabled.ompt_callback_lock_destroy) {
2721 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2722 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2723 }
2724 #endif
2725
2726 #if USE_ITT_BUILD
2727 __kmp_itt_lock_destroyed(lck);
2728 #endif /* USE_ITT_BUILD */
2729
2730 DESTROY_NESTED_LOCK(lck);
2731
2732 if ((__kmp_user_lock_kind == lk_tas) &&
2733 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2734 OMP_NEST_LOCK_T_SIZE)) {
2735 ;
2736 }
2737 #if KMP_USE_FUTEX
2738 else if ((__kmp_user_lock_kind == lk_futex) &&
2739 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2740 OMP_NEST_LOCK_T_SIZE)) {
2741 ;
2742 }
2743 #endif
2744 else {
2745 __kmp_user_lock_free(user_lock, gtid, lck);
2746 }
2747 #endif // KMP_USE_DYNAMIC_LOCK
2748 } // __kmpc_destroy_nest_lock
2749
__kmpc_set_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2750 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2751 KMP_COUNT_BLOCK(OMP_set_lock);
2752 #if KMP_USE_DYNAMIC_LOCK
2753 int tag = KMP_EXTRACT_D_TAG(user_lock);
2754 #if USE_ITT_BUILD
2755 __kmp_itt_lock_acquiring(
2756 (kmp_user_lock_p)
2757 user_lock); // itt function will get to the right lock object.
2758 #endif
2759 #if OMPT_SUPPORT && OMPT_OPTIONAL
2760 // This is the case, if called from omp_init_lock_with_hint:
2761 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2762 if (!codeptr)
2763 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2764 if (ompt_enabled.ompt_callback_mutex_acquire) {
2765 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2766 ompt_mutex_lock, omp_lock_hint_none,
2767 __ompt_get_mutex_impl_type(user_lock),
2768 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2769 }
2770 #endif
2771 #if KMP_USE_INLINED_TAS
2772 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2773 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2774 } else
2775 #elif KMP_USE_INLINED_FUTEX
2776 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2777 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2778 } else
2779 #endif
2780 {
2781 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2782 }
2783 #if USE_ITT_BUILD
2784 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2785 #endif
2786 #if OMPT_SUPPORT && OMPT_OPTIONAL
2787 if (ompt_enabled.ompt_callback_mutex_acquired) {
2788 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2789 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2790 }
2791 #endif
2792
2793 #else // KMP_USE_DYNAMIC_LOCK
2794
2795 kmp_user_lock_p lck;
2796
2797 if ((__kmp_user_lock_kind == lk_tas) &&
2798 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2799 lck = (kmp_user_lock_p)user_lock;
2800 }
2801 #if KMP_USE_FUTEX
2802 else if ((__kmp_user_lock_kind == lk_futex) &&
2803 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2804 lck = (kmp_user_lock_p)user_lock;
2805 }
2806 #endif
2807 else {
2808 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2809 }
2810
2811 #if USE_ITT_BUILD
2812 __kmp_itt_lock_acquiring(lck);
2813 #endif /* USE_ITT_BUILD */
2814 #if OMPT_SUPPORT && OMPT_OPTIONAL
2815 // This is the case, if called from omp_init_lock_with_hint:
2816 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2817 if (!codeptr)
2818 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2819 if (ompt_enabled.ompt_callback_mutex_acquire) {
2820 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2821 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2822 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2823 }
2824 #endif
2825
2826 ACQUIRE_LOCK(lck, gtid);
2827
2828 #if USE_ITT_BUILD
2829 __kmp_itt_lock_acquired(lck);
2830 #endif /* USE_ITT_BUILD */
2831
2832 #if OMPT_SUPPORT && OMPT_OPTIONAL
2833 if (ompt_enabled.ompt_callback_mutex_acquired) {
2834 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2835 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2836 }
2837 #endif
2838
2839 #endif // KMP_USE_DYNAMIC_LOCK
2840 }
2841
__kmpc_set_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2842 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2843 #if KMP_USE_DYNAMIC_LOCK
2844
2845 #if USE_ITT_BUILD
2846 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2847 #endif
2848 #if OMPT_SUPPORT && OMPT_OPTIONAL
2849 // This is the case, if called from omp_init_lock_with_hint:
2850 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2851 if (!codeptr)
2852 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2853 if (ompt_enabled.enabled) {
2854 if (ompt_enabled.ompt_callback_mutex_acquire) {
2855 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2856 ompt_mutex_nest_lock, omp_lock_hint_none,
2857 __ompt_get_mutex_impl_type(user_lock),
2858 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2859 }
2860 }
2861 #endif
2862 int acquire_status =
2863 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2864 (void)acquire_status;
2865 #if USE_ITT_BUILD
2866 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2867 #endif
2868
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870 if (ompt_enabled.enabled) {
2871 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2872 if (ompt_enabled.ompt_callback_mutex_acquired) {
2873 // lock_first
2874 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2875 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2876 codeptr);
2877 }
2878 } else {
2879 if (ompt_enabled.ompt_callback_nest_lock) {
2880 // lock_next
2881 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2882 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2883 }
2884 }
2885 }
2886 #endif
2887
2888 #else // KMP_USE_DYNAMIC_LOCK
2889 int acquire_status;
2890 kmp_user_lock_p lck;
2891
2892 if ((__kmp_user_lock_kind == lk_tas) &&
2893 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2894 OMP_NEST_LOCK_T_SIZE)) {
2895 lck = (kmp_user_lock_p)user_lock;
2896 }
2897 #if KMP_USE_FUTEX
2898 else if ((__kmp_user_lock_kind == lk_futex) &&
2899 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2900 OMP_NEST_LOCK_T_SIZE)) {
2901 lck = (kmp_user_lock_p)user_lock;
2902 }
2903 #endif
2904 else {
2905 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2906 }
2907
2908 #if USE_ITT_BUILD
2909 __kmp_itt_lock_acquiring(lck);
2910 #endif /* USE_ITT_BUILD */
2911 #if OMPT_SUPPORT && OMPT_OPTIONAL
2912 // This is the case, if called from omp_init_lock_with_hint:
2913 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2914 if (!codeptr)
2915 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2916 if (ompt_enabled.enabled) {
2917 if (ompt_enabled.ompt_callback_mutex_acquire) {
2918 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2919 ompt_mutex_nest_lock, omp_lock_hint_none,
2920 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2921 codeptr);
2922 }
2923 }
2924 #endif
2925
2926 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2927
2928 #if USE_ITT_BUILD
2929 __kmp_itt_lock_acquired(lck);
2930 #endif /* USE_ITT_BUILD */
2931
2932 #if OMPT_SUPPORT && OMPT_OPTIONAL
2933 if (ompt_enabled.enabled) {
2934 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2935 if (ompt_enabled.ompt_callback_mutex_acquired) {
2936 // lock_first
2937 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2938 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2939 }
2940 } else {
2941 if (ompt_enabled.ompt_callback_nest_lock) {
2942 // lock_next
2943 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2944 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2945 }
2946 }
2947 }
2948 #endif
2949
2950 #endif // KMP_USE_DYNAMIC_LOCK
2951 }
2952
__kmpc_unset_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2953 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2954 #if KMP_USE_DYNAMIC_LOCK
2955
2956 int tag = KMP_EXTRACT_D_TAG(user_lock);
2957 #if USE_ITT_BUILD
2958 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2959 #endif
2960 #if KMP_USE_INLINED_TAS
2961 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2962 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2963 } else
2964 #elif KMP_USE_INLINED_FUTEX
2965 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2966 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2967 } else
2968 #endif
2969 {
2970 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2971 }
2972
2973 #if OMPT_SUPPORT && OMPT_OPTIONAL
2974 // This is the case, if called from omp_init_lock_with_hint:
2975 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2976 if (!codeptr)
2977 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2978 if (ompt_enabled.ompt_callback_mutex_released) {
2979 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2980 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2981 }
2982 #endif
2983
2984 #else // KMP_USE_DYNAMIC_LOCK
2985
2986 kmp_user_lock_p lck;
2987
2988 /* Can't use serial interval since not block structured */
2989 /* release the lock */
2990
2991 if ((__kmp_user_lock_kind == lk_tas) &&
2992 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2993 #if KMP_OS_LINUX && \
2994 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2995 // "fast" path implemented to fix customer performance issue
2996 #if USE_ITT_BUILD
2997 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2998 #endif /* USE_ITT_BUILD */
2999 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
3000 KMP_MB();
3001
3002 #if OMPT_SUPPORT && OMPT_OPTIONAL
3003 // This is the case, if called from omp_init_lock_with_hint:
3004 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3005 if (!codeptr)
3006 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3007 if (ompt_enabled.ompt_callback_mutex_released) {
3008 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3009 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3010 }
3011 #endif
3012
3013 return;
3014 #else
3015 lck = (kmp_user_lock_p)user_lock;
3016 #endif
3017 }
3018 #if KMP_USE_FUTEX
3019 else if ((__kmp_user_lock_kind == lk_futex) &&
3020 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3021 lck = (kmp_user_lock_p)user_lock;
3022 }
3023 #endif
3024 else {
3025 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
3026 }
3027
3028 #if USE_ITT_BUILD
3029 __kmp_itt_lock_releasing(lck);
3030 #endif /* USE_ITT_BUILD */
3031
3032 RELEASE_LOCK(lck, gtid);
3033
3034 #if OMPT_SUPPORT && OMPT_OPTIONAL
3035 // This is the case, if called from omp_init_lock_with_hint:
3036 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3037 if (!codeptr)
3038 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3039 if (ompt_enabled.ompt_callback_mutex_released) {
3040 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3041 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3042 }
3043 #endif
3044
3045 #endif // KMP_USE_DYNAMIC_LOCK
3046 }
3047
3048 /* release the lock */
__kmpc_unset_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)3049 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3050 #if KMP_USE_DYNAMIC_LOCK
3051
3052 #if USE_ITT_BUILD
3053 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3054 #endif
3055 int release_status =
3056 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3057 (void)release_status;
3058
3059 #if OMPT_SUPPORT && OMPT_OPTIONAL
3060 // This is the case, if called from omp_init_lock_with_hint:
3061 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3062 if (!codeptr)
3063 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3064 if (ompt_enabled.enabled) {
3065 if (release_status == KMP_LOCK_RELEASED) {
3066 if (ompt_enabled.ompt_callback_mutex_released) {
3067 // release_lock_last
3068 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3069 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3070 codeptr);
3071 }
3072 } else if (ompt_enabled.ompt_callback_nest_lock) {
3073 // release_lock_prev
3074 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3075 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3076 }
3077 }
3078 #endif
3079
3080 #else // KMP_USE_DYNAMIC_LOCK
3081
3082 kmp_user_lock_p lck;
3083
3084 /* Can't use serial interval since not block structured */
3085
3086 if ((__kmp_user_lock_kind == lk_tas) &&
3087 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3088 OMP_NEST_LOCK_T_SIZE)) {
3089 #if KMP_OS_LINUX && \
3090 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3091 // "fast" path implemented to fix customer performance issue
3092 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3093 #if USE_ITT_BUILD
3094 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3095 #endif /* USE_ITT_BUILD */
3096
3097 #if OMPT_SUPPORT && OMPT_OPTIONAL
3098 int release_status = KMP_LOCK_STILL_HELD;
3099 #endif
3100
3101 if (--(tl->lk.depth_locked) == 0) {
3102 TCW_4(tl->lk.poll, 0);
3103 #if OMPT_SUPPORT && OMPT_OPTIONAL
3104 release_status = KMP_LOCK_RELEASED;
3105 #endif
3106 }
3107 KMP_MB();
3108
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110 // This is the case, if called from omp_init_lock_with_hint:
3111 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3112 if (!codeptr)
3113 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3114 if (ompt_enabled.enabled) {
3115 if (release_status == KMP_LOCK_RELEASED) {
3116 if (ompt_enabled.ompt_callback_mutex_released) {
3117 // release_lock_last
3118 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3119 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3120 }
3121 } else if (ompt_enabled.ompt_callback_nest_lock) {
3122 // release_lock_previous
3123 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3124 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3125 }
3126 }
3127 #endif
3128
3129 return;
3130 #else
3131 lck = (kmp_user_lock_p)user_lock;
3132 #endif
3133 }
3134 #if KMP_USE_FUTEX
3135 else if ((__kmp_user_lock_kind == lk_futex) &&
3136 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3137 OMP_NEST_LOCK_T_SIZE)) {
3138 lck = (kmp_user_lock_p)user_lock;
3139 }
3140 #endif
3141 else {
3142 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3143 }
3144
3145 #if USE_ITT_BUILD
3146 __kmp_itt_lock_releasing(lck);
3147 #endif /* USE_ITT_BUILD */
3148
3149 int release_status;
3150 release_status = RELEASE_NESTED_LOCK(lck, gtid);
3151 #if OMPT_SUPPORT && OMPT_OPTIONAL
3152 // This is the case, if called from omp_init_lock_with_hint:
3153 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3154 if (!codeptr)
3155 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3156 if (ompt_enabled.enabled) {
3157 if (release_status == KMP_LOCK_RELEASED) {
3158 if (ompt_enabled.ompt_callback_mutex_released) {
3159 // release_lock_last
3160 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3161 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3162 }
3163 } else if (ompt_enabled.ompt_callback_nest_lock) {
3164 // release_lock_previous
3165 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3166 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3167 }
3168 }
3169 #endif
3170
3171 #endif // KMP_USE_DYNAMIC_LOCK
3172 }
3173
3174 /* try to acquire the lock */
__kmpc_test_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)3175 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3176 KMP_COUNT_BLOCK(OMP_test_lock);
3177
3178 #if KMP_USE_DYNAMIC_LOCK
3179 int rc;
3180 int tag = KMP_EXTRACT_D_TAG(user_lock);
3181 #if USE_ITT_BUILD
3182 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3183 #endif
3184 #if OMPT_SUPPORT && OMPT_OPTIONAL
3185 // This is the case, if called from omp_init_lock_with_hint:
3186 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3187 if (!codeptr)
3188 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3189 if (ompt_enabled.ompt_callback_mutex_acquire) {
3190 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3191 ompt_mutex_test_lock, omp_lock_hint_none,
3192 __ompt_get_mutex_impl_type(user_lock),
3193 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3194 }
3195 #endif
3196 #if KMP_USE_INLINED_TAS
3197 if (tag == locktag_tas && !__kmp_env_consistency_check) {
3198 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3199 } else
3200 #elif KMP_USE_INLINED_FUTEX
3201 if (tag == locktag_futex && !__kmp_env_consistency_check) {
3202 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3203 } else
3204 #endif
3205 {
3206 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3207 }
3208 if (rc) {
3209 #if USE_ITT_BUILD
3210 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3211 #endif
3212 #if OMPT_SUPPORT && OMPT_OPTIONAL
3213 if (ompt_enabled.ompt_callback_mutex_acquired) {
3214 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3215 ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3216 }
3217 #endif
3218 return FTN_TRUE;
3219 } else {
3220 #if USE_ITT_BUILD
3221 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3222 #endif
3223 return FTN_FALSE;
3224 }
3225
3226 #else // KMP_USE_DYNAMIC_LOCK
3227
3228 kmp_user_lock_p lck;
3229 int rc;
3230
3231 if ((__kmp_user_lock_kind == lk_tas) &&
3232 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3233 lck = (kmp_user_lock_p)user_lock;
3234 }
3235 #if KMP_USE_FUTEX
3236 else if ((__kmp_user_lock_kind == lk_futex) &&
3237 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3238 lck = (kmp_user_lock_p)user_lock;
3239 }
3240 #endif
3241 else {
3242 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3243 }
3244
3245 #if USE_ITT_BUILD
3246 __kmp_itt_lock_acquiring(lck);
3247 #endif /* USE_ITT_BUILD */
3248 #if OMPT_SUPPORT && OMPT_OPTIONAL
3249 // This is the case, if called from omp_init_lock_with_hint:
3250 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3251 if (!codeptr)
3252 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3253 if (ompt_enabled.ompt_callback_mutex_acquire) {
3254 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3255 ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3256 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3257 }
3258 #endif
3259
3260 rc = TEST_LOCK(lck, gtid);
3261 #if USE_ITT_BUILD
3262 if (rc) {
3263 __kmp_itt_lock_acquired(lck);
3264 } else {
3265 __kmp_itt_lock_cancelled(lck);
3266 }
3267 #endif /* USE_ITT_BUILD */
3268 #if OMPT_SUPPORT && OMPT_OPTIONAL
3269 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3270 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3271 ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3272 }
3273 #endif
3274
3275 return (rc ? FTN_TRUE : FTN_FALSE);
3276
3277 /* Can't use serial interval since not block structured */
3278
3279 #endif // KMP_USE_DYNAMIC_LOCK
3280 }
3281
3282 /* try to acquire the lock */
__kmpc_test_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)3283 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3284 #if KMP_USE_DYNAMIC_LOCK
3285 int rc;
3286 #if USE_ITT_BUILD
3287 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3288 #endif
3289 #if OMPT_SUPPORT && OMPT_OPTIONAL
3290 // This is the case, if called from omp_init_lock_with_hint:
3291 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3292 if (!codeptr)
3293 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3294 if (ompt_enabled.ompt_callback_mutex_acquire) {
3295 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3296 ompt_mutex_test_nest_lock, omp_lock_hint_none,
3297 __ompt_get_mutex_impl_type(user_lock),
3298 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3299 }
3300 #endif
3301 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3302 #if USE_ITT_BUILD
3303 if (rc) {
3304 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3305 } else {
3306 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3307 }
3308 #endif
3309 #if OMPT_SUPPORT && OMPT_OPTIONAL
3310 if (ompt_enabled.enabled && rc) {
3311 if (rc == 1) {
3312 if (ompt_enabled.ompt_callback_mutex_acquired) {
3313 // lock_first
3314 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3315 ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3316 codeptr);
3317 }
3318 } else {
3319 if (ompt_enabled.ompt_callback_nest_lock) {
3320 // lock_next
3321 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3322 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3323 }
3324 }
3325 }
3326 #endif
3327 return rc;
3328
3329 #else // KMP_USE_DYNAMIC_LOCK
3330
3331 kmp_user_lock_p lck;
3332 int rc;
3333
3334 if ((__kmp_user_lock_kind == lk_tas) &&
3335 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3336 OMP_NEST_LOCK_T_SIZE)) {
3337 lck = (kmp_user_lock_p)user_lock;
3338 }
3339 #if KMP_USE_FUTEX
3340 else if ((__kmp_user_lock_kind == lk_futex) &&
3341 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3342 OMP_NEST_LOCK_T_SIZE)) {
3343 lck = (kmp_user_lock_p)user_lock;
3344 }
3345 #endif
3346 else {
3347 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3348 }
3349
3350 #if USE_ITT_BUILD
3351 __kmp_itt_lock_acquiring(lck);
3352 #endif /* USE_ITT_BUILD */
3353
3354 #if OMPT_SUPPORT && OMPT_OPTIONAL
3355 // This is the case, if called from omp_init_lock_with_hint:
3356 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3357 if (!codeptr)
3358 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3359 if (ompt_enabled.enabled) &&
3360 ompt_enabled.ompt_callback_mutex_acquire) {
3361 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3362 ompt_mutex_test_nest_lock, omp_lock_hint_none,
3363 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3364 codeptr);
3365 }
3366 #endif
3367
3368 rc = TEST_NESTED_LOCK(lck, gtid);
3369 #if USE_ITT_BUILD
3370 if (rc) {
3371 __kmp_itt_lock_acquired(lck);
3372 } else {
3373 __kmp_itt_lock_cancelled(lck);
3374 }
3375 #endif /* USE_ITT_BUILD */
3376 #if OMPT_SUPPORT && OMPT_OPTIONAL
3377 if (ompt_enabled.enabled && rc) {
3378 if (rc == 1) {
3379 if (ompt_enabled.ompt_callback_mutex_acquired) {
3380 // lock_first
3381 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3382 ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3383 }
3384 } else {
3385 if (ompt_enabled.ompt_callback_nest_lock) {
3386 // lock_next
3387 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3388 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3389 }
3390 }
3391 }
3392 #endif
3393 return rc;
3394
3395 /* Can't use serial interval since not block structured */
3396
3397 #endif // KMP_USE_DYNAMIC_LOCK
3398 }
3399
3400 // Interface to fast scalable reduce methods routines
3401
3402 // keep the selected method in a thread local structure for cross-function
3403 // usage: will be used in __kmpc_end_reduce* functions;
3404 // another solution: to re-determine the method one more time in
3405 // __kmpc_end_reduce* functions (new prototype required then)
3406 // AT: which solution is better?
3407 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3408 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3409
3410 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3411 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3412
3413 // description of the packed_reduction_method variable: look at the macros in
3414 // kmp.h
3415
3416 // used in a critical section reduce block
3417 static __forceinline void
__kmp_enter_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3418 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3419 kmp_critical_name *crit) {
3420
3421 // this lock was visible to a customer and to the threading profile tool as a
3422 // serial overhead span (although it's used for an internal purpose only)
3423 // why was it visible in previous implementation?
3424 // should we keep it visible in new reduce block?
3425 kmp_user_lock_p lck;
3426
3427 #if KMP_USE_DYNAMIC_LOCK
3428
3429 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3430 // Check if it is initialized.
3431 if (*lk == 0) {
3432 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3433 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3434 KMP_GET_D_TAG(__kmp_user_lock_seq));
3435 } else {
3436 __kmp_init_indirect_csptr(crit, loc, global_tid,
3437 KMP_GET_I_TAG(__kmp_user_lock_seq));
3438 }
3439 }
3440 // Branch for accessing the actual lock object and set operation. This
3441 // branching is inevitable since this lock initialization does not follow the
3442 // normal dispatch path (lock table is not used).
3443 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3444 lck = (kmp_user_lock_p)lk;
3445 KMP_DEBUG_ASSERT(lck != NULL);
3446 if (__kmp_env_consistency_check) {
3447 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3448 }
3449 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3450 } else {
3451 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3452 lck = ilk->lock;
3453 KMP_DEBUG_ASSERT(lck != NULL);
3454 if (__kmp_env_consistency_check) {
3455 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3456 }
3457 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3458 }
3459
3460 #else // KMP_USE_DYNAMIC_LOCK
3461
3462 // We know that the fast reduction code is only emitted by Intel compilers
3463 // with 32 byte critical sections. If there isn't enough space, then we
3464 // have to use a pointer.
3465 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3466 lck = (kmp_user_lock_p)crit;
3467 } else {
3468 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3469 }
3470 KMP_DEBUG_ASSERT(lck != NULL);
3471
3472 if (__kmp_env_consistency_check)
3473 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3474
3475 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3476
3477 #endif // KMP_USE_DYNAMIC_LOCK
3478 }
3479
3480 // used in a critical section reduce block
3481 static __forceinline void
__kmp_end_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3482 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3483 kmp_critical_name *crit) {
3484
3485 kmp_user_lock_p lck;
3486
3487 #if KMP_USE_DYNAMIC_LOCK
3488
3489 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3490 lck = (kmp_user_lock_p)crit;
3491 if (__kmp_env_consistency_check)
3492 __kmp_pop_sync(global_tid, ct_critical, loc);
3493 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3494 } else {
3495 kmp_indirect_lock_t *ilk =
3496 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3497 if (__kmp_env_consistency_check)
3498 __kmp_pop_sync(global_tid, ct_critical, loc);
3499 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3500 }
3501
3502 #else // KMP_USE_DYNAMIC_LOCK
3503
3504 // We know that the fast reduction code is only emitted by Intel compilers
3505 // with 32 byte critical sections. If there isn't enough space, then we have
3506 // to use a pointer.
3507 if (__kmp_base_user_lock_size > 32) {
3508 lck = *((kmp_user_lock_p *)crit);
3509 KMP_ASSERT(lck != NULL);
3510 } else {
3511 lck = (kmp_user_lock_p)crit;
3512 }
3513
3514 if (__kmp_env_consistency_check)
3515 __kmp_pop_sync(global_tid, ct_critical, loc);
3516
3517 __kmp_release_user_lock_with_checks(lck, global_tid);
3518
3519 #endif // KMP_USE_DYNAMIC_LOCK
3520 } // __kmp_end_critical_section_reduce_block
3521
3522 static __forceinline int
__kmp_swap_teams_for_teams_reduction(kmp_info_t * th,kmp_team_t ** team_p,int * task_state)3523 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3524 int *task_state) {
3525 kmp_team_t *team;
3526
3527 // Check if we are inside the teams construct?
3528 if (th->th.th_teams_microtask) {
3529 *team_p = team = th->th.th_team;
3530 if (team->t.t_level == th->th.th_teams_level) {
3531 // This is reduction at teams construct.
3532 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3533 // Let's swap teams temporarily for the reduction.
3534 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3535 th->th.th_team = team->t.t_parent;
3536 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3537 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3538 *task_state = th->th.th_task_state;
3539 th->th.th_task_state = 0;
3540
3541 return 1;
3542 }
3543 }
3544 return 0;
3545 }
3546
3547 static __forceinline void
__kmp_restore_swapped_teams(kmp_info_t * th,kmp_team_t * team,int task_state)3548 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3549 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3550 th->th.th_info.ds.ds_tid = 0;
3551 th->th.th_team = team;
3552 th->th.th_team_nproc = team->t.t_nproc;
3553 th->th.th_task_team = team->t.t_task_team[task_state];
3554 __kmp_type_convert(task_state, &(th->th.th_task_state));
3555 }
3556
3557 /* 2.a.i. Reduce Block without a terminating barrier */
3558 /*!
3559 @ingroup SYNCHRONIZATION
3560 @param loc source location information
3561 @param global_tid global thread number
3562 @param num_vars number of items (variables) to be reduced
3563 @param reduce_size size of data in bytes to be reduced
3564 @param reduce_data pointer to data to be reduced
3565 @param reduce_func callback function providing reduction operation on two
3566 operands and returning result of reduction in lhs_data
3567 @param lck pointer to the unique lock data structure
3568 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3569 threads if atomic reduction needed
3570
3571 The nowait version is used for a reduce clause with the nowait argument.
3572 */
3573 kmp_int32
__kmpc_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3574 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3575 size_t reduce_size, void *reduce_data,
3576 void (*reduce_func)(void *lhs_data, void *rhs_data),
3577 kmp_critical_name *lck) {
3578
3579 KMP_COUNT_BLOCK(REDUCE_nowait);
3580 int retval = 0;
3581 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3582 kmp_info_t *th;
3583 kmp_team_t *team;
3584 int teams_swapped = 0, task_state;
3585 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3586 __kmp_assert_valid_gtid(global_tid);
3587
3588 // why do we need this initialization here at all?
3589 // Reduction clause can not be used as a stand-alone directive.
3590
3591 // do not call __kmp_serial_initialize(), it will be called by
3592 // __kmp_parallel_initialize() if needed
3593 // possible detection of false-positive race by the threadchecker ???
3594 if (!TCR_4(__kmp_init_parallel))
3595 __kmp_parallel_initialize();
3596
3597 __kmp_resume_if_soft_paused();
3598
3599 // check correctness of reduce block nesting
3600 #if KMP_USE_DYNAMIC_LOCK
3601 if (__kmp_env_consistency_check)
3602 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3603 #else
3604 if (__kmp_env_consistency_check)
3605 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3606 #endif
3607
3608 th = __kmp_thread_from_gtid(global_tid);
3609 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3610
3611 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3612 // the value should be kept in a variable
3613 // the variable should be either a construct-specific or thread-specific
3614 // property, not a team specific property
3615 // (a thread can reach the next reduce block on the next construct, reduce
3616 // method may differ on the next construct)
3617 // an ident_t "loc" parameter could be used as a construct-specific property
3618 // (what if loc == 0?)
3619 // (if both construct-specific and team-specific variables were shared,
3620 // then unness extra syncs should be needed)
3621 // a thread-specific variable is better regarding two issues above (next
3622 // construct and extra syncs)
3623 // a thread-specific "th_local.reduction_method" variable is used currently
3624 // each thread executes 'determine' and 'set' lines (no need to execute by one
3625 // thread, to avoid unness extra syncs)
3626
3627 packed_reduction_method = __kmp_determine_reduction_method(
3628 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3629 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3630
3631 OMPT_REDUCTION_DECL(th, global_tid);
3632 if (packed_reduction_method == critical_reduce_block) {
3633
3634 OMPT_REDUCTION_BEGIN;
3635
3636 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3637 retval = 1;
3638
3639 } else if (packed_reduction_method == empty_reduce_block) {
3640
3641 OMPT_REDUCTION_BEGIN;
3642
3643 // usage: if team size == 1, no synchronization is required ( Intel
3644 // platforms only )
3645 retval = 1;
3646
3647 } else if (packed_reduction_method == atomic_reduce_block) {
3648
3649 retval = 2;
3650
3651 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3652 // won't be called by the code gen)
3653 // (it's not quite good, because the checking block has been closed by
3654 // this 'pop',
3655 // but atomic operation has not been executed yet, will be executed
3656 // slightly later, literally on next instruction)
3657 if (__kmp_env_consistency_check)
3658 __kmp_pop_sync(global_tid, ct_reduce, loc);
3659
3660 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3661 tree_reduce_block)) {
3662
3663 // AT: performance issue: a real barrier here
3664 // AT: (if primary thread is slow, other threads are blocked here waiting for
3665 // the primary thread to come and release them)
3666 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3667 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3668 // be confusing to a customer)
3669 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3670 // might go faster and be more in line with sense of NOWAIT
3671 // AT: TO DO: do epcc test and compare times
3672
3673 // this barrier should be invisible to a customer and to the threading profile
3674 // tool (it's neither a terminating barrier nor customer's code, it's
3675 // used for an internal purpose)
3676 #if OMPT_SUPPORT
3677 // JP: can this barrier potentially leed to task scheduling?
3678 // JP: as long as there is a barrier in the implementation, OMPT should and
3679 // will provide the barrier events
3680 // so we set-up the necessary frame/return addresses.
3681 ompt_frame_t *ompt_frame;
3682 if (ompt_enabled.enabled) {
3683 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3684 if (ompt_frame->enter_frame.ptr == NULL)
3685 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3686 }
3687 OMPT_STORE_RETURN_ADDRESS(global_tid);
3688 #endif
3689 #if USE_ITT_NOTIFY
3690 __kmp_threads[global_tid]->th.th_ident = loc;
3691 #endif
3692 retval =
3693 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3694 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3695 retval = (retval != 0) ? (0) : (1);
3696 #if OMPT_SUPPORT && OMPT_OPTIONAL
3697 if (ompt_enabled.enabled) {
3698 ompt_frame->enter_frame = ompt_data_none;
3699 }
3700 #endif
3701
3702 // all other workers except primary thread should do this pop here
3703 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3704 if (__kmp_env_consistency_check) {
3705 if (retval == 0) {
3706 __kmp_pop_sync(global_tid, ct_reduce, loc);
3707 }
3708 }
3709
3710 } else {
3711
3712 // should never reach this block
3713 KMP_ASSERT(0); // "unexpected method"
3714 }
3715 if (teams_swapped) {
3716 __kmp_restore_swapped_teams(th, team, task_state);
3717 }
3718 KA_TRACE(
3719 10,
3720 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3721 global_tid, packed_reduction_method, retval));
3722
3723 return retval;
3724 }
3725
3726 /*!
3727 @ingroup SYNCHRONIZATION
3728 @param loc source location information
3729 @param global_tid global thread id.
3730 @param lck pointer to the unique lock data structure
3731
3732 Finish the execution of a reduce nowait.
3733 */
__kmpc_end_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3734 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3735 kmp_critical_name *lck) {
3736
3737 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3738
3739 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3740 __kmp_assert_valid_gtid(global_tid);
3741
3742 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3743
3744 OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3745
3746 if (packed_reduction_method == critical_reduce_block) {
3747
3748 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3749 OMPT_REDUCTION_END;
3750
3751 } else if (packed_reduction_method == empty_reduce_block) {
3752
3753 // usage: if team size == 1, no synchronization is required ( on Intel
3754 // platforms only )
3755
3756 OMPT_REDUCTION_END;
3757
3758 } else if (packed_reduction_method == atomic_reduce_block) {
3759
3760 // neither primary thread nor other workers should get here
3761 // (code gen does not generate this call in case 2: atomic reduce block)
3762 // actually it's better to remove this elseif at all;
3763 // after removal this value will checked by the 'else' and will assert
3764
3765 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3766 tree_reduce_block)) {
3767
3768 // only primary thread gets here
3769 // OMPT: tree reduction is annotated in the barrier code
3770
3771 } else {
3772
3773 // should never reach this block
3774 KMP_ASSERT(0); // "unexpected method"
3775 }
3776
3777 if (__kmp_env_consistency_check)
3778 __kmp_pop_sync(global_tid, ct_reduce, loc);
3779
3780 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3781 global_tid, packed_reduction_method));
3782
3783 return;
3784 }
3785
3786 /* 2.a.ii. Reduce Block with a terminating barrier */
3787
3788 /*!
3789 @ingroup SYNCHRONIZATION
3790 @param loc source location information
3791 @param global_tid global thread number
3792 @param num_vars number of items (variables) to be reduced
3793 @param reduce_size size of data in bytes to be reduced
3794 @param reduce_data pointer to data to be reduced
3795 @param reduce_func callback function providing reduction operation on two
3796 operands and returning result of reduction in lhs_data
3797 @param lck pointer to the unique lock data structure
3798 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3799 threads if atomic reduction needed
3800
3801 A blocking reduce that includes an implicit barrier.
3802 */
__kmpc_reduce(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3803 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3804 size_t reduce_size, void *reduce_data,
3805 void (*reduce_func)(void *lhs_data, void *rhs_data),
3806 kmp_critical_name *lck) {
3807 KMP_COUNT_BLOCK(REDUCE_wait);
3808 int retval = 0;
3809 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3810 kmp_info_t *th;
3811 kmp_team_t *team;
3812 int teams_swapped = 0, task_state;
3813
3814 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3815 __kmp_assert_valid_gtid(global_tid);
3816
3817 // why do we need this initialization here at all?
3818 // Reduction clause can not be a stand-alone directive.
3819
3820 // do not call __kmp_serial_initialize(), it will be called by
3821 // __kmp_parallel_initialize() if needed
3822 // possible detection of false-positive race by the threadchecker ???
3823 if (!TCR_4(__kmp_init_parallel))
3824 __kmp_parallel_initialize();
3825
3826 __kmp_resume_if_soft_paused();
3827
3828 // check correctness of reduce block nesting
3829 #if KMP_USE_DYNAMIC_LOCK
3830 if (__kmp_env_consistency_check)
3831 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3832 #else
3833 if (__kmp_env_consistency_check)
3834 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3835 #endif
3836
3837 th = __kmp_thread_from_gtid(global_tid);
3838 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3839
3840 packed_reduction_method = __kmp_determine_reduction_method(
3841 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3842 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3843
3844 OMPT_REDUCTION_DECL(th, global_tid);
3845
3846 if (packed_reduction_method == critical_reduce_block) {
3847
3848 OMPT_REDUCTION_BEGIN;
3849 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3850 retval = 1;
3851
3852 } else if (packed_reduction_method == empty_reduce_block) {
3853
3854 OMPT_REDUCTION_BEGIN;
3855 // usage: if team size == 1, no synchronization is required ( Intel
3856 // platforms only )
3857 retval = 1;
3858
3859 } else if (packed_reduction_method == atomic_reduce_block) {
3860
3861 retval = 2;
3862
3863 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3864 tree_reduce_block)) {
3865
3866 // case tree_reduce_block:
3867 // this barrier should be visible to a customer and to the threading profile
3868 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3869 #if OMPT_SUPPORT
3870 ompt_frame_t *ompt_frame;
3871 if (ompt_enabled.enabled) {
3872 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3873 if (ompt_frame->enter_frame.ptr == NULL)
3874 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3875 }
3876 OMPT_STORE_RETURN_ADDRESS(global_tid);
3877 #endif
3878 #if USE_ITT_NOTIFY
3879 __kmp_threads[global_tid]->th.th_ident =
3880 loc; // needed for correct notification of frames
3881 #endif
3882 retval =
3883 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3884 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3885 retval = (retval != 0) ? (0) : (1);
3886 #if OMPT_SUPPORT && OMPT_OPTIONAL
3887 if (ompt_enabled.enabled) {
3888 ompt_frame->enter_frame = ompt_data_none;
3889 }
3890 #endif
3891
3892 // all other workers except primary thread should do this pop here
3893 // (none of other workers except primary will enter __kmpc_end_reduce())
3894 if (__kmp_env_consistency_check) {
3895 if (retval == 0) { // 0: all other workers; 1: primary thread
3896 __kmp_pop_sync(global_tid, ct_reduce, loc);
3897 }
3898 }
3899
3900 } else {
3901
3902 // should never reach this block
3903 KMP_ASSERT(0); // "unexpected method"
3904 }
3905 if (teams_swapped) {
3906 __kmp_restore_swapped_teams(th, team, task_state);
3907 }
3908
3909 KA_TRACE(10,
3910 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3911 global_tid, packed_reduction_method, retval));
3912 return retval;
3913 }
3914
3915 /*!
3916 @ingroup SYNCHRONIZATION
3917 @param loc source location information
3918 @param global_tid global thread id.
3919 @param lck pointer to the unique lock data structure
3920
3921 Finish the execution of a blocking reduce.
3922 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3923 start function.
3924 */
__kmpc_end_reduce(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3925 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3926 kmp_critical_name *lck) {
3927
3928 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3929 kmp_info_t *th;
3930 kmp_team_t *team;
3931 int teams_swapped = 0, task_state;
3932
3933 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3934 __kmp_assert_valid_gtid(global_tid);
3935
3936 th = __kmp_thread_from_gtid(global_tid);
3937 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3938
3939 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3940
3941 // this barrier should be visible to a customer and to the threading profile
3942 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3943 OMPT_REDUCTION_DECL(th, global_tid);
3944
3945 if (packed_reduction_method == critical_reduce_block) {
3946 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3947
3948 OMPT_REDUCTION_END;
3949
3950 // TODO: implicit barrier: should be exposed
3951 #if OMPT_SUPPORT
3952 ompt_frame_t *ompt_frame;
3953 if (ompt_enabled.enabled) {
3954 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3955 if (ompt_frame->enter_frame.ptr == NULL)
3956 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3957 }
3958 OMPT_STORE_RETURN_ADDRESS(global_tid);
3959 #endif
3960 #if USE_ITT_NOTIFY
3961 __kmp_threads[global_tid]->th.th_ident = loc;
3962 #endif
3963 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3964 #if OMPT_SUPPORT && OMPT_OPTIONAL
3965 if (ompt_enabled.enabled) {
3966 ompt_frame->enter_frame = ompt_data_none;
3967 }
3968 #endif
3969
3970 } else if (packed_reduction_method == empty_reduce_block) {
3971
3972 OMPT_REDUCTION_END;
3973
3974 // usage: if team size==1, no synchronization is required (Intel platforms only)
3975
3976 // TODO: implicit barrier: should be exposed
3977 #if OMPT_SUPPORT
3978 ompt_frame_t *ompt_frame;
3979 if (ompt_enabled.enabled) {
3980 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3981 if (ompt_frame->enter_frame.ptr == NULL)
3982 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3983 }
3984 OMPT_STORE_RETURN_ADDRESS(global_tid);
3985 #endif
3986 #if USE_ITT_NOTIFY
3987 __kmp_threads[global_tid]->th.th_ident = loc;
3988 #endif
3989 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3990 #if OMPT_SUPPORT && OMPT_OPTIONAL
3991 if (ompt_enabled.enabled) {
3992 ompt_frame->enter_frame = ompt_data_none;
3993 }
3994 #endif
3995
3996 } else if (packed_reduction_method == atomic_reduce_block) {
3997
3998 #if OMPT_SUPPORT
3999 ompt_frame_t *ompt_frame;
4000 if (ompt_enabled.enabled) {
4001 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4002 if (ompt_frame->enter_frame.ptr == NULL)
4003 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4004 }
4005 OMPT_STORE_RETURN_ADDRESS(global_tid);
4006 #endif
4007 // TODO: implicit barrier: should be exposed
4008 #if USE_ITT_NOTIFY
4009 __kmp_threads[global_tid]->th.th_ident = loc;
4010 #endif
4011 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4012 #if OMPT_SUPPORT && OMPT_OPTIONAL
4013 if (ompt_enabled.enabled) {
4014 ompt_frame->enter_frame = ompt_data_none;
4015 }
4016 #endif
4017
4018 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
4019 tree_reduce_block)) {
4020
4021 // only primary thread executes here (primary releases all other workers)
4022 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
4023 global_tid);
4024
4025 } else {
4026
4027 // should never reach this block
4028 KMP_ASSERT(0); // "unexpected method"
4029 }
4030 if (teams_swapped) {
4031 __kmp_restore_swapped_teams(th, team, task_state);
4032 }
4033
4034 if (__kmp_env_consistency_check)
4035 __kmp_pop_sync(global_tid, ct_reduce, loc);
4036
4037 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4038 global_tid, packed_reduction_method));
4039
4040 return;
4041 }
4042
4043 #undef __KMP_GET_REDUCTION_METHOD
4044 #undef __KMP_SET_REDUCTION_METHOD
4045
4046 /* end of interface to fast scalable reduce routines */
4047
__kmpc_get_taskid()4048 kmp_uint64 __kmpc_get_taskid() {
4049
4050 kmp_int32 gtid;
4051 kmp_info_t *thread;
4052
4053 gtid = __kmp_get_gtid();
4054 if (gtid < 0) {
4055 return 0;
4056 }
4057 thread = __kmp_thread_from_gtid(gtid);
4058 return thread->th.th_current_task->td_task_id;
4059
4060 } // __kmpc_get_taskid
4061
__kmpc_get_parent_taskid()4062 kmp_uint64 __kmpc_get_parent_taskid() {
4063
4064 kmp_int32 gtid;
4065 kmp_info_t *thread;
4066 kmp_taskdata_t *parent_task;
4067
4068 gtid = __kmp_get_gtid();
4069 if (gtid < 0) {
4070 return 0;
4071 }
4072 thread = __kmp_thread_from_gtid(gtid);
4073 parent_task = thread->th.th_current_task->td_parent;
4074 return (parent_task == NULL ? 0 : parent_task->td_task_id);
4075
4076 } // __kmpc_get_parent_taskid
4077
4078 /*!
4079 @ingroup WORK_SHARING
4080 @param loc source location information.
4081 @param gtid global thread number.
4082 @param num_dims number of associated doacross loops.
4083 @param dims info on loops bounds.
4084
4085 Initialize doacross loop information.
4086 Expect compiler send us inclusive bounds,
4087 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4088 */
__kmpc_doacross_init(ident_t * loc,int gtid,int num_dims,const struct kmp_dim * dims)4089 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4090 const struct kmp_dim *dims) {
4091 __kmp_assert_valid_gtid(gtid);
4092 int j, idx;
4093 kmp_int64 last, trace_count;
4094 kmp_info_t *th = __kmp_threads[gtid];
4095 kmp_team_t *team = th->th.th_team;
4096 kmp_uint32 *flags;
4097 kmp_disp_t *pr_buf = th->th.th_dispatch;
4098 dispatch_shared_info_t *sh_buf;
4099
4100 KA_TRACE(
4101 20,
4102 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4103 gtid, num_dims, !team->t.t_serialized));
4104 KMP_DEBUG_ASSERT(dims != NULL);
4105 KMP_DEBUG_ASSERT(num_dims > 0);
4106
4107 if (team->t.t_serialized) {
4108 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4109 return; // no dependencies if team is serialized
4110 }
4111 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4112 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4113 // the next loop
4114 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4115
4116 // Save bounds info into allocated private buffer
4117 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4118 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4119 th, sizeof(kmp_int64) * (4 * num_dims + 1));
4120 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4121 pr_buf->th_doacross_info[0] =
4122 (kmp_int64)num_dims; // first element is number of dimensions
4123 // Save also address of num_done in order to access it later without knowing
4124 // the buffer index
4125 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4126 pr_buf->th_doacross_info[2] = dims[0].lo;
4127 pr_buf->th_doacross_info[3] = dims[0].up;
4128 pr_buf->th_doacross_info[4] = dims[0].st;
4129 last = 5;
4130 for (j = 1; j < num_dims; ++j) {
4131 kmp_int64
4132 range_length; // To keep ranges of all dimensions but the first dims[0]
4133 if (dims[j].st == 1) { // most common case
4134 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4135 range_length = dims[j].up - dims[j].lo + 1;
4136 } else {
4137 if (dims[j].st > 0) {
4138 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4139 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4140 } else { // negative increment
4141 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4142 range_length =
4143 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4144 }
4145 }
4146 pr_buf->th_doacross_info[last++] = range_length;
4147 pr_buf->th_doacross_info[last++] = dims[j].lo;
4148 pr_buf->th_doacross_info[last++] = dims[j].up;
4149 pr_buf->th_doacross_info[last++] = dims[j].st;
4150 }
4151
4152 // Compute total trip count.
4153 // Start with range of dims[0] which we don't need to keep in the buffer.
4154 if (dims[0].st == 1) { // most common case
4155 trace_count = dims[0].up - dims[0].lo + 1;
4156 } else if (dims[0].st > 0) {
4157 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4158 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4159 } else { // negative increment
4160 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4161 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4162 }
4163 for (j = 1; j < num_dims; ++j) {
4164 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4165 }
4166 KMP_DEBUG_ASSERT(trace_count > 0);
4167
4168 // Check if shared buffer is not occupied by other loop (idx -
4169 // __kmp_dispatch_num_buffers)
4170 if (idx != sh_buf->doacross_buf_idx) {
4171 // Shared buffer is occupied, wait for it to be free
4172 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4173 __kmp_eq_4, NULL);
4174 }
4175 #if KMP_32_BIT_ARCH
4176 // Check if we are the first thread. After the CAS the first thread gets 0,
4177 // others get 1 if initialization is in progress, allocated pointer otherwise.
4178 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4179 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4180 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4181 #else
4182 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4183 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4184 #endif
4185 if (flags == NULL) {
4186 // we are the first thread, allocate the array of flags
4187 size_t size =
4188 (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4189 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4190 KMP_MB();
4191 sh_buf->doacross_flags = flags;
4192 } else if (flags == (kmp_uint32 *)1) {
4193 #if KMP_32_BIT_ARCH
4194 // initialization is still in progress, need to wait
4195 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4196 #else
4197 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4198 #endif
4199 KMP_YIELD(TRUE);
4200 KMP_MB();
4201 } else {
4202 KMP_MB();
4203 }
4204 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4205 pr_buf->th_doacross_flags =
4206 sh_buf->doacross_flags; // save private copy in order to not
4207 // touch shared buffer on each iteration
4208 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4209 }
4210
__kmpc_doacross_wait(ident_t * loc,int gtid,const kmp_int64 * vec)4211 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4212 __kmp_assert_valid_gtid(gtid);
4213 kmp_int64 shft;
4214 size_t num_dims, i;
4215 kmp_uint32 flag;
4216 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4217 kmp_info_t *th = __kmp_threads[gtid];
4218 kmp_team_t *team = th->th.th_team;
4219 kmp_disp_t *pr_buf;
4220 kmp_int64 lo, up, st;
4221
4222 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4223 if (team->t.t_serialized) {
4224 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4225 return; // no dependencies if team is serialized
4226 }
4227
4228 // calculate sequential iteration number and check out-of-bounds condition
4229 pr_buf = th->th.th_dispatch;
4230 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4231 num_dims = (size_t)pr_buf->th_doacross_info[0];
4232 lo = pr_buf->th_doacross_info[2];
4233 up = pr_buf->th_doacross_info[3];
4234 st = pr_buf->th_doacross_info[4];
4235 #if OMPT_SUPPORT && OMPT_OPTIONAL
4236 ompt_dependence_t deps[num_dims];
4237 #endif
4238 if (st == 1) { // most common case
4239 if (vec[0] < lo || vec[0] > up) {
4240 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4241 "bounds [%lld,%lld]\n",
4242 gtid, vec[0], lo, up));
4243 return;
4244 }
4245 iter_number = vec[0] - lo;
4246 } else if (st > 0) {
4247 if (vec[0] < lo || vec[0] > up) {
4248 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4249 "bounds [%lld,%lld]\n",
4250 gtid, vec[0], lo, up));
4251 return;
4252 }
4253 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4254 } else { // negative increment
4255 if (vec[0] > lo || vec[0] < up) {
4256 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4257 "bounds [%lld,%lld]\n",
4258 gtid, vec[0], lo, up));
4259 return;
4260 }
4261 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4262 }
4263 #if OMPT_SUPPORT && OMPT_OPTIONAL
4264 deps[0].variable.value = iter_number;
4265 deps[0].dependence_type = ompt_dependence_type_sink;
4266 #endif
4267 for (i = 1; i < num_dims; ++i) {
4268 kmp_int64 iter, ln;
4269 size_t j = i * 4;
4270 ln = pr_buf->th_doacross_info[j + 1];
4271 lo = pr_buf->th_doacross_info[j + 2];
4272 up = pr_buf->th_doacross_info[j + 3];
4273 st = pr_buf->th_doacross_info[j + 4];
4274 if (st == 1) {
4275 if (vec[i] < lo || vec[i] > up) {
4276 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4277 "bounds [%lld,%lld]\n",
4278 gtid, vec[i], lo, up));
4279 return;
4280 }
4281 iter = vec[i] - lo;
4282 } else if (st > 0) {
4283 if (vec[i] < lo || vec[i] > up) {
4284 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4285 "bounds [%lld,%lld]\n",
4286 gtid, vec[i], lo, up));
4287 return;
4288 }
4289 iter = (kmp_uint64)(vec[i] - lo) / st;
4290 } else { // st < 0
4291 if (vec[i] > lo || vec[i] < up) {
4292 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4293 "bounds [%lld,%lld]\n",
4294 gtid, vec[i], lo, up));
4295 return;
4296 }
4297 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4298 }
4299 iter_number = iter + ln * iter_number;
4300 #if OMPT_SUPPORT && OMPT_OPTIONAL
4301 deps[i].variable.value = iter;
4302 deps[i].dependence_type = ompt_dependence_type_sink;
4303 #endif
4304 }
4305 shft = iter_number % 32; // use 32-bit granularity
4306 iter_number >>= 5; // divided by 32
4307 flag = 1 << shft;
4308 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4309 KMP_YIELD(TRUE);
4310 }
4311 KMP_MB();
4312 #if OMPT_SUPPORT && OMPT_OPTIONAL
4313 if (ompt_enabled.ompt_callback_dependences) {
4314 ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4315 &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4316 }
4317 #endif
4318 KA_TRACE(20,
4319 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4320 gtid, (iter_number << 5) + shft));
4321 }
4322
__kmpc_doacross_post(ident_t * loc,int gtid,const kmp_int64 * vec)4323 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4324 __kmp_assert_valid_gtid(gtid);
4325 kmp_int64 shft;
4326 size_t num_dims, i;
4327 kmp_uint32 flag;
4328 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4329 kmp_info_t *th = __kmp_threads[gtid];
4330 kmp_team_t *team = th->th.th_team;
4331 kmp_disp_t *pr_buf;
4332 kmp_int64 lo, st;
4333
4334 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4335 if (team->t.t_serialized) {
4336 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4337 return; // no dependencies if team is serialized
4338 }
4339
4340 // calculate sequential iteration number (same as in "wait" but no
4341 // out-of-bounds checks)
4342 pr_buf = th->th.th_dispatch;
4343 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4344 num_dims = (size_t)pr_buf->th_doacross_info[0];
4345 lo = pr_buf->th_doacross_info[2];
4346 st = pr_buf->th_doacross_info[4];
4347 #if OMPT_SUPPORT && OMPT_OPTIONAL
4348 ompt_dependence_t deps[num_dims];
4349 #endif
4350 if (st == 1) { // most common case
4351 iter_number = vec[0] - lo;
4352 } else if (st > 0) {
4353 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4354 } else { // negative increment
4355 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4356 }
4357 #if OMPT_SUPPORT && OMPT_OPTIONAL
4358 deps[0].variable.value = iter_number;
4359 deps[0].dependence_type = ompt_dependence_type_source;
4360 #endif
4361 for (i = 1; i < num_dims; ++i) {
4362 kmp_int64 iter, ln;
4363 size_t j = i * 4;
4364 ln = pr_buf->th_doacross_info[j + 1];
4365 lo = pr_buf->th_doacross_info[j + 2];
4366 st = pr_buf->th_doacross_info[j + 4];
4367 if (st == 1) {
4368 iter = vec[i] - lo;
4369 } else if (st > 0) {
4370 iter = (kmp_uint64)(vec[i] - lo) / st;
4371 } else { // st < 0
4372 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4373 }
4374 iter_number = iter + ln * iter_number;
4375 #if OMPT_SUPPORT && OMPT_OPTIONAL
4376 deps[i].variable.value = iter;
4377 deps[i].dependence_type = ompt_dependence_type_source;
4378 #endif
4379 }
4380 #if OMPT_SUPPORT && OMPT_OPTIONAL
4381 if (ompt_enabled.ompt_callback_dependences) {
4382 ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4383 &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4384 }
4385 #endif
4386 shft = iter_number % 32; // use 32-bit granularity
4387 iter_number >>= 5; // divided by 32
4388 flag = 1 << shft;
4389 KMP_MB();
4390 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4391 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4392 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4393 (iter_number << 5) + shft));
4394 }
4395
__kmpc_doacross_fini(ident_t * loc,int gtid)4396 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4397 __kmp_assert_valid_gtid(gtid);
4398 kmp_int32 num_done;
4399 kmp_info_t *th = __kmp_threads[gtid];
4400 kmp_team_t *team = th->th.th_team;
4401 kmp_disp_t *pr_buf = th->th.th_dispatch;
4402
4403 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4404 if (team->t.t_serialized) {
4405 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4406 return; // nothing to do
4407 }
4408 num_done =
4409 KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4410 if (num_done == th->th.th_team_nproc) {
4411 // we are the last thread, need to free shared resources
4412 int idx = pr_buf->th_doacross_buf_idx - 1;
4413 dispatch_shared_info_t *sh_buf =
4414 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4415 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4416 (kmp_int64)&sh_buf->doacross_num_done);
4417 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4418 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4419 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4420 sh_buf->doacross_flags = NULL;
4421 sh_buf->doacross_num_done = 0;
4422 sh_buf->doacross_buf_idx +=
4423 __kmp_dispatch_num_buffers; // free buffer for future re-use
4424 }
4425 // free private resources (need to keep buffer index forever)
4426 pr_buf->th_doacross_flags = NULL;
4427 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4428 pr_buf->th_doacross_info = NULL;
4429 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4430 }
4431
4432 /* OpenMP 5.1 Memory Management routines */
omp_alloc(size_t size,omp_allocator_handle_t allocator)4433 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4434 return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4435 }
4436
omp_aligned_alloc(size_t align,size_t size,omp_allocator_handle_t allocator)4437 void *omp_aligned_alloc(size_t align, size_t size,
4438 omp_allocator_handle_t allocator) {
4439 return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4440 }
4441
omp_calloc(size_t nmemb,size_t size,omp_allocator_handle_t allocator)4442 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4443 return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4444 }
4445
omp_aligned_calloc(size_t align,size_t nmemb,size_t size,omp_allocator_handle_t allocator)4446 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4447 omp_allocator_handle_t allocator) {
4448 return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4449 }
4450
omp_realloc(void * ptr,size_t size,omp_allocator_handle_t allocator,omp_allocator_handle_t free_allocator)4451 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4452 omp_allocator_handle_t free_allocator) {
4453 return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4454 free_allocator);
4455 }
4456
omp_free(void * ptr,omp_allocator_handle_t allocator)4457 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4458 ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4459 }
4460 /* end of OpenMP 5.1 Memory Management routines */
4461
__kmpc_get_target_offload(void)4462 int __kmpc_get_target_offload(void) {
4463 if (!__kmp_init_serial) {
4464 __kmp_serial_initialize();
4465 }
4466 return __kmp_target_offload;
4467 }
4468
__kmpc_pause_resource(kmp_pause_status_t level)4469 int __kmpc_pause_resource(kmp_pause_status_t level) {
4470 if (!__kmp_init_serial) {
4471 return 1; // Can't pause if runtime is not initialized
4472 }
4473 return __kmp_pause_resource(level);
4474 }
4475
__kmpc_error(ident_t * loc,int severity,const char * message)4476 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4477 if (!__kmp_init_serial)
4478 __kmp_serial_initialize();
4479
4480 KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4481
4482 #if OMPT_SUPPORT
4483 if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4484 ompt_callbacks.ompt_callback(ompt_callback_error)(
4485 (ompt_severity_t)severity, message, KMP_STRLEN(message),
4486 OMPT_GET_RETURN_ADDRESS(0));
4487 }
4488 #endif // OMPT_SUPPORT
4489
4490 char *src_loc;
4491 if (loc && loc->psource) {
4492 kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4493 src_loc =
4494 __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
4495 __kmp_str_loc_free(&str_loc);
4496 } else {
4497 src_loc = __kmp_str_format("unknown");
4498 }
4499
4500 if (severity == severity_warning)
4501 KMP_WARNING(UserDirectedWarning, src_loc, message);
4502 else
4503 KMP_FATAL(UserDirectedError, src_loc, message);
4504
4505 __kmp_str_free(&src_loc);
4506 }
4507
4508 // Mark begin of scope directive.
__kmpc_scope(ident_t * loc,kmp_int32 gtid,void * reserved)4509 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4510 // reserved is for extension of scope directive and not used.
4511 #if OMPT_SUPPORT && OMPT_OPTIONAL
4512 if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4513 kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4514 int tid = __kmp_tid_from_gtid(gtid);
4515 ompt_callbacks.ompt_callback(ompt_callback_work)(
4516 ompt_work_scope, ompt_scope_begin,
4517 &(team->t.ompt_team_info.parallel_data),
4518 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4519 OMPT_GET_RETURN_ADDRESS(0));
4520 }
4521 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4522 }
4523
4524 // Mark end of scope directive
__kmpc_end_scope(ident_t * loc,kmp_int32 gtid,void * reserved)4525 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4526 // reserved is for extension of scope directive and not used.
4527 #if OMPT_SUPPORT && OMPT_OPTIONAL
4528 if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4529 kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4530 int tid = __kmp_tid_from_gtid(gtid);
4531 ompt_callbacks.ompt_callback(ompt_callback_work)(
4532 ompt_work_scope, ompt_scope_end,
4533 &(team->t.ompt_team_info.parallel_data),
4534 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4535 OMPT_GET_RETURN_ADDRESS(0));
4536 }
4537 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4538 }
4539
4540 #ifdef KMP_USE_VERSION_SYMBOLS
4541 // For GOMP compatibility there are two versions of each omp_* API.
4542 // One is the plain C symbol and one is the Fortran symbol with an appended
4543 // underscore. When we implement a specific ompc_* version of an omp_*
4544 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4545 // instead of the Fortran versions in kmp_ftn_entry.h
4546 extern "C" {
4547 // Have to undef these from omp.h so they aren't translated into
4548 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4549 #ifdef omp_set_affinity_format
4550 #undef omp_set_affinity_format
4551 #endif
4552 #ifdef omp_get_affinity_format
4553 #undef omp_get_affinity_format
4554 #endif
4555 #ifdef omp_display_affinity
4556 #undef omp_display_affinity
4557 #endif
4558 #ifdef omp_capture_affinity
4559 #undef omp_capture_affinity
4560 #endif
4561 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4562 "OMP_5.0");
4563 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4564 "OMP_5.0");
4565 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4566 "OMP_5.0");
4567 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4568 "OMP_5.0");
4569 } // extern "C"
4570 #endif
4571