1 /*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #define __KMP_IMP
15 #include "omp.h" /* extern "C" declarations of user-visible routines */
16 #include "kmp.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_stats.h"
22
23 #if OMPT_SUPPORT
24 #include "ompt-specific.h"
25 #endif
26
27 #define MAX_MESSAGE 512
28
29 // flags will be used in future, e.g. to implement openmp_strict library
30 // restrictions
31
32 /*!
33 * @ingroup STARTUP_SHUTDOWN
34 * @param loc in source location information
35 * @param flags in for future use (currently ignored)
36 *
37 * Initialize the runtime library. This call is optional; if it is not made then
38 * it will be implicitly called by attempts to use other library functions.
39 */
__kmpc_begin(ident_t * loc,kmp_int32 flags)40 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
41 // By default __kmpc_begin() is no-op.
42 char *env;
43 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
44 __kmp_str_match_true(env)) {
45 __kmp_middle_initialize();
46 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
47 } else if (__kmp_ignore_mppbeg() == FALSE) {
48 // By default __kmp_ignore_mppbeg() returns TRUE.
49 __kmp_internal_begin();
50 KC_TRACE(10, ("__kmpc_begin: called\n"));
51 }
52 }
53
54 /*!
55 * @ingroup STARTUP_SHUTDOWN
56 * @param loc source location information
57 *
58 * Shutdown the runtime library. This is also optional, and even if called will
59 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
60 * zero.
61 */
__kmpc_end(ident_t * loc)62 void __kmpc_end(ident_t *loc) {
63 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
64 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
65 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
66 // returns FALSE and __kmpc_end() will unregister this root (it can cause
67 // library shut down).
68 if (__kmp_ignore_mppend() == FALSE) {
69 KC_TRACE(10, ("__kmpc_end: called\n"));
70 KA_TRACE(30, ("__kmpc_end\n"));
71
72 __kmp_internal_end_thread(-1);
73 }
74 #if KMP_OS_WINDOWS && OMPT_SUPPORT
75 // Normal exit process on Windows does not allow worker threads of the final
76 // parallel region to finish reporting their events, so shutting down the
77 // library here fixes the issue at least for the cases where __kmpc_end() is
78 // placed properly.
79 if (ompt_enabled.enabled)
80 __kmp_internal_end_library(__kmp_gtid_get_specific());
81 #endif
82 }
83
84 /*!
85 @ingroup THREAD_STATES
86 @param loc Source location information.
87 @return The global thread index of the active thread.
88
89 This function can be called in any context.
90
91 If the runtime has ony been entered at the outermost level from a
92 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
93 that which would be returned by omp_get_thread_num() in the outermost
94 active parallel construct. (Or zero if there is no active parallel
95 construct, since the master thread is necessarily thread zero).
96
97 If multiple non-OpenMP threads all enter an OpenMP construct then this
98 will be a unique thread identifier among all the threads created by
99 the OpenMP runtime (but the value cannote be defined in terms of
100 OpenMP thread ids returned by omp_get_thread_num()).
101 */
__kmpc_global_thread_num(ident_t * loc)102 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
103 kmp_int32 gtid = __kmp_entry_gtid();
104
105 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
106
107 return gtid;
108 }
109
110 /*!
111 @ingroup THREAD_STATES
112 @param loc Source location information.
113 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
114
115 This function can be called in any context.
116 It returns the total number of threads under the control of the OpenMP runtime.
117 That is not a number that can be determined by any OpenMP standard calls, since
118 the library may be called from more than one non-OpenMP thread, and this
119 reflects the total over all such calls. Similarly the runtime maintains
120 underlying threads even when they are not active (since the cost of creating
121 and destroying OS threads is high), this call counts all such threads even if
122 they are not waiting for work.
123 */
__kmpc_global_num_threads(ident_t * loc)124 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
125 KC_TRACE(10,
126 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
127
128 return TCR_4(__kmp_all_nth);
129 }
130
131 /*!
132 @ingroup THREAD_STATES
133 @param loc Source location information.
134 @return The thread number of the calling thread in the innermost active parallel
135 construct.
136 */
__kmpc_bound_thread_num(ident_t * loc)137 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
138 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
139 return __kmp_tid_from_gtid(__kmp_entry_gtid());
140 }
141
142 /*!
143 @ingroup THREAD_STATES
144 @param loc Source location information.
145 @return The number of threads in the innermost active parallel construct.
146 */
__kmpc_bound_num_threads(ident_t * loc)147 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
148 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
149
150 return __kmp_entry_thread()->th.th_team->t.t_nproc;
151 }
152
153 /*!
154 * @ingroup DEPRECATED
155 * @param loc location description
156 *
157 * This function need not be called. It always returns TRUE.
158 */
__kmpc_ok_to_fork(ident_t * loc)159 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
160 #ifndef KMP_DEBUG
161
162 return TRUE;
163
164 #else
165
166 const char *semi2;
167 const char *semi3;
168 int line_no;
169
170 if (__kmp_par_range == 0) {
171 return TRUE;
172 }
173 semi2 = loc->psource;
174 if (semi2 == NULL) {
175 return TRUE;
176 }
177 semi2 = strchr(semi2, ';');
178 if (semi2 == NULL) {
179 return TRUE;
180 }
181 semi2 = strchr(semi2 + 1, ';');
182 if (semi2 == NULL) {
183 return TRUE;
184 }
185 if (__kmp_par_range_filename[0]) {
186 const char *name = semi2 - 1;
187 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
188 name--;
189 }
190 if ((*name == '/') || (*name == ';')) {
191 name++;
192 }
193 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
194 return __kmp_par_range < 0;
195 }
196 }
197 semi3 = strchr(semi2 + 1, ';');
198 if (__kmp_par_range_routine[0]) {
199 if ((semi3 != NULL) && (semi3 > semi2) &&
200 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
201 return __kmp_par_range < 0;
202 }
203 }
204 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
205 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
206 return __kmp_par_range > 0;
207 }
208 return __kmp_par_range < 0;
209 }
210 return TRUE;
211
212 #endif /* KMP_DEBUG */
213 }
214
215 /*!
216 @ingroup THREAD_STATES
217 @param loc Source location information.
218 @return 1 if this thread is executing inside an active parallel region, zero if
219 not.
220 */
__kmpc_in_parallel(ident_t * loc)221 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
222 return __kmp_entry_thread()->th.th_root->r.r_active;
223 }
224
225 /*!
226 @ingroup PARALLEL
227 @param loc source location information
228 @param global_tid global thread number
229 @param num_threads number of threads requested for this parallel construct
230
231 Set the number of threads to be used by the next fork spawned by this thread.
232 This call is only required if the parallel construct has a `num_threads` clause.
233 */
__kmpc_push_num_threads(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_threads)234 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
235 kmp_int32 num_threads) {
236 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
237 global_tid, num_threads));
238
239 __kmp_push_num_threads(loc, global_tid, num_threads);
240 }
241
__kmpc_pop_num_threads(ident_t * loc,kmp_int32 global_tid)242 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
243 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
244
245 /* the num_threads are automatically popped */
246 }
247
248 #if OMP_40_ENABLED
249
__kmpc_push_proc_bind(ident_t * loc,kmp_int32 global_tid,kmp_int32 proc_bind)250 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
251 kmp_int32 proc_bind) {
252 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
253 proc_bind));
254
255 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
256 }
257
258 #endif /* OMP_40_ENABLED */
259
260 /*!
261 @ingroup PARALLEL
262 @param loc source location information
263 @param argc total number of arguments in the ellipsis
264 @param microtask pointer to callback routine consisting of outlined parallel
265 construct
266 @param ... pointers to shared variables that aren't global
267
268 Do the actual fork and call the microtask in the relevant number of threads.
269 */
__kmpc_fork_call(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)270 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
271 int gtid = __kmp_entry_gtid();
272
273 #if (KMP_STATS_ENABLED)
274 // If we were in a serial region, then stop the serial timer, record
275 // the event, and start parallel region timer
276 stats_state_e previous_state = KMP_GET_THREAD_STATE();
277 if (previous_state == stats_state_e::SERIAL_REGION) {
278 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
279 } else {
280 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
281 }
282 int inParallel = __kmpc_in_parallel(loc);
283 if (inParallel) {
284 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
285 } else {
286 KMP_COUNT_BLOCK(OMP_PARALLEL);
287 }
288 #endif
289
290 // maybe to save thr_state is enough here
291 {
292 va_list ap;
293 va_start(ap, microtask);
294
295 #if OMPT_SUPPORT
296 ompt_frame_t *ompt_frame;
297 if (ompt_enabled.enabled) {
298 kmp_info_t *master_th = __kmp_threads[gtid];
299 kmp_team_t *parent_team = master_th->th.th_team;
300 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
301 if (lwt)
302 ompt_frame = &(lwt->ompt_task_info.frame);
303 else {
304 int tid = __kmp_tid_from_gtid(gtid);
305 ompt_frame = &(
306 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
307 }
308 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
309 OMPT_STORE_RETURN_ADDRESS(gtid);
310 }
311 #endif
312
313 #if INCLUDE_SSC_MARKS
314 SSC_MARK_FORKING();
315 #endif
316 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
317 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
318 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
319 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
320 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
321 &ap
322 #else
323 ap
324 #endif
325 );
326 #if INCLUDE_SSC_MARKS
327 SSC_MARK_JOINING();
328 #endif
329 __kmp_join_call(loc, gtid
330 #if OMPT_SUPPORT
331 ,
332 fork_context_intel
333 #endif
334 );
335
336 va_end(ap);
337 }
338
339 #if KMP_STATS_ENABLED
340 if (previous_state == stats_state_e::SERIAL_REGION) {
341 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
342 } else {
343 KMP_POP_PARTITIONED_TIMER();
344 }
345 #endif // KMP_STATS_ENABLED
346 }
347
348 #if OMP_40_ENABLED
349 /*!
350 @ingroup PARALLEL
351 @param loc source location information
352 @param global_tid global thread number
353 @param num_teams number of teams requested for the teams construct
354 @param num_threads number of threads per team requested for the teams construct
355
356 Set the number of teams to be used by the teams construct.
357 This call is only required if the teams construct has a `num_teams` clause
358 or a `thread_limit` clause (or both).
359 */
__kmpc_push_num_teams(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_teams,kmp_int32 num_threads)360 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
361 kmp_int32 num_teams, kmp_int32 num_threads) {
362 KA_TRACE(20,
363 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
364 global_tid, num_teams, num_threads));
365
366 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
367 }
368
369 /*!
370 @ingroup PARALLEL
371 @param loc source location information
372 @param argc total number of arguments in the ellipsis
373 @param microtask pointer to callback routine consisting of outlined teams
374 construct
375 @param ... pointers to shared variables that aren't global
376
377 Do the actual fork and call the microtask in the relevant number of threads.
378 */
__kmpc_fork_teams(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)379 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
380 ...) {
381 int gtid = __kmp_entry_gtid();
382 kmp_info_t *this_thr = __kmp_threads[gtid];
383 va_list ap;
384 va_start(ap, microtask);
385
386 KMP_COUNT_BLOCK(OMP_TEAMS);
387
388 // remember teams entry point and nesting level
389 this_thr->th.th_teams_microtask = microtask;
390 this_thr->th.th_teams_level =
391 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
392
393 #if OMPT_SUPPORT
394 kmp_team_t *parent_team = this_thr->th.th_team;
395 int tid = __kmp_tid_from_gtid(gtid);
396 if (ompt_enabled.enabled) {
397 parent_team->t.t_implicit_task_taskdata[tid]
398 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
399 }
400 OMPT_STORE_RETURN_ADDRESS(gtid);
401 #endif
402
403 // check if __kmpc_push_num_teams called, set default number of teams
404 // otherwise
405 if (this_thr->th.th_teams_size.nteams == 0) {
406 __kmp_push_num_teams(loc, gtid, 0, 0);
407 }
408 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
409 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
410 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
411
412 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
413 VOLATILE_CAST(microtask_t)
414 __kmp_teams_master, // "wrapped" task
415 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
416 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
417 &ap
418 #else
419 ap
420 #endif
421 );
422 __kmp_join_call(loc, gtid
423 #if OMPT_SUPPORT
424 ,
425 fork_context_intel
426 #endif
427 );
428
429 this_thr->th.th_teams_microtask = NULL;
430 this_thr->th.th_teams_level = 0;
431 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
432 va_end(ap);
433 }
434 #endif /* OMP_40_ENABLED */
435
436 // I don't think this function should ever have been exported.
437 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
438 // openmp code ever called it, but it's been exported from the RTL for so
439 // long that I'm afraid to remove the definition.
__kmpc_invoke_task_func(int gtid)440 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
441
442 /*!
443 @ingroup PARALLEL
444 @param loc source location information
445 @param global_tid global thread number
446
447 Enter a serialized parallel construct. This interface is used to handle a
448 conditional parallel region, like this,
449 @code
450 #pragma omp parallel if (condition)
451 @endcode
452 when the condition is false.
453 */
__kmpc_serialized_parallel(ident_t * loc,kmp_int32 global_tid)454 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
455 // The implementation is now in kmp_runtime.cpp so that it can share static
456 // functions with kmp_fork_call since the tasks to be done are similar in
457 // each case.
458 #if OMPT_SUPPORT
459 OMPT_STORE_RETURN_ADDRESS(global_tid);
460 #endif
461 __kmp_serialized_parallel(loc, global_tid);
462 }
463
464 /*!
465 @ingroup PARALLEL
466 @param loc source location information
467 @param global_tid global thread number
468
469 Leave a serialized parallel construct.
470 */
__kmpc_end_serialized_parallel(ident_t * loc,kmp_int32 global_tid)471 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
472 kmp_internal_control_t *top;
473 kmp_info_t *this_thr;
474 kmp_team_t *serial_team;
475
476 KC_TRACE(10,
477 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
478
479 /* skip all this code for autopar serialized loops since it results in
480 unacceptable overhead */
481 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
482 return;
483
484 // Not autopar code
485 if (!TCR_4(__kmp_init_parallel))
486 __kmp_parallel_initialize();
487
488 this_thr = __kmp_threads[global_tid];
489 serial_team = this_thr->th.th_serial_team;
490
491 #if OMP_45_ENABLED
492 kmp_task_team_t *task_team = this_thr->th.th_task_team;
493
494 // we need to wait for the proxy tasks before finishing the thread
495 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
496 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
497 #endif
498
499 KMP_MB();
500 KMP_DEBUG_ASSERT(serial_team);
501 KMP_ASSERT(serial_team->t.t_serialized);
502 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
503 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
504 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
505 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
506
507 #if OMPT_SUPPORT
508 if (ompt_enabled.enabled &&
509 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
510 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
511 if (ompt_enabled.ompt_callback_implicit_task) {
512 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
513 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
514 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
515 }
516
517 // reset clear the task id only after unlinking the task
518 ompt_data_t *parent_task_data;
519 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
520
521 if (ompt_enabled.ompt_callback_parallel_end) {
522 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
523 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
524 ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
525 }
526 __ompt_lw_taskteam_unlink(this_thr);
527 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
528 }
529 #endif
530
531 /* If necessary, pop the internal control stack values and replace the team
532 * values */
533 top = serial_team->t.t_control_stack_top;
534 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
535 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
536 serial_team->t.t_control_stack_top = top->next;
537 __kmp_free(top);
538 }
539
540 // if( serial_team -> t.t_serialized > 1 )
541 serial_team->t.t_level--;
542
543 /* pop dispatch buffers stack */
544 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
545 {
546 dispatch_private_info_t *disp_buffer =
547 serial_team->t.t_dispatch->th_disp_buffer;
548 serial_team->t.t_dispatch->th_disp_buffer =
549 serial_team->t.t_dispatch->th_disp_buffer->next;
550 __kmp_free(disp_buffer);
551 }
552 #if OMP_50_ENABLED
553 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
554 #endif
555
556 --serial_team->t.t_serialized;
557 if (serial_team->t.t_serialized == 0) {
558
559 /* return to the parallel section */
560
561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
562 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
563 __kmp_clear_x87_fpu_status_word();
564 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
565 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
566 }
567 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
568
569 this_thr->th.th_team = serial_team->t.t_parent;
570 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
571
572 /* restore values cached in the thread */
573 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
574 this_thr->th.th_team_master =
575 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
576 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
577
578 /* TODO the below shouldn't need to be adjusted for serialized teams */
579 this_thr->th.th_dispatch =
580 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
581
582 __kmp_pop_current_task_from_thread(this_thr);
583
584 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
585 this_thr->th.th_current_task->td_flags.executing = 1;
586
587 if (__kmp_tasking_mode != tskm_immediate_exec) {
588 // Copy the task team from the new child / old parent team to the thread.
589 this_thr->th.th_task_team =
590 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
591 KA_TRACE(20,
592 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
593 "team %p\n",
594 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
595 }
596 } else {
597 if (__kmp_tasking_mode != tskm_immediate_exec) {
598 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
599 "depth of serial team %p to %d\n",
600 global_tid, serial_team, serial_team->t.t_serialized));
601 }
602 }
603
604 if (__kmp_env_consistency_check)
605 __kmp_pop_parallel(global_tid, NULL);
606 #if OMPT_SUPPORT
607 if (ompt_enabled.enabled)
608 this_thr->th.ompt_thread_info.state =
609 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
610 : ompt_state_work_parallel);
611 #endif
612 }
613
614 /*!
615 @ingroup SYNCHRONIZATION
616 @param loc source location information.
617
618 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
619 depending on the memory ordering convention obeyed by the compiler
620 even that may not be necessary).
621 */
__kmpc_flush(ident_t * loc)622 void __kmpc_flush(ident_t *loc) {
623 KC_TRACE(10, ("__kmpc_flush: called\n"));
624
625 /* need explicit __mf() here since use volatile instead in library */
626 KMP_MB(); /* Flush all pending memory write invalidates. */
627
628 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
629 #if KMP_MIC
630 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
631 // We shouldn't need it, though, since the ABI rules require that
632 // * If the compiler generates NGO stores it also generates the fence
633 // * If users hand-code NGO stores they should insert the fence
634 // therefore no incomplete unordered stores should be visible.
635 #else
636 // C74404
637 // This is to address non-temporal store instructions (sfence needed).
638 // The clflush instruction is addressed either (mfence needed).
639 // Probably the non-temporal load monvtdqa instruction should also be
640 // addressed.
641 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
642 if (!__kmp_cpuinfo.initialized) {
643 __kmp_query_cpuid(&__kmp_cpuinfo);
644 }
645 if (!__kmp_cpuinfo.sse2) {
646 // CPU cannot execute SSE2 instructions.
647 } else {
648 #if KMP_COMPILER_ICC
649 _mm_mfence();
650 #elif KMP_COMPILER_MSVC
651 MemoryBarrier();
652 #else
653 __sync_synchronize();
654 #endif // KMP_COMPILER_ICC
655 }
656 #endif // KMP_MIC
657 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
658 // Nothing to see here move along
659 #elif KMP_ARCH_PPC64
660 // Nothing needed here (we have a real MB above).
661 #if KMP_OS_CNK
662 // The flushing thread needs to yield here; this prevents a
663 // busy-waiting thread from saturating the pipeline. flush is
664 // often used in loops like this:
665 // while (!flag) {
666 // #pragma omp flush(flag)
667 // }
668 // and adding the yield here is good for at least a 10x speedup
669 // when running >2 threads per core (on the NAS LU benchmark).
670 __kmp_yield(TRUE);
671 #endif
672 #else
673 #error Unknown or unsupported architecture
674 #endif
675
676 #if OMPT_SUPPORT && OMPT_OPTIONAL
677 if (ompt_enabled.ompt_callback_flush) {
678 ompt_callbacks.ompt_callback(ompt_callback_flush)(
679 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
680 }
681 #endif
682 }
683
684 /* -------------------------------------------------------------------------- */
685 /*!
686 @ingroup SYNCHRONIZATION
687 @param loc source location information
688 @param global_tid thread id.
689
690 Execute a barrier.
691 */
__kmpc_barrier(ident_t * loc,kmp_int32 global_tid)692 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
693 KMP_COUNT_BLOCK(OMP_BARRIER);
694 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
695
696 if (!TCR_4(__kmp_init_parallel))
697 __kmp_parallel_initialize();
698
699 if (__kmp_env_consistency_check) {
700 if (loc == 0) {
701 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
702 }
703
704 __kmp_check_barrier(global_tid, ct_barrier, loc);
705 }
706
707 #if OMPT_SUPPORT
708 ompt_frame_t *ompt_frame;
709 if (ompt_enabled.enabled) {
710 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
711 if (ompt_frame->enter_frame.ptr == NULL)
712 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
713 OMPT_STORE_RETURN_ADDRESS(global_tid);
714 }
715 #endif
716 __kmp_threads[global_tid]->th.th_ident = loc;
717 // TODO: explicit barrier_wait_id:
718 // this function is called when 'barrier' directive is present or
719 // implicit barrier at the end of a worksharing construct.
720 // 1) better to add a per-thread barrier counter to a thread data structure
721 // 2) set to 0 when a new team is created
722 // 4) no sync is required
723
724 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
725 #if OMPT_SUPPORT && OMPT_OPTIONAL
726 if (ompt_enabled.enabled) {
727 ompt_frame->enter_frame = ompt_data_none;
728 }
729 #endif
730 }
731
732 /* The BARRIER for a MASTER section is always explicit */
733 /*!
734 @ingroup WORK_SHARING
735 @param loc source location information.
736 @param global_tid global thread number .
737 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
738 */
__kmpc_master(ident_t * loc,kmp_int32 global_tid)739 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
740 int status = 0;
741
742 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
743
744 if (!TCR_4(__kmp_init_parallel))
745 __kmp_parallel_initialize();
746
747 if (KMP_MASTER_GTID(global_tid)) {
748 KMP_COUNT_BLOCK(OMP_MASTER);
749 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
750 status = 1;
751 }
752
753 #if OMPT_SUPPORT && OMPT_OPTIONAL
754 if (status) {
755 if (ompt_enabled.ompt_callback_master) {
756 kmp_info_t *this_thr = __kmp_threads[global_tid];
757 kmp_team_t *team = this_thr->th.th_team;
758
759 int tid = __kmp_tid_from_gtid(global_tid);
760 ompt_callbacks.ompt_callback(ompt_callback_master)(
761 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
762 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
763 OMPT_GET_RETURN_ADDRESS(0));
764 }
765 }
766 #endif
767
768 if (__kmp_env_consistency_check) {
769 #if KMP_USE_DYNAMIC_LOCK
770 if (status)
771 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
772 else
773 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
774 #else
775 if (status)
776 __kmp_push_sync(global_tid, ct_master, loc, NULL);
777 else
778 __kmp_check_sync(global_tid, ct_master, loc, NULL);
779 #endif
780 }
781
782 return status;
783 }
784
785 /*!
786 @ingroup WORK_SHARING
787 @param loc source location information.
788 @param global_tid global thread number .
789
790 Mark the end of a <tt>master</tt> region. This should only be called by the
791 thread that executes the <tt>master</tt> region.
792 */
__kmpc_end_master(ident_t * loc,kmp_int32 global_tid)793 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
794 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
795
796 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
797 KMP_POP_PARTITIONED_TIMER();
798
799 #if OMPT_SUPPORT && OMPT_OPTIONAL
800 kmp_info_t *this_thr = __kmp_threads[global_tid];
801 kmp_team_t *team = this_thr->th.th_team;
802 if (ompt_enabled.ompt_callback_master) {
803 int tid = __kmp_tid_from_gtid(global_tid);
804 ompt_callbacks.ompt_callback(ompt_callback_master)(
805 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
806 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
807 OMPT_GET_RETURN_ADDRESS(0));
808 }
809 #endif
810
811 if (__kmp_env_consistency_check) {
812 if (global_tid < 0)
813 KMP_WARNING(ThreadIdentInvalid);
814
815 if (KMP_MASTER_GTID(global_tid))
816 __kmp_pop_sync(global_tid, ct_master, loc);
817 }
818 }
819
820 /*!
821 @ingroup WORK_SHARING
822 @param loc source location information.
823 @param gtid global thread number.
824
825 Start execution of an <tt>ordered</tt> construct.
826 */
__kmpc_ordered(ident_t * loc,kmp_int32 gtid)827 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
828 int cid = 0;
829 kmp_info_t *th;
830 KMP_DEBUG_ASSERT(__kmp_init_serial);
831
832 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
833
834 if (!TCR_4(__kmp_init_parallel))
835 __kmp_parallel_initialize();
836
837 #if USE_ITT_BUILD
838 __kmp_itt_ordered_prep(gtid);
839 // TODO: ordered_wait_id
840 #endif /* USE_ITT_BUILD */
841
842 th = __kmp_threads[gtid];
843
844 #if OMPT_SUPPORT && OMPT_OPTIONAL
845 kmp_team_t *team;
846 ompt_wait_id_t lck;
847 void *codeptr_ra;
848 if (ompt_enabled.enabled) {
849 OMPT_STORE_RETURN_ADDRESS(gtid);
850 team = __kmp_team_from_gtid(gtid);
851 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
852 /* OMPT state update */
853 th->th.ompt_thread_info.wait_id = lck;
854 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
855
856 /* OMPT event callback */
857 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
858 if (ompt_enabled.ompt_callback_mutex_acquire) {
859 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
860 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
861 codeptr_ra);
862 }
863 }
864 #endif
865
866 if (th->th.th_dispatch->th_deo_fcn != 0)
867 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc);
868 else
869 __kmp_parallel_deo(>id, &cid, loc);
870
871 #if OMPT_SUPPORT && OMPT_OPTIONAL
872 if (ompt_enabled.enabled) {
873 /* OMPT state update */
874 th->th.ompt_thread_info.state = ompt_state_work_parallel;
875 th->th.ompt_thread_info.wait_id = 0;
876
877 /* OMPT event callback */
878 if (ompt_enabled.ompt_callback_mutex_acquired) {
879 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
880 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
881 }
882 }
883 #endif
884
885 #if USE_ITT_BUILD
886 __kmp_itt_ordered_start(gtid);
887 #endif /* USE_ITT_BUILD */
888 }
889
890 /*!
891 @ingroup WORK_SHARING
892 @param loc source location information.
893 @param gtid global thread number.
894
895 End execution of an <tt>ordered</tt> construct.
896 */
__kmpc_end_ordered(ident_t * loc,kmp_int32 gtid)897 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
898 int cid = 0;
899 kmp_info_t *th;
900
901 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
902
903 #if USE_ITT_BUILD
904 __kmp_itt_ordered_end(gtid);
905 // TODO: ordered_wait_id
906 #endif /* USE_ITT_BUILD */
907
908 th = __kmp_threads[gtid];
909
910 if (th->th.th_dispatch->th_dxo_fcn != 0)
911 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc);
912 else
913 __kmp_parallel_dxo(>id, &cid, loc);
914
915 #if OMPT_SUPPORT && OMPT_OPTIONAL
916 OMPT_STORE_RETURN_ADDRESS(gtid);
917 if (ompt_enabled.ompt_callback_mutex_released) {
918 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
919 ompt_mutex_ordered,
920 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
921 ->t.t_ordered.dt.t_value,
922 OMPT_LOAD_RETURN_ADDRESS(gtid));
923 }
924 #endif
925 }
926
927 #if KMP_USE_DYNAMIC_LOCK
928
929 static __forceinline void
__kmp_init_indirect_csptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid,kmp_indirect_locktag_t tag)930 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
931 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
932 // Pointer to the allocated indirect lock is written to crit, while indexing
933 // is ignored.
934 void *idx;
935 kmp_indirect_lock_t **lck;
936 lck = (kmp_indirect_lock_t **)crit;
937 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
938 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
939 KMP_SET_I_LOCK_LOCATION(ilk, loc);
940 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
941 KA_TRACE(20,
942 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
943 #if USE_ITT_BUILD
944 __kmp_itt_critical_creating(ilk->lock, loc);
945 #endif
946 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
947 if (status == 0) {
948 #if USE_ITT_BUILD
949 __kmp_itt_critical_destroyed(ilk->lock);
950 #endif
951 // We don't really need to destroy the unclaimed lock here since it will be
952 // cleaned up at program exit.
953 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
954 }
955 KMP_DEBUG_ASSERT(*lck != NULL);
956 }
957
958 // Fast-path acquire tas lock
959 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
960 { \
961 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
962 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
963 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
964 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
965 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
966 kmp_uint32 spins; \
967 KMP_FSYNC_PREPARE(l); \
968 KMP_INIT_YIELD(spins); \
969 if (TCR_4(__kmp_nth) > \
970 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
971 KMP_YIELD(TRUE); \
972 } else { \
973 KMP_YIELD_SPIN(spins); \
974 } \
975 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
976 while ( \
977 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
978 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
979 __kmp_spin_backoff(&backoff); \
980 if (TCR_4(__kmp_nth) > \
981 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
982 KMP_YIELD(TRUE); \
983 } else { \
984 KMP_YIELD_SPIN(spins); \
985 } \
986 } \
987 } \
988 KMP_FSYNC_ACQUIRED(l); \
989 }
990
991 // Fast-path test tas lock
992 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
993 { \
994 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
995 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
996 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
997 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
998 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
999 }
1000
1001 // Fast-path release tas lock
1002 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1003 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1004
1005 #if KMP_USE_FUTEX
1006
1007 #include <sys/syscall.h>
1008 #include <unistd.h>
1009 #ifndef FUTEX_WAIT
1010 #define FUTEX_WAIT 0
1011 #endif
1012 #ifndef FUTEX_WAKE
1013 #define FUTEX_WAKE 1
1014 #endif
1015
1016 // Fast-path acquire futex lock
1017 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1018 { \
1019 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1020 kmp_int32 gtid_code = (gtid + 1) << 1; \
1021 KMP_MB(); \
1022 KMP_FSYNC_PREPARE(ftx); \
1023 kmp_int32 poll_val; \
1024 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1025 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1026 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1027 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1028 if (!cond) { \
1029 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1030 poll_val | \
1031 KMP_LOCK_BUSY(1, futex))) { \
1032 continue; \
1033 } \
1034 poll_val |= KMP_LOCK_BUSY(1, futex); \
1035 } \
1036 kmp_int32 rc; \
1037 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1038 NULL, NULL, 0)) != 0) { \
1039 continue; \
1040 } \
1041 gtid_code |= 1; \
1042 } \
1043 KMP_FSYNC_ACQUIRED(ftx); \
1044 }
1045
1046 // Fast-path test futex lock
1047 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1048 { \
1049 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1050 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1051 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1052 KMP_FSYNC_ACQUIRED(ftx); \
1053 rc = TRUE; \
1054 } else { \
1055 rc = FALSE; \
1056 } \
1057 }
1058
1059 // Fast-path release futex lock
1060 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1061 { \
1062 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1063 KMP_MB(); \
1064 KMP_FSYNC_RELEASING(ftx); \
1065 kmp_int32 poll_val = \
1066 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1067 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1068 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1069 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1070 } \
1071 KMP_MB(); \
1072 KMP_YIELD(TCR_4(__kmp_nth) > \
1073 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1074 }
1075
1076 #endif // KMP_USE_FUTEX
1077
1078 #else // KMP_USE_DYNAMIC_LOCK
1079
__kmp_get_critical_section_ptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid)1080 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1081 ident_t const *loc,
1082 kmp_int32 gtid) {
1083 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1084
1085 // Because of the double-check, the following load doesn't need to be volatile
1086 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1087
1088 if (lck == NULL) {
1089 void *idx;
1090
1091 // Allocate & initialize the lock.
1092 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1093 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1094 __kmp_init_user_lock_with_checks(lck);
1095 __kmp_set_user_lock_location(lck, loc);
1096 #if USE_ITT_BUILD
1097 __kmp_itt_critical_creating(lck);
1098 // __kmp_itt_critical_creating() should be called *before* the first usage
1099 // of underlying lock. It is the only place where we can guarantee it. There
1100 // are chances the lock will destroyed with no usage, but it is not a
1101 // problem, because this is not real event seen by user but rather setting
1102 // name for object (lock). See more details in kmp_itt.h.
1103 #endif /* USE_ITT_BUILD */
1104
1105 // Use a cmpxchg instruction to slam the start of the critical section with
1106 // the lock pointer. If another thread beat us to it, deallocate the lock,
1107 // and use the lock that the other thread allocated.
1108 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1109
1110 if (status == 0) {
1111 // Deallocate the lock and reload the value.
1112 #if USE_ITT_BUILD
1113 __kmp_itt_critical_destroyed(lck);
1114 // Let ITT know the lock is destroyed and the same memory location may be reused
1115 // for another purpose.
1116 #endif /* USE_ITT_BUILD */
1117 __kmp_destroy_user_lock_with_checks(lck);
1118 __kmp_user_lock_free(&idx, gtid, lck);
1119 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1120 KMP_DEBUG_ASSERT(lck != NULL);
1121 }
1122 }
1123 return lck;
1124 }
1125
1126 #endif // KMP_USE_DYNAMIC_LOCK
1127
1128 /*!
1129 @ingroup WORK_SHARING
1130 @param loc source location information.
1131 @param global_tid global thread number .
1132 @param crit identity of the critical section. This could be a pointer to a lock
1133 associated with the critical section, or some other suitably unique value.
1134
1135 Enter code protected by a `critical` construct.
1136 This function blocks until the executing thread can enter the critical section.
1137 */
__kmpc_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1138 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1139 kmp_critical_name *crit) {
1140 #if KMP_USE_DYNAMIC_LOCK
1141 #if OMPT_SUPPORT && OMPT_OPTIONAL
1142 OMPT_STORE_RETURN_ADDRESS(global_tid);
1143 #endif // OMPT_SUPPORT
1144 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1145 #else
1146 KMP_COUNT_BLOCK(OMP_CRITICAL);
1147 #if OMPT_SUPPORT && OMPT_OPTIONAL
1148 ompt_state_t prev_state = ompt_state_undefined;
1149 ompt_thread_info_t ti;
1150 #endif
1151 kmp_user_lock_p lck;
1152
1153 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1154
1155 // TODO: add THR_OVHD_STATE
1156
1157 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1158 KMP_CHECK_USER_LOCK_INIT();
1159
1160 if ((__kmp_user_lock_kind == lk_tas) &&
1161 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1162 lck = (kmp_user_lock_p)crit;
1163 }
1164 #if KMP_USE_FUTEX
1165 else if ((__kmp_user_lock_kind == lk_futex) &&
1166 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1167 lck = (kmp_user_lock_p)crit;
1168 }
1169 #endif
1170 else { // ticket, queuing or drdpa
1171 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1172 }
1173
1174 if (__kmp_env_consistency_check)
1175 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1176
1177 // since the critical directive binds to all threads, not just the current
1178 // team we have to check this even if we are in a serialized team.
1179 // also, even if we are the uber thread, we still have to conduct the lock,
1180 // as we have to contend with sibling threads.
1181
1182 #if USE_ITT_BUILD
1183 __kmp_itt_critical_acquiring(lck);
1184 #endif /* USE_ITT_BUILD */
1185 #if OMPT_SUPPORT && OMPT_OPTIONAL
1186 OMPT_STORE_RETURN_ADDRESS(gtid);
1187 void *codeptr_ra = NULL;
1188 if (ompt_enabled.enabled) {
1189 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1190 /* OMPT state update */
1191 prev_state = ti.state;
1192 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1193 ti.state = ompt_state_wait_critical;
1194
1195 /* OMPT event callback */
1196 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1197 if (ompt_enabled.ompt_callback_mutex_acquire) {
1198 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1199 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1200 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1201 }
1202 }
1203 #endif
1204 // Value of 'crit' should be good for using as a critical_id of the critical
1205 // section directive.
1206 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1207
1208 #if USE_ITT_BUILD
1209 __kmp_itt_critical_acquired(lck);
1210 #endif /* USE_ITT_BUILD */
1211 #if OMPT_SUPPORT && OMPT_OPTIONAL
1212 if (ompt_enabled.enabled) {
1213 /* OMPT state update */
1214 ti.state = prev_state;
1215 ti.wait_id = 0;
1216
1217 /* OMPT event callback */
1218 if (ompt_enabled.ompt_callback_mutex_acquired) {
1219 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1220 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1221 }
1222 }
1223 #endif
1224 KMP_POP_PARTITIONED_TIMER();
1225
1226 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1227 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1228 #endif // KMP_USE_DYNAMIC_LOCK
1229 }
1230
1231 #if KMP_USE_DYNAMIC_LOCK
1232
1233 // Converts the given hint to an internal lock implementation
__kmp_map_hint_to_lock(uintptr_t hint)1234 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1235 #if KMP_USE_TSX
1236 #define KMP_TSX_LOCK(seq) lockseq_##seq
1237 #else
1238 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1239 #endif
1240
1241 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1242 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1243 #else
1244 #define KMP_CPUINFO_RTM 0
1245 #endif
1246
1247 // Hints that do not require further logic
1248 if (hint & kmp_lock_hint_hle)
1249 return KMP_TSX_LOCK(hle);
1250 if (hint & kmp_lock_hint_rtm)
1251 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1252 if (hint & kmp_lock_hint_adaptive)
1253 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1254
1255 // Rule out conflicting hints first by returning the default lock
1256 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1257 return __kmp_user_lock_seq;
1258 if ((hint & omp_lock_hint_speculative) &&
1259 (hint & omp_lock_hint_nonspeculative))
1260 return __kmp_user_lock_seq;
1261
1262 // Do not even consider speculation when it appears to be contended
1263 if (hint & omp_lock_hint_contended)
1264 return lockseq_queuing;
1265
1266 // Uncontended lock without speculation
1267 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1268 return lockseq_tas;
1269
1270 // HLE lock for speculation
1271 if (hint & omp_lock_hint_speculative)
1272 return KMP_TSX_LOCK(hle);
1273
1274 return __kmp_user_lock_seq;
1275 }
1276
1277 #if OMPT_SUPPORT && OMPT_OPTIONAL
1278 #if KMP_USE_DYNAMIC_LOCK
1279 static kmp_mutex_impl_t
__ompt_get_mutex_impl_type(void * user_lock,kmp_indirect_lock_t * ilock=0)1280 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1281 if (user_lock) {
1282 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1283 case 0:
1284 break;
1285 #if KMP_USE_FUTEX
1286 case locktag_futex:
1287 return kmp_mutex_impl_queuing;
1288 #endif
1289 case locktag_tas:
1290 return kmp_mutex_impl_spin;
1291 #if KMP_USE_TSX
1292 case locktag_hle:
1293 return kmp_mutex_impl_speculative;
1294 #endif
1295 default:
1296 return kmp_mutex_impl_none;
1297 }
1298 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1299 }
1300 KMP_ASSERT(ilock);
1301 switch (ilock->type) {
1302 #if KMP_USE_TSX
1303 case locktag_adaptive:
1304 case locktag_rtm:
1305 return kmp_mutex_impl_speculative;
1306 #endif
1307 case locktag_nested_tas:
1308 return kmp_mutex_impl_spin;
1309 #if KMP_USE_FUTEX
1310 case locktag_nested_futex:
1311 #endif
1312 case locktag_ticket:
1313 case locktag_queuing:
1314 case locktag_drdpa:
1315 case locktag_nested_ticket:
1316 case locktag_nested_queuing:
1317 case locktag_nested_drdpa:
1318 return kmp_mutex_impl_queuing;
1319 default:
1320 return kmp_mutex_impl_none;
1321 }
1322 }
1323 #else
1324 // For locks without dynamic binding
__ompt_get_mutex_impl_type()1325 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1326 switch (__kmp_user_lock_kind) {
1327 case lk_tas:
1328 return kmp_mutex_impl_spin;
1329 #if KMP_USE_FUTEX
1330 case lk_futex:
1331 #endif
1332 case lk_ticket:
1333 case lk_queuing:
1334 case lk_drdpa:
1335 return kmp_mutex_impl_queuing;
1336 #if KMP_USE_TSX
1337 case lk_hle:
1338 case lk_rtm:
1339 case lk_adaptive:
1340 return kmp_mutex_impl_speculative;
1341 #endif
1342 default:
1343 return kmp_mutex_impl_none;
1344 }
1345 }
1346 #endif // KMP_USE_DYNAMIC_LOCK
1347 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1348
1349 /*!
1350 @ingroup WORK_SHARING
1351 @param loc source location information.
1352 @param global_tid global thread number.
1353 @param crit identity of the critical section. This could be a pointer to a lock
1354 associated with the critical section, or some other suitably unique value.
1355 @param hint the lock hint.
1356
1357 Enter code protected by a `critical` construct with a hint. The hint value is
1358 used to suggest a lock implementation. This function blocks until the executing
1359 thread can enter the critical section unless the hint suggests use of
1360 speculative execution and the hardware supports it.
1361 */
__kmpc_critical_with_hint(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit,uint32_t hint)1362 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1363 kmp_critical_name *crit, uint32_t hint) {
1364 KMP_COUNT_BLOCK(OMP_CRITICAL);
1365 kmp_user_lock_p lck;
1366 #if OMPT_SUPPORT && OMPT_OPTIONAL
1367 ompt_state_t prev_state = ompt_state_undefined;
1368 ompt_thread_info_t ti;
1369 // This is the case, if called from __kmpc_critical:
1370 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1371 if (!codeptr)
1372 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1373 #endif
1374
1375 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1376
1377 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1378 // Check if it is initialized.
1379 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1380 if (*lk == 0) {
1381 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1382 if (KMP_IS_D_LOCK(lckseq)) {
1383 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1384 KMP_GET_D_TAG(lckseq));
1385 } else {
1386 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1387 }
1388 }
1389 // Branch for accessing the actual lock object and set operation. This
1390 // branching is inevitable since this lock initialization does not follow the
1391 // normal dispatch path (lock table is not used).
1392 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1393 lck = (kmp_user_lock_p)lk;
1394 if (__kmp_env_consistency_check) {
1395 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1396 __kmp_map_hint_to_lock(hint));
1397 }
1398 #if USE_ITT_BUILD
1399 __kmp_itt_critical_acquiring(lck);
1400 #endif
1401 #if OMPT_SUPPORT && OMPT_OPTIONAL
1402 if (ompt_enabled.enabled) {
1403 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1404 /* OMPT state update */
1405 prev_state = ti.state;
1406 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1407 ti.state = ompt_state_wait_critical;
1408
1409 /* OMPT event callback */
1410 if (ompt_enabled.ompt_callback_mutex_acquire) {
1411 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1412 ompt_mutex_critical, (unsigned int)hint,
1413 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1414 codeptr);
1415 }
1416 }
1417 #endif
1418 #if KMP_USE_INLINED_TAS
1419 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1420 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1421 } else
1422 #elif KMP_USE_INLINED_FUTEX
1423 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1424 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1425 } else
1426 #endif
1427 {
1428 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1429 }
1430 } else {
1431 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1432 lck = ilk->lock;
1433 if (__kmp_env_consistency_check) {
1434 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1435 __kmp_map_hint_to_lock(hint));
1436 }
1437 #if USE_ITT_BUILD
1438 __kmp_itt_critical_acquiring(lck);
1439 #endif
1440 #if OMPT_SUPPORT && OMPT_OPTIONAL
1441 if (ompt_enabled.enabled) {
1442 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1443 /* OMPT state update */
1444 prev_state = ti.state;
1445 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1446 ti.state = ompt_state_wait_critical;
1447
1448 /* OMPT event callback */
1449 if (ompt_enabled.ompt_callback_mutex_acquire) {
1450 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1451 ompt_mutex_critical, (unsigned int)hint,
1452 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1453 codeptr);
1454 }
1455 }
1456 #endif
1457 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1458 }
1459 KMP_POP_PARTITIONED_TIMER();
1460
1461 #if USE_ITT_BUILD
1462 __kmp_itt_critical_acquired(lck);
1463 #endif /* USE_ITT_BUILD */
1464 #if OMPT_SUPPORT && OMPT_OPTIONAL
1465 if (ompt_enabled.enabled) {
1466 /* OMPT state update */
1467 ti.state = prev_state;
1468 ti.wait_id = 0;
1469
1470 /* OMPT event callback */
1471 if (ompt_enabled.ompt_callback_mutex_acquired) {
1472 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1473 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1474 }
1475 }
1476 #endif
1477
1478 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1479 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1480 } // __kmpc_critical_with_hint
1481
1482 #endif // KMP_USE_DYNAMIC_LOCK
1483
1484 /*!
1485 @ingroup WORK_SHARING
1486 @param loc source location information.
1487 @param global_tid global thread number .
1488 @param crit identity of the critical section. This could be a pointer to a lock
1489 associated with the critical section, or some other suitably unique value.
1490
1491 Leave a critical section, releasing any lock that was held during its execution.
1492 */
__kmpc_end_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1493 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1494 kmp_critical_name *crit) {
1495 kmp_user_lock_p lck;
1496
1497 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1498
1499 #if KMP_USE_DYNAMIC_LOCK
1500 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1501 lck = (kmp_user_lock_p)crit;
1502 KMP_ASSERT(lck != NULL);
1503 if (__kmp_env_consistency_check) {
1504 __kmp_pop_sync(global_tid, ct_critical, loc);
1505 }
1506 #if USE_ITT_BUILD
1507 __kmp_itt_critical_releasing(lck);
1508 #endif
1509 #if KMP_USE_INLINED_TAS
1510 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1511 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1512 } else
1513 #elif KMP_USE_INLINED_FUTEX
1514 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1515 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1516 } else
1517 #endif
1518 {
1519 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1520 }
1521 } else {
1522 kmp_indirect_lock_t *ilk =
1523 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1524 KMP_ASSERT(ilk != NULL);
1525 lck = ilk->lock;
1526 if (__kmp_env_consistency_check) {
1527 __kmp_pop_sync(global_tid, ct_critical, loc);
1528 }
1529 #if USE_ITT_BUILD
1530 __kmp_itt_critical_releasing(lck);
1531 #endif
1532 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1533 }
1534
1535 #else // KMP_USE_DYNAMIC_LOCK
1536
1537 if ((__kmp_user_lock_kind == lk_tas) &&
1538 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1539 lck = (kmp_user_lock_p)crit;
1540 }
1541 #if KMP_USE_FUTEX
1542 else if ((__kmp_user_lock_kind == lk_futex) &&
1543 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1544 lck = (kmp_user_lock_p)crit;
1545 }
1546 #endif
1547 else { // ticket, queuing or drdpa
1548 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1549 }
1550
1551 KMP_ASSERT(lck != NULL);
1552
1553 if (__kmp_env_consistency_check)
1554 __kmp_pop_sync(global_tid, ct_critical, loc);
1555
1556 #if USE_ITT_BUILD
1557 __kmp_itt_critical_releasing(lck);
1558 #endif /* USE_ITT_BUILD */
1559 // Value of 'crit' should be good for using as a critical_id of the critical
1560 // section directive.
1561 __kmp_release_user_lock_with_checks(lck, global_tid);
1562
1563 #endif // KMP_USE_DYNAMIC_LOCK
1564
1565 #if OMPT_SUPPORT && OMPT_OPTIONAL
1566 /* OMPT release event triggers after lock is released; place here to trigger
1567 * for all #if branches */
1568 OMPT_STORE_RETURN_ADDRESS(global_tid);
1569 if (ompt_enabled.ompt_callback_mutex_released) {
1570 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1571 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1572 OMPT_LOAD_RETURN_ADDRESS(0));
1573 }
1574 #endif
1575
1576 KMP_POP_PARTITIONED_TIMER();
1577 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1578 }
1579
1580 /*!
1581 @ingroup SYNCHRONIZATION
1582 @param loc source location information
1583 @param global_tid thread id.
1584 @return one if the thread should execute the master block, zero otherwise
1585
1586 Start execution of a combined barrier and master. The barrier is executed inside
1587 this function.
1588 */
__kmpc_barrier_master(ident_t * loc,kmp_int32 global_tid)1589 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1590 int status;
1591
1592 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1593
1594 if (!TCR_4(__kmp_init_parallel))
1595 __kmp_parallel_initialize();
1596
1597 if (__kmp_env_consistency_check)
1598 __kmp_check_barrier(global_tid, ct_barrier, loc);
1599
1600 #if OMPT_SUPPORT
1601 ompt_frame_t *ompt_frame;
1602 if (ompt_enabled.enabled) {
1603 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1604 if (ompt_frame->enter_frame.ptr == NULL)
1605 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1606 OMPT_STORE_RETURN_ADDRESS(global_tid);
1607 }
1608 #endif
1609 #if USE_ITT_NOTIFY
1610 __kmp_threads[global_tid]->th.th_ident = loc;
1611 #endif
1612 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1613 #if OMPT_SUPPORT && OMPT_OPTIONAL
1614 if (ompt_enabled.enabled) {
1615 ompt_frame->enter_frame = ompt_data_none;
1616 }
1617 #endif
1618
1619 return (status != 0) ? 0 : 1;
1620 }
1621
1622 /*!
1623 @ingroup SYNCHRONIZATION
1624 @param loc source location information
1625 @param global_tid thread id.
1626
1627 Complete the execution of a combined barrier and master. This function should
1628 only be called at the completion of the <tt>master</tt> code. Other threads will
1629 still be waiting at the barrier and this call releases them.
1630 */
__kmpc_end_barrier_master(ident_t * loc,kmp_int32 global_tid)1631 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1632 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1633
1634 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1635 }
1636
1637 /*!
1638 @ingroup SYNCHRONIZATION
1639 @param loc source location information
1640 @param global_tid thread id.
1641 @return one if the thread should execute the master block, zero otherwise
1642
1643 Start execution of a combined barrier and master(nowait) construct.
1644 The barrier is executed inside this function.
1645 There is no equivalent "end" function, since the
1646 */
__kmpc_barrier_master_nowait(ident_t * loc,kmp_int32 global_tid)1647 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1648 kmp_int32 ret;
1649
1650 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1651
1652 if (!TCR_4(__kmp_init_parallel))
1653 __kmp_parallel_initialize();
1654
1655 if (__kmp_env_consistency_check) {
1656 if (loc == 0) {
1657 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1658 }
1659 __kmp_check_barrier(global_tid, ct_barrier, loc);
1660 }
1661
1662 #if OMPT_SUPPORT
1663 ompt_frame_t *ompt_frame;
1664 if (ompt_enabled.enabled) {
1665 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1666 if (ompt_frame->enter_frame.ptr == NULL)
1667 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1668 OMPT_STORE_RETURN_ADDRESS(global_tid);
1669 }
1670 #endif
1671 #if USE_ITT_NOTIFY
1672 __kmp_threads[global_tid]->th.th_ident = loc;
1673 #endif
1674 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1675 #if OMPT_SUPPORT && OMPT_OPTIONAL
1676 if (ompt_enabled.enabled) {
1677 ompt_frame->enter_frame = ompt_data_none;
1678 }
1679 #endif
1680
1681 ret = __kmpc_master(loc, global_tid);
1682
1683 if (__kmp_env_consistency_check) {
1684 /* there's no __kmpc_end_master called; so the (stats) */
1685 /* actions of __kmpc_end_master are done here */
1686
1687 if (global_tid < 0) {
1688 KMP_WARNING(ThreadIdentInvalid);
1689 }
1690 if (ret) {
1691 /* only one thread should do the pop since only */
1692 /* one did the push (see __kmpc_master()) */
1693
1694 __kmp_pop_sync(global_tid, ct_master, loc);
1695 }
1696 }
1697
1698 return (ret);
1699 }
1700
1701 /* The BARRIER for a SINGLE process section is always explicit */
1702 /*!
1703 @ingroup WORK_SHARING
1704 @param loc source location information
1705 @param global_tid global thread number
1706 @return One if this thread should execute the single construct, zero otherwise.
1707
1708 Test whether to execute a <tt>single</tt> construct.
1709 There are no implicit barriers in the two "single" calls, rather the compiler
1710 should introduce an explicit barrier if it is required.
1711 */
1712
__kmpc_single(ident_t * loc,kmp_int32 global_tid)1713 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1714 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1715
1716 if (rc) {
1717 // We are going to execute the single statement, so we should count it.
1718 KMP_COUNT_BLOCK(OMP_SINGLE);
1719 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1720 }
1721
1722 #if OMPT_SUPPORT && OMPT_OPTIONAL
1723 kmp_info_t *this_thr = __kmp_threads[global_tid];
1724 kmp_team_t *team = this_thr->th.th_team;
1725 int tid = __kmp_tid_from_gtid(global_tid);
1726
1727 if (ompt_enabled.enabled) {
1728 if (rc) {
1729 if (ompt_enabled.ompt_callback_work) {
1730 ompt_callbacks.ompt_callback(ompt_callback_work)(
1731 ompt_work_single_executor, ompt_scope_begin,
1732 &(team->t.ompt_team_info.parallel_data),
1733 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1734 1, OMPT_GET_RETURN_ADDRESS(0));
1735 }
1736 } else {
1737 if (ompt_enabled.ompt_callback_work) {
1738 ompt_callbacks.ompt_callback(ompt_callback_work)(
1739 ompt_work_single_other, ompt_scope_begin,
1740 &(team->t.ompt_team_info.parallel_data),
1741 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1742 1, OMPT_GET_RETURN_ADDRESS(0));
1743 ompt_callbacks.ompt_callback(ompt_callback_work)(
1744 ompt_work_single_other, ompt_scope_end,
1745 &(team->t.ompt_team_info.parallel_data),
1746 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1747 1, OMPT_GET_RETURN_ADDRESS(0));
1748 }
1749 }
1750 }
1751 #endif
1752
1753 return rc;
1754 }
1755
1756 /*!
1757 @ingroup WORK_SHARING
1758 @param loc source location information
1759 @param global_tid global thread number
1760
1761 Mark the end of a <tt>single</tt> construct. This function should
1762 only be called by the thread that executed the block of code protected
1763 by the `single` construct.
1764 */
__kmpc_end_single(ident_t * loc,kmp_int32 global_tid)1765 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1766 __kmp_exit_single(global_tid);
1767 KMP_POP_PARTITIONED_TIMER();
1768
1769 #if OMPT_SUPPORT && OMPT_OPTIONAL
1770 kmp_info_t *this_thr = __kmp_threads[global_tid];
1771 kmp_team_t *team = this_thr->th.th_team;
1772 int tid = __kmp_tid_from_gtid(global_tid);
1773
1774 if (ompt_enabled.ompt_callback_work) {
1775 ompt_callbacks.ompt_callback(ompt_callback_work)(
1776 ompt_work_single_executor, ompt_scope_end,
1777 &(team->t.ompt_team_info.parallel_data),
1778 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1779 OMPT_GET_RETURN_ADDRESS(0));
1780 }
1781 #endif
1782 }
1783
1784 /*!
1785 @ingroup WORK_SHARING
1786 @param loc Source location
1787 @param global_tid Global thread id
1788
1789 Mark the end of a statically scheduled loop.
1790 */
__kmpc_for_static_fini(ident_t * loc,kmp_int32 global_tid)1791 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1792 KMP_POP_PARTITIONED_TIMER();
1793 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1794
1795 #if OMPT_SUPPORT && OMPT_OPTIONAL
1796 if (ompt_enabled.ompt_callback_work) {
1797 ompt_work_t ompt_work_type = ompt_work_loop;
1798 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1799 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1800 // Determine workshare type
1801 if (loc != NULL) {
1802 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1803 ompt_work_type = ompt_work_loop;
1804 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1805 ompt_work_type = ompt_work_sections;
1806 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1807 ompt_work_type = ompt_work_distribute;
1808 } else {
1809 // use default set above.
1810 // a warning about this case is provided in __kmpc_for_static_init
1811 }
1812 KMP_DEBUG_ASSERT(ompt_work_type);
1813 }
1814 ompt_callbacks.ompt_callback(ompt_callback_work)(
1815 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1816 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1817 }
1818 #endif
1819 if (__kmp_env_consistency_check)
1820 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1821 }
1822
1823 // User routines which take C-style arguments (call by value)
1824 // different from the Fortran equivalent routines
1825
ompc_set_num_threads(int arg)1826 void ompc_set_num_threads(int arg) {
1827 // !!!!! TODO: check the per-task binding
1828 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1829 }
1830
ompc_set_dynamic(int flag)1831 void ompc_set_dynamic(int flag) {
1832 kmp_info_t *thread;
1833
1834 /* For the thread-private implementation of the internal controls */
1835 thread = __kmp_entry_thread();
1836
1837 __kmp_save_internal_controls(thread);
1838
1839 set__dynamic(thread, flag ? TRUE : FALSE);
1840 }
1841
ompc_set_nested(int flag)1842 void ompc_set_nested(int flag) {
1843 kmp_info_t *thread;
1844
1845 /* For the thread-private internal controls implementation */
1846 thread = __kmp_entry_thread();
1847
1848 __kmp_save_internal_controls(thread);
1849
1850 set__nested(thread, flag ? TRUE : FALSE);
1851 }
1852
ompc_set_max_active_levels(int max_active_levels)1853 void ompc_set_max_active_levels(int max_active_levels) {
1854 /* TO DO */
1855 /* we want per-task implementation of this internal control */
1856
1857 /* For the per-thread internal controls implementation */
1858 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1859 }
1860
ompc_set_schedule(omp_sched_t kind,int modifier)1861 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1862 // !!!!! TODO: check the per-task binding
1863 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1864 }
1865
ompc_get_ancestor_thread_num(int level)1866 int ompc_get_ancestor_thread_num(int level) {
1867 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1868 }
1869
ompc_get_team_size(int level)1870 int ompc_get_team_size(int level) {
1871 return __kmp_get_team_size(__kmp_entry_gtid(), level);
1872 }
1873
1874 #if OMP_50_ENABLED
1875 /* OpenMP 5.0 Affinity Format API */
1876
ompc_set_affinity_format(char const * format)1877 void ompc_set_affinity_format(char const *format) {
1878 if (!__kmp_init_serial) {
1879 __kmp_serial_initialize();
1880 }
1881 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1882 format, KMP_STRLEN(format) + 1);
1883 }
1884
ompc_get_affinity_format(char * buffer,size_t size)1885 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1886 size_t format_size;
1887 if (!__kmp_init_serial) {
1888 __kmp_serial_initialize();
1889 }
1890 format_size = KMP_STRLEN(__kmp_affinity_format);
1891 if (buffer && size) {
1892 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1893 format_size + 1);
1894 }
1895 return format_size;
1896 }
1897
ompc_display_affinity(char const * format)1898 void ompc_display_affinity(char const *format) {
1899 int gtid;
1900 if (!TCR_4(__kmp_init_middle)) {
1901 __kmp_middle_initialize();
1902 }
1903 gtid = __kmp_get_gtid();
1904 __kmp_aux_display_affinity(gtid, format);
1905 }
1906
ompc_capture_affinity(char * buffer,size_t buf_size,char const * format)1907 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1908 char const *format) {
1909 int gtid;
1910 size_t num_required;
1911 kmp_str_buf_t capture_buf;
1912 if (!TCR_4(__kmp_init_middle)) {
1913 __kmp_middle_initialize();
1914 }
1915 gtid = __kmp_get_gtid();
1916 __kmp_str_buf_init(&capture_buf);
1917 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1918 if (buffer && buf_size) {
1919 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1920 capture_buf.used + 1);
1921 }
1922 __kmp_str_buf_free(&capture_buf);
1923 return num_required;
1924 }
1925 #endif /* OMP_50_ENABLED */
1926
kmpc_set_stacksize(int arg)1927 void kmpc_set_stacksize(int arg) {
1928 // __kmp_aux_set_stacksize initializes the library if needed
1929 __kmp_aux_set_stacksize(arg);
1930 }
1931
kmpc_set_stacksize_s(size_t arg)1932 void kmpc_set_stacksize_s(size_t arg) {
1933 // __kmp_aux_set_stacksize initializes the library if needed
1934 __kmp_aux_set_stacksize(arg);
1935 }
1936
kmpc_set_blocktime(int arg)1937 void kmpc_set_blocktime(int arg) {
1938 int gtid, tid;
1939 kmp_info_t *thread;
1940
1941 gtid = __kmp_entry_gtid();
1942 tid = __kmp_tid_from_gtid(gtid);
1943 thread = __kmp_thread_from_gtid(gtid);
1944
1945 __kmp_aux_set_blocktime(arg, thread, tid);
1946 }
1947
kmpc_set_library(int arg)1948 void kmpc_set_library(int arg) {
1949 // __kmp_user_set_library initializes the library if needed
1950 __kmp_user_set_library((enum library_type)arg);
1951 }
1952
kmpc_set_defaults(char const * str)1953 void kmpc_set_defaults(char const *str) {
1954 // __kmp_aux_set_defaults initializes the library if needed
1955 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1956 }
1957
kmpc_set_disp_num_buffers(int arg)1958 void kmpc_set_disp_num_buffers(int arg) {
1959 // ignore after initialization because some teams have already
1960 // allocated dispatch buffers
1961 if (__kmp_init_serial == 0 && arg > 0)
1962 __kmp_dispatch_num_buffers = arg;
1963 }
1964
kmpc_set_affinity_mask_proc(int proc,void ** mask)1965 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1966 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1967 return -1;
1968 #else
1969 if (!TCR_4(__kmp_init_middle)) {
1970 __kmp_middle_initialize();
1971 }
1972 return __kmp_aux_set_affinity_mask_proc(proc, mask);
1973 #endif
1974 }
1975
kmpc_unset_affinity_mask_proc(int proc,void ** mask)1976 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1977 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1978 return -1;
1979 #else
1980 if (!TCR_4(__kmp_init_middle)) {
1981 __kmp_middle_initialize();
1982 }
1983 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1984 #endif
1985 }
1986
kmpc_get_affinity_mask_proc(int proc,void ** mask)1987 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1988 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1989 return -1;
1990 #else
1991 if (!TCR_4(__kmp_init_middle)) {
1992 __kmp_middle_initialize();
1993 }
1994 return __kmp_aux_get_affinity_mask_proc(proc, mask);
1995 #endif
1996 }
1997
1998 /* -------------------------------------------------------------------------- */
1999 /*!
2000 @ingroup THREADPRIVATE
2001 @param loc source location information
2002 @param gtid global thread number
2003 @param cpy_size size of the cpy_data buffer
2004 @param cpy_data pointer to data to be copied
2005 @param cpy_func helper function to call for copying data
2006 @param didit flag variable: 1=single thread; 0=not single thread
2007
2008 __kmpc_copyprivate implements the interface for the private data broadcast
2009 needed for the copyprivate clause associated with a single region in an
2010 OpenMP<sup>*</sup> program (both C and Fortran).
2011 All threads participating in the parallel region call this routine.
2012 One of the threads (called the single thread) should have the <tt>didit</tt>
2013 variable set to 1 and all other threads should have that variable set to 0.
2014 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2015
2016 The OpenMP specification forbids the use of nowait on the single region when a
2017 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2018 barrier internally to avoid race conditions, so the code generation for the
2019 single region should avoid generating a barrier after the call to @ref
2020 __kmpc_copyprivate.
2021
2022 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2023 The <tt>loc</tt> parameter is a pointer to source location information.
2024
2025 Internal implementation: The single thread will first copy its descriptor
2026 address (cpy_data) to a team-private location, then the other threads will each
2027 call the function pointed to by the parameter cpy_func, which carries out the
2028 copy by copying the data using the cpy_data buffer.
2029
2030 The cpy_func routine used for the copy and the contents of the data area defined
2031 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2032 to be done. For instance, the cpy_data buffer can hold the actual data to be
2033 copied or it may hold a list of pointers to the data. The cpy_func routine must
2034 interpret the cpy_data buffer appropriately.
2035
2036 The interface to cpy_func is as follows:
2037 @code
2038 void cpy_func( void *destination, void *source )
2039 @endcode
2040 where void *destination is the cpy_data pointer for the thread being copied to
2041 and void *source is the cpy_data pointer for the thread being copied from.
2042 */
__kmpc_copyprivate(ident_t * loc,kmp_int32 gtid,size_t cpy_size,void * cpy_data,void (* cpy_func)(void *,void *),kmp_int32 didit)2043 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2044 void *cpy_data, void (*cpy_func)(void *, void *),
2045 kmp_int32 didit) {
2046 void **data_ptr;
2047
2048 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2049
2050 KMP_MB();
2051
2052 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2053
2054 if (__kmp_env_consistency_check) {
2055 if (loc == 0) {
2056 KMP_WARNING(ConstructIdentInvalid);
2057 }
2058 }
2059
2060 // ToDo: Optimize the following two barriers into some kind of split barrier
2061
2062 if (didit)
2063 *data_ptr = cpy_data;
2064
2065 #if OMPT_SUPPORT
2066 ompt_frame_t *ompt_frame;
2067 if (ompt_enabled.enabled) {
2068 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2069 if (ompt_frame->enter_frame.ptr == NULL)
2070 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2071 OMPT_STORE_RETURN_ADDRESS(gtid);
2072 }
2073 #endif
2074 /* This barrier is not a barrier region boundary */
2075 #if USE_ITT_NOTIFY
2076 __kmp_threads[gtid]->th.th_ident = loc;
2077 #endif
2078 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2079
2080 if (!didit)
2081 (*cpy_func)(cpy_data, *data_ptr);
2082
2083 // Consider next barrier a user-visible barrier for barrier region boundaries
2084 // Nesting checks are already handled by the single construct checks
2085
2086 #if OMPT_SUPPORT
2087 if (ompt_enabled.enabled) {
2088 OMPT_STORE_RETURN_ADDRESS(gtid);
2089 }
2090 #endif
2091 #if USE_ITT_NOTIFY
2092 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2093 // tasks can overwrite the location)
2094 #endif
2095 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2096 #if OMPT_SUPPORT && OMPT_OPTIONAL
2097 if (ompt_enabled.enabled) {
2098 ompt_frame->enter_frame = ompt_data_none;
2099 }
2100 #endif
2101 }
2102
2103 /* -------------------------------------------------------------------------- */
2104
2105 #define INIT_LOCK __kmp_init_user_lock_with_checks
2106 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2107 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2108 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2109 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2110 #define ACQUIRE_NESTED_LOCK_TIMED \
2111 __kmp_acquire_nested_user_lock_with_checks_timed
2112 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2113 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2114 #define TEST_LOCK __kmp_test_user_lock_with_checks
2115 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2116 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2117 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2118
2119 // TODO: Make check abort messages use location info & pass it into
2120 // with_checks routines
2121
2122 #if KMP_USE_DYNAMIC_LOCK
2123
2124 // internal lock initializer
__kmp_init_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2125 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2126 kmp_dyna_lockseq_t seq) {
2127 if (KMP_IS_D_LOCK(seq)) {
2128 KMP_INIT_D_LOCK(lock, seq);
2129 #if USE_ITT_BUILD
2130 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2131 #endif
2132 } else {
2133 KMP_INIT_I_LOCK(lock, seq);
2134 #if USE_ITT_BUILD
2135 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2136 __kmp_itt_lock_creating(ilk->lock, loc);
2137 #endif
2138 }
2139 }
2140
2141 // internal nest lock initializer
2142 static __forceinline void
__kmp_init_nest_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2143 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2144 kmp_dyna_lockseq_t seq) {
2145 #if KMP_USE_TSX
2146 // Don't have nested lock implementation for speculative locks
2147 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2148 seq = __kmp_user_lock_seq;
2149 #endif
2150 switch (seq) {
2151 case lockseq_tas:
2152 seq = lockseq_nested_tas;
2153 break;
2154 #if KMP_USE_FUTEX
2155 case lockseq_futex:
2156 seq = lockseq_nested_futex;
2157 break;
2158 #endif
2159 case lockseq_ticket:
2160 seq = lockseq_nested_ticket;
2161 break;
2162 case lockseq_queuing:
2163 seq = lockseq_nested_queuing;
2164 break;
2165 case lockseq_drdpa:
2166 seq = lockseq_nested_drdpa;
2167 break;
2168 default:
2169 seq = lockseq_nested_queuing;
2170 }
2171 KMP_INIT_I_LOCK(lock, seq);
2172 #if USE_ITT_BUILD
2173 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2174 __kmp_itt_lock_creating(ilk->lock, loc);
2175 #endif
2176 }
2177
2178 /* initialize the lock with a hint */
__kmpc_init_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2179 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2180 uintptr_t hint) {
2181 KMP_DEBUG_ASSERT(__kmp_init_serial);
2182 if (__kmp_env_consistency_check && user_lock == NULL) {
2183 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2184 }
2185
2186 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2187
2188 #if OMPT_SUPPORT && OMPT_OPTIONAL
2189 // This is the case, if called from omp_init_lock_with_hint:
2190 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2191 if (!codeptr)
2192 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2193 if (ompt_enabled.ompt_callback_lock_init) {
2194 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2195 ompt_mutex_lock, (omp_lock_hint_t)hint,
2196 __ompt_get_mutex_impl_type(user_lock),
2197 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2198 }
2199 #endif
2200 }
2201
2202 /* initialize the lock with a hint */
__kmpc_init_nest_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2203 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2204 void **user_lock, uintptr_t hint) {
2205 KMP_DEBUG_ASSERT(__kmp_init_serial);
2206 if (__kmp_env_consistency_check && user_lock == NULL) {
2207 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2208 }
2209
2210 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2211
2212 #if OMPT_SUPPORT && OMPT_OPTIONAL
2213 // This is the case, if called from omp_init_lock_with_hint:
2214 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2215 if (!codeptr)
2216 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2217 if (ompt_enabled.ompt_callback_lock_init) {
2218 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2219 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2220 __ompt_get_mutex_impl_type(user_lock),
2221 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2222 }
2223 #endif
2224 }
2225
2226 #endif // KMP_USE_DYNAMIC_LOCK
2227
2228 /* initialize the lock */
__kmpc_init_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2229 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2230 #if KMP_USE_DYNAMIC_LOCK
2231
2232 KMP_DEBUG_ASSERT(__kmp_init_serial);
2233 if (__kmp_env_consistency_check && user_lock == NULL) {
2234 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2235 }
2236 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2237
2238 #if OMPT_SUPPORT && OMPT_OPTIONAL
2239 // This is the case, if called from omp_init_lock_with_hint:
2240 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2241 if (!codeptr)
2242 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2243 if (ompt_enabled.ompt_callback_lock_init) {
2244 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2245 ompt_mutex_lock, omp_lock_hint_none,
2246 __ompt_get_mutex_impl_type(user_lock),
2247 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2248 }
2249 #endif
2250
2251 #else // KMP_USE_DYNAMIC_LOCK
2252
2253 static char const *const func = "omp_init_lock";
2254 kmp_user_lock_p lck;
2255 KMP_DEBUG_ASSERT(__kmp_init_serial);
2256
2257 if (__kmp_env_consistency_check) {
2258 if (user_lock == NULL) {
2259 KMP_FATAL(LockIsUninitialized, func);
2260 }
2261 }
2262
2263 KMP_CHECK_USER_LOCK_INIT();
2264
2265 if ((__kmp_user_lock_kind == lk_tas) &&
2266 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2267 lck = (kmp_user_lock_p)user_lock;
2268 }
2269 #if KMP_USE_FUTEX
2270 else if ((__kmp_user_lock_kind == lk_futex) &&
2271 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2272 lck = (kmp_user_lock_p)user_lock;
2273 }
2274 #endif
2275 else {
2276 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2277 }
2278 INIT_LOCK(lck);
2279 __kmp_set_user_lock_location(lck, loc);
2280
2281 #if OMPT_SUPPORT && OMPT_OPTIONAL
2282 // This is the case, if called from omp_init_lock_with_hint:
2283 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2284 if (!codeptr)
2285 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2286 if (ompt_enabled.ompt_callback_lock_init) {
2287 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2288 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2289 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2290 }
2291 #endif
2292
2293 #if USE_ITT_BUILD
2294 __kmp_itt_lock_creating(lck);
2295 #endif /* USE_ITT_BUILD */
2296
2297 #endif // KMP_USE_DYNAMIC_LOCK
2298 } // __kmpc_init_lock
2299
2300 /* initialize the lock */
__kmpc_init_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2301 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2302 #if KMP_USE_DYNAMIC_LOCK
2303
2304 KMP_DEBUG_ASSERT(__kmp_init_serial);
2305 if (__kmp_env_consistency_check && user_lock == NULL) {
2306 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2307 }
2308 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2309
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311 // This is the case, if called from omp_init_lock_with_hint:
2312 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2313 if (!codeptr)
2314 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2315 if (ompt_enabled.ompt_callback_lock_init) {
2316 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2317 ompt_mutex_nest_lock, omp_lock_hint_none,
2318 __ompt_get_mutex_impl_type(user_lock),
2319 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2320 }
2321 #endif
2322
2323 #else // KMP_USE_DYNAMIC_LOCK
2324
2325 static char const *const func = "omp_init_nest_lock";
2326 kmp_user_lock_p lck;
2327 KMP_DEBUG_ASSERT(__kmp_init_serial);
2328
2329 if (__kmp_env_consistency_check) {
2330 if (user_lock == NULL) {
2331 KMP_FATAL(LockIsUninitialized, func);
2332 }
2333 }
2334
2335 KMP_CHECK_USER_LOCK_INIT();
2336
2337 if ((__kmp_user_lock_kind == lk_tas) &&
2338 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2339 OMP_NEST_LOCK_T_SIZE)) {
2340 lck = (kmp_user_lock_p)user_lock;
2341 }
2342 #if KMP_USE_FUTEX
2343 else if ((__kmp_user_lock_kind == lk_futex) &&
2344 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2345 OMP_NEST_LOCK_T_SIZE)) {
2346 lck = (kmp_user_lock_p)user_lock;
2347 }
2348 #endif
2349 else {
2350 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2351 }
2352
2353 INIT_NESTED_LOCK(lck);
2354 __kmp_set_user_lock_location(lck, loc);
2355
2356 #if OMPT_SUPPORT && OMPT_OPTIONAL
2357 // This is the case, if called from omp_init_lock_with_hint:
2358 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2359 if (!codeptr)
2360 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2361 if (ompt_enabled.ompt_callback_lock_init) {
2362 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2363 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2364 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2365 }
2366 #endif
2367
2368 #if USE_ITT_BUILD
2369 __kmp_itt_lock_creating(lck);
2370 #endif /* USE_ITT_BUILD */
2371
2372 #endif // KMP_USE_DYNAMIC_LOCK
2373 } // __kmpc_init_nest_lock
2374
__kmpc_destroy_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2375 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2376 #if KMP_USE_DYNAMIC_LOCK
2377
2378 #if USE_ITT_BUILD
2379 kmp_user_lock_p lck;
2380 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2381 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2382 } else {
2383 lck = (kmp_user_lock_p)user_lock;
2384 }
2385 __kmp_itt_lock_destroyed(lck);
2386 #endif
2387 #if OMPT_SUPPORT && OMPT_OPTIONAL
2388 // This is the case, if called from omp_init_lock_with_hint:
2389 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2390 if (!codeptr)
2391 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2392 if (ompt_enabled.ompt_callback_lock_destroy) {
2393 kmp_user_lock_p lck;
2394 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2395 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2396 } else {
2397 lck = (kmp_user_lock_p)user_lock;
2398 }
2399 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2400 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2401 }
2402 #endif
2403 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2404 #else
2405 kmp_user_lock_p lck;
2406
2407 if ((__kmp_user_lock_kind == lk_tas) &&
2408 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2409 lck = (kmp_user_lock_p)user_lock;
2410 }
2411 #if KMP_USE_FUTEX
2412 else if ((__kmp_user_lock_kind == lk_futex) &&
2413 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2414 lck = (kmp_user_lock_p)user_lock;
2415 }
2416 #endif
2417 else {
2418 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2419 }
2420
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422 // This is the case, if called from omp_init_lock_with_hint:
2423 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2424 if (!codeptr)
2425 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2426 if (ompt_enabled.ompt_callback_lock_destroy) {
2427 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2428 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2429 }
2430 #endif
2431
2432 #if USE_ITT_BUILD
2433 __kmp_itt_lock_destroyed(lck);
2434 #endif /* USE_ITT_BUILD */
2435 DESTROY_LOCK(lck);
2436
2437 if ((__kmp_user_lock_kind == lk_tas) &&
2438 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2439 ;
2440 }
2441 #if KMP_USE_FUTEX
2442 else if ((__kmp_user_lock_kind == lk_futex) &&
2443 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2444 ;
2445 }
2446 #endif
2447 else {
2448 __kmp_user_lock_free(user_lock, gtid, lck);
2449 }
2450 #endif // KMP_USE_DYNAMIC_LOCK
2451 } // __kmpc_destroy_lock
2452
2453 /* destroy the lock */
__kmpc_destroy_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2454 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2455 #if KMP_USE_DYNAMIC_LOCK
2456
2457 #if USE_ITT_BUILD
2458 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2459 __kmp_itt_lock_destroyed(ilk->lock);
2460 #endif
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462 // This is the case, if called from omp_init_lock_with_hint:
2463 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2464 if (!codeptr)
2465 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2466 if (ompt_enabled.ompt_callback_lock_destroy) {
2467 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2468 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2469 }
2470 #endif
2471 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2472
2473 #else // KMP_USE_DYNAMIC_LOCK
2474
2475 kmp_user_lock_p lck;
2476
2477 if ((__kmp_user_lock_kind == lk_tas) &&
2478 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2479 OMP_NEST_LOCK_T_SIZE)) {
2480 lck = (kmp_user_lock_p)user_lock;
2481 }
2482 #if KMP_USE_FUTEX
2483 else if ((__kmp_user_lock_kind == lk_futex) &&
2484 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2485 OMP_NEST_LOCK_T_SIZE)) {
2486 lck = (kmp_user_lock_p)user_lock;
2487 }
2488 #endif
2489 else {
2490 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2491 }
2492
2493 #if OMPT_SUPPORT && OMPT_OPTIONAL
2494 // This is the case, if called from omp_init_lock_with_hint:
2495 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2496 if (!codeptr)
2497 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2498 if (ompt_enabled.ompt_callback_lock_destroy) {
2499 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2500 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2501 }
2502 #endif
2503
2504 #if USE_ITT_BUILD
2505 __kmp_itt_lock_destroyed(lck);
2506 #endif /* USE_ITT_BUILD */
2507
2508 DESTROY_NESTED_LOCK(lck);
2509
2510 if ((__kmp_user_lock_kind == lk_tas) &&
2511 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2512 OMP_NEST_LOCK_T_SIZE)) {
2513 ;
2514 }
2515 #if KMP_USE_FUTEX
2516 else if ((__kmp_user_lock_kind == lk_futex) &&
2517 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2518 OMP_NEST_LOCK_T_SIZE)) {
2519 ;
2520 }
2521 #endif
2522 else {
2523 __kmp_user_lock_free(user_lock, gtid, lck);
2524 }
2525 #endif // KMP_USE_DYNAMIC_LOCK
2526 } // __kmpc_destroy_nest_lock
2527
__kmpc_set_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2528 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2529 KMP_COUNT_BLOCK(OMP_set_lock);
2530 #if KMP_USE_DYNAMIC_LOCK
2531 int tag = KMP_EXTRACT_D_TAG(user_lock);
2532 #if USE_ITT_BUILD
2533 __kmp_itt_lock_acquiring(
2534 (kmp_user_lock_p)
2535 user_lock); // itt function will get to the right lock object.
2536 #endif
2537 #if OMPT_SUPPORT && OMPT_OPTIONAL
2538 // This is the case, if called from omp_init_lock_with_hint:
2539 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2540 if (!codeptr)
2541 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2542 if (ompt_enabled.ompt_callback_mutex_acquire) {
2543 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2544 ompt_mutex_lock, omp_lock_hint_none,
2545 __ompt_get_mutex_impl_type(user_lock),
2546 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2547 }
2548 #endif
2549 #if KMP_USE_INLINED_TAS
2550 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2551 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2552 } else
2553 #elif KMP_USE_INLINED_FUTEX
2554 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2555 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2556 } else
2557 #endif
2558 {
2559 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2560 }
2561 #if USE_ITT_BUILD
2562 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2563 #endif
2564 #if OMPT_SUPPORT && OMPT_OPTIONAL
2565 if (ompt_enabled.ompt_callback_mutex_acquired) {
2566 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2567 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2568 }
2569 #endif
2570
2571 #else // KMP_USE_DYNAMIC_LOCK
2572
2573 kmp_user_lock_p lck;
2574
2575 if ((__kmp_user_lock_kind == lk_tas) &&
2576 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2577 lck = (kmp_user_lock_p)user_lock;
2578 }
2579 #if KMP_USE_FUTEX
2580 else if ((__kmp_user_lock_kind == lk_futex) &&
2581 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2582 lck = (kmp_user_lock_p)user_lock;
2583 }
2584 #endif
2585 else {
2586 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2587 }
2588
2589 #if USE_ITT_BUILD
2590 __kmp_itt_lock_acquiring(lck);
2591 #endif /* USE_ITT_BUILD */
2592 #if OMPT_SUPPORT && OMPT_OPTIONAL
2593 // This is the case, if called from omp_init_lock_with_hint:
2594 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2595 if (!codeptr)
2596 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2597 if (ompt_enabled.ompt_callback_mutex_acquire) {
2598 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2599 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2600 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2601 }
2602 #endif
2603
2604 ACQUIRE_LOCK(lck, gtid);
2605
2606 #if USE_ITT_BUILD
2607 __kmp_itt_lock_acquired(lck);
2608 #endif /* USE_ITT_BUILD */
2609
2610 #if OMPT_SUPPORT && OMPT_OPTIONAL
2611 if (ompt_enabled.ompt_callback_mutex_acquired) {
2612 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2613 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2614 }
2615 #endif
2616
2617 #endif // KMP_USE_DYNAMIC_LOCK
2618 }
2619
__kmpc_set_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2620 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2621 #if KMP_USE_DYNAMIC_LOCK
2622
2623 #if USE_ITT_BUILD
2624 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2625 #endif
2626 #if OMPT_SUPPORT && OMPT_OPTIONAL
2627 // This is the case, if called from omp_init_lock_with_hint:
2628 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2629 if (!codeptr)
2630 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2631 if (ompt_enabled.enabled) {
2632 if (ompt_enabled.ompt_callback_mutex_acquire) {
2633 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2634 ompt_mutex_nest_lock, omp_lock_hint_none,
2635 __ompt_get_mutex_impl_type(user_lock),
2636 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2637 }
2638 }
2639 #endif
2640 int acquire_status =
2641 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2642 (void) acquire_status;
2643 #if USE_ITT_BUILD
2644 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2645 #endif
2646
2647 #if OMPT_SUPPORT && OMPT_OPTIONAL
2648 if (ompt_enabled.enabled) {
2649 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2650 if (ompt_enabled.ompt_callback_mutex_acquired) {
2651 // lock_first
2652 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2653 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2654 codeptr);
2655 }
2656 } else {
2657 if (ompt_enabled.ompt_callback_nest_lock) {
2658 // lock_next
2659 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2660 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2661 }
2662 }
2663 }
2664 #endif
2665
2666 #else // KMP_USE_DYNAMIC_LOCK
2667 int acquire_status;
2668 kmp_user_lock_p lck;
2669
2670 if ((__kmp_user_lock_kind == lk_tas) &&
2671 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2672 OMP_NEST_LOCK_T_SIZE)) {
2673 lck = (kmp_user_lock_p)user_lock;
2674 }
2675 #if KMP_USE_FUTEX
2676 else if ((__kmp_user_lock_kind == lk_futex) &&
2677 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2678 OMP_NEST_LOCK_T_SIZE)) {
2679 lck = (kmp_user_lock_p)user_lock;
2680 }
2681 #endif
2682 else {
2683 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2684 }
2685
2686 #if USE_ITT_BUILD
2687 __kmp_itt_lock_acquiring(lck);
2688 #endif /* USE_ITT_BUILD */
2689 #if OMPT_SUPPORT && OMPT_OPTIONAL
2690 // This is the case, if called from omp_init_lock_with_hint:
2691 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2692 if (!codeptr)
2693 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2694 if (ompt_enabled.enabled) {
2695 if (ompt_enabled.ompt_callback_mutex_acquire) {
2696 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2697 ompt_mutex_nest_lock, omp_lock_hint_none,
2698 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2699 codeptr);
2700 }
2701 }
2702 #endif
2703
2704 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2705
2706 #if USE_ITT_BUILD
2707 __kmp_itt_lock_acquired(lck);
2708 #endif /* USE_ITT_BUILD */
2709
2710 #if OMPT_SUPPORT && OMPT_OPTIONAL
2711 if (ompt_enabled.enabled) {
2712 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2713 if (ompt_enabled.ompt_callback_mutex_acquired) {
2714 // lock_first
2715 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2716 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2717 }
2718 } else {
2719 if (ompt_enabled.ompt_callback_nest_lock) {
2720 // lock_next
2721 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2722 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2723 }
2724 }
2725 }
2726 #endif
2727
2728 #endif // KMP_USE_DYNAMIC_LOCK
2729 }
2730
__kmpc_unset_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2731 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2732 #if KMP_USE_DYNAMIC_LOCK
2733
2734 int tag = KMP_EXTRACT_D_TAG(user_lock);
2735 #if USE_ITT_BUILD
2736 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2737 #endif
2738 #if KMP_USE_INLINED_TAS
2739 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2740 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2741 } else
2742 #elif KMP_USE_INLINED_FUTEX
2743 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2744 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2745 } else
2746 #endif
2747 {
2748 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2749 }
2750
2751 #if OMPT_SUPPORT && OMPT_OPTIONAL
2752 // This is the case, if called from omp_init_lock_with_hint:
2753 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2754 if (!codeptr)
2755 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2756 if (ompt_enabled.ompt_callback_mutex_released) {
2757 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2758 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2759 }
2760 #endif
2761
2762 #else // KMP_USE_DYNAMIC_LOCK
2763
2764 kmp_user_lock_p lck;
2765
2766 /* Can't use serial interval since not block structured */
2767 /* release the lock */
2768
2769 if ((__kmp_user_lock_kind == lk_tas) &&
2770 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2771 #if KMP_OS_LINUX && \
2772 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2773 // "fast" path implemented to fix customer performance issue
2774 #if USE_ITT_BUILD
2775 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2776 #endif /* USE_ITT_BUILD */
2777 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2778 KMP_MB();
2779
2780 #if OMPT_SUPPORT && OMPT_OPTIONAL
2781 // This is the case, if called from omp_init_lock_with_hint:
2782 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2783 if (!codeptr)
2784 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2785 if (ompt_enabled.ompt_callback_mutex_released) {
2786 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2787 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2788 }
2789 #endif
2790
2791 return;
2792 #else
2793 lck = (kmp_user_lock_p)user_lock;
2794 #endif
2795 }
2796 #if KMP_USE_FUTEX
2797 else if ((__kmp_user_lock_kind == lk_futex) &&
2798 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2799 lck = (kmp_user_lock_p)user_lock;
2800 }
2801 #endif
2802 else {
2803 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2804 }
2805
2806 #if USE_ITT_BUILD
2807 __kmp_itt_lock_releasing(lck);
2808 #endif /* USE_ITT_BUILD */
2809
2810 RELEASE_LOCK(lck, gtid);
2811
2812 #if OMPT_SUPPORT && OMPT_OPTIONAL
2813 // This is the case, if called from omp_init_lock_with_hint:
2814 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2815 if (!codeptr)
2816 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2817 if (ompt_enabled.ompt_callback_mutex_released) {
2818 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2819 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2820 }
2821 #endif
2822
2823 #endif // KMP_USE_DYNAMIC_LOCK
2824 }
2825
2826 /* release the lock */
__kmpc_unset_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2827 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2828 #if KMP_USE_DYNAMIC_LOCK
2829
2830 #if USE_ITT_BUILD
2831 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2832 #endif
2833 int release_status =
2834 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2835 (void) release_status;
2836
2837 #if OMPT_SUPPORT && OMPT_OPTIONAL
2838 // This is the case, if called from omp_init_lock_with_hint:
2839 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2840 if (!codeptr)
2841 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2842 if (ompt_enabled.enabled) {
2843 if (release_status == KMP_LOCK_RELEASED) {
2844 if (ompt_enabled.ompt_callback_mutex_released) {
2845 // release_lock_last
2846 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2847 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2848 codeptr);
2849 }
2850 } else if (ompt_enabled.ompt_callback_nest_lock) {
2851 // release_lock_prev
2852 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2853 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2854 }
2855 }
2856 #endif
2857
2858 #else // KMP_USE_DYNAMIC_LOCK
2859
2860 kmp_user_lock_p lck;
2861
2862 /* Can't use serial interval since not block structured */
2863
2864 if ((__kmp_user_lock_kind == lk_tas) &&
2865 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2866 OMP_NEST_LOCK_T_SIZE)) {
2867 #if KMP_OS_LINUX && \
2868 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2869 // "fast" path implemented to fix customer performance issue
2870 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2871 #if USE_ITT_BUILD
2872 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2873 #endif /* USE_ITT_BUILD */
2874
2875 #if OMPT_SUPPORT && OMPT_OPTIONAL
2876 int release_status = KMP_LOCK_STILL_HELD;
2877 #endif
2878
2879 if (--(tl->lk.depth_locked) == 0) {
2880 TCW_4(tl->lk.poll, 0);
2881 #if OMPT_SUPPORT && OMPT_OPTIONAL
2882 release_status = KMP_LOCK_RELEASED;
2883 #endif
2884 }
2885 KMP_MB();
2886
2887 #if OMPT_SUPPORT && OMPT_OPTIONAL
2888 // This is the case, if called from omp_init_lock_with_hint:
2889 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2890 if (!codeptr)
2891 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2892 if (ompt_enabled.enabled) {
2893 if (release_status == KMP_LOCK_RELEASED) {
2894 if (ompt_enabled.ompt_callback_mutex_released) {
2895 // release_lock_last
2896 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2897 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2898 }
2899 } else if (ompt_enabled.ompt_callback_nest_lock) {
2900 // release_lock_previous
2901 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2902 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2903 }
2904 }
2905 #endif
2906
2907 return;
2908 #else
2909 lck = (kmp_user_lock_p)user_lock;
2910 #endif
2911 }
2912 #if KMP_USE_FUTEX
2913 else if ((__kmp_user_lock_kind == lk_futex) &&
2914 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2915 OMP_NEST_LOCK_T_SIZE)) {
2916 lck = (kmp_user_lock_p)user_lock;
2917 }
2918 #endif
2919 else {
2920 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2921 }
2922
2923 #if USE_ITT_BUILD
2924 __kmp_itt_lock_releasing(lck);
2925 #endif /* USE_ITT_BUILD */
2926
2927 int release_status;
2928 release_status = RELEASE_NESTED_LOCK(lck, gtid);
2929 #if OMPT_SUPPORT && OMPT_OPTIONAL
2930 // This is the case, if called from omp_init_lock_with_hint:
2931 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2932 if (!codeptr)
2933 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2934 if (ompt_enabled.enabled) {
2935 if (release_status == KMP_LOCK_RELEASED) {
2936 if (ompt_enabled.ompt_callback_mutex_released) {
2937 // release_lock_last
2938 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2939 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2940 }
2941 } else if (ompt_enabled.ompt_callback_nest_lock) {
2942 // release_lock_previous
2943 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2944 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2945 }
2946 }
2947 #endif
2948
2949 #endif // KMP_USE_DYNAMIC_LOCK
2950 }
2951
2952 /* try to acquire the lock */
__kmpc_test_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2953 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2954 KMP_COUNT_BLOCK(OMP_test_lock);
2955
2956 #if KMP_USE_DYNAMIC_LOCK
2957 int rc;
2958 int tag = KMP_EXTRACT_D_TAG(user_lock);
2959 #if USE_ITT_BUILD
2960 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2961 #endif
2962 #if OMPT_SUPPORT && OMPT_OPTIONAL
2963 // This is the case, if called from omp_init_lock_with_hint:
2964 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2965 if (!codeptr)
2966 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2967 if (ompt_enabled.ompt_callback_mutex_acquire) {
2968 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2969 ompt_mutex_lock, omp_lock_hint_none,
2970 __ompt_get_mutex_impl_type(user_lock),
2971 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2972 }
2973 #endif
2974 #if KMP_USE_INLINED_TAS
2975 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2976 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2977 } else
2978 #elif KMP_USE_INLINED_FUTEX
2979 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2980 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2981 } else
2982 #endif
2983 {
2984 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2985 }
2986 if (rc) {
2987 #if USE_ITT_BUILD
2988 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2989 #endif
2990 #if OMPT_SUPPORT && OMPT_OPTIONAL
2991 if (ompt_enabled.ompt_callback_mutex_acquired) {
2992 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2993 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2994 }
2995 #endif
2996 return FTN_TRUE;
2997 } else {
2998 #if USE_ITT_BUILD
2999 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3000 #endif
3001 return FTN_FALSE;
3002 }
3003
3004 #else // KMP_USE_DYNAMIC_LOCK
3005
3006 kmp_user_lock_p lck;
3007 int rc;
3008
3009 if ((__kmp_user_lock_kind == lk_tas) &&
3010 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3011 lck = (kmp_user_lock_p)user_lock;
3012 }
3013 #if KMP_USE_FUTEX
3014 else if ((__kmp_user_lock_kind == lk_futex) &&
3015 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3016 lck = (kmp_user_lock_p)user_lock;
3017 }
3018 #endif
3019 else {
3020 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3021 }
3022
3023 #if USE_ITT_BUILD
3024 __kmp_itt_lock_acquiring(lck);
3025 #endif /* USE_ITT_BUILD */
3026 #if OMPT_SUPPORT && OMPT_OPTIONAL
3027 // This is the case, if called from omp_init_lock_with_hint:
3028 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3029 if (!codeptr)
3030 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3031 if (ompt_enabled.ompt_callback_mutex_acquire) {
3032 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3033 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3034 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3035 }
3036 #endif
3037
3038 rc = TEST_LOCK(lck, gtid);
3039 #if USE_ITT_BUILD
3040 if (rc) {
3041 __kmp_itt_lock_acquired(lck);
3042 } else {
3043 __kmp_itt_lock_cancelled(lck);
3044 }
3045 #endif /* USE_ITT_BUILD */
3046 #if OMPT_SUPPORT && OMPT_OPTIONAL
3047 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3048 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3049 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3050 }
3051 #endif
3052
3053 return (rc ? FTN_TRUE : FTN_FALSE);
3054
3055 /* Can't use serial interval since not block structured */
3056
3057 #endif // KMP_USE_DYNAMIC_LOCK
3058 }
3059
3060 /* try to acquire the lock */
__kmpc_test_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)3061 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3062 #if KMP_USE_DYNAMIC_LOCK
3063 int rc;
3064 #if USE_ITT_BUILD
3065 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3066 #endif
3067 #if OMPT_SUPPORT && OMPT_OPTIONAL
3068 // This is the case, if called from omp_init_lock_with_hint:
3069 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3070 if (!codeptr)
3071 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3072 if (ompt_enabled.ompt_callback_mutex_acquire) {
3073 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3074 ompt_mutex_nest_lock, omp_lock_hint_none,
3075 __ompt_get_mutex_impl_type(user_lock),
3076 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3077 }
3078 #endif
3079 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3080 #if USE_ITT_BUILD
3081 if (rc) {
3082 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3083 } else {
3084 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3085 }
3086 #endif
3087 #if OMPT_SUPPORT && OMPT_OPTIONAL
3088 if (ompt_enabled.enabled && rc) {
3089 if (rc == 1) {
3090 if (ompt_enabled.ompt_callback_mutex_acquired) {
3091 // lock_first
3092 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3093 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3094 codeptr);
3095 }
3096 } else {
3097 if (ompt_enabled.ompt_callback_nest_lock) {
3098 // lock_next
3099 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3100 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3101 }
3102 }
3103 }
3104 #endif
3105 return rc;
3106
3107 #else // KMP_USE_DYNAMIC_LOCK
3108
3109 kmp_user_lock_p lck;
3110 int rc;
3111
3112 if ((__kmp_user_lock_kind == lk_tas) &&
3113 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3114 OMP_NEST_LOCK_T_SIZE)) {
3115 lck = (kmp_user_lock_p)user_lock;
3116 }
3117 #if KMP_USE_FUTEX
3118 else if ((__kmp_user_lock_kind == lk_futex) &&
3119 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3120 OMP_NEST_LOCK_T_SIZE)) {
3121 lck = (kmp_user_lock_p)user_lock;
3122 }
3123 #endif
3124 else {
3125 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3126 }
3127
3128 #if USE_ITT_BUILD
3129 __kmp_itt_lock_acquiring(lck);
3130 #endif /* USE_ITT_BUILD */
3131
3132 #if OMPT_SUPPORT && OMPT_OPTIONAL
3133 // This is the case, if called from omp_init_lock_with_hint:
3134 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3135 if (!codeptr)
3136 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3137 if (ompt_enabled.enabled) &&
3138 ompt_enabled.ompt_callback_mutex_acquire) {
3139 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3140 ompt_mutex_nest_lock, omp_lock_hint_none,
3141 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3142 codeptr);
3143 }
3144 #endif
3145
3146 rc = TEST_NESTED_LOCK(lck, gtid);
3147 #if USE_ITT_BUILD
3148 if (rc) {
3149 __kmp_itt_lock_acquired(lck);
3150 } else {
3151 __kmp_itt_lock_cancelled(lck);
3152 }
3153 #endif /* USE_ITT_BUILD */
3154 #if OMPT_SUPPORT && OMPT_OPTIONAL
3155 if (ompt_enabled.enabled && rc) {
3156 if (rc == 1) {
3157 if (ompt_enabled.ompt_callback_mutex_acquired) {
3158 // lock_first
3159 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3160 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3161 }
3162 } else {
3163 if (ompt_enabled.ompt_callback_nest_lock) {
3164 // lock_next
3165 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3166 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3167 }
3168 }
3169 }
3170 #endif
3171 return rc;
3172
3173 /* Can't use serial interval since not block structured */
3174
3175 #endif // KMP_USE_DYNAMIC_LOCK
3176 }
3177
3178 // Interface to fast scalable reduce methods routines
3179
3180 // keep the selected method in a thread local structure for cross-function
3181 // usage: will be used in __kmpc_end_reduce* functions;
3182 // another solution: to re-determine the method one more time in
3183 // __kmpc_end_reduce* functions (new prototype required then)
3184 // AT: which solution is better?
3185 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3186 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3187
3188 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3189 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3190
3191 // description of the packed_reduction_method variable: look at the macros in
3192 // kmp.h
3193
3194 // used in a critical section reduce block
3195 static __forceinline void
__kmp_enter_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3196 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3197 kmp_critical_name *crit) {
3198
3199 // this lock was visible to a customer and to the threading profile tool as a
3200 // serial overhead span (although it's used for an internal purpose only)
3201 // why was it visible in previous implementation?
3202 // should we keep it visible in new reduce block?
3203 kmp_user_lock_p lck;
3204
3205 #if KMP_USE_DYNAMIC_LOCK
3206
3207 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3208 // Check if it is initialized.
3209 if (*lk == 0) {
3210 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3211 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3212 KMP_GET_D_TAG(__kmp_user_lock_seq));
3213 } else {
3214 __kmp_init_indirect_csptr(crit, loc, global_tid,
3215 KMP_GET_I_TAG(__kmp_user_lock_seq));
3216 }
3217 }
3218 // Branch for accessing the actual lock object and set operation. This
3219 // branching is inevitable since this lock initialization does not follow the
3220 // normal dispatch path (lock table is not used).
3221 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3222 lck = (kmp_user_lock_p)lk;
3223 KMP_DEBUG_ASSERT(lck != NULL);
3224 if (__kmp_env_consistency_check) {
3225 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3226 }
3227 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3228 } else {
3229 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3230 lck = ilk->lock;
3231 KMP_DEBUG_ASSERT(lck != NULL);
3232 if (__kmp_env_consistency_check) {
3233 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3234 }
3235 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3236 }
3237
3238 #else // KMP_USE_DYNAMIC_LOCK
3239
3240 // We know that the fast reduction code is only emitted by Intel compilers
3241 // with 32 byte critical sections. If there isn't enough space, then we
3242 // have to use a pointer.
3243 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3244 lck = (kmp_user_lock_p)crit;
3245 } else {
3246 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3247 }
3248 KMP_DEBUG_ASSERT(lck != NULL);
3249
3250 if (__kmp_env_consistency_check)
3251 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3252
3253 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3254
3255 #endif // KMP_USE_DYNAMIC_LOCK
3256 }
3257
3258 // used in a critical section reduce block
3259 static __forceinline void
__kmp_end_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3260 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3261 kmp_critical_name *crit) {
3262
3263 kmp_user_lock_p lck;
3264
3265 #if KMP_USE_DYNAMIC_LOCK
3266
3267 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3268 lck = (kmp_user_lock_p)crit;
3269 if (__kmp_env_consistency_check)
3270 __kmp_pop_sync(global_tid, ct_critical, loc);
3271 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3272 } else {
3273 kmp_indirect_lock_t *ilk =
3274 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3275 if (__kmp_env_consistency_check)
3276 __kmp_pop_sync(global_tid, ct_critical, loc);
3277 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3278 }
3279
3280 #else // KMP_USE_DYNAMIC_LOCK
3281
3282 // We know that the fast reduction code is only emitted by Intel compilers
3283 // with 32 byte critical sections. If there isn't enough space, then we have
3284 // to use a pointer.
3285 if (__kmp_base_user_lock_size > 32) {
3286 lck = *((kmp_user_lock_p *)crit);
3287 KMP_ASSERT(lck != NULL);
3288 } else {
3289 lck = (kmp_user_lock_p)crit;
3290 }
3291
3292 if (__kmp_env_consistency_check)
3293 __kmp_pop_sync(global_tid, ct_critical, loc);
3294
3295 __kmp_release_user_lock_with_checks(lck, global_tid);
3296
3297 #endif // KMP_USE_DYNAMIC_LOCK
3298 } // __kmp_end_critical_section_reduce_block
3299
3300 #if OMP_40_ENABLED
3301 static __forceinline int
__kmp_swap_teams_for_teams_reduction(kmp_info_t * th,kmp_team_t ** team_p,int * task_state)3302 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3303 int *task_state) {
3304 kmp_team_t *team;
3305
3306 // Check if we are inside the teams construct?
3307 if (th->th.th_teams_microtask) {
3308 *team_p = team = th->th.th_team;
3309 if (team->t.t_level == th->th.th_teams_level) {
3310 // This is reduction at teams construct.
3311 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3312 // Let's swap teams temporarily for the reduction.
3313 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3314 th->th.th_team = team->t.t_parent;
3315 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3316 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3317 *task_state = th->th.th_task_state;
3318 th->th.th_task_state = 0;
3319
3320 return 1;
3321 }
3322 }
3323 return 0;
3324 }
3325
3326 static __forceinline void
__kmp_restore_swapped_teams(kmp_info_t * th,kmp_team_t * team,int task_state)3327 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3328 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3329 th->th.th_info.ds.ds_tid = 0;
3330 th->th.th_team = team;
3331 th->th.th_team_nproc = team->t.t_nproc;
3332 th->th.th_task_team = team->t.t_task_team[task_state];
3333 th->th.th_task_state = task_state;
3334 }
3335 #endif
3336
3337 /* 2.a.i. Reduce Block without a terminating barrier */
3338 /*!
3339 @ingroup SYNCHRONIZATION
3340 @param loc source location information
3341 @param global_tid global thread number
3342 @param num_vars number of items (variables) to be reduced
3343 @param reduce_size size of data in bytes to be reduced
3344 @param reduce_data pointer to data to be reduced
3345 @param reduce_func callback function providing reduction operation on two
3346 operands and returning result of reduction in lhs_data
3347 @param lck pointer to the unique lock data structure
3348 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3349 threads if atomic reduction needed
3350
3351 The nowait version is used for a reduce clause with the nowait argument.
3352 */
3353 kmp_int32
__kmpc_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3354 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3355 size_t reduce_size, void *reduce_data,
3356 void (*reduce_func)(void *lhs_data, void *rhs_data),
3357 kmp_critical_name *lck) {
3358
3359 KMP_COUNT_BLOCK(REDUCE_nowait);
3360 int retval = 0;
3361 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3362 #if OMP_40_ENABLED
3363 kmp_info_t *th;
3364 kmp_team_t *team;
3365 int teams_swapped = 0, task_state;
3366 #endif
3367 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3368
3369 // why do we need this initialization here at all?
3370 // Reduction clause can not be used as a stand-alone directive.
3371
3372 // do not call __kmp_serial_initialize(), it will be called by
3373 // __kmp_parallel_initialize() if needed
3374 // possible detection of false-positive race by the threadchecker ???
3375 if (!TCR_4(__kmp_init_parallel))
3376 __kmp_parallel_initialize();
3377
3378 // check correctness of reduce block nesting
3379 #if KMP_USE_DYNAMIC_LOCK
3380 if (__kmp_env_consistency_check)
3381 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3382 #else
3383 if (__kmp_env_consistency_check)
3384 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3385 #endif
3386
3387 #if OMP_40_ENABLED
3388 th = __kmp_thread_from_gtid(global_tid);
3389 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3390 #endif // OMP_40_ENABLED
3391
3392 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3393 // the value should be kept in a variable
3394 // the variable should be either a construct-specific or thread-specific
3395 // property, not a team specific property
3396 // (a thread can reach the next reduce block on the next construct, reduce
3397 // method may differ on the next construct)
3398 // an ident_t "loc" parameter could be used as a construct-specific property
3399 // (what if loc == 0?)
3400 // (if both construct-specific and team-specific variables were shared,
3401 // then unness extra syncs should be needed)
3402 // a thread-specific variable is better regarding two issues above (next
3403 // construct and extra syncs)
3404 // a thread-specific "th_local.reduction_method" variable is used currently
3405 // each thread executes 'determine' and 'set' lines (no need to execute by one
3406 // thread, to avoid unness extra syncs)
3407
3408 packed_reduction_method = __kmp_determine_reduction_method(
3409 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3410 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3411
3412 if (packed_reduction_method == critical_reduce_block) {
3413
3414 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3415 retval = 1;
3416
3417 } else if (packed_reduction_method == empty_reduce_block) {
3418
3419 // usage: if team size == 1, no synchronization is required ( Intel
3420 // platforms only )
3421 retval = 1;
3422
3423 } else if (packed_reduction_method == atomic_reduce_block) {
3424
3425 retval = 2;
3426
3427 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3428 // won't be called by the code gen)
3429 // (it's not quite good, because the checking block has been closed by
3430 // this 'pop',
3431 // but atomic operation has not been executed yet, will be executed
3432 // slightly later, literally on next instruction)
3433 if (__kmp_env_consistency_check)
3434 __kmp_pop_sync(global_tid, ct_reduce, loc);
3435
3436 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3437 tree_reduce_block)) {
3438
3439 // AT: performance issue: a real barrier here
3440 // AT: (if master goes slow, other threads are blocked here waiting for the
3441 // master to come and release them)
3442 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3443 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3444 // be confusing to a customer)
3445 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3446 // might go faster and be more in line with sense of NOWAIT
3447 // AT: TO DO: do epcc test and compare times
3448
3449 // this barrier should be invisible to a customer and to the threading profile
3450 // tool (it's neither a terminating barrier nor customer's code, it's
3451 // used for an internal purpose)
3452 #if OMPT_SUPPORT
3453 // JP: can this barrier potentially leed to task scheduling?
3454 // JP: as long as there is a barrier in the implementation, OMPT should and
3455 // will provide the barrier events
3456 // so we set-up the necessary frame/return addresses.
3457 ompt_frame_t *ompt_frame;
3458 if (ompt_enabled.enabled) {
3459 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3460 if (ompt_frame->enter_frame.ptr == NULL)
3461 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3462 OMPT_STORE_RETURN_ADDRESS(global_tid);
3463 }
3464 #endif
3465 #if USE_ITT_NOTIFY
3466 __kmp_threads[global_tid]->th.th_ident = loc;
3467 #endif
3468 retval =
3469 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3470 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3471 retval = (retval != 0) ? (0) : (1);
3472 #if OMPT_SUPPORT && OMPT_OPTIONAL
3473 if (ompt_enabled.enabled) {
3474 ompt_frame->enter_frame = ompt_data_none;
3475 }
3476 #endif
3477
3478 // all other workers except master should do this pop here
3479 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3480 if (__kmp_env_consistency_check) {
3481 if (retval == 0) {
3482 __kmp_pop_sync(global_tid, ct_reduce, loc);
3483 }
3484 }
3485
3486 } else {
3487
3488 // should never reach this block
3489 KMP_ASSERT(0); // "unexpected method"
3490 }
3491 #if OMP_40_ENABLED
3492 if (teams_swapped) {
3493 __kmp_restore_swapped_teams(th, team, task_state);
3494 }
3495 #endif
3496 KA_TRACE(
3497 10,
3498 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3499 global_tid, packed_reduction_method, retval));
3500
3501 return retval;
3502 }
3503
3504 /*!
3505 @ingroup SYNCHRONIZATION
3506 @param loc source location information
3507 @param global_tid global thread id.
3508 @param lck pointer to the unique lock data structure
3509
3510 Finish the execution of a reduce nowait.
3511 */
__kmpc_end_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3512 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3513 kmp_critical_name *lck) {
3514
3515 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3516
3517 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3518
3519 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3520
3521 if (packed_reduction_method == critical_reduce_block) {
3522
3523 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3524
3525 } else if (packed_reduction_method == empty_reduce_block) {
3526
3527 // usage: if team size == 1, no synchronization is required ( on Intel
3528 // platforms only )
3529
3530 } else if (packed_reduction_method == atomic_reduce_block) {
3531
3532 // neither master nor other workers should get here
3533 // (code gen does not generate this call in case 2: atomic reduce block)
3534 // actually it's better to remove this elseif at all;
3535 // after removal this value will checked by the 'else' and will assert
3536
3537 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3538 tree_reduce_block)) {
3539
3540 // only master gets here
3541
3542 } else {
3543
3544 // should never reach this block
3545 KMP_ASSERT(0); // "unexpected method"
3546 }
3547
3548 if (__kmp_env_consistency_check)
3549 __kmp_pop_sync(global_tid, ct_reduce, loc);
3550
3551 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3552 global_tid, packed_reduction_method));
3553
3554 return;
3555 }
3556
3557 /* 2.a.ii. Reduce Block with a terminating barrier */
3558
3559 /*!
3560 @ingroup SYNCHRONIZATION
3561 @param loc source location information
3562 @param global_tid global thread number
3563 @param num_vars number of items (variables) to be reduced
3564 @param reduce_size size of data in bytes to be reduced
3565 @param reduce_data pointer to data to be reduced
3566 @param reduce_func callback function providing reduction operation on two
3567 operands and returning result of reduction in lhs_data
3568 @param lck pointer to the unique lock data structure
3569 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3570 threads if atomic reduction needed
3571
3572 A blocking reduce that includes an implicit barrier.
3573 */
__kmpc_reduce(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3574 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3575 size_t reduce_size, void *reduce_data,
3576 void (*reduce_func)(void *lhs_data, void *rhs_data),
3577 kmp_critical_name *lck) {
3578 KMP_COUNT_BLOCK(REDUCE_wait);
3579 int retval = 0;
3580 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3581 #if OMP_40_ENABLED
3582 kmp_info_t *th;
3583 kmp_team_t *team;
3584 int teams_swapped = 0, task_state;
3585 #endif
3586
3587 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3588
3589 // why do we need this initialization here at all?
3590 // Reduction clause can not be a stand-alone directive.
3591
3592 // do not call __kmp_serial_initialize(), it will be called by
3593 // __kmp_parallel_initialize() if needed
3594 // possible detection of false-positive race by the threadchecker ???
3595 if (!TCR_4(__kmp_init_parallel))
3596 __kmp_parallel_initialize();
3597
3598 // check correctness of reduce block nesting
3599 #if KMP_USE_DYNAMIC_LOCK
3600 if (__kmp_env_consistency_check)
3601 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3602 #else
3603 if (__kmp_env_consistency_check)
3604 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3605 #endif
3606
3607 #if OMP_40_ENABLED
3608 th = __kmp_thread_from_gtid(global_tid);
3609 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3610 #endif // OMP_40_ENABLED
3611
3612 packed_reduction_method = __kmp_determine_reduction_method(
3613 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3614 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3615
3616 if (packed_reduction_method == critical_reduce_block) {
3617
3618 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3619 retval = 1;
3620
3621 } else if (packed_reduction_method == empty_reduce_block) {
3622
3623 // usage: if team size == 1, no synchronization is required ( Intel
3624 // platforms only )
3625 retval = 1;
3626
3627 } else if (packed_reduction_method == atomic_reduce_block) {
3628
3629 retval = 2;
3630
3631 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3632 tree_reduce_block)) {
3633
3634 // case tree_reduce_block:
3635 // this barrier should be visible to a customer and to the threading profile
3636 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3637 #if OMPT_SUPPORT
3638 ompt_frame_t *ompt_frame;
3639 if (ompt_enabled.enabled) {
3640 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3641 if (ompt_frame->enter_frame.ptr == NULL)
3642 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3643 OMPT_STORE_RETURN_ADDRESS(global_tid);
3644 }
3645 #endif
3646 #if USE_ITT_NOTIFY
3647 __kmp_threads[global_tid]->th.th_ident =
3648 loc; // needed for correct notification of frames
3649 #endif
3650 retval =
3651 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3652 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3653 retval = (retval != 0) ? (0) : (1);
3654 #if OMPT_SUPPORT && OMPT_OPTIONAL
3655 if (ompt_enabled.enabled) {
3656 ompt_frame->enter_frame = ompt_data_none;
3657 }
3658 #endif
3659
3660 // all other workers except master should do this pop here
3661 // ( none of other workers except master will enter __kmpc_end_reduce() )
3662 if (__kmp_env_consistency_check) {
3663 if (retval == 0) { // 0: all other workers; 1: master
3664 __kmp_pop_sync(global_tid, ct_reduce, loc);
3665 }
3666 }
3667
3668 } else {
3669
3670 // should never reach this block
3671 KMP_ASSERT(0); // "unexpected method"
3672 }
3673 #if OMP_40_ENABLED
3674 if (teams_swapped) {
3675 __kmp_restore_swapped_teams(th, team, task_state);
3676 }
3677 #endif
3678
3679 KA_TRACE(10,
3680 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3681 global_tid, packed_reduction_method, retval));
3682
3683 return retval;
3684 }
3685
3686 /*!
3687 @ingroup SYNCHRONIZATION
3688 @param loc source location information
3689 @param global_tid global thread id.
3690 @param lck pointer to the unique lock data structure
3691
3692 Finish the execution of a blocking reduce.
3693 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3694 start function.
3695 */
__kmpc_end_reduce(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3696 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3697 kmp_critical_name *lck) {
3698
3699 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3700 #if OMP_40_ENABLED
3701 kmp_info_t *th;
3702 kmp_team_t *team;
3703 int teams_swapped = 0, task_state;
3704 #endif
3705
3706 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3707
3708 #if OMP_40_ENABLED
3709 th = __kmp_thread_from_gtid(global_tid);
3710 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3711 #endif // OMP_40_ENABLED
3712
3713 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3714
3715 // this barrier should be visible to a customer and to the threading profile
3716 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3717
3718 if (packed_reduction_method == critical_reduce_block) {
3719
3720 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3721
3722 // TODO: implicit barrier: should be exposed
3723 #if OMPT_SUPPORT
3724 ompt_frame_t *ompt_frame;
3725 if (ompt_enabled.enabled) {
3726 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3727 if (ompt_frame->enter_frame.ptr == NULL)
3728 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3729 OMPT_STORE_RETURN_ADDRESS(global_tid);
3730 }
3731 #endif
3732 #if USE_ITT_NOTIFY
3733 __kmp_threads[global_tid]->th.th_ident = loc;
3734 #endif
3735 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3736 #if OMPT_SUPPORT && OMPT_OPTIONAL
3737 if (ompt_enabled.enabled) {
3738 ompt_frame->enter_frame = ompt_data_none;
3739 }
3740 #endif
3741
3742 } else if (packed_reduction_method == empty_reduce_block) {
3743
3744 // usage: if team size==1, no synchronization is required (Intel platforms only)
3745
3746 // TODO: implicit barrier: should be exposed
3747 #if OMPT_SUPPORT
3748 ompt_frame_t *ompt_frame;
3749 if (ompt_enabled.enabled) {
3750 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3751 if (ompt_frame->enter_frame.ptr == NULL)
3752 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3753 OMPT_STORE_RETURN_ADDRESS(global_tid);
3754 }
3755 #endif
3756 #if USE_ITT_NOTIFY
3757 __kmp_threads[global_tid]->th.th_ident = loc;
3758 #endif
3759 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3760 #if OMPT_SUPPORT && OMPT_OPTIONAL
3761 if (ompt_enabled.enabled) {
3762 ompt_frame->enter_frame = ompt_data_none;
3763 }
3764 #endif
3765
3766 } else if (packed_reduction_method == atomic_reduce_block) {
3767
3768 #if OMPT_SUPPORT
3769 ompt_frame_t *ompt_frame;
3770 if (ompt_enabled.enabled) {
3771 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3772 if (ompt_frame->enter_frame.ptr == NULL)
3773 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3774 OMPT_STORE_RETURN_ADDRESS(global_tid);
3775 }
3776 #endif
3777 // TODO: implicit barrier: should be exposed
3778 #if USE_ITT_NOTIFY
3779 __kmp_threads[global_tid]->th.th_ident = loc;
3780 #endif
3781 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3782 #if OMPT_SUPPORT && OMPT_OPTIONAL
3783 if (ompt_enabled.enabled) {
3784 ompt_frame->enter_frame = ompt_data_none;
3785 }
3786 #endif
3787
3788 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3789 tree_reduce_block)) {
3790
3791 // only master executes here (master releases all other workers)
3792 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3793 global_tid);
3794
3795 } else {
3796
3797 // should never reach this block
3798 KMP_ASSERT(0); // "unexpected method"
3799 }
3800 #if OMP_40_ENABLED
3801 if (teams_swapped) {
3802 __kmp_restore_swapped_teams(th, team, task_state);
3803 }
3804 #endif
3805
3806 if (__kmp_env_consistency_check)
3807 __kmp_pop_sync(global_tid, ct_reduce, loc);
3808
3809 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3810 global_tid, packed_reduction_method));
3811
3812 return;
3813 }
3814
3815 #undef __KMP_GET_REDUCTION_METHOD
3816 #undef __KMP_SET_REDUCTION_METHOD
3817
3818 /* end of interface to fast scalable reduce routines */
3819
__kmpc_get_taskid()3820 kmp_uint64 __kmpc_get_taskid() {
3821
3822 kmp_int32 gtid;
3823 kmp_info_t *thread;
3824
3825 gtid = __kmp_get_gtid();
3826 if (gtid < 0) {
3827 return 0;
3828 }
3829 thread = __kmp_thread_from_gtid(gtid);
3830 return thread->th.th_current_task->td_task_id;
3831
3832 } // __kmpc_get_taskid
3833
__kmpc_get_parent_taskid()3834 kmp_uint64 __kmpc_get_parent_taskid() {
3835
3836 kmp_int32 gtid;
3837 kmp_info_t *thread;
3838 kmp_taskdata_t *parent_task;
3839
3840 gtid = __kmp_get_gtid();
3841 if (gtid < 0) {
3842 return 0;
3843 }
3844 thread = __kmp_thread_from_gtid(gtid);
3845 parent_task = thread->th.th_current_task->td_parent;
3846 return (parent_task == NULL ? 0 : parent_task->td_task_id);
3847
3848 } // __kmpc_get_parent_taskid
3849
3850 #if OMP_45_ENABLED
3851 /*!
3852 @ingroup WORK_SHARING
3853 @param loc source location information.
3854 @param gtid global thread number.
3855 @param num_dims number of associated doacross loops.
3856 @param dims info on loops bounds.
3857
3858 Initialize doacross loop information.
3859 Expect compiler send us inclusive bounds,
3860 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3861 */
__kmpc_doacross_init(ident_t * loc,int gtid,int num_dims,const struct kmp_dim * dims)3862 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3863 const struct kmp_dim *dims) {
3864 int j, idx;
3865 kmp_int64 last, trace_count;
3866 kmp_info_t *th = __kmp_threads[gtid];
3867 kmp_team_t *team = th->th.th_team;
3868 kmp_uint32 *flags;
3869 kmp_disp_t *pr_buf = th->th.th_dispatch;
3870 dispatch_shared_info_t *sh_buf;
3871
3872 KA_TRACE(
3873 20,
3874 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3875 gtid, num_dims, !team->t.t_serialized));
3876 KMP_DEBUG_ASSERT(dims != NULL);
3877 KMP_DEBUG_ASSERT(num_dims > 0);
3878
3879 if (team->t.t_serialized) {
3880 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3881 return; // no dependencies if team is serialized
3882 }
3883 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3884 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3885 // the next loop
3886 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3887
3888 // Save bounds info into allocated private buffer
3889 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3890 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3891 th, sizeof(kmp_int64) * (4 * num_dims + 1));
3892 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3893 pr_buf->th_doacross_info[0] =
3894 (kmp_int64)num_dims; // first element is number of dimensions
3895 // Save also address of num_done in order to access it later without knowing
3896 // the buffer index
3897 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3898 pr_buf->th_doacross_info[2] = dims[0].lo;
3899 pr_buf->th_doacross_info[3] = dims[0].up;
3900 pr_buf->th_doacross_info[4] = dims[0].st;
3901 last = 5;
3902 for (j = 1; j < num_dims; ++j) {
3903 kmp_int64
3904 range_length; // To keep ranges of all dimensions but the first dims[0]
3905 if (dims[j].st == 1) { // most common case
3906 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3907 range_length = dims[j].up - dims[j].lo + 1;
3908 } else {
3909 if (dims[j].st > 0) {
3910 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3911 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3912 } else { // negative increment
3913 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3914 range_length =
3915 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3916 }
3917 }
3918 pr_buf->th_doacross_info[last++] = range_length;
3919 pr_buf->th_doacross_info[last++] = dims[j].lo;
3920 pr_buf->th_doacross_info[last++] = dims[j].up;
3921 pr_buf->th_doacross_info[last++] = dims[j].st;
3922 }
3923
3924 // Compute total trip count.
3925 // Start with range of dims[0] which we don't need to keep in the buffer.
3926 if (dims[0].st == 1) { // most common case
3927 trace_count = dims[0].up - dims[0].lo + 1;
3928 } else if (dims[0].st > 0) {
3929 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3930 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3931 } else { // negative increment
3932 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3933 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3934 }
3935 for (j = 1; j < num_dims; ++j) {
3936 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3937 }
3938 KMP_DEBUG_ASSERT(trace_count > 0);
3939
3940 // Check if shared buffer is not occupied by other loop (idx -
3941 // __kmp_dispatch_num_buffers)
3942 if (idx != sh_buf->doacross_buf_idx) {
3943 // Shared buffer is occupied, wait for it to be free
3944 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3945 __kmp_eq_4, NULL);
3946 }
3947 #if KMP_32_BIT_ARCH
3948 // Check if we are the first thread. After the CAS the first thread gets 0,
3949 // others get 1 if initialization is in progress, allocated pointer otherwise.
3950 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3951 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3952 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3953 #else
3954 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3955 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3956 #endif
3957 if (flags == NULL) {
3958 // we are the first thread, allocate the array of flags
3959 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3960 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3961 KMP_MB();
3962 sh_buf->doacross_flags = flags;
3963 } else if (flags == (kmp_uint32 *)1) {
3964 #if KMP_32_BIT_ARCH
3965 // initialization is still in progress, need to wait
3966 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3967 #else
3968 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3969 #endif
3970 KMP_YIELD(TRUE);
3971 KMP_MB();
3972 } else {
3973 KMP_MB();
3974 }
3975 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3976 pr_buf->th_doacross_flags =
3977 sh_buf->doacross_flags; // save private copy in order to not
3978 // touch shared buffer on each iteration
3979 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3980 }
3981
__kmpc_doacross_wait(ident_t * loc,int gtid,const kmp_int64 * vec)3982 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3983 kmp_int32 shft, num_dims, i;
3984 kmp_uint32 flag;
3985 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3986 kmp_info_t *th = __kmp_threads[gtid];
3987 kmp_team_t *team = th->th.th_team;
3988 kmp_disp_t *pr_buf;
3989 kmp_int64 lo, up, st;
3990
3991 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3992 if (team->t.t_serialized) {
3993 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3994 return; // no dependencies if team is serialized
3995 }
3996
3997 // calculate sequential iteration number and check out-of-bounds condition
3998 pr_buf = th->th.th_dispatch;
3999 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4000 num_dims = pr_buf->th_doacross_info[0];
4001 lo = pr_buf->th_doacross_info[2];
4002 up = pr_buf->th_doacross_info[3];
4003 st = pr_buf->th_doacross_info[4];
4004 if (st == 1) { // most common case
4005 if (vec[0] < lo || vec[0] > up) {
4006 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4007 "bounds [%lld,%lld]\n",
4008 gtid, vec[0], lo, up));
4009 return;
4010 }
4011 iter_number = vec[0] - lo;
4012 } else if (st > 0) {
4013 if (vec[0] < lo || vec[0] > up) {
4014 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4015 "bounds [%lld,%lld]\n",
4016 gtid, vec[0], lo, up));
4017 return;
4018 }
4019 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4020 } else { // negative increment
4021 if (vec[0] > lo || vec[0] < up) {
4022 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4023 "bounds [%lld,%lld]\n",
4024 gtid, vec[0], lo, up));
4025 return;
4026 }
4027 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4028 }
4029 for (i = 1; i < num_dims; ++i) {
4030 kmp_int64 iter, ln;
4031 kmp_int32 j = i * 4;
4032 ln = pr_buf->th_doacross_info[j + 1];
4033 lo = pr_buf->th_doacross_info[j + 2];
4034 up = pr_buf->th_doacross_info[j + 3];
4035 st = pr_buf->th_doacross_info[j + 4];
4036 if (st == 1) {
4037 if (vec[i] < lo || vec[i] > up) {
4038 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4039 "bounds [%lld,%lld]\n",
4040 gtid, vec[i], lo, up));
4041 return;
4042 }
4043 iter = vec[i] - lo;
4044 } else if (st > 0) {
4045 if (vec[i] < lo || vec[i] > up) {
4046 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4047 "bounds [%lld,%lld]\n",
4048 gtid, vec[i], lo, up));
4049 return;
4050 }
4051 iter = (kmp_uint64)(vec[i] - lo) / st;
4052 } else { // st < 0
4053 if (vec[i] > lo || vec[i] < up) {
4054 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4055 "bounds [%lld,%lld]\n",
4056 gtid, vec[i], lo, up));
4057 return;
4058 }
4059 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4060 }
4061 iter_number = iter + ln * iter_number;
4062 }
4063 shft = iter_number % 32; // use 32-bit granularity
4064 iter_number >>= 5; // divided by 32
4065 flag = 1 << shft;
4066 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4067 KMP_YIELD(TRUE);
4068 }
4069 KMP_MB();
4070 KA_TRACE(20,
4071 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4072 gtid, (iter_number << 5) + shft));
4073 }
4074
__kmpc_doacross_post(ident_t * loc,int gtid,const kmp_int64 * vec)4075 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4076 kmp_int32 shft, num_dims, i;
4077 kmp_uint32 flag;
4078 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4079 kmp_info_t *th = __kmp_threads[gtid];
4080 kmp_team_t *team = th->th.th_team;
4081 kmp_disp_t *pr_buf;
4082 kmp_int64 lo, st;
4083
4084 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4085 if (team->t.t_serialized) {
4086 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4087 return; // no dependencies if team is serialized
4088 }
4089
4090 // calculate sequential iteration number (same as in "wait" but no
4091 // out-of-bounds checks)
4092 pr_buf = th->th.th_dispatch;
4093 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4094 num_dims = pr_buf->th_doacross_info[0];
4095 lo = pr_buf->th_doacross_info[2];
4096 st = pr_buf->th_doacross_info[4];
4097 if (st == 1) { // most common case
4098 iter_number = vec[0] - lo;
4099 } else if (st > 0) {
4100 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4101 } else { // negative increment
4102 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4103 }
4104 for (i = 1; i < num_dims; ++i) {
4105 kmp_int64 iter, ln;
4106 kmp_int32 j = i * 4;
4107 ln = pr_buf->th_doacross_info[j + 1];
4108 lo = pr_buf->th_doacross_info[j + 2];
4109 st = pr_buf->th_doacross_info[j + 4];
4110 if (st == 1) {
4111 iter = vec[i] - lo;
4112 } else if (st > 0) {
4113 iter = (kmp_uint64)(vec[i] - lo) / st;
4114 } else { // st < 0
4115 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4116 }
4117 iter_number = iter + ln * iter_number;
4118 }
4119 shft = iter_number % 32; // use 32-bit granularity
4120 iter_number >>= 5; // divided by 32
4121 flag = 1 << shft;
4122 KMP_MB();
4123 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4124 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4125 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4126 (iter_number << 5) + shft));
4127 }
4128
__kmpc_doacross_fini(ident_t * loc,int gtid)4129 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4130 kmp_int32 num_done;
4131 kmp_info_t *th = __kmp_threads[gtid];
4132 kmp_team_t *team = th->th.th_team;
4133 kmp_disp_t *pr_buf = th->th.th_dispatch;
4134
4135 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4136 if (team->t.t_serialized) {
4137 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4138 return; // nothing to do
4139 }
4140 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4141 if (num_done == th->th.th_team_nproc) {
4142 // we are the last thread, need to free shared resources
4143 int idx = pr_buf->th_doacross_buf_idx - 1;
4144 dispatch_shared_info_t *sh_buf =
4145 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4146 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4147 (kmp_int64)&sh_buf->doacross_num_done);
4148 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4149 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4150 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4151 sh_buf->doacross_flags = NULL;
4152 sh_buf->doacross_num_done = 0;
4153 sh_buf->doacross_buf_idx +=
4154 __kmp_dispatch_num_buffers; // free buffer for future re-use
4155 }
4156 // free private resources (need to keep buffer index forever)
4157 pr_buf->th_doacross_flags = NULL;
4158 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4159 pr_buf->th_doacross_info = NULL;
4160 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4161 }
4162 #endif
4163
4164 #if OMP_50_ENABLED
__kmpc_get_target_offload(void)4165 int __kmpc_get_target_offload(void) {
4166 if (!__kmp_init_serial) {
4167 __kmp_serial_initialize();
4168 }
4169 return __kmp_target_offload;
4170 }
4171 #endif // OMP_50_ENABLED
4172
4173 // end of file //
4174