1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc source location information
336 @param global_tid global thread number
337 @param num_teams number of teams requested for the teams construct
338 @param num_threads number of threads per team requested for the teams construct
339 
340 Set the number of teams to be used by the teams construct.
341 This call is only required if the teams construct has a `num_teams` clause
342 or a `thread_limit` clause (or both).
343 */
344 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
345                            kmp_int32 num_teams, kmp_int32 num_threads) {
346   KA_TRACE(20,
347            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
348             global_tid, num_teams, num_threads));
349   __kmp_assert_valid_gtid(global_tid);
350   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
351 }
352 
353 /*!
354 @ingroup PARALLEL
355 @param loc source location information
356 @param global_tid global thread number
357 @param num_teams_lb lower bound on number of teams requested for the teams
358 construct
359 @param num_teams_ub upper bound on number of teams requested for the teams
360 construct
361 @param num_threads number of threads per team requested for the teams construct
362 
363 Set the number of teams to be used by the teams construct. The number of initial
364 teams cretaed will be greater than or equal to the lower bound and less than or
365 equal to the upper bound.
366 This call is only required if the teams construct has a `num_teams` clause
367 or a `thread_limit` clause (or both).
368 */
369 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
370                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
371                               kmp_int32 num_threads) {
372   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
373                 " num_teams_ub=%d num_threads=%d\n",
374                 global_tid, num_teams_lb, num_teams_ub, num_threads));
375   __kmp_assert_valid_gtid(global_tid);
376   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
377                           num_threads);
378 }
379 
380 /*!
381 @ingroup PARALLEL
382 @param loc  source location information
383 @param argc  total number of arguments in the ellipsis
384 @param microtask  pointer to callback routine consisting of outlined teams
385 construct
386 @param ...  pointers to shared variables that aren't global
387 
388 Do the actual fork and call the microtask in the relevant number of threads.
389 */
390 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
391                        ...) {
392   int gtid = __kmp_entry_gtid();
393   kmp_info_t *this_thr = __kmp_threads[gtid];
394   va_list ap;
395   va_start(ap, microtask);
396 
397 #if KMP_STATS_ENABLED
398   KMP_COUNT_BLOCK(OMP_TEAMS);
399   stats_state_e previous_state = KMP_GET_THREAD_STATE();
400   if (previous_state == stats_state_e::SERIAL_REGION) {
401     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
402   } else {
403     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
404   }
405 #endif
406 
407   // remember teams entry point and nesting level
408   this_thr->th.th_teams_microtask = microtask;
409   this_thr->th.th_teams_level =
410       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
411 
412 #if OMPT_SUPPORT
413   kmp_team_t *parent_team = this_thr->th.th_team;
414   int tid = __kmp_tid_from_gtid(gtid);
415   if (ompt_enabled.enabled) {
416     parent_team->t.t_implicit_task_taskdata[tid]
417         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
418   }
419   OMPT_STORE_RETURN_ADDRESS(gtid);
420 #endif
421 
422   // check if __kmpc_push_num_teams called, set default number of teams
423   // otherwise
424   if (this_thr->th.th_teams_size.nteams == 0) {
425     __kmp_push_num_teams(loc, gtid, 0, 0);
426   }
427   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
428   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
430 
431   __kmp_fork_call(
432       loc, gtid, fork_context_intel, argc,
433       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
434       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
435   __kmp_join_call(loc, gtid
436 #if OMPT_SUPPORT
437                   ,
438                   fork_context_intel
439 #endif
440   );
441 
442   // Pop current CG root off list
443   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
444   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
445   this_thr->th.th_cg_roots = tmp->up;
446   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
447                  " to node %p. cg_nthreads was %d\n",
448                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
449   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
450   int i = tmp->cg_nthreads--;
451   if (i == 1) { // check is we are the last thread in CG (not always the case)
452     __kmp_free(tmp);
453   }
454   // Restore current task's thread_limit from CG root
455   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
456   this_thr->th.th_current_task->td_icvs.thread_limit =
457       this_thr->th.th_cg_roots->cg_thread_limit;
458 
459   this_thr->th.th_teams_microtask = NULL;
460   this_thr->th.th_teams_level = 0;
461   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
462   va_end(ap);
463 #if KMP_STATS_ENABLED
464   if (previous_state == stats_state_e::SERIAL_REGION) {
465     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
466     KMP_SET_THREAD_STATE(previous_state);
467   } else {
468     KMP_POP_PARTITIONED_TIMER();
469   }
470 #endif // KMP_STATS_ENABLED
471 }
472 
473 // I don't think this function should ever have been exported.
474 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
475 // openmp code ever called it, but it's been exported from the RTL for so
476 // long that I'm afraid to remove the definition.
477 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
478 
479 /*!
480 @ingroup PARALLEL
481 @param loc  source location information
482 @param global_tid  global thread number
483 
484 Enter a serialized parallel construct. This interface is used to handle a
485 conditional parallel region, like this,
486 @code
487 #pragma omp parallel if (condition)
488 @endcode
489 when the condition is false.
490 */
491 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
492   // The implementation is now in kmp_runtime.cpp so that it can share static
493   // functions with kmp_fork_call since the tasks to be done are similar in
494   // each case.
495   __kmp_assert_valid_gtid(global_tid);
496 #if OMPT_SUPPORT
497   OMPT_STORE_RETURN_ADDRESS(global_tid);
498 #endif
499   __kmp_serialized_parallel(loc, global_tid);
500 }
501 
502 /*!
503 @ingroup PARALLEL
504 @param loc  source location information
505 @param global_tid  global thread number
506 
507 Leave a serialized parallel construct.
508 */
509 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
510   kmp_internal_control_t *top;
511   kmp_info_t *this_thr;
512   kmp_team_t *serial_team;
513 
514   KC_TRACE(10,
515            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
516 
517   /* skip all this code for autopar serialized loops since it results in
518      unacceptable overhead */
519   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
520     return;
521 
522   // Not autopar code
523   __kmp_assert_valid_gtid(global_tid);
524   if (!TCR_4(__kmp_init_parallel))
525     __kmp_parallel_initialize();
526 
527   __kmp_resume_if_soft_paused();
528 
529   this_thr = __kmp_threads[global_tid];
530   serial_team = this_thr->th.th_serial_team;
531 
532   kmp_task_team_t *task_team = this_thr->th.th_task_team;
533   // we need to wait for the proxy tasks before finishing the thread
534   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
535                             task_team->tt.tt_hidden_helper_task_encountered))
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   /* pop dispatch buffers stack */
581   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
582   {
583     dispatch_private_info_t *disp_buffer =
584         serial_team->t.t_dispatch->th_disp_buffer;
585     serial_team->t.t_dispatch->th_disp_buffer =
586         serial_team->t.t_dispatch->th_disp_buffer->next;
587     __kmp_free(disp_buffer);
588   }
589   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
590 
591   --serial_team->t.t_serialized;
592   if (serial_team->t.t_serialized == 0) {
593 
594     /* return to the parallel section */
595 
596 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
597     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
598       __kmp_clear_x87_fpu_status_word();
599       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
600       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
601     }
602 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
603 
604     __kmp_pop_current_task_from_thread(this_thr);
605 #if OMPD_SUPPORT
606     if (ompd_state & OMPD_ENABLE_BP)
607       ompd_bp_parallel_end();
608 #endif
609 
610     this_thr->th.th_team = serial_team->t.t_parent;
611     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
612 
613     /* restore values cached in the thread */
614     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
615     this_thr->th.th_team_master =
616         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
617     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
618 
619     /* TODO the below shouldn't need to be adjusted for serialized teams */
620     this_thr->th.th_dispatch =
621         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
622 
623     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
624     this_thr->th.th_current_task->td_flags.executing = 1;
625 
626     if (__kmp_tasking_mode != tskm_immediate_exec) {
627       // Copy the task team from the new child / old parent team to the thread.
628       this_thr->th.th_task_team =
629           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
630       KA_TRACE(20,
631                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
632                 "team %p\n",
633                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
634     }
635   } else {
636     if (__kmp_tasking_mode != tskm_immediate_exec) {
637       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
638                     "depth of serial team %p to %d\n",
639                     global_tid, serial_team, serial_team->t.t_serialized));
640     }
641   }
642 
643   serial_team->t.t_level--;
644   if (__kmp_env_consistency_check)
645     __kmp_pop_parallel(global_tid, NULL);
646 #if OMPT_SUPPORT
647   if (ompt_enabled.enabled)
648     this_thr->th.ompt_thread_info.state =
649         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
650                                            : ompt_state_work_parallel);
651 #endif
652 }
653 
654 /*!
655 @ingroup SYNCHRONIZATION
656 @param loc  source location information.
657 
658 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
659 depending on the memory ordering convention obeyed by the compiler
660 even that may not be necessary).
661 */
662 void __kmpc_flush(ident_t *loc) {
663   KC_TRACE(10, ("__kmpc_flush: called\n"));
664 
665   /* need explicit __mf() here since use volatile instead in library */
666   KMP_MB(); /* Flush all pending memory write invalidates.  */
667 
668 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
669 #if KMP_MIC
670 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
671 // We shouldn't need it, though, since the ABI rules require that
672 // * If the compiler generates NGO stores it also generates the fence
673 // * If users hand-code NGO stores they should insert the fence
674 // therefore no incomplete unordered stores should be visible.
675 #else
676   // C74404
677   // This is to address non-temporal store instructions (sfence needed).
678   // The clflush instruction is addressed either (mfence needed).
679   // Probably the non-temporal load monvtdqa instruction should also be
680   // addressed.
681   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
682   if (!__kmp_cpuinfo.initialized) {
683     __kmp_query_cpuid(&__kmp_cpuinfo);
684   }
685   if (!__kmp_cpuinfo.flags.sse2) {
686     // CPU cannot execute SSE2 instructions.
687   } else {
688 #if KMP_COMPILER_ICC || KMP_COMPILER_ICX
689     _mm_mfence();
690 #elif KMP_COMPILER_MSVC
691     MemoryBarrier();
692 #else
693     __sync_synchronize();
694 #endif // KMP_COMPILER_ICC || KMP_COMPILER_ICX
695   }
696 #endif // KMP_MIC
697 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
698        KMP_ARCH_RISCV64)
699 // Nothing to see here move along
700 #elif KMP_ARCH_PPC64
701 // Nothing needed here (we have a real MB above).
702 #else
703 #error Unknown or unsupported architecture
704 #endif
705 
706 #if OMPT_SUPPORT && OMPT_OPTIONAL
707   if (ompt_enabled.ompt_callback_flush) {
708     ompt_callbacks.ompt_callback(ompt_callback_flush)(
709         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
710   }
711 #endif
712 }
713 
714 /* -------------------------------------------------------------------------- */
715 /*!
716 @ingroup SYNCHRONIZATION
717 @param loc source location information
718 @param global_tid thread id.
719 
720 Execute a barrier.
721 */
722 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
723   KMP_COUNT_BLOCK(OMP_BARRIER);
724   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
725   __kmp_assert_valid_gtid(global_tid);
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729 
730   __kmp_resume_if_soft_paused();
731 
732   if (__kmp_env_consistency_check) {
733     if (loc == 0) {
734       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
735     }
736     __kmp_check_barrier(global_tid, ct_barrier, loc);
737   }
738 
739 #if OMPT_SUPPORT
740   ompt_frame_t *ompt_frame;
741   if (ompt_enabled.enabled) {
742     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
743     if (ompt_frame->enter_frame.ptr == NULL)
744       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
745   }
746   OMPT_STORE_RETURN_ADDRESS(global_tid);
747 #endif
748   __kmp_threads[global_tid]->th.th_ident = loc;
749   // TODO: explicit barrier_wait_id:
750   //   this function is called when 'barrier' directive is present or
751   //   implicit barrier at the end of a worksharing construct.
752   // 1) better to add a per-thread barrier counter to a thread data structure
753   // 2) set to 0 when a new team is created
754   // 4) no sync is required
755 
756   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
757 #if OMPT_SUPPORT && OMPT_OPTIONAL
758   if (ompt_enabled.enabled) {
759     ompt_frame->enter_frame = ompt_data_none;
760   }
761 #endif
762 }
763 
764 /* The BARRIER for a MASTER section is always explicit   */
765 /*!
766 @ingroup WORK_SHARING
767 @param loc  source location information.
768 @param global_tid  global thread number .
769 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
770 */
771 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
772   int status = 0;
773 
774   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
775   __kmp_assert_valid_gtid(global_tid);
776 
777   if (!TCR_4(__kmp_init_parallel))
778     __kmp_parallel_initialize();
779 
780   __kmp_resume_if_soft_paused();
781 
782   if (KMP_MASTER_GTID(global_tid)) {
783     KMP_COUNT_BLOCK(OMP_MASTER);
784     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
785     status = 1;
786   }
787 
788 #if OMPT_SUPPORT && OMPT_OPTIONAL
789   if (status) {
790     if (ompt_enabled.ompt_callback_masked) {
791       kmp_info_t *this_thr = __kmp_threads[global_tid];
792       kmp_team_t *team = this_thr->th.th_team;
793 
794       int tid = __kmp_tid_from_gtid(global_tid);
795       ompt_callbacks.ompt_callback(ompt_callback_masked)(
796           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
797           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
798           OMPT_GET_RETURN_ADDRESS(0));
799     }
800   }
801 #endif
802 
803   if (__kmp_env_consistency_check) {
804 #if KMP_USE_DYNAMIC_LOCK
805     if (status)
806       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
807     else
808       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
809 #else
810     if (status)
811       __kmp_push_sync(global_tid, ct_master, loc, NULL);
812     else
813       __kmp_check_sync(global_tid, ct_master, loc, NULL);
814 #endif
815   }
816 
817   return status;
818 }
819 
820 /*!
821 @ingroup WORK_SHARING
822 @param loc  source location information.
823 @param global_tid  global thread number .
824 
825 Mark the end of a <tt>master</tt> region. This should only be called by the
826 thread that executes the <tt>master</tt> region.
827 */
828 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
829   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
830   __kmp_assert_valid_gtid(global_tid);
831   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
832   KMP_POP_PARTITIONED_TIMER();
833 
834 #if OMPT_SUPPORT && OMPT_OPTIONAL
835   kmp_info_t *this_thr = __kmp_threads[global_tid];
836   kmp_team_t *team = this_thr->th.th_team;
837   if (ompt_enabled.ompt_callback_masked) {
838     int tid = __kmp_tid_from_gtid(global_tid);
839     ompt_callbacks.ompt_callback(ompt_callback_masked)(
840         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
841         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
842         OMPT_GET_RETURN_ADDRESS(0));
843   }
844 #endif
845 
846   if (__kmp_env_consistency_check) {
847     if (KMP_MASTER_GTID(global_tid))
848       __kmp_pop_sync(global_tid, ct_master, loc);
849   }
850 }
851 
852 /*!
853 @ingroup WORK_SHARING
854 @param loc  source location information.
855 @param global_tid  global thread number.
856 @param filter result of evaluating filter clause on thread global_tid, or zero
857 if no filter clause present
858 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
859 */
860 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
861   int status = 0;
862   int tid;
863   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
864   __kmp_assert_valid_gtid(global_tid);
865 
866   if (!TCR_4(__kmp_init_parallel))
867     __kmp_parallel_initialize();
868 
869   __kmp_resume_if_soft_paused();
870 
871   tid = __kmp_tid_from_gtid(global_tid);
872   if (tid == filter) {
873     KMP_COUNT_BLOCK(OMP_MASKED);
874     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
875     status = 1;
876   }
877 
878 #if OMPT_SUPPORT && OMPT_OPTIONAL
879   if (status) {
880     if (ompt_enabled.ompt_callback_masked) {
881       kmp_info_t *this_thr = __kmp_threads[global_tid];
882       kmp_team_t *team = this_thr->th.th_team;
883       ompt_callbacks.ompt_callback(ompt_callback_masked)(
884           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
885           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
886           OMPT_GET_RETURN_ADDRESS(0));
887     }
888   }
889 #endif
890 
891   if (__kmp_env_consistency_check) {
892 #if KMP_USE_DYNAMIC_LOCK
893     if (status)
894       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
895     else
896       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
897 #else
898     if (status)
899       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
900     else
901       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
902 #endif
903   }
904 
905   return status;
906 }
907 
908 /*!
909 @ingroup WORK_SHARING
910 @param loc  source location information.
911 @param global_tid  global thread number .
912 
913 Mark the end of a <tt>masked</tt> region. This should only be called by the
914 thread that executes the <tt>masked</tt> region.
915 */
916 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
917   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
918   __kmp_assert_valid_gtid(global_tid);
919   KMP_POP_PARTITIONED_TIMER();
920 
921 #if OMPT_SUPPORT && OMPT_OPTIONAL
922   kmp_info_t *this_thr = __kmp_threads[global_tid];
923   kmp_team_t *team = this_thr->th.th_team;
924   if (ompt_enabled.ompt_callback_masked) {
925     int tid = __kmp_tid_from_gtid(global_tid);
926     ompt_callbacks.ompt_callback(ompt_callback_masked)(
927         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
928         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
929         OMPT_GET_RETURN_ADDRESS(0));
930   }
931 #endif
932 
933   if (__kmp_env_consistency_check) {
934     __kmp_pop_sync(global_tid, ct_masked, loc);
935   }
936 }
937 
938 /*!
939 @ingroup WORK_SHARING
940 @param loc  source location information.
941 @param gtid  global thread number.
942 
943 Start execution of an <tt>ordered</tt> construct.
944 */
945 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
946   int cid = 0;
947   kmp_info_t *th;
948   KMP_DEBUG_ASSERT(__kmp_init_serial);
949 
950   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
951   __kmp_assert_valid_gtid(gtid);
952 
953   if (!TCR_4(__kmp_init_parallel))
954     __kmp_parallel_initialize();
955 
956   __kmp_resume_if_soft_paused();
957 
958 #if USE_ITT_BUILD
959   __kmp_itt_ordered_prep(gtid);
960 // TODO: ordered_wait_id
961 #endif /* USE_ITT_BUILD */
962 
963   th = __kmp_threads[gtid];
964 
965 #if OMPT_SUPPORT && OMPT_OPTIONAL
966   kmp_team_t *team;
967   ompt_wait_id_t lck;
968   void *codeptr_ra;
969   OMPT_STORE_RETURN_ADDRESS(gtid);
970   if (ompt_enabled.enabled) {
971     team = __kmp_team_from_gtid(gtid);
972     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
973     /* OMPT state update */
974     th->th.ompt_thread_info.wait_id = lck;
975     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
976 
977     /* OMPT event callback */
978     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
979     if (ompt_enabled.ompt_callback_mutex_acquire) {
980       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
981           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
982           codeptr_ra);
983     }
984   }
985 #endif
986 
987   if (th->th.th_dispatch->th_deo_fcn != 0)
988     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
989   else
990     __kmp_parallel_deo(&gtid, &cid, loc);
991 
992 #if OMPT_SUPPORT && OMPT_OPTIONAL
993   if (ompt_enabled.enabled) {
994     /* OMPT state update */
995     th->th.ompt_thread_info.state = ompt_state_work_parallel;
996     th->th.ompt_thread_info.wait_id = 0;
997 
998     /* OMPT event callback */
999     if (ompt_enabled.ompt_callback_mutex_acquired) {
1000       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1001           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1002     }
1003   }
1004 #endif
1005 
1006 #if USE_ITT_BUILD
1007   __kmp_itt_ordered_start(gtid);
1008 #endif /* USE_ITT_BUILD */
1009 }
1010 
1011 /*!
1012 @ingroup WORK_SHARING
1013 @param loc  source location information.
1014 @param gtid  global thread number.
1015 
1016 End execution of an <tt>ordered</tt> construct.
1017 */
1018 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1019   int cid = 0;
1020   kmp_info_t *th;
1021 
1022   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1023   __kmp_assert_valid_gtid(gtid);
1024 
1025 #if USE_ITT_BUILD
1026   __kmp_itt_ordered_end(gtid);
1027 // TODO: ordered_wait_id
1028 #endif /* USE_ITT_BUILD */
1029 
1030   th = __kmp_threads[gtid];
1031 
1032   if (th->th.th_dispatch->th_dxo_fcn != 0)
1033     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1034   else
1035     __kmp_parallel_dxo(&gtid, &cid, loc);
1036 
1037 #if OMPT_SUPPORT && OMPT_OPTIONAL
1038   OMPT_STORE_RETURN_ADDRESS(gtid);
1039   if (ompt_enabled.ompt_callback_mutex_released) {
1040     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1041         ompt_mutex_ordered,
1042         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1043             ->t.t_ordered.dt.t_value,
1044         OMPT_LOAD_RETURN_ADDRESS(gtid));
1045   }
1046 #endif
1047 }
1048 
1049 #if KMP_USE_DYNAMIC_LOCK
1050 
1051 static __forceinline void
1052 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1053                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1054   // Pointer to the allocated indirect lock is written to crit, while indexing
1055   // is ignored.
1056   void *idx;
1057   kmp_indirect_lock_t **lck;
1058   lck = (kmp_indirect_lock_t **)crit;
1059   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1060   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1061   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1062   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1063   KA_TRACE(20,
1064            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1065 #if USE_ITT_BUILD
1066   __kmp_itt_critical_creating(ilk->lock, loc);
1067 #endif
1068   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1069   if (status == 0) {
1070 #if USE_ITT_BUILD
1071     __kmp_itt_critical_destroyed(ilk->lock);
1072 #endif
1073     // We don't really need to destroy the unclaimed lock here since it will be
1074     // cleaned up at program exit.
1075     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1076   }
1077   KMP_DEBUG_ASSERT(*lck != NULL);
1078 }
1079 
1080 // Fast-path acquire tas lock
1081 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1082   {                                                                            \
1083     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1084     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1085     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1086     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1087         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1088       kmp_uint32 spins;                                                        \
1089       KMP_FSYNC_PREPARE(l);                                                    \
1090       KMP_INIT_YIELD(spins);                                                   \
1091       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1092       do {                                                                     \
1093         if (TCR_4(__kmp_nth) >                                                 \
1094             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1095           KMP_YIELD(TRUE);                                                     \
1096         } else {                                                               \
1097           KMP_YIELD_SPIN(spins);                                               \
1098         }                                                                      \
1099         __kmp_spin_backoff(&backoff);                                          \
1100       } while (                                                                \
1101           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1102           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1103     }                                                                          \
1104     KMP_FSYNC_ACQUIRED(l);                                                     \
1105   }
1106 
1107 // Fast-path test tas lock
1108 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1109   {                                                                            \
1110     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1111     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1112     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1113     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1114          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1115   }
1116 
1117 // Fast-path release tas lock
1118 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1119   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1120 
1121 #if KMP_USE_FUTEX
1122 
1123 #include <sys/syscall.h>
1124 #include <unistd.h>
1125 #ifndef FUTEX_WAIT
1126 #define FUTEX_WAIT 0
1127 #endif
1128 #ifndef FUTEX_WAKE
1129 #define FUTEX_WAKE 1
1130 #endif
1131 
1132 // Fast-path acquire futex lock
1133 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1134   {                                                                            \
1135     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1136     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1137     KMP_MB();                                                                  \
1138     KMP_FSYNC_PREPARE(ftx);                                                    \
1139     kmp_int32 poll_val;                                                        \
1140     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1141                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1142                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1143       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1144       if (!cond) {                                                             \
1145         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1146                                          poll_val |                            \
1147                                              KMP_LOCK_BUSY(1, futex))) {       \
1148           continue;                                                            \
1149         }                                                                      \
1150         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1151       }                                                                        \
1152       kmp_int32 rc;                                                            \
1153       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1154                         NULL, NULL, 0)) != 0) {                                \
1155         continue;                                                              \
1156       }                                                                        \
1157       gtid_code |= 1;                                                          \
1158     }                                                                          \
1159     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1160   }
1161 
1162 // Fast-path test futex lock
1163 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1164   {                                                                            \
1165     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1166     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1167                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1168       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1169       rc = TRUE;                                                               \
1170     } else {                                                                   \
1171       rc = FALSE;                                                              \
1172     }                                                                          \
1173   }
1174 
1175 // Fast-path release futex lock
1176 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1177   {                                                                            \
1178     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1179     KMP_MB();                                                                  \
1180     KMP_FSYNC_RELEASING(ftx);                                                  \
1181     kmp_int32 poll_val =                                                       \
1182         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1183     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1184       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1185               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1186     }                                                                          \
1187     KMP_MB();                                                                  \
1188     KMP_YIELD_OVERSUB();                                                       \
1189   }
1190 
1191 #endif // KMP_USE_FUTEX
1192 
1193 #else // KMP_USE_DYNAMIC_LOCK
1194 
1195 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1196                                                       ident_t const *loc,
1197                                                       kmp_int32 gtid) {
1198   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1199 
1200   // Because of the double-check, the following load doesn't need to be volatile
1201   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1202 
1203   if (lck == NULL) {
1204     void *idx;
1205 
1206     // Allocate & initialize the lock.
1207     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1208     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1209     __kmp_init_user_lock_with_checks(lck);
1210     __kmp_set_user_lock_location(lck, loc);
1211 #if USE_ITT_BUILD
1212     __kmp_itt_critical_creating(lck);
1213 // __kmp_itt_critical_creating() should be called *before* the first usage
1214 // of underlying lock. It is the only place where we can guarantee it. There
1215 // are chances the lock will destroyed with no usage, but it is not a
1216 // problem, because this is not real event seen by user but rather setting
1217 // name for object (lock). See more details in kmp_itt.h.
1218 #endif /* USE_ITT_BUILD */
1219 
1220     // Use a cmpxchg instruction to slam the start of the critical section with
1221     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1222     // and use the lock that the other thread allocated.
1223     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1224 
1225     if (status == 0) {
1226 // Deallocate the lock and reload the value.
1227 #if USE_ITT_BUILD
1228       __kmp_itt_critical_destroyed(lck);
1229 // Let ITT know the lock is destroyed and the same memory location may be reused
1230 // for another purpose.
1231 #endif /* USE_ITT_BUILD */
1232       __kmp_destroy_user_lock_with_checks(lck);
1233       __kmp_user_lock_free(&idx, gtid, lck);
1234       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1235       KMP_DEBUG_ASSERT(lck != NULL);
1236     }
1237   }
1238   return lck;
1239 }
1240 
1241 #endif // KMP_USE_DYNAMIC_LOCK
1242 
1243 /*!
1244 @ingroup WORK_SHARING
1245 @param loc  source location information.
1246 @param global_tid  global thread number.
1247 @param crit identity of the critical section. This could be a pointer to a lock
1248 associated with the critical section, or some other suitably unique value.
1249 
1250 Enter code protected by a `critical` construct.
1251 This function blocks until the executing thread can enter the critical section.
1252 */
1253 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1254                      kmp_critical_name *crit) {
1255 #if KMP_USE_DYNAMIC_LOCK
1256 #if OMPT_SUPPORT && OMPT_OPTIONAL
1257   OMPT_STORE_RETURN_ADDRESS(global_tid);
1258 #endif // OMPT_SUPPORT
1259   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1260 #else
1261   KMP_COUNT_BLOCK(OMP_CRITICAL);
1262 #if OMPT_SUPPORT && OMPT_OPTIONAL
1263   ompt_state_t prev_state = ompt_state_undefined;
1264   ompt_thread_info_t ti;
1265 #endif
1266   kmp_user_lock_p lck;
1267 
1268   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1269   __kmp_assert_valid_gtid(global_tid);
1270 
1271   // TODO: add THR_OVHD_STATE
1272 
1273   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1274   KMP_CHECK_USER_LOCK_INIT();
1275 
1276   if ((__kmp_user_lock_kind == lk_tas) &&
1277       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1278     lck = (kmp_user_lock_p)crit;
1279   }
1280 #if KMP_USE_FUTEX
1281   else if ((__kmp_user_lock_kind == lk_futex) &&
1282            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1283     lck = (kmp_user_lock_p)crit;
1284   }
1285 #endif
1286   else { // ticket, queuing or drdpa
1287     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1288   }
1289 
1290   if (__kmp_env_consistency_check)
1291     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1292 
1293     // since the critical directive binds to all threads, not just the current
1294     // team we have to check this even if we are in a serialized team.
1295     // also, even if we are the uber thread, we still have to conduct the lock,
1296     // as we have to contend with sibling threads.
1297 
1298 #if USE_ITT_BUILD
1299   __kmp_itt_critical_acquiring(lck);
1300 #endif /* USE_ITT_BUILD */
1301 #if OMPT_SUPPORT && OMPT_OPTIONAL
1302   OMPT_STORE_RETURN_ADDRESS(gtid);
1303   void *codeptr_ra = NULL;
1304   if (ompt_enabled.enabled) {
1305     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1306     /* OMPT state update */
1307     prev_state = ti.state;
1308     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1309     ti.state = ompt_state_wait_critical;
1310 
1311     /* OMPT event callback */
1312     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1313     if (ompt_enabled.ompt_callback_mutex_acquire) {
1314       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1315           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1316           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1317     }
1318   }
1319 #endif
1320   // Value of 'crit' should be good for using as a critical_id of the critical
1321   // section directive.
1322   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1323 
1324 #if USE_ITT_BUILD
1325   __kmp_itt_critical_acquired(lck);
1326 #endif /* USE_ITT_BUILD */
1327 #if OMPT_SUPPORT && OMPT_OPTIONAL
1328   if (ompt_enabled.enabled) {
1329     /* OMPT state update */
1330     ti.state = prev_state;
1331     ti.wait_id = 0;
1332 
1333     /* OMPT event callback */
1334     if (ompt_enabled.ompt_callback_mutex_acquired) {
1335       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1336           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1337     }
1338   }
1339 #endif
1340   KMP_POP_PARTITIONED_TIMER();
1341 
1342   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1343   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1344 #endif // KMP_USE_DYNAMIC_LOCK
1345 }
1346 
1347 #if KMP_USE_DYNAMIC_LOCK
1348 
1349 // Converts the given hint to an internal lock implementation
1350 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1351 #if KMP_USE_TSX
1352 #define KMP_TSX_LOCK(seq) lockseq_##seq
1353 #else
1354 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1355 #endif
1356 
1357 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1358 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1359 #else
1360 #define KMP_CPUINFO_RTM 0
1361 #endif
1362 
1363   // Hints that do not require further logic
1364   if (hint & kmp_lock_hint_hle)
1365     return KMP_TSX_LOCK(hle);
1366   if (hint & kmp_lock_hint_rtm)
1367     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1368   if (hint & kmp_lock_hint_adaptive)
1369     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1370 
1371   // Rule out conflicting hints first by returning the default lock
1372   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1373     return __kmp_user_lock_seq;
1374   if ((hint & omp_lock_hint_speculative) &&
1375       (hint & omp_lock_hint_nonspeculative))
1376     return __kmp_user_lock_seq;
1377 
1378   // Do not even consider speculation when it appears to be contended
1379   if (hint & omp_lock_hint_contended)
1380     return lockseq_queuing;
1381 
1382   // Uncontended lock without speculation
1383   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1384     return lockseq_tas;
1385 
1386   // Use RTM lock for speculation
1387   if (hint & omp_lock_hint_speculative)
1388     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1389 
1390   return __kmp_user_lock_seq;
1391 }
1392 
1393 #if OMPT_SUPPORT && OMPT_OPTIONAL
1394 #if KMP_USE_DYNAMIC_LOCK
1395 static kmp_mutex_impl_t
1396 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1397   if (user_lock) {
1398     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1399     case 0:
1400       break;
1401 #if KMP_USE_FUTEX
1402     case locktag_futex:
1403       return kmp_mutex_impl_queuing;
1404 #endif
1405     case locktag_tas:
1406       return kmp_mutex_impl_spin;
1407 #if KMP_USE_TSX
1408     case locktag_hle:
1409     case locktag_rtm_spin:
1410       return kmp_mutex_impl_speculative;
1411 #endif
1412     default:
1413       return kmp_mutex_impl_none;
1414     }
1415     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1416   }
1417   KMP_ASSERT(ilock);
1418   switch (ilock->type) {
1419 #if KMP_USE_TSX
1420   case locktag_adaptive:
1421   case locktag_rtm_queuing:
1422     return kmp_mutex_impl_speculative;
1423 #endif
1424   case locktag_nested_tas:
1425     return kmp_mutex_impl_spin;
1426 #if KMP_USE_FUTEX
1427   case locktag_nested_futex:
1428 #endif
1429   case locktag_ticket:
1430   case locktag_queuing:
1431   case locktag_drdpa:
1432   case locktag_nested_ticket:
1433   case locktag_nested_queuing:
1434   case locktag_nested_drdpa:
1435     return kmp_mutex_impl_queuing;
1436   default:
1437     return kmp_mutex_impl_none;
1438   }
1439 }
1440 #else
1441 // For locks without dynamic binding
1442 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1443   switch (__kmp_user_lock_kind) {
1444   case lk_tas:
1445     return kmp_mutex_impl_spin;
1446 #if KMP_USE_FUTEX
1447   case lk_futex:
1448 #endif
1449   case lk_ticket:
1450   case lk_queuing:
1451   case lk_drdpa:
1452     return kmp_mutex_impl_queuing;
1453 #if KMP_USE_TSX
1454   case lk_hle:
1455   case lk_rtm_queuing:
1456   case lk_rtm_spin:
1457   case lk_adaptive:
1458     return kmp_mutex_impl_speculative;
1459 #endif
1460   default:
1461     return kmp_mutex_impl_none;
1462   }
1463 }
1464 #endif // KMP_USE_DYNAMIC_LOCK
1465 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1466 
1467 /*!
1468 @ingroup WORK_SHARING
1469 @param loc  source location information.
1470 @param global_tid  global thread number.
1471 @param crit identity of the critical section. This could be a pointer to a lock
1472 associated with the critical section, or some other suitably unique value.
1473 @param hint the lock hint.
1474 
1475 Enter code protected by a `critical` construct with a hint. The hint value is
1476 used to suggest a lock implementation. This function blocks until the executing
1477 thread can enter the critical section unless the hint suggests use of
1478 speculative execution and the hardware supports it.
1479 */
1480 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1481                                kmp_critical_name *crit, uint32_t hint) {
1482   KMP_COUNT_BLOCK(OMP_CRITICAL);
1483   kmp_user_lock_p lck;
1484 #if OMPT_SUPPORT && OMPT_OPTIONAL
1485   ompt_state_t prev_state = ompt_state_undefined;
1486   ompt_thread_info_t ti;
1487   // This is the case, if called from __kmpc_critical:
1488   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1489   if (!codeptr)
1490     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1491 #endif
1492 
1493   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1494   __kmp_assert_valid_gtid(global_tid);
1495 
1496   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1497   // Check if it is initialized.
1498   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1499   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1500   if (*lk == 0) {
1501     if (KMP_IS_D_LOCK(lockseq)) {
1502       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1503                                   KMP_GET_D_TAG(lockseq));
1504     } else {
1505       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1506     }
1507   }
1508   // Branch for accessing the actual lock object and set operation. This
1509   // branching is inevitable since this lock initialization does not follow the
1510   // normal dispatch path (lock table is not used).
1511   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1512     lck = (kmp_user_lock_p)lk;
1513     if (__kmp_env_consistency_check) {
1514       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1515                       __kmp_map_hint_to_lock(hint));
1516     }
1517 #if USE_ITT_BUILD
1518     __kmp_itt_critical_acquiring(lck);
1519 #endif
1520 #if OMPT_SUPPORT && OMPT_OPTIONAL
1521     if (ompt_enabled.enabled) {
1522       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1523       /* OMPT state update */
1524       prev_state = ti.state;
1525       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1526       ti.state = ompt_state_wait_critical;
1527 
1528       /* OMPT event callback */
1529       if (ompt_enabled.ompt_callback_mutex_acquire) {
1530         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1531             ompt_mutex_critical, (unsigned int)hint,
1532             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1533             codeptr);
1534       }
1535     }
1536 #endif
1537 #if KMP_USE_INLINED_TAS
1538     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1539       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1540     } else
1541 #elif KMP_USE_INLINED_FUTEX
1542     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1543       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1544     } else
1545 #endif
1546     {
1547       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1548     }
1549   } else {
1550     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1551     lck = ilk->lock;
1552     if (__kmp_env_consistency_check) {
1553       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1554                       __kmp_map_hint_to_lock(hint));
1555     }
1556 #if USE_ITT_BUILD
1557     __kmp_itt_critical_acquiring(lck);
1558 #endif
1559 #if OMPT_SUPPORT && OMPT_OPTIONAL
1560     if (ompt_enabled.enabled) {
1561       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1562       /* OMPT state update */
1563       prev_state = ti.state;
1564       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1565       ti.state = ompt_state_wait_critical;
1566 
1567       /* OMPT event callback */
1568       if (ompt_enabled.ompt_callback_mutex_acquire) {
1569         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1570             ompt_mutex_critical, (unsigned int)hint,
1571             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1572             codeptr);
1573       }
1574     }
1575 #endif
1576     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1577   }
1578   KMP_POP_PARTITIONED_TIMER();
1579 
1580 #if USE_ITT_BUILD
1581   __kmp_itt_critical_acquired(lck);
1582 #endif /* USE_ITT_BUILD */
1583 #if OMPT_SUPPORT && OMPT_OPTIONAL
1584   if (ompt_enabled.enabled) {
1585     /* OMPT state update */
1586     ti.state = prev_state;
1587     ti.wait_id = 0;
1588 
1589     /* OMPT event callback */
1590     if (ompt_enabled.ompt_callback_mutex_acquired) {
1591       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1592           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1593     }
1594   }
1595 #endif
1596 
1597   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1598   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1599 } // __kmpc_critical_with_hint
1600 
1601 #endif // KMP_USE_DYNAMIC_LOCK
1602 
1603 /*!
1604 @ingroup WORK_SHARING
1605 @param loc  source location information.
1606 @param global_tid  global thread number .
1607 @param crit identity of the critical section. This could be a pointer to a lock
1608 associated with the critical section, or some other suitably unique value.
1609 
1610 Leave a critical section, releasing any lock that was held during its execution.
1611 */
1612 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1613                          kmp_critical_name *crit) {
1614   kmp_user_lock_p lck;
1615 
1616   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1617 
1618 #if KMP_USE_DYNAMIC_LOCK
1619   int locktag = KMP_EXTRACT_D_TAG(crit);
1620   if (locktag) {
1621     lck = (kmp_user_lock_p)crit;
1622     KMP_ASSERT(lck != NULL);
1623     if (__kmp_env_consistency_check) {
1624       __kmp_pop_sync(global_tid, ct_critical, loc);
1625     }
1626 #if USE_ITT_BUILD
1627     __kmp_itt_critical_releasing(lck);
1628 #endif
1629 #if KMP_USE_INLINED_TAS
1630     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1631       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1632     } else
1633 #elif KMP_USE_INLINED_FUTEX
1634     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1635       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1636     } else
1637 #endif
1638     {
1639       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1640     }
1641   } else {
1642     kmp_indirect_lock_t *ilk =
1643         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1644     KMP_ASSERT(ilk != NULL);
1645     lck = ilk->lock;
1646     if (__kmp_env_consistency_check) {
1647       __kmp_pop_sync(global_tid, ct_critical, loc);
1648     }
1649 #if USE_ITT_BUILD
1650     __kmp_itt_critical_releasing(lck);
1651 #endif
1652     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1653   }
1654 
1655 #else // KMP_USE_DYNAMIC_LOCK
1656 
1657   if ((__kmp_user_lock_kind == lk_tas) &&
1658       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1659     lck = (kmp_user_lock_p)crit;
1660   }
1661 #if KMP_USE_FUTEX
1662   else if ((__kmp_user_lock_kind == lk_futex) &&
1663            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1664     lck = (kmp_user_lock_p)crit;
1665   }
1666 #endif
1667   else { // ticket, queuing or drdpa
1668     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1669   }
1670 
1671   KMP_ASSERT(lck != NULL);
1672 
1673   if (__kmp_env_consistency_check)
1674     __kmp_pop_sync(global_tid, ct_critical, loc);
1675 
1676 #if USE_ITT_BUILD
1677   __kmp_itt_critical_releasing(lck);
1678 #endif /* USE_ITT_BUILD */
1679   // Value of 'crit' should be good for using as a critical_id of the critical
1680   // section directive.
1681   __kmp_release_user_lock_with_checks(lck, global_tid);
1682 
1683 #endif // KMP_USE_DYNAMIC_LOCK
1684 
1685 #if OMPT_SUPPORT && OMPT_OPTIONAL
1686   /* OMPT release event triggers after lock is released; place here to trigger
1687    * for all #if branches */
1688   OMPT_STORE_RETURN_ADDRESS(global_tid);
1689   if (ompt_enabled.ompt_callback_mutex_released) {
1690     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1691         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1692         OMPT_LOAD_RETURN_ADDRESS(0));
1693   }
1694 #endif
1695 
1696   KMP_POP_PARTITIONED_TIMER();
1697   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1698 }
1699 
1700 /*!
1701 @ingroup SYNCHRONIZATION
1702 @param loc source location information
1703 @param global_tid thread id.
1704 @return one if the thread should execute the master block, zero otherwise
1705 
1706 Start execution of a combined barrier and master. The barrier is executed inside
1707 this function.
1708 */
1709 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1710   int status;
1711   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1712   __kmp_assert_valid_gtid(global_tid);
1713 
1714   if (!TCR_4(__kmp_init_parallel))
1715     __kmp_parallel_initialize();
1716 
1717   __kmp_resume_if_soft_paused();
1718 
1719   if (__kmp_env_consistency_check)
1720     __kmp_check_barrier(global_tid, ct_barrier, loc);
1721 
1722 #if OMPT_SUPPORT
1723   ompt_frame_t *ompt_frame;
1724   if (ompt_enabled.enabled) {
1725     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1726     if (ompt_frame->enter_frame.ptr == NULL)
1727       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1728   }
1729   OMPT_STORE_RETURN_ADDRESS(global_tid);
1730 #endif
1731 #if USE_ITT_NOTIFY
1732   __kmp_threads[global_tid]->th.th_ident = loc;
1733 #endif
1734   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1735 #if OMPT_SUPPORT && OMPT_OPTIONAL
1736   if (ompt_enabled.enabled) {
1737     ompt_frame->enter_frame = ompt_data_none;
1738   }
1739 #endif
1740 
1741   return (status != 0) ? 0 : 1;
1742 }
1743 
1744 /*!
1745 @ingroup SYNCHRONIZATION
1746 @param loc source location information
1747 @param global_tid thread id.
1748 
1749 Complete the execution of a combined barrier and master. This function should
1750 only be called at the completion of the <tt>master</tt> code. Other threads will
1751 still be waiting at the barrier and this call releases them.
1752 */
1753 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1754   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1755   __kmp_assert_valid_gtid(global_tid);
1756   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1757 }
1758 
1759 /*!
1760 @ingroup SYNCHRONIZATION
1761 @param loc source location information
1762 @param global_tid thread id.
1763 @return one if the thread should execute the master block, zero otherwise
1764 
1765 Start execution of a combined barrier and master(nowait) construct.
1766 The barrier is executed inside this function.
1767 There is no equivalent "end" function, since the
1768 */
1769 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1770   kmp_int32 ret;
1771   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1772   __kmp_assert_valid_gtid(global_tid);
1773 
1774   if (!TCR_4(__kmp_init_parallel))
1775     __kmp_parallel_initialize();
1776 
1777   __kmp_resume_if_soft_paused();
1778 
1779   if (__kmp_env_consistency_check) {
1780     if (loc == 0) {
1781       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1782     }
1783     __kmp_check_barrier(global_tid, ct_barrier, loc);
1784   }
1785 
1786 #if OMPT_SUPPORT
1787   ompt_frame_t *ompt_frame;
1788   if (ompt_enabled.enabled) {
1789     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1790     if (ompt_frame->enter_frame.ptr == NULL)
1791       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1792   }
1793   OMPT_STORE_RETURN_ADDRESS(global_tid);
1794 #endif
1795 #if USE_ITT_NOTIFY
1796   __kmp_threads[global_tid]->th.th_ident = loc;
1797 #endif
1798   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1799 #if OMPT_SUPPORT && OMPT_OPTIONAL
1800   if (ompt_enabled.enabled) {
1801     ompt_frame->enter_frame = ompt_data_none;
1802   }
1803 #endif
1804 
1805   ret = __kmpc_master(loc, global_tid);
1806 
1807   if (__kmp_env_consistency_check) {
1808     /*  there's no __kmpc_end_master called; so the (stats) */
1809     /*  actions of __kmpc_end_master are done here          */
1810     if (ret) {
1811       /* only one thread should do the pop since only */
1812       /* one did the push (see __kmpc_master())       */
1813       __kmp_pop_sync(global_tid, ct_master, loc);
1814     }
1815   }
1816 
1817   return (ret);
1818 }
1819 
1820 /* The BARRIER for a SINGLE process section is always explicit   */
1821 /*!
1822 @ingroup WORK_SHARING
1823 @param loc  source location information
1824 @param global_tid  global thread number
1825 @return One if this thread should execute the single construct, zero otherwise.
1826 
1827 Test whether to execute a <tt>single</tt> construct.
1828 There are no implicit barriers in the two "single" calls, rather the compiler
1829 should introduce an explicit barrier if it is required.
1830 */
1831 
1832 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1833   __kmp_assert_valid_gtid(global_tid);
1834   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1835 
1836   if (rc) {
1837     // We are going to execute the single statement, so we should count it.
1838     KMP_COUNT_BLOCK(OMP_SINGLE);
1839     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1840   }
1841 
1842 #if OMPT_SUPPORT && OMPT_OPTIONAL
1843   kmp_info_t *this_thr = __kmp_threads[global_tid];
1844   kmp_team_t *team = this_thr->th.th_team;
1845   int tid = __kmp_tid_from_gtid(global_tid);
1846 
1847   if (ompt_enabled.enabled) {
1848     if (rc) {
1849       if (ompt_enabled.ompt_callback_work) {
1850         ompt_callbacks.ompt_callback(ompt_callback_work)(
1851             ompt_work_single_executor, ompt_scope_begin,
1852             &(team->t.ompt_team_info.parallel_data),
1853             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1854             1, OMPT_GET_RETURN_ADDRESS(0));
1855       }
1856     } else {
1857       if (ompt_enabled.ompt_callback_work) {
1858         ompt_callbacks.ompt_callback(ompt_callback_work)(
1859             ompt_work_single_other, ompt_scope_begin,
1860             &(team->t.ompt_team_info.parallel_data),
1861             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1862             1, OMPT_GET_RETURN_ADDRESS(0));
1863         ompt_callbacks.ompt_callback(ompt_callback_work)(
1864             ompt_work_single_other, ompt_scope_end,
1865             &(team->t.ompt_team_info.parallel_data),
1866             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1867             1, OMPT_GET_RETURN_ADDRESS(0));
1868       }
1869     }
1870   }
1871 #endif
1872 
1873   return rc;
1874 }
1875 
1876 /*!
1877 @ingroup WORK_SHARING
1878 @param loc  source location information
1879 @param global_tid  global thread number
1880 
1881 Mark the end of a <tt>single</tt> construct.  This function should
1882 only be called by the thread that executed the block of code protected
1883 by the `single` construct.
1884 */
1885 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1886   __kmp_assert_valid_gtid(global_tid);
1887   __kmp_exit_single(global_tid);
1888   KMP_POP_PARTITIONED_TIMER();
1889 
1890 #if OMPT_SUPPORT && OMPT_OPTIONAL
1891   kmp_info_t *this_thr = __kmp_threads[global_tid];
1892   kmp_team_t *team = this_thr->th.th_team;
1893   int tid = __kmp_tid_from_gtid(global_tid);
1894 
1895   if (ompt_enabled.ompt_callback_work) {
1896     ompt_callbacks.ompt_callback(ompt_callback_work)(
1897         ompt_work_single_executor, ompt_scope_end,
1898         &(team->t.ompt_team_info.parallel_data),
1899         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1900         OMPT_GET_RETURN_ADDRESS(0));
1901   }
1902 #endif
1903 }
1904 
1905 /*!
1906 @ingroup WORK_SHARING
1907 @param loc Source location
1908 @param global_tid Global thread id
1909 
1910 Mark the end of a statically scheduled loop.
1911 */
1912 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1913   KMP_POP_PARTITIONED_TIMER();
1914   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1915 
1916 #if OMPT_SUPPORT && OMPT_OPTIONAL
1917   if (ompt_enabled.ompt_callback_work) {
1918     ompt_work_t ompt_work_type = ompt_work_loop;
1919     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1920     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1921     // Determine workshare type
1922     if (loc != NULL) {
1923       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1924         ompt_work_type = ompt_work_loop;
1925       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1926         ompt_work_type = ompt_work_sections;
1927       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1928         ompt_work_type = ompt_work_distribute;
1929       } else {
1930         // use default set above.
1931         // a warning about this case is provided in __kmpc_for_static_init
1932       }
1933       KMP_DEBUG_ASSERT(ompt_work_type);
1934     }
1935     ompt_callbacks.ompt_callback(ompt_callback_work)(
1936         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1937         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1938   }
1939 #endif
1940   if (__kmp_env_consistency_check)
1941     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1942 }
1943 
1944 // User routines which take C-style arguments (call by value)
1945 // different from the Fortran equivalent routines
1946 
1947 void ompc_set_num_threads(int arg) {
1948   // !!!!! TODO: check the per-task binding
1949   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1950 }
1951 
1952 void ompc_set_dynamic(int flag) {
1953   kmp_info_t *thread;
1954 
1955   /* For the thread-private implementation of the internal controls */
1956   thread = __kmp_entry_thread();
1957 
1958   __kmp_save_internal_controls(thread);
1959 
1960   set__dynamic(thread, flag ? true : false);
1961 }
1962 
1963 void ompc_set_nested(int flag) {
1964   kmp_info_t *thread;
1965 
1966   /* For the thread-private internal controls implementation */
1967   thread = __kmp_entry_thread();
1968 
1969   __kmp_save_internal_controls(thread);
1970 
1971   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1972 }
1973 
1974 void ompc_set_max_active_levels(int max_active_levels) {
1975   /* TO DO */
1976   /* we want per-task implementation of this internal control */
1977 
1978   /* For the per-thread internal controls implementation */
1979   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1980 }
1981 
1982 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1983   // !!!!! TODO: check the per-task binding
1984   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1985 }
1986 
1987 int ompc_get_ancestor_thread_num(int level) {
1988   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1989 }
1990 
1991 int ompc_get_team_size(int level) {
1992   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1993 }
1994 
1995 /* OpenMP 5.0 Affinity Format API */
1996 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
1997   if (!__kmp_init_serial) {
1998     __kmp_serial_initialize();
1999   }
2000   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2001                          format, KMP_STRLEN(format) + 1);
2002 }
2003 
2004 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2005   size_t format_size;
2006   if (!__kmp_init_serial) {
2007     __kmp_serial_initialize();
2008   }
2009   format_size = KMP_STRLEN(__kmp_affinity_format);
2010   if (buffer && size) {
2011     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2012                            format_size + 1);
2013   }
2014   return format_size;
2015 }
2016 
2017 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2018   int gtid;
2019   if (!TCR_4(__kmp_init_middle)) {
2020     __kmp_middle_initialize();
2021   }
2022   __kmp_assign_root_init_mask();
2023   gtid = __kmp_get_gtid();
2024   __kmp_aux_display_affinity(gtid, format);
2025 }
2026 
2027 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2028                                               char const *format) {
2029   int gtid;
2030   size_t num_required;
2031   kmp_str_buf_t capture_buf;
2032   if (!TCR_4(__kmp_init_middle)) {
2033     __kmp_middle_initialize();
2034   }
2035   __kmp_assign_root_init_mask();
2036   gtid = __kmp_get_gtid();
2037   __kmp_str_buf_init(&capture_buf);
2038   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2039   if (buffer && buf_size) {
2040     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2041                            capture_buf.used + 1);
2042   }
2043   __kmp_str_buf_free(&capture_buf);
2044   return num_required;
2045 }
2046 
2047 void kmpc_set_stacksize(int arg) {
2048   // __kmp_aux_set_stacksize initializes the library if needed
2049   __kmp_aux_set_stacksize(arg);
2050 }
2051 
2052 void kmpc_set_stacksize_s(size_t arg) {
2053   // __kmp_aux_set_stacksize initializes the library if needed
2054   __kmp_aux_set_stacksize(arg);
2055 }
2056 
2057 void kmpc_set_blocktime(int arg) {
2058   int gtid, tid;
2059   kmp_info_t *thread;
2060 
2061   gtid = __kmp_entry_gtid();
2062   tid = __kmp_tid_from_gtid(gtid);
2063   thread = __kmp_thread_from_gtid(gtid);
2064 
2065   __kmp_aux_set_blocktime(arg, thread, tid);
2066 }
2067 
2068 void kmpc_set_library(int arg) {
2069   // __kmp_user_set_library initializes the library if needed
2070   __kmp_user_set_library((enum library_type)arg);
2071 }
2072 
2073 void kmpc_set_defaults(char const *str) {
2074   // __kmp_aux_set_defaults initializes the library if needed
2075   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2076 }
2077 
2078 void kmpc_set_disp_num_buffers(int arg) {
2079   // ignore after initialization because some teams have already
2080   // allocated dispatch buffers
2081   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2082       arg <= KMP_MAX_DISP_NUM_BUFF) {
2083     __kmp_dispatch_num_buffers = arg;
2084   }
2085 }
2086 
2087 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2088 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2089   return -1;
2090 #else
2091   if (!TCR_4(__kmp_init_middle)) {
2092     __kmp_middle_initialize();
2093   }
2094   __kmp_assign_root_init_mask();
2095   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2096 #endif
2097 }
2098 
2099 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2100 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2101   return -1;
2102 #else
2103   if (!TCR_4(__kmp_init_middle)) {
2104     __kmp_middle_initialize();
2105   }
2106   __kmp_assign_root_init_mask();
2107   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2108 #endif
2109 }
2110 
2111 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2112 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2113   return -1;
2114 #else
2115   if (!TCR_4(__kmp_init_middle)) {
2116     __kmp_middle_initialize();
2117   }
2118   __kmp_assign_root_init_mask();
2119   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2120 #endif
2121 }
2122 
2123 /* -------------------------------------------------------------------------- */
2124 /*!
2125 @ingroup THREADPRIVATE
2126 @param loc       source location information
2127 @param gtid      global thread number
2128 @param cpy_size  size of the cpy_data buffer
2129 @param cpy_data  pointer to data to be copied
2130 @param cpy_func  helper function to call for copying data
2131 @param didit     flag variable: 1=single thread; 0=not single thread
2132 
2133 __kmpc_copyprivate implements the interface for the private data broadcast
2134 needed for the copyprivate clause associated with a single region in an
2135 OpenMP<sup>*</sup> program (both C and Fortran).
2136 All threads participating in the parallel region call this routine.
2137 One of the threads (called the single thread) should have the <tt>didit</tt>
2138 variable set to 1 and all other threads should have that variable set to 0.
2139 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2140 
2141 The OpenMP specification forbids the use of nowait on the single region when a
2142 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2143 barrier internally to avoid race conditions, so the code generation for the
2144 single region should avoid generating a barrier after the call to @ref
2145 __kmpc_copyprivate.
2146 
2147 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2148 The <tt>loc</tt> parameter is a pointer to source location information.
2149 
2150 Internal implementation: The single thread will first copy its descriptor
2151 address (cpy_data) to a team-private location, then the other threads will each
2152 call the function pointed to by the parameter cpy_func, which carries out the
2153 copy by copying the data using the cpy_data buffer.
2154 
2155 The cpy_func routine used for the copy and the contents of the data area defined
2156 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2157 to be done. For instance, the cpy_data buffer can hold the actual data to be
2158 copied or it may hold a list of pointers to the data. The cpy_func routine must
2159 interpret the cpy_data buffer appropriately.
2160 
2161 The interface to cpy_func is as follows:
2162 @code
2163 void cpy_func( void *destination, void *source )
2164 @endcode
2165 where void *destination is the cpy_data pointer for the thread being copied to
2166 and void *source is the cpy_data pointer for the thread being copied from.
2167 */
2168 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2169                         void *cpy_data, void (*cpy_func)(void *, void *),
2170                         kmp_int32 didit) {
2171   void **data_ptr;
2172   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2173   __kmp_assert_valid_gtid(gtid);
2174 
2175   KMP_MB();
2176 
2177   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2178 
2179   if (__kmp_env_consistency_check) {
2180     if (loc == 0) {
2181       KMP_WARNING(ConstructIdentInvalid);
2182     }
2183   }
2184 
2185   // ToDo: Optimize the following two barriers into some kind of split barrier
2186 
2187   if (didit)
2188     *data_ptr = cpy_data;
2189 
2190 #if OMPT_SUPPORT
2191   ompt_frame_t *ompt_frame;
2192   if (ompt_enabled.enabled) {
2193     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2194     if (ompt_frame->enter_frame.ptr == NULL)
2195       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2196   }
2197   OMPT_STORE_RETURN_ADDRESS(gtid);
2198 #endif
2199 /* This barrier is not a barrier region boundary */
2200 #if USE_ITT_NOTIFY
2201   __kmp_threads[gtid]->th.th_ident = loc;
2202 #endif
2203   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2204 
2205   if (!didit)
2206     (*cpy_func)(cpy_data, *data_ptr);
2207 
2208   // Consider next barrier a user-visible barrier for barrier region boundaries
2209   // Nesting checks are already handled by the single construct checks
2210   {
2211 #if OMPT_SUPPORT
2212     OMPT_STORE_RETURN_ADDRESS(gtid);
2213 #endif
2214 #if USE_ITT_NOTIFY
2215     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2216 // tasks can overwrite the location)
2217 #endif
2218     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2219 #if OMPT_SUPPORT && OMPT_OPTIONAL
2220     if (ompt_enabled.enabled) {
2221       ompt_frame->enter_frame = ompt_data_none;
2222     }
2223 #endif
2224   }
2225 }
2226 
2227 /* --------------------------------------------------------------------------*/
2228 /*!
2229 @ingroup THREADPRIVATE
2230 @param loc       source location information
2231 @param gtid      global thread number
2232 @param cpy_data  pointer to the data to be saved/copied or 0
2233 @return          the saved pointer to the data
2234 
2235 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2236 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2237 coming from single), and returns that pointer in all calls (for single thread
2238 it's not needed). This version doesn't do any actual data copying. Data copying
2239 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2240 this function doesn't have any barrier at the end of the function, like
2241 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2242 data was done.
2243 */
2244 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2245   void **data_ptr;
2246 
2247   KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2248 
2249   KMP_MB();
2250 
2251   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2252 
2253   if (__kmp_env_consistency_check) {
2254     if (loc == 0) {
2255       KMP_WARNING(ConstructIdentInvalid);
2256     }
2257   }
2258 
2259   // ToDo: Optimize the following barrier
2260 
2261   if (cpy_data)
2262     *data_ptr = cpy_data;
2263 
2264 #if OMPT_SUPPORT
2265   ompt_frame_t *ompt_frame;
2266   if (ompt_enabled.enabled) {
2267     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2268     if (ompt_frame->enter_frame.ptr == NULL)
2269       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2270     OMPT_STORE_RETURN_ADDRESS(gtid);
2271   }
2272 #endif
2273 /* This barrier is not a barrier region boundary */
2274 #if USE_ITT_NOTIFY
2275   __kmp_threads[gtid]->th.th_ident = loc;
2276 #endif
2277   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2278 
2279   return *data_ptr;
2280 }
2281 
2282 /* -------------------------------------------------------------------------- */
2283 
2284 #define INIT_LOCK __kmp_init_user_lock_with_checks
2285 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2286 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2287 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2288 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2289 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2290   __kmp_acquire_nested_user_lock_with_checks_timed
2291 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2292 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2293 #define TEST_LOCK __kmp_test_user_lock_with_checks
2294 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2295 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2296 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2297 
2298 // TODO: Make check abort messages use location info & pass it into
2299 // with_checks routines
2300 
2301 #if KMP_USE_DYNAMIC_LOCK
2302 
2303 // internal lock initializer
2304 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2305                                                     kmp_dyna_lockseq_t seq) {
2306   if (KMP_IS_D_LOCK(seq)) {
2307     KMP_INIT_D_LOCK(lock, seq);
2308 #if USE_ITT_BUILD
2309     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2310 #endif
2311   } else {
2312     KMP_INIT_I_LOCK(lock, seq);
2313 #if USE_ITT_BUILD
2314     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2315     __kmp_itt_lock_creating(ilk->lock, loc);
2316 #endif
2317   }
2318 }
2319 
2320 // internal nest lock initializer
2321 static __forceinline void
2322 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2323                                kmp_dyna_lockseq_t seq) {
2324 #if KMP_USE_TSX
2325   // Don't have nested lock implementation for speculative locks
2326   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2327       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2328     seq = __kmp_user_lock_seq;
2329 #endif
2330   switch (seq) {
2331   case lockseq_tas:
2332     seq = lockseq_nested_tas;
2333     break;
2334 #if KMP_USE_FUTEX
2335   case lockseq_futex:
2336     seq = lockseq_nested_futex;
2337     break;
2338 #endif
2339   case lockseq_ticket:
2340     seq = lockseq_nested_ticket;
2341     break;
2342   case lockseq_queuing:
2343     seq = lockseq_nested_queuing;
2344     break;
2345   case lockseq_drdpa:
2346     seq = lockseq_nested_drdpa;
2347     break;
2348   default:
2349     seq = lockseq_nested_queuing;
2350   }
2351   KMP_INIT_I_LOCK(lock, seq);
2352 #if USE_ITT_BUILD
2353   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2354   __kmp_itt_lock_creating(ilk->lock, loc);
2355 #endif
2356 }
2357 
2358 /* initialize the lock with a hint */
2359 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2360                                 uintptr_t hint) {
2361   KMP_DEBUG_ASSERT(__kmp_init_serial);
2362   if (__kmp_env_consistency_check && user_lock == NULL) {
2363     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2364   }
2365 
2366   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2367 
2368 #if OMPT_SUPPORT && OMPT_OPTIONAL
2369   // This is the case, if called from omp_init_lock_with_hint:
2370   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2371   if (!codeptr)
2372     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2373   if (ompt_enabled.ompt_callback_lock_init) {
2374     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2375         ompt_mutex_lock, (omp_lock_hint_t)hint,
2376         __ompt_get_mutex_impl_type(user_lock),
2377         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2378   }
2379 #endif
2380 }
2381 
2382 /* initialize the lock with a hint */
2383 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2384                                      void **user_lock, uintptr_t hint) {
2385   KMP_DEBUG_ASSERT(__kmp_init_serial);
2386   if (__kmp_env_consistency_check && user_lock == NULL) {
2387     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2388   }
2389 
2390   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2391 
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393   // This is the case, if called from omp_init_lock_with_hint:
2394   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2395   if (!codeptr)
2396     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2397   if (ompt_enabled.ompt_callback_lock_init) {
2398     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2399         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2400         __ompt_get_mutex_impl_type(user_lock),
2401         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2402   }
2403 #endif
2404 }
2405 
2406 #endif // KMP_USE_DYNAMIC_LOCK
2407 
2408 /* initialize the lock */
2409 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2410 #if KMP_USE_DYNAMIC_LOCK
2411 
2412   KMP_DEBUG_ASSERT(__kmp_init_serial);
2413   if (__kmp_env_consistency_check && user_lock == NULL) {
2414     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2415   }
2416   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2417 
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419   // This is the case, if called from omp_init_lock_with_hint:
2420   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2421   if (!codeptr)
2422     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2423   if (ompt_enabled.ompt_callback_lock_init) {
2424     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2425         ompt_mutex_lock, omp_lock_hint_none,
2426         __ompt_get_mutex_impl_type(user_lock),
2427         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2428   }
2429 #endif
2430 
2431 #else // KMP_USE_DYNAMIC_LOCK
2432 
2433   static char const *const func = "omp_init_lock";
2434   kmp_user_lock_p lck;
2435   KMP_DEBUG_ASSERT(__kmp_init_serial);
2436 
2437   if (__kmp_env_consistency_check) {
2438     if (user_lock == NULL) {
2439       KMP_FATAL(LockIsUninitialized, func);
2440     }
2441   }
2442 
2443   KMP_CHECK_USER_LOCK_INIT();
2444 
2445   if ((__kmp_user_lock_kind == lk_tas) &&
2446       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2447     lck = (kmp_user_lock_p)user_lock;
2448   }
2449 #if KMP_USE_FUTEX
2450   else if ((__kmp_user_lock_kind == lk_futex) &&
2451            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2452     lck = (kmp_user_lock_p)user_lock;
2453   }
2454 #endif
2455   else {
2456     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2457   }
2458   INIT_LOCK(lck);
2459   __kmp_set_user_lock_location(lck, loc);
2460 
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462   // This is the case, if called from omp_init_lock_with_hint:
2463   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2464   if (!codeptr)
2465     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2466   if (ompt_enabled.ompt_callback_lock_init) {
2467     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2468         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2469         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2470   }
2471 #endif
2472 
2473 #if USE_ITT_BUILD
2474   __kmp_itt_lock_creating(lck);
2475 #endif /* USE_ITT_BUILD */
2476 
2477 #endif // KMP_USE_DYNAMIC_LOCK
2478 } // __kmpc_init_lock
2479 
2480 /* initialize the lock */
2481 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2482 #if KMP_USE_DYNAMIC_LOCK
2483 
2484   KMP_DEBUG_ASSERT(__kmp_init_serial);
2485   if (__kmp_env_consistency_check && user_lock == NULL) {
2486     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2487   }
2488   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2489 
2490 #if OMPT_SUPPORT && OMPT_OPTIONAL
2491   // This is the case, if called from omp_init_lock_with_hint:
2492   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2493   if (!codeptr)
2494     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2495   if (ompt_enabled.ompt_callback_lock_init) {
2496     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2497         ompt_mutex_nest_lock, omp_lock_hint_none,
2498         __ompt_get_mutex_impl_type(user_lock),
2499         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2500   }
2501 #endif
2502 
2503 #else // KMP_USE_DYNAMIC_LOCK
2504 
2505   static char const *const func = "omp_init_nest_lock";
2506   kmp_user_lock_p lck;
2507   KMP_DEBUG_ASSERT(__kmp_init_serial);
2508 
2509   if (__kmp_env_consistency_check) {
2510     if (user_lock == NULL) {
2511       KMP_FATAL(LockIsUninitialized, func);
2512     }
2513   }
2514 
2515   KMP_CHECK_USER_LOCK_INIT();
2516 
2517   if ((__kmp_user_lock_kind == lk_tas) &&
2518       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2519        OMP_NEST_LOCK_T_SIZE)) {
2520     lck = (kmp_user_lock_p)user_lock;
2521   }
2522 #if KMP_USE_FUTEX
2523   else if ((__kmp_user_lock_kind == lk_futex) &&
2524            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2525             OMP_NEST_LOCK_T_SIZE)) {
2526     lck = (kmp_user_lock_p)user_lock;
2527   }
2528 #endif
2529   else {
2530     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2531   }
2532 
2533   INIT_NESTED_LOCK(lck);
2534   __kmp_set_user_lock_location(lck, loc);
2535 
2536 #if OMPT_SUPPORT && OMPT_OPTIONAL
2537   // This is the case, if called from omp_init_lock_with_hint:
2538   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2539   if (!codeptr)
2540     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2541   if (ompt_enabled.ompt_callback_lock_init) {
2542     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2543         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2544         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2545   }
2546 #endif
2547 
2548 #if USE_ITT_BUILD
2549   __kmp_itt_lock_creating(lck);
2550 #endif /* USE_ITT_BUILD */
2551 
2552 #endif // KMP_USE_DYNAMIC_LOCK
2553 } // __kmpc_init_nest_lock
2554 
2555 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2556 #if KMP_USE_DYNAMIC_LOCK
2557 
2558 #if USE_ITT_BUILD
2559   kmp_user_lock_p lck;
2560   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2561     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2562   } else {
2563     lck = (kmp_user_lock_p)user_lock;
2564   }
2565   __kmp_itt_lock_destroyed(lck);
2566 #endif
2567 #if OMPT_SUPPORT && OMPT_OPTIONAL
2568   // This is the case, if called from omp_init_lock_with_hint:
2569   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2570   if (!codeptr)
2571     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2572   if (ompt_enabled.ompt_callback_lock_destroy) {
2573     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2574         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2575   }
2576 #endif
2577   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2578 #else
2579   kmp_user_lock_p lck;
2580 
2581   if ((__kmp_user_lock_kind == lk_tas) &&
2582       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2583     lck = (kmp_user_lock_p)user_lock;
2584   }
2585 #if KMP_USE_FUTEX
2586   else if ((__kmp_user_lock_kind == lk_futex) &&
2587            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2588     lck = (kmp_user_lock_p)user_lock;
2589   }
2590 #endif
2591   else {
2592     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2593   }
2594 
2595 #if OMPT_SUPPORT && OMPT_OPTIONAL
2596   // This is the case, if called from omp_init_lock_with_hint:
2597   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2598   if (!codeptr)
2599     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2600   if (ompt_enabled.ompt_callback_lock_destroy) {
2601     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2602         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2603   }
2604 #endif
2605 
2606 #if USE_ITT_BUILD
2607   __kmp_itt_lock_destroyed(lck);
2608 #endif /* USE_ITT_BUILD */
2609   DESTROY_LOCK(lck);
2610 
2611   if ((__kmp_user_lock_kind == lk_tas) &&
2612       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2613     ;
2614   }
2615 #if KMP_USE_FUTEX
2616   else if ((__kmp_user_lock_kind == lk_futex) &&
2617            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2618     ;
2619   }
2620 #endif
2621   else {
2622     __kmp_user_lock_free(user_lock, gtid, lck);
2623   }
2624 #endif // KMP_USE_DYNAMIC_LOCK
2625 } // __kmpc_destroy_lock
2626 
2627 /* destroy the lock */
2628 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2629 #if KMP_USE_DYNAMIC_LOCK
2630 
2631 #if USE_ITT_BUILD
2632   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2633   __kmp_itt_lock_destroyed(ilk->lock);
2634 #endif
2635 #if OMPT_SUPPORT && OMPT_OPTIONAL
2636   // This is the case, if called from omp_init_lock_with_hint:
2637   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2638   if (!codeptr)
2639     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2640   if (ompt_enabled.ompt_callback_lock_destroy) {
2641     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2642         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2643   }
2644 #endif
2645   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2646 
2647 #else // KMP_USE_DYNAMIC_LOCK
2648 
2649   kmp_user_lock_p lck;
2650 
2651   if ((__kmp_user_lock_kind == lk_tas) &&
2652       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2653        OMP_NEST_LOCK_T_SIZE)) {
2654     lck = (kmp_user_lock_p)user_lock;
2655   }
2656 #if KMP_USE_FUTEX
2657   else if ((__kmp_user_lock_kind == lk_futex) &&
2658            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2659             OMP_NEST_LOCK_T_SIZE)) {
2660     lck = (kmp_user_lock_p)user_lock;
2661   }
2662 #endif
2663   else {
2664     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2665   }
2666 
2667 #if OMPT_SUPPORT && OMPT_OPTIONAL
2668   // This is the case, if called from omp_init_lock_with_hint:
2669   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2670   if (!codeptr)
2671     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2672   if (ompt_enabled.ompt_callback_lock_destroy) {
2673     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2674         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2675   }
2676 #endif
2677 
2678 #if USE_ITT_BUILD
2679   __kmp_itt_lock_destroyed(lck);
2680 #endif /* USE_ITT_BUILD */
2681 
2682   DESTROY_NESTED_LOCK(lck);
2683 
2684   if ((__kmp_user_lock_kind == lk_tas) &&
2685       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2686        OMP_NEST_LOCK_T_SIZE)) {
2687     ;
2688   }
2689 #if KMP_USE_FUTEX
2690   else if ((__kmp_user_lock_kind == lk_futex) &&
2691            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2692             OMP_NEST_LOCK_T_SIZE)) {
2693     ;
2694   }
2695 #endif
2696   else {
2697     __kmp_user_lock_free(user_lock, gtid, lck);
2698   }
2699 #endif // KMP_USE_DYNAMIC_LOCK
2700 } // __kmpc_destroy_nest_lock
2701 
2702 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2703   KMP_COUNT_BLOCK(OMP_set_lock);
2704 #if KMP_USE_DYNAMIC_LOCK
2705   int tag = KMP_EXTRACT_D_TAG(user_lock);
2706 #if USE_ITT_BUILD
2707   __kmp_itt_lock_acquiring(
2708       (kmp_user_lock_p)
2709           user_lock); // itt function will get to the right lock object.
2710 #endif
2711 #if OMPT_SUPPORT && OMPT_OPTIONAL
2712   // This is the case, if called from omp_init_lock_with_hint:
2713   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2714   if (!codeptr)
2715     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2716   if (ompt_enabled.ompt_callback_mutex_acquire) {
2717     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2718         ompt_mutex_lock, omp_lock_hint_none,
2719         __ompt_get_mutex_impl_type(user_lock),
2720         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2721   }
2722 #endif
2723 #if KMP_USE_INLINED_TAS
2724   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2725     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2726   } else
2727 #elif KMP_USE_INLINED_FUTEX
2728   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2729     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2730   } else
2731 #endif
2732   {
2733     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2734   }
2735 #if USE_ITT_BUILD
2736   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2737 #endif
2738 #if OMPT_SUPPORT && OMPT_OPTIONAL
2739   if (ompt_enabled.ompt_callback_mutex_acquired) {
2740     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2741         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2742   }
2743 #endif
2744 
2745 #else // KMP_USE_DYNAMIC_LOCK
2746 
2747   kmp_user_lock_p lck;
2748 
2749   if ((__kmp_user_lock_kind == lk_tas) &&
2750       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2751     lck = (kmp_user_lock_p)user_lock;
2752   }
2753 #if KMP_USE_FUTEX
2754   else if ((__kmp_user_lock_kind == lk_futex) &&
2755            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2756     lck = (kmp_user_lock_p)user_lock;
2757   }
2758 #endif
2759   else {
2760     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2761   }
2762 
2763 #if USE_ITT_BUILD
2764   __kmp_itt_lock_acquiring(lck);
2765 #endif /* USE_ITT_BUILD */
2766 #if OMPT_SUPPORT && OMPT_OPTIONAL
2767   // This is the case, if called from omp_init_lock_with_hint:
2768   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2769   if (!codeptr)
2770     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2771   if (ompt_enabled.ompt_callback_mutex_acquire) {
2772     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2773         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2774         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2775   }
2776 #endif
2777 
2778   ACQUIRE_LOCK(lck, gtid);
2779 
2780 #if USE_ITT_BUILD
2781   __kmp_itt_lock_acquired(lck);
2782 #endif /* USE_ITT_BUILD */
2783 
2784 #if OMPT_SUPPORT && OMPT_OPTIONAL
2785   if (ompt_enabled.ompt_callback_mutex_acquired) {
2786     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2787         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2788   }
2789 #endif
2790 
2791 #endif // KMP_USE_DYNAMIC_LOCK
2792 }
2793 
2794 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2795 #if KMP_USE_DYNAMIC_LOCK
2796 
2797 #if USE_ITT_BUILD
2798   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2799 #endif
2800 #if OMPT_SUPPORT && OMPT_OPTIONAL
2801   // This is the case, if called from omp_init_lock_with_hint:
2802   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2803   if (!codeptr)
2804     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2805   if (ompt_enabled.enabled) {
2806     if (ompt_enabled.ompt_callback_mutex_acquire) {
2807       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2808           ompt_mutex_nest_lock, omp_lock_hint_none,
2809           __ompt_get_mutex_impl_type(user_lock),
2810           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2811     }
2812   }
2813 #endif
2814   int acquire_status =
2815       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2816   (void)acquire_status;
2817 #if USE_ITT_BUILD
2818   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2819 #endif
2820 
2821 #if OMPT_SUPPORT && OMPT_OPTIONAL
2822   if (ompt_enabled.enabled) {
2823     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2824       if (ompt_enabled.ompt_callback_mutex_acquired) {
2825         // lock_first
2826         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2827             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2828             codeptr);
2829       }
2830     } else {
2831       if (ompt_enabled.ompt_callback_nest_lock) {
2832         // lock_next
2833         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2834             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2835       }
2836     }
2837   }
2838 #endif
2839 
2840 #else // KMP_USE_DYNAMIC_LOCK
2841   int acquire_status;
2842   kmp_user_lock_p lck;
2843 
2844   if ((__kmp_user_lock_kind == lk_tas) &&
2845       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2846        OMP_NEST_LOCK_T_SIZE)) {
2847     lck = (kmp_user_lock_p)user_lock;
2848   }
2849 #if KMP_USE_FUTEX
2850   else if ((__kmp_user_lock_kind == lk_futex) &&
2851            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2852             OMP_NEST_LOCK_T_SIZE)) {
2853     lck = (kmp_user_lock_p)user_lock;
2854   }
2855 #endif
2856   else {
2857     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2858   }
2859 
2860 #if USE_ITT_BUILD
2861   __kmp_itt_lock_acquiring(lck);
2862 #endif /* USE_ITT_BUILD */
2863 #if OMPT_SUPPORT && OMPT_OPTIONAL
2864   // This is the case, if called from omp_init_lock_with_hint:
2865   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2866   if (!codeptr)
2867     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2868   if (ompt_enabled.enabled) {
2869     if (ompt_enabled.ompt_callback_mutex_acquire) {
2870       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2871           ompt_mutex_nest_lock, omp_lock_hint_none,
2872           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2873           codeptr);
2874     }
2875   }
2876 #endif
2877 
2878   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2879 
2880 #if USE_ITT_BUILD
2881   __kmp_itt_lock_acquired(lck);
2882 #endif /* USE_ITT_BUILD */
2883 
2884 #if OMPT_SUPPORT && OMPT_OPTIONAL
2885   if (ompt_enabled.enabled) {
2886     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2887       if (ompt_enabled.ompt_callback_mutex_acquired) {
2888         // lock_first
2889         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2890             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2891       }
2892     } else {
2893       if (ompt_enabled.ompt_callback_nest_lock) {
2894         // lock_next
2895         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2896             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2897       }
2898     }
2899   }
2900 #endif
2901 
2902 #endif // KMP_USE_DYNAMIC_LOCK
2903 }
2904 
2905 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2906 #if KMP_USE_DYNAMIC_LOCK
2907 
2908   int tag = KMP_EXTRACT_D_TAG(user_lock);
2909 #if USE_ITT_BUILD
2910   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2911 #endif
2912 #if KMP_USE_INLINED_TAS
2913   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2914     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2915   } else
2916 #elif KMP_USE_INLINED_FUTEX
2917   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2918     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2919   } else
2920 #endif
2921   {
2922     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2923   }
2924 
2925 #if OMPT_SUPPORT && OMPT_OPTIONAL
2926   // This is the case, if called from omp_init_lock_with_hint:
2927   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2928   if (!codeptr)
2929     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2930   if (ompt_enabled.ompt_callback_mutex_released) {
2931     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2932         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2933   }
2934 #endif
2935 
2936 #else // KMP_USE_DYNAMIC_LOCK
2937 
2938   kmp_user_lock_p lck;
2939 
2940   /* Can't use serial interval since not block structured */
2941   /* release the lock */
2942 
2943   if ((__kmp_user_lock_kind == lk_tas) &&
2944       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2945 #if KMP_OS_LINUX &&                                                            \
2946     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2947 // "fast" path implemented to fix customer performance issue
2948 #if USE_ITT_BUILD
2949     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2950 #endif /* USE_ITT_BUILD */
2951     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2952     KMP_MB();
2953 
2954 #if OMPT_SUPPORT && OMPT_OPTIONAL
2955     // This is the case, if called from omp_init_lock_with_hint:
2956     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2957     if (!codeptr)
2958       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2959     if (ompt_enabled.ompt_callback_mutex_released) {
2960       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2961           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2962     }
2963 #endif
2964 
2965     return;
2966 #else
2967     lck = (kmp_user_lock_p)user_lock;
2968 #endif
2969   }
2970 #if KMP_USE_FUTEX
2971   else if ((__kmp_user_lock_kind == lk_futex) &&
2972            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2973     lck = (kmp_user_lock_p)user_lock;
2974   }
2975 #endif
2976   else {
2977     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2978   }
2979 
2980 #if USE_ITT_BUILD
2981   __kmp_itt_lock_releasing(lck);
2982 #endif /* USE_ITT_BUILD */
2983 
2984   RELEASE_LOCK(lck, gtid);
2985 
2986 #if OMPT_SUPPORT && OMPT_OPTIONAL
2987   // This is the case, if called from omp_init_lock_with_hint:
2988   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2989   if (!codeptr)
2990     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2991   if (ompt_enabled.ompt_callback_mutex_released) {
2992     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2993         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2994   }
2995 #endif
2996 
2997 #endif // KMP_USE_DYNAMIC_LOCK
2998 }
2999 
3000 /* release the lock */
3001 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3002 #if KMP_USE_DYNAMIC_LOCK
3003 
3004 #if USE_ITT_BUILD
3005   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3006 #endif
3007   int release_status =
3008       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3009   (void)release_status;
3010 
3011 #if OMPT_SUPPORT && OMPT_OPTIONAL
3012   // This is the case, if called from omp_init_lock_with_hint:
3013   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3014   if (!codeptr)
3015     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3016   if (ompt_enabled.enabled) {
3017     if (release_status == KMP_LOCK_RELEASED) {
3018       if (ompt_enabled.ompt_callback_mutex_released) {
3019         // release_lock_last
3020         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3021             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3022             codeptr);
3023       }
3024     } else if (ompt_enabled.ompt_callback_nest_lock) {
3025       // release_lock_prev
3026       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3027           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3028     }
3029   }
3030 #endif
3031 
3032 #else // KMP_USE_DYNAMIC_LOCK
3033 
3034   kmp_user_lock_p lck;
3035 
3036   /* Can't use serial interval since not block structured */
3037 
3038   if ((__kmp_user_lock_kind == lk_tas) &&
3039       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3040        OMP_NEST_LOCK_T_SIZE)) {
3041 #if KMP_OS_LINUX &&                                                            \
3042     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3043     // "fast" path implemented to fix customer performance issue
3044     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3045 #if USE_ITT_BUILD
3046     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3047 #endif /* USE_ITT_BUILD */
3048 
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050     int release_status = KMP_LOCK_STILL_HELD;
3051 #endif
3052 
3053     if (--(tl->lk.depth_locked) == 0) {
3054       TCW_4(tl->lk.poll, 0);
3055 #if OMPT_SUPPORT && OMPT_OPTIONAL
3056       release_status = KMP_LOCK_RELEASED;
3057 #endif
3058     }
3059     KMP_MB();
3060 
3061 #if OMPT_SUPPORT && OMPT_OPTIONAL
3062     // This is the case, if called from omp_init_lock_with_hint:
3063     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3064     if (!codeptr)
3065       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3066     if (ompt_enabled.enabled) {
3067       if (release_status == KMP_LOCK_RELEASED) {
3068         if (ompt_enabled.ompt_callback_mutex_released) {
3069           // release_lock_last
3070           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3071               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3072         }
3073       } else if (ompt_enabled.ompt_callback_nest_lock) {
3074         // release_lock_previous
3075         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3076             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3077       }
3078     }
3079 #endif
3080 
3081     return;
3082 #else
3083     lck = (kmp_user_lock_p)user_lock;
3084 #endif
3085   }
3086 #if KMP_USE_FUTEX
3087   else if ((__kmp_user_lock_kind == lk_futex) &&
3088            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3089             OMP_NEST_LOCK_T_SIZE)) {
3090     lck = (kmp_user_lock_p)user_lock;
3091   }
3092 #endif
3093   else {
3094     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3095   }
3096 
3097 #if USE_ITT_BUILD
3098   __kmp_itt_lock_releasing(lck);
3099 #endif /* USE_ITT_BUILD */
3100 
3101   int release_status;
3102   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3103 #if OMPT_SUPPORT && OMPT_OPTIONAL
3104   // This is the case, if called from omp_init_lock_with_hint:
3105   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3106   if (!codeptr)
3107     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3108   if (ompt_enabled.enabled) {
3109     if (release_status == KMP_LOCK_RELEASED) {
3110       if (ompt_enabled.ompt_callback_mutex_released) {
3111         // release_lock_last
3112         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3113             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3114       }
3115     } else if (ompt_enabled.ompt_callback_nest_lock) {
3116       // release_lock_previous
3117       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3118           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3119     }
3120   }
3121 #endif
3122 
3123 #endif // KMP_USE_DYNAMIC_LOCK
3124 }
3125 
3126 /* try to acquire the lock */
3127 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3128   KMP_COUNT_BLOCK(OMP_test_lock);
3129 
3130 #if KMP_USE_DYNAMIC_LOCK
3131   int rc;
3132   int tag = KMP_EXTRACT_D_TAG(user_lock);
3133 #if USE_ITT_BUILD
3134   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3135 #endif
3136 #if OMPT_SUPPORT && OMPT_OPTIONAL
3137   // This is the case, if called from omp_init_lock_with_hint:
3138   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3139   if (!codeptr)
3140     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3141   if (ompt_enabled.ompt_callback_mutex_acquire) {
3142     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3143         ompt_mutex_lock, omp_lock_hint_none,
3144         __ompt_get_mutex_impl_type(user_lock),
3145         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3146   }
3147 #endif
3148 #if KMP_USE_INLINED_TAS
3149   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3150     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3151   } else
3152 #elif KMP_USE_INLINED_FUTEX
3153   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3154     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3155   } else
3156 #endif
3157   {
3158     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3159   }
3160   if (rc) {
3161 #if USE_ITT_BUILD
3162     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3163 #endif
3164 #if OMPT_SUPPORT && OMPT_OPTIONAL
3165     if (ompt_enabled.ompt_callback_mutex_acquired) {
3166       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3167           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3168     }
3169 #endif
3170     return FTN_TRUE;
3171   } else {
3172 #if USE_ITT_BUILD
3173     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3174 #endif
3175     return FTN_FALSE;
3176   }
3177 
3178 #else // KMP_USE_DYNAMIC_LOCK
3179 
3180   kmp_user_lock_p lck;
3181   int rc;
3182 
3183   if ((__kmp_user_lock_kind == lk_tas) &&
3184       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3185     lck = (kmp_user_lock_p)user_lock;
3186   }
3187 #if KMP_USE_FUTEX
3188   else if ((__kmp_user_lock_kind == lk_futex) &&
3189            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3190     lck = (kmp_user_lock_p)user_lock;
3191   }
3192 #endif
3193   else {
3194     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3195   }
3196 
3197 #if USE_ITT_BUILD
3198   __kmp_itt_lock_acquiring(lck);
3199 #endif /* USE_ITT_BUILD */
3200 #if OMPT_SUPPORT && OMPT_OPTIONAL
3201   // This is the case, if called from omp_init_lock_with_hint:
3202   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3203   if (!codeptr)
3204     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3205   if (ompt_enabled.ompt_callback_mutex_acquire) {
3206     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3207         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3208         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3209   }
3210 #endif
3211 
3212   rc = TEST_LOCK(lck, gtid);
3213 #if USE_ITT_BUILD
3214   if (rc) {
3215     __kmp_itt_lock_acquired(lck);
3216   } else {
3217     __kmp_itt_lock_cancelled(lck);
3218   }
3219 #endif /* USE_ITT_BUILD */
3220 #if OMPT_SUPPORT && OMPT_OPTIONAL
3221   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3222     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3223         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3224   }
3225 #endif
3226 
3227   return (rc ? FTN_TRUE : FTN_FALSE);
3228 
3229   /* Can't use serial interval since not block structured */
3230 
3231 #endif // KMP_USE_DYNAMIC_LOCK
3232 }
3233 
3234 /* try to acquire the lock */
3235 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3236 #if KMP_USE_DYNAMIC_LOCK
3237   int rc;
3238 #if USE_ITT_BUILD
3239   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3240 #endif
3241 #if OMPT_SUPPORT && OMPT_OPTIONAL
3242   // This is the case, if called from omp_init_lock_with_hint:
3243   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3244   if (!codeptr)
3245     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3246   if (ompt_enabled.ompt_callback_mutex_acquire) {
3247     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3248         ompt_mutex_nest_lock, omp_lock_hint_none,
3249         __ompt_get_mutex_impl_type(user_lock),
3250         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3251   }
3252 #endif
3253   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3254 #if USE_ITT_BUILD
3255   if (rc) {
3256     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3257   } else {
3258     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3259   }
3260 #endif
3261 #if OMPT_SUPPORT && OMPT_OPTIONAL
3262   if (ompt_enabled.enabled && rc) {
3263     if (rc == 1) {
3264       if (ompt_enabled.ompt_callback_mutex_acquired) {
3265         // lock_first
3266         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3267             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3268             codeptr);
3269       }
3270     } else {
3271       if (ompt_enabled.ompt_callback_nest_lock) {
3272         // lock_next
3273         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3274             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3275       }
3276     }
3277   }
3278 #endif
3279   return rc;
3280 
3281 #else // KMP_USE_DYNAMIC_LOCK
3282 
3283   kmp_user_lock_p lck;
3284   int rc;
3285 
3286   if ((__kmp_user_lock_kind == lk_tas) &&
3287       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3288        OMP_NEST_LOCK_T_SIZE)) {
3289     lck = (kmp_user_lock_p)user_lock;
3290   }
3291 #if KMP_USE_FUTEX
3292   else if ((__kmp_user_lock_kind == lk_futex) &&
3293            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3294             OMP_NEST_LOCK_T_SIZE)) {
3295     lck = (kmp_user_lock_p)user_lock;
3296   }
3297 #endif
3298   else {
3299     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3300   }
3301 
3302 #if USE_ITT_BUILD
3303   __kmp_itt_lock_acquiring(lck);
3304 #endif /* USE_ITT_BUILD */
3305 
3306 #if OMPT_SUPPORT && OMPT_OPTIONAL
3307   // This is the case, if called from omp_init_lock_with_hint:
3308   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3309   if (!codeptr)
3310     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3311   if (ompt_enabled.enabled) &&
3312         ompt_enabled.ompt_callback_mutex_acquire) {
3313       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3314           ompt_mutex_nest_lock, omp_lock_hint_none,
3315           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3316           codeptr);
3317     }
3318 #endif
3319 
3320   rc = TEST_NESTED_LOCK(lck, gtid);
3321 #if USE_ITT_BUILD
3322   if (rc) {
3323     __kmp_itt_lock_acquired(lck);
3324   } else {
3325     __kmp_itt_lock_cancelled(lck);
3326   }
3327 #endif /* USE_ITT_BUILD */
3328 #if OMPT_SUPPORT && OMPT_OPTIONAL
3329   if (ompt_enabled.enabled && rc) {
3330     if (rc == 1) {
3331       if (ompt_enabled.ompt_callback_mutex_acquired) {
3332         // lock_first
3333         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3334             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3335       }
3336     } else {
3337       if (ompt_enabled.ompt_callback_nest_lock) {
3338         // lock_next
3339         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3340             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3341       }
3342     }
3343   }
3344 #endif
3345   return rc;
3346 
3347   /* Can't use serial interval since not block structured */
3348 
3349 #endif // KMP_USE_DYNAMIC_LOCK
3350 }
3351 
3352 // Interface to fast scalable reduce methods routines
3353 
3354 // keep the selected method in a thread local structure for cross-function
3355 // usage: will be used in __kmpc_end_reduce* functions;
3356 // another solution: to re-determine the method one more time in
3357 // __kmpc_end_reduce* functions (new prototype required then)
3358 // AT: which solution is better?
3359 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3360   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3361 
3362 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3363   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3364 
3365 // description of the packed_reduction_method variable: look at the macros in
3366 // kmp.h
3367 
3368 // used in a critical section reduce block
3369 static __forceinline void
3370 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3371                                           kmp_critical_name *crit) {
3372 
3373   // this lock was visible to a customer and to the threading profile tool as a
3374   // serial overhead span (although it's used for an internal purpose only)
3375   //            why was it visible in previous implementation?
3376   //            should we keep it visible in new reduce block?
3377   kmp_user_lock_p lck;
3378 
3379 #if KMP_USE_DYNAMIC_LOCK
3380 
3381   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3382   // Check if it is initialized.
3383   if (*lk == 0) {
3384     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3385       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3386                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3387     } else {
3388       __kmp_init_indirect_csptr(crit, loc, global_tid,
3389                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3390     }
3391   }
3392   // Branch for accessing the actual lock object and set operation. This
3393   // branching is inevitable since this lock initialization does not follow the
3394   // normal dispatch path (lock table is not used).
3395   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3396     lck = (kmp_user_lock_p)lk;
3397     KMP_DEBUG_ASSERT(lck != NULL);
3398     if (__kmp_env_consistency_check) {
3399       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3400     }
3401     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3402   } else {
3403     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3404     lck = ilk->lock;
3405     KMP_DEBUG_ASSERT(lck != NULL);
3406     if (__kmp_env_consistency_check) {
3407       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3408     }
3409     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3410   }
3411 
3412 #else // KMP_USE_DYNAMIC_LOCK
3413 
3414   // We know that the fast reduction code is only emitted by Intel compilers
3415   // with 32 byte critical sections. If there isn't enough space, then we
3416   // have to use a pointer.
3417   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3418     lck = (kmp_user_lock_p)crit;
3419   } else {
3420     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3421   }
3422   KMP_DEBUG_ASSERT(lck != NULL);
3423 
3424   if (__kmp_env_consistency_check)
3425     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3426 
3427   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3428 
3429 #endif // KMP_USE_DYNAMIC_LOCK
3430 }
3431 
3432 // used in a critical section reduce block
3433 static __forceinline void
3434 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3435                                         kmp_critical_name *crit) {
3436 
3437   kmp_user_lock_p lck;
3438 
3439 #if KMP_USE_DYNAMIC_LOCK
3440 
3441   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3442     lck = (kmp_user_lock_p)crit;
3443     if (__kmp_env_consistency_check)
3444       __kmp_pop_sync(global_tid, ct_critical, loc);
3445     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3446   } else {
3447     kmp_indirect_lock_t *ilk =
3448         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3449     if (__kmp_env_consistency_check)
3450       __kmp_pop_sync(global_tid, ct_critical, loc);
3451     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3452   }
3453 
3454 #else // KMP_USE_DYNAMIC_LOCK
3455 
3456   // We know that the fast reduction code is only emitted by Intel compilers
3457   // with 32 byte critical sections. If there isn't enough space, then we have
3458   // to use a pointer.
3459   if (__kmp_base_user_lock_size > 32) {
3460     lck = *((kmp_user_lock_p *)crit);
3461     KMP_ASSERT(lck != NULL);
3462   } else {
3463     lck = (kmp_user_lock_p)crit;
3464   }
3465 
3466   if (__kmp_env_consistency_check)
3467     __kmp_pop_sync(global_tid, ct_critical, loc);
3468 
3469   __kmp_release_user_lock_with_checks(lck, global_tid);
3470 
3471 #endif // KMP_USE_DYNAMIC_LOCK
3472 } // __kmp_end_critical_section_reduce_block
3473 
3474 static __forceinline int
3475 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3476                                      int *task_state) {
3477   kmp_team_t *team;
3478 
3479   // Check if we are inside the teams construct?
3480   if (th->th.th_teams_microtask) {
3481     *team_p = team = th->th.th_team;
3482     if (team->t.t_level == th->th.th_teams_level) {
3483       // This is reduction at teams construct.
3484       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3485       // Let's swap teams temporarily for the reduction.
3486       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3487       th->th.th_team = team->t.t_parent;
3488       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3489       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3490       *task_state = th->th.th_task_state;
3491       th->th.th_task_state = 0;
3492 
3493       return 1;
3494     }
3495   }
3496   return 0;
3497 }
3498 
3499 static __forceinline void
3500 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3501   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3502   th->th.th_info.ds.ds_tid = 0;
3503   th->th.th_team = team;
3504   th->th.th_team_nproc = team->t.t_nproc;
3505   th->th.th_task_team = team->t.t_task_team[task_state];
3506   __kmp_type_convert(task_state, &(th->th.th_task_state));
3507 }
3508 
3509 /* 2.a.i. Reduce Block without a terminating barrier */
3510 /*!
3511 @ingroup SYNCHRONIZATION
3512 @param loc source location information
3513 @param global_tid global thread number
3514 @param num_vars number of items (variables) to be reduced
3515 @param reduce_size size of data in bytes to be reduced
3516 @param reduce_data pointer to data to be reduced
3517 @param reduce_func callback function providing reduction operation on two
3518 operands and returning result of reduction in lhs_data
3519 @param lck pointer to the unique lock data structure
3520 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3521 threads if atomic reduction needed
3522 
3523 The nowait version is used for a reduce clause with the nowait argument.
3524 */
3525 kmp_int32
3526 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3527                      size_t reduce_size, void *reduce_data,
3528                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3529                      kmp_critical_name *lck) {
3530 
3531   KMP_COUNT_BLOCK(REDUCE_nowait);
3532   int retval = 0;
3533   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3534   kmp_info_t *th;
3535   kmp_team_t *team;
3536   int teams_swapped = 0, task_state;
3537   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3538   __kmp_assert_valid_gtid(global_tid);
3539 
3540   // why do we need this initialization here at all?
3541   // Reduction clause can not be used as a stand-alone directive.
3542 
3543   // do not call __kmp_serial_initialize(), it will be called by
3544   // __kmp_parallel_initialize() if needed
3545   // possible detection of false-positive race by the threadchecker ???
3546   if (!TCR_4(__kmp_init_parallel))
3547     __kmp_parallel_initialize();
3548 
3549   __kmp_resume_if_soft_paused();
3550 
3551 // check correctness of reduce block nesting
3552 #if KMP_USE_DYNAMIC_LOCK
3553   if (__kmp_env_consistency_check)
3554     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3555 #else
3556   if (__kmp_env_consistency_check)
3557     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3558 #endif
3559 
3560   th = __kmp_thread_from_gtid(global_tid);
3561   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3562 
3563   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3564   // the value should be kept in a variable
3565   // the variable should be either a construct-specific or thread-specific
3566   // property, not a team specific property
3567   //     (a thread can reach the next reduce block on the next construct, reduce
3568   //     method may differ on the next construct)
3569   // an ident_t "loc" parameter could be used as a construct-specific property
3570   // (what if loc == 0?)
3571   //     (if both construct-specific and team-specific variables were shared,
3572   //     then unness extra syncs should be needed)
3573   // a thread-specific variable is better regarding two issues above (next
3574   // construct and extra syncs)
3575   // a thread-specific "th_local.reduction_method" variable is used currently
3576   // each thread executes 'determine' and 'set' lines (no need to execute by one
3577   // thread, to avoid unness extra syncs)
3578 
3579   packed_reduction_method = __kmp_determine_reduction_method(
3580       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3581   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3582 
3583   OMPT_REDUCTION_DECL(th, global_tid);
3584   if (packed_reduction_method == critical_reduce_block) {
3585 
3586     OMPT_REDUCTION_BEGIN;
3587 
3588     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3589     retval = 1;
3590 
3591   } else if (packed_reduction_method == empty_reduce_block) {
3592 
3593     OMPT_REDUCTION_BEGIN;
3594 
3595     // usage: if team size == 1, no synchronization is required ( Intel
3596     // platforms only )
3597     retval = 1;
3598 
3599   } else if (packed_reduction_method == atomic_reduce_block) {
3600 
3601     retval = 2;
3602 
3603     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3604     // won't be called by the code gen)
3605     //     (it's not quite good, because the checking block has been closed by
3606     //     this 'pop',
3607     //      but atomic operation has not been executed yet, will be executed
3608     //      slightly later, literally on next instruction)
3609     if (__kmp_env_consistency_check)
3610       __kmp_pop_sync(global_tid, ct_reduce, loc);
3611 
3612   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3613                                    tree_reduce_block)) {
3614 
3615 // AT: performance issue: a real barrier here
3616 // AT: (if primary thread is slow, other threads are blocked here waiting for
3617 //      the primary thread to come and release them)
3618 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3619 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3620 //      be confusing to a customer)
3621 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3622 // might go faster and be more in line with sense of NOWAIT
3623 // AT: TO DO: do epcc test and compare times
3624 
3625 // this barrier should be invisible to a customer and to the threading profile
3626 // tool (it's neither a terminating barrier nor customer's code, it's
3627 // used for an internal purpose)
3628 #if OMPT_SUPPORT
3629     // JP: can this barrier potentially leed to task scheduling?
3630     // JP: as long as there is a barrier in the implementation, OMPT should and
3631     // will provide the barrier events
3632     //         so we set-up the necessary frame/return addresses.
3633     ompt_frame_t *ompt_frame;
3634     if (ompt_enabled.enabled) {
3635       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3636       if (ompt_frame->enter_frame.ptr == NULL)
3637         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3638     }
3639     OMPT_STORE_RETURN_ADDRESS(global_tid);
3640 #endif
3641 #if USE_ITT_NOTIFY
3642     __kmp_threads[global_tid]->th.th_ident = loc;
3643 #endif
3644     retval =
3645         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3646                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3647     retval = (retval != 0) ? (0) : (1);
3648 #if OMPT_SUPPORT && OMPT_OPTIONAL
3649     if (ompt_enabled.enabled) {
3650       ompt_frame->enter_frame = ompt_data_none;
3651     }
3652 #endif
3653 
3654     // all other workers except primary thread should do this pop here
3655     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3656     if (__kmp_env_consistency_check) {
3657       if (retval == 0) {
3658         __kmp_pop_sync(global_tid, ct_reduce, loc);
3659       }
3660     }
3661 
3662   } else {
3663 
3664     // should never reach this block
3665     KMP_ASSERT(0); // "unexpected method"
3666   }
3667   if (teams_swapped) {
3668     __kmp_restore_swapped_teams(th, team, task_state);
3669   }
3670   KA_TRACE(
3671       10,
3672       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3673        global_tid, packed_reduction_method, retval));
3674 
3675   return retval;
3676 }
3677 
3678 /*!
3679 @ingroup SYNCHRONIZATION
3680 @param loc source location information
3681 @param global_tid global thread id.
3682 @param lck pointer to the unique lock data structure
3683 
3684 Finish the execution of a reduce nowait.
3685 */
3686 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3687                               kmp_critical_name *lck) {
3688 
3689   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3690 
3691   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3692   __kmp_assert_valid_gtid(global_tid);
3693 
3694   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3695 
3696   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3697 
3698   if (packed_reduction_method == critical_reduce_block) {
3699 
3700     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3701     OMPT_REDUCTION_END;
3702 
3703   } else if (packed_reduction_method == empty_reduce_block) {
3704 
3705     // usage: if team size == 1, no synchronization is required ( on Intel
3706     // platforms only )
3707 
3708     OMPT_REDUCTION_END;
3709 
3710   } else if (packed_reduction_method == atomic_reduce_block) {
3711 
3712     // neither primary thread nor other workers should get here
3713     //     (code gen does not generate this call in case 2: atomic reduce block)
3714     // actually it's better to remove this elseif at all;
3715     // after removal this value will checked by the 'else' and will assert
3716 
3717   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3718                                    tree_reduce_block)) {
3719 
3720     // only primary thread gets here
3721     // OMPT: tree reduction is annotated in the barrier code
3722 
3723   } else {
3724 
3725     // should never reach this block
3726     KMP_ASSERT(0); // "unexpected method"
3727   }
3728 
3729   if (__kmp_env_consistency_check)
3730     __kmp_pop_sync(global_tid, ct_reduce, loc);
3731 
3732   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3733                 global_tid, packed_reduction_method));
3734 
3735   return;
3736 }
3737 
3738 /* 2.a.ii. Reduce Block with a terminating barrier */
3739 
3740 /*!
3741 @ingroup SYNCHRONIZATION
3742 @param loc source location information
3743 @param global_tid global thread number
3744 @param num_vars number of items (variables) to be reduced
3745 @param reduce_size size of data in bytes to be reduced
3746 @param reduce_data pointer to data to be reduced
3747 @param reduce_func callback function providing reduction operation on two
3748 operands and returning result of reduction in lhs_data
3749 @param lck pointer to the unique lock data structure
3750 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3751 threads if atomic reduction needed
3752 
3753 A blocking reduce that includes an implicit barrier.
3754 */
3755 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3756                         size_t reduce_size, void *reduce_data,
3757                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3758                         kmp_critical_name *lck) {
3759   KMP_COUNT_BLOCK(REDUCE_wait);
3760   int retval = 0;
3761   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3762   kmp_info_t *th;
3763   kmp_team_t *team;
3764   int teams_swapped = 0, task_state;
3765 
3766   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3767   __kmp_assert_valid_gtid(global_tid);
3768 
3769   // why do we need this initialization here at all?
3770   // Reduction clause can not be a stand-alone directive.
3771 
3772   // do not call __kmp_serial_initialize(), it will be called by
3773   // __kmp_parallel_initialize() if needed
3774   // possible detection of false-positive race by the threadchecker ???
3775   if (!TCR_4(__kmp_init_parallel))
3776     __kmp_parallel_initialize();
3777 
3778   __kmp_resume_if_soft_paused();
3779 
3780 // check correctness of reduce block nesting
3781 #if KMP_USE_DYNAMIC_LOCK
3782   if (__kmp_env_consistency_check)
3783     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3784 #else
3785   if (__kmp_env_consistency_check)
3786     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3787 #endif
3788 
3789   th = __kmp_thread_from_gtid(global_tid);
3790   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3791 
3792   packed_reduction_method = __kmp_determine_reduction_method(
3793       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3794   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3795 
3796   OMPT_REDUCTION_DECL(th, global_tid);
3797 
3798   if (packed_reduction_method == critical_reduce_block) {
3799 
3800     OMPT_REDUCTION_BEGIN;
3801     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3802     retval = 1;
3803 
3804   } else if (packed_reduction_method == empty_reduce_block) {
3805 
3806     OMPT_REDUCTION_BEGIN;
3807     // usage: if team size == 1, no synchronization is required ( Intel
3808     // platforms only )
3809     retval = 1;
3810 
3811   } else if (packed_reduction_method == atomic_reduce_block) {
3812 
3813     retval = 2;
3814 
3815   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3816                                    tree_reduce_block)) {
3817 
3818 // case tree_reduce_block:
3819 // this barrier should be visible to a customer and to the threading profile
3820 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3821 #if OMPT_SUPPORT
3822     ompt_frame_t *ompt_frame;
3823     if (ompt_enabled.enabled) {
3824       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3825       if (ompt_frame->enter_frame.ptr == NULL)
3826         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3827     }
3828     OMPT_STORE_RETURN_ADDRESS(global_tid);
3829 #endif
3830 #if USE_ITT_NOTIFY
3831     __kmp_threads[global_tid]->th.th_ident =
3832         loc; // needed for correct notification of frames
3833 #endif
3834     retval =
3835         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3836                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3837     retval = (retval != 0) ? (0) : (1);
3838 #if OMPT_SUPPORT && OMPT_OPTIONAL
3839     if (ompt_enabled.enabled) {
3840       ompt_frame->enter_frame = ompt_data_none;
3841     }
3842 #endif
3843 
3844     // all other workers except primary thread should do this pop here
3845     // (none of other workers except primary will enter __kmpc_end_reduce())
3846     if (__kmp_env_consistency_check) {
3847       if (retval == 0) { // 0: all other workers; 1: primary thread
3848         __kmp_pop_sync(global_tid, ct_reduce, loc);
3849       }
3850     }
3851 
3852   } else {
3853 
3854     // should never reach this block
3855     KMP_ASSERT(0); // "unexpected method"
3856   }
3857   if (teams_swapped) {
3858     __kmp_restore_swapped_teams(th, team, task_state);
3859   }
3860 
3861   KA_TRACE(10,
3862            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3863             global_tid, packed_reduction_method, retval));
3864   return retval;
3865 }
3866 
3867 /*!
3868 @ingroup SYNCHRONIZATION
3869 @param loc source location information
3870 @param global_tid global thread id.
3871 @param lck pointer to the unique lock data structure
3872 
3873 Finish the execution of a blocking reduce.
3874 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3875 start function.
3876 */
3877 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3878                        kmp_critical_name *lck) {
3879 
3880   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3881   kmp_info_t *th;
3882   kmp_team_t *team;
3883   int teams_swapped = 0, task_state;
3884 
3885   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3886   __kmp_assert_valid_gtid(global_tid);
3887 
3888   th = __kmp_thread_from_gtid(global_tid);
3889   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3890 
3891   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3892 
3893   // this barrier should be visible to a customer and to the threading profile
3894   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3895   OMPT_REDUCTION_DECL(th, global_tid);
3896 
3897   if (packed_reduction_method == critical_reduce_block) {
3898     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3899 
3900     OMPT_REDUCTION_END;
3901 
3902 // TODO: implicit barrier: should be exposed
3903 #if OMPT_SUPPORT
3904     ompt_frame_t *ompt_frame;
3905     if (ompt_enabled.enabled) {
3906       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3907       if (ompt_frame->enter_frame.ptr == NULL)
3908         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3909     }
3910     OMPT_STORE_RETURN_ADDRESS(global_tid);
3911 #endif
3912 #if USE_ITT_NOTIFY
3913     __kmp_threads[global_tid]->th.th_ident = loc;
3914 #endif
3915     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3916 #if OMPT_SUPPORT && OMPT_OPTIONAL
3917     if (ompt_enabled.enabled) {
3918       ompt_frame->enter_frame = ompt_data_none;
3919     }
3920 #endif
3921 
3922   } else if (packed_reduction_method == empty_reduce_block) {
3923 
3924     OMPT_REDUCTION_END;
3925 
3926 // usage: if team size==1, no synchronization is required (Intel platforms only)
3927 
3928 // TODO: implicit barrier: should be exposed
3929 #if OMPT_SUPPORT
3930     ompt_frame_t *ompt_frame;
3931     if (ompt_enabled.enabled) {
3932       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3933       if (ompt_frame->enter_frame.ptr == NULL)
3934         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3935     }
3936     OMPT_STORE_RETURN_ADDRESS(global_tid);
3937 #endif
3938 #if USE_ITT_NOTIFY
3939     __kmp_threads[global_tid]->th.th_ident = loc;
3940 #endif
3941     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3942 #if OMPT_SUPPORT && OMPT_OPTIONAL
3943     if (ompt_enabled.enabled) {
3944       ompt_frame->enter_frame = ompt_data_none;
3945     }
3946 #endif
3947 
3948   } else if (packed_reduction_method == atomic_reduce_block) {
3949 
3950 #if OMPT_SUPPORT
3951     ompt_frame_t *ompt_frame;
3952     if (ompt_enabled.enabled) {
3953       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3954       if (ompt_frame->enter_frame.ptr == NULL)
3955         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3956     }
3957     OMPT_STORE_RETURN_ADDRESS(global_tid);
3958 #endif
3959 // TODO: implicit barrier: should be exposed
3960 #if USE_ITT_NOTIFY
3961     __kmp_threads[global_tid]->th.th_ident = loc;
3962 #endif
3963     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3964 #if OMPT_SUPPORT && OMPT_OPTIONAL
3965     if (ompt_enabled.enabled) {
3966       ompt_frame->enter_frame = ompt_data_none;
3967     }
3968 #endif
3969 
3970   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3971                                    tree_reduce_block)) {
3972 
3973     // only primary thread executes here (primary releases all other workers)
3974     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3975                             global_tid);
3976 
3977   } else {
3978 
3979     // should never reach this block
3980     KMP_ASSERT(0); // "unexpected method"
3981   }
3982   if (teams_swapped) {
3983     __kmp_restore_swapped_teams(th, team, task_state);
3984   }
3985 
3986   if (__kmp_env_consistency_check)
3987     __kmp_pop_sync(global_tid, ct_reduce, loc);
3988 
3989   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3990                 global_tid, packed_reduction_method));
3991 
3992   return;
3993 }
3994 
3995 #undef __KMP_GET_REDUCTION_METHOD
3996 #undef __KMP_SET_REDUCTION_METHOD
3997 
3998 /* end of interface to fast scalable reduce routines */
3999 
4000 kmp_uint64 __kmpc_get_taskid() {
4001 
4002   kmp_int32 gtid;
4003   kmp_info_t *thread;
4004 
4005   gtid = __kmp_get_gtid();
4006   if (gtid < 0) {
4007     return 0;
4008   }
4009   thread = __kmp_thread_from_gtid(gtid);
4010   return thread->th.th_current_task->td_task_id;
4011 
4012 } // __kmpc_get_taskid
4013 
4014 kmp_uint64 __kmpc_get_parent_taskid() {
4015 
4016   kmp_int32 gtid;
4017   kmp_info_t *thread;
4018   kmp_taskdata_t *parent_task;
4019 
4020   gtid = __kmp_get_gtid();
4021   if (gtid < 0) {
4022     return 0;
4023   }
4024   thread = __kmp_thread_from_gtid(gtid);
4025   parent_task = thread->th.th_current_task->td_parent;
4026   return (parent_task == NULL ? 0 : parent_task->td_task_id);
4027 
4028 } // __kmpc_get_parent_taskid
4029 
4030 /*!
4031 @ingroup WORK_SHARING
4032 @param loc  source location information.
4033 @param gtid  global thread number.
4034 @param num_dims  number of associated doacross loops.
4035 @param dims  info on loops bounds.
4036 
4037 Initialize doacross loop information.
4038 Expect compiler send us inclusive bounds,
4039 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4040 */
4041 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4042                           const struct kmp_dim *dims) {
4043   __kmp_assert_valid_gtid(gtid);
4044   int j, idx;
4045   kmp_int64 last, trace_count;
4046   kmp_info_t *th = __kmp_threads[gtid];
4047   kmp_team_t *team = th->th.th_team;
4048   kmp_uint32 *flags;
4049   kmp_disp_t *pr_buf = th->th.th_dispatch;
4050   dispatch_shared_info_t *sh_buf;
4051 
4052   KA_TRACE(
4053       20,
4054       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4055        gtid, num_dims, !team->t.t_serialized));
4056   KMP_DEBUG_ASSERT(dims != NULL);
4057   KMP_DEBUG_ASSERT(num_dims > 0);
4058 
4059   if (team->t.t_serialized) {
4060     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4061     return; // no dependencies if team is serialized
4062   }
4063   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4064   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4065   // the next loop
4066   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4067 
4068   // Save bounds info into allocated private buffer
4069   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4070   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4071       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4072   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4073   pr_buf->th_doacross_info[0] =
4074       (kmp_int64)num_dims; // first element is number of dimensions
4075   // Save also address of num_done in order to access it later without knowing
4076   // the buffer index
4077   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4078   pr_buf->th_doacross_info[2] = dims[0].lo;
4079   pr_buf->th_doacross_info[3] = dims[0].up;
4080   pr_buf->th_doacross_info[4] = dims[0].st;
4081   last = 5;
4082   for (j = 1; j < num_dims; ++j) {
4083     kmp_int64
4084         range_length; // To keep ranges of all dimensions but the first dims[0]
4085     if (dims[j].st == 1) { // most common case
4086       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4087       range_length = dims[j].up - dims[j].lo + 1;
4088     } else {
4089       if (dims[j].st > 0) {
4090         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4091         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4092       } else { // negative increment
4093         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4094         range_length =
4095             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4096       }
4097     }
4098     pr_buf->th_doacross_info[last++] = range_length;
4099     pr_buf->th_doacross_info[last++] = dims[j].lo;
4100     pr_buf->th_doacross_info[last++] = dims[j].up;
4101     pr_buf->th_doacross_info[last++] = dims[j].st;
4102   }
4103 
4104   // Compute total trip count.
4105   // Start with range of dims[0] which we don't need to keep in the buffer.
4106   if (dims[0].st == 1) { // most common case
4107     trace_count = dims[0].up - dims[0].lo + 1;
4108   } else if (dims[0].st > 0) {
4109     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4110     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4111   } else { // negative increment
4112     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4113     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4114   }
4115   for (j = 1; j < num_dims; ++j) {
4116     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4117   }
4118   KMP_DEBUG_ASSERT(trace_count > 0);
4119 
4120   // Check if shared buffer is not occupied by other loop (idx -
4121   // __kmp_dispatch_num_buffers)
4122   if (idx != sh_buf->doacross_buf_idx) {
4123     // Shared buffer is occupied, wait for it to be free
4124     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4125                  __kmp_eq_4, NULL);
4126   }
4127 #if KMP_32_BIT_ARCH
4128   // Check if we are the first thread. After the CAS the first thread gets 0,
4129   // others get 1 if initialization is in progress, allocated pointer otherwise.
4130   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4131   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4132       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4133 #else
4134   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4135       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4136 #endif
4137   if (flags == NULL) {
4138     // we are the first thread, allocate the array of flags
4139     size_t size =
4140         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4141     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4142     KMP_MB();
4143     sh_buf->doacross_flags = flags;
4144   } else if (flags == (kmp_uint32 *)1) {
4145 #if KMP_32_BIT_ARCH
4146     // initialization is still in progress, need to wait
4147     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4148 #else
4149     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4150 #endif
4151       KMP_YIELD(TRUE);
4152     KMP_MB();
4153   } else {
4154     KMP_MB();
4155   }
4156   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4157   pr_buf->th_doacross_flags =
4158       sh_buf->doacross_flags; // save private copy in order to not
4159   // touch shared buffer on each iteration
4160   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4161 }
4162 
4163 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4164   __kmp_assert_valid_gtid(gtid);
4165   kmp_int64 shft;
4166   size_t num_dims, i;
4167   kmp_uint32 flag;
4168   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4169   kmp_info_t *th = __kmp_threads[gtid];
4170   kmp_team_t *team = th->th.th_team;
4171   kmp_disp_t *pr_buf;
4172   kmp_int64 lo, up, st;
4173 
4174   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4175   if (team->t.t_serialized) {
4176     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4177     return; // no dependencies if team is serialized
4178   }
4179 
4180   // calculate sequential iteration number and check out-of-bounds condition
4181   pr_buf = th->th.th_dispatch;
4182   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4183   num_dims = (size_t)pr_buf->th_doacross_info[0];
4184   lo = pr_buf->th_doacross_info[2];
4185   up = pr_buf->th_doacross_info[3];
4186   st = pr_buf->th_doacross_info[4];
4187 #if OMPT_SUPPORT && OMPT_OPTIONAL
4188   ompt_dependence_t deps[num_dims];
4189 #endif
4190   if (st == 1) { // most common case
4191     if (vec[0] < lo || vec[0] > up) {
4192       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4193                     "bounds [%lld,%lld]\n",
4194                     gtid, vec[0], lo, up));
4195       return;
4196     }
4197     iter_number = vec[0] - lo;
4198   } else if (st > 0) {
4199     if (vec[0] < lo || vec[0] > up) {
4200       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4201                     "bounds [%lld,%lld]\n",
4202                     gtid, vec[0], lo, up));
4203       return;
4204     }
4205     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4206   } else { // negative increment
4207     if (vec[0] > lo || vec[0] < up) {
4208       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4209                     "bounds [%lld,%lld]\n",
4210                     gtid, vec[0], lo, up));
4211       return;
4212     }
4213     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4214   }
4215 #if OMPT_SUPPORT && OMPT_OPTIONAL
4216   deps[0].variable.value = iter_number;
4217   deps[0].dependence_type = ompt_dependence_type_sink;
4218 #endif
4219   for (i = 1; i < num_dims; ++i) {
4220     kmp_int64 iter, ln;
4221     size_t j = i * 4;
4222     ln = pr_buf->th_doacross_info[j + 1];
4223     lo = pr_buf->th_doacross_info[j + 2];
4224     up = pr_buf->th_doacross_info[j + 3];
4225     st = pr_buf->th_doacross_info[j + 4];
4226     if (st == 1) {
4227       if (vec[i] < lo || vec[i] > up) {
4228         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4229                       "bounds [%lld,%lld]\n",
4230                       gtid, vec[i], lo, up));
4231         return;
4232       }
4233       iter = vec[i] - lo;
4234     } else if (st > 0) {
4235       if (vec[i] < lo || vec[i] > up) {
4236         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4237                       "bounds [%lld,%lld]\n",
4238                       gtid, vec[i], lo, up));
4239         return;
4240       }
4241       iter = (kmp_uint64)(vec[i] - lo) / st;
4242     } else { // st < 0
4243       if (vec[i] > lo || vec[i] < up) {
4244         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4245                       "bounds [%lld,%lld]\n",
4246                       gtid, vec[i], lo, up));
4247         return;
4248       }
4249       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4250     }
4251     iter_number = iter + ln * iter_number;
4252 #if OMPT_SUPPORT && OMPT_OPTIONAL
4253     deps[i].variable.value = iter;
4254     deps[i].dependence_type = ompt_dependence_type_sink;
4255 #endif
4256   }
4257   shft = iter_number % 32; // use 32-bit granularity
4258   iter_number >>= 5; // divided by 32
4259   flag = 1 << shft;
4260   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4261     KMP_YIELD(TRUE);
4262   }
4263   KMP_MB();
4264 #if OMPT_SUPPORT && OMPT_OPTIONAL
4265   if (ompt_enabled.ompt_callback_dependences) {
4266     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4267         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4268   }
4269 #endif
4270   KA_TRACE(20,
4271            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4272             gtid, (iter_number << 5) + shft));
4273 }
4274 
4275 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4276   __kmp_assert_valid_gtid(gtid);
4277   kmp_int64 shft;
4278   size_t num_dims, i;
4279   kmp_uint32 flag;
4280   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4281   kmp_info_t *th = __kmp_threads[gtid];
4282   kmp_team_t *team = th->th.th_team;
4283   kmp_disp_t *pr_buf;
4284   kmp_int64 lo, st;
4285 
4286   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4287   if (team->t.t_serialized) {
4288     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4289     return; // no dependencies if team is serialized
4290   }
4291 
4292   // calculate sequential iteration number (same as in "wait" but no
4293   // out-of-bounds checks)
4294   pr_buf = th->th.th_dispatch;
4295   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4296   num_dims = (size_t)pr_buf->th_doacross_info[0];
4297   lo = pr_buf->th_doacross_info[2];
4298   st = pr_buf->th_doacross_info[4];
4299 #if OMPT_SUPPORT && OMPT_OPTIONAL
4300   ompt_dependence_t deps[num_dims];
4301 #endif
4302   if (st == 1) { // most common case
4303     iter_number = vec[0] - lo;
4304   } else if (st > 0) {
4305     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4306   } else { // negative increment
4307     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4308   }
4309 #if OMPT_SUPPORT && OMPT_OPTIONAL
4310   deps[0].variable.value = iter_number;
4311   deps[0].dependence_type = ompt_dependence_type_source;
4312 #endif
4313   for (i = 1; i < num_dims; ++i) {
4314     kmp_int64 iter, ln;
4315     size_t j = i * 4;
4316     ln = pr_buf->th_doacross_info[j + 1];
4317     lo = pr_buf->th_doacross_info[j + 2];
4318     st = pr_buf->th_doacross_info[j + 4];
4319     if (st == 1) {
4320       iter = vec[i] - lo;
4321     } else if (st > 0) {
4322       iter = (kmp_uint64)(vec[i] - lo) / st;
4323     } else { // st < 0
4324       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4325     }
4326     iter_number = iter + ln * iter_number;
4327 #if OMPT_SUPPORT && OMPT_OPTIONAL
4328     deps[i].variable.value = iter;
4329     deps[i].dependence_type = ompt_dependence_type_source;
4330 #endif
4331   }
4332 #if OMPT_SUPPORT && OMPT_OPTIONAL
4333   if (ompt_enabled.ompt_callback_dependences) {
4334     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4335         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4336   }
4337 #endif
4338   shft = iter_number % 32; // use 32-bit granularity
4339   iter_number >>= 5; // divided by 32
4340   flag = 1 << shft;
4341   KMP_MB();
4342   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4343     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4344   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4345                 (iter_number << 5) + shft));
4346 }
4347 
4348 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4349   __kmp_assert_valid_gtid(gtid);
4350   kmp_int32 num_done;
4351   kmp_info_t *th = __kmp_threads[gtid];
4352   kmp_team_t *team = th->th.th_team;
4353   kmp_disp_t *pr_buf = th->th.th_dispatch;
4354 
4355   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4356   if (team->t.t_serialized) {
4357     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4358     return; // nothing to do
4359   }
4360   num_done =
4361       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4362   if (num_done == th->th.th_team_nproc) {
4363     // we are the last thread, need to free shared resources
4364     int idx = pr_buf->th_doacross_buf_idx - 1;
4365     dispatch_shared_info_t *sh_buf =
4366         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4367     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4368                      (kmp_int64)&sh_buf->doacross_num_done);
4369     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4370     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4371     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4372     sh_buf->doacross_flags = NULL;
4373     sh_buf->doacross_num_done = 0;
4374     sh_buf->doacross_buf_idx +=
4375         __kmp_dispatch_num_buffers; // free buffer for future re-use
4376   }
4377   // free private resources (need to keep buffer index forever)
4378   pr_buf->th_doacross_flags = NULL;
4379   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4380   pr_buf->th_doacross_info = NULL;
4381   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4382 }
4383 
4384 /* OpenMP 5.1 Memory Management routines */
4385 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4386   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4387 }
4388 
4389 void *omp_aligned_alloc(size_t align, size_t size,
4390                         omp_allocator_handle_t allocator) {
4391   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4392 }
4393 
4394 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4395   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4396 }
4397 
4398 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4399                          omp_allocator_handle_t allocator) {
4400   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4401 }
4402 
4403 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4404                   omp_allocator_handle_t free_allocator) {
4405   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4406                        free_allocator);
4407 }
4408 
4409 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4410   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4411 }
4412 /* end of OpenMP 5.1 Memory Management routines */
4413 
4414 int __kmpc_get_target_offload(void) {
4415   if (!__kmp_init_serial) {
4416     __kmp_serial_initialize();
4417   }
4418   return __kmp_target_offload;
4419 }
4420 
4421 int __kmpc_pause_resource(kmp_pause_status_t level) {
4422   if (!__kmp_init_serial) {
4423     return 1; // Can't pause if runtime is not initialized
4424   }
4425   return __kmp_pause_resource(level);
4426 }
4427 
4428 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4429   if (!__kmp_init_serial)
4430     __kmp_serial_initialize();
4431 
4432   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4433 
4434 #if OMPT_SUPPORT
4435   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4436     ompt_callbacks.ompt_callback(ompt_callback_error)(
4437         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4438         OMPT_GET_RETURN_ADDRESS(0));
4439   }
4440 #endif // OMPT_SUPPORT
4441 
4442   char *src_loc;
4443   if (loc && loc->psource) {
4444     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4445     src_loc =
4446         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4447     __kmp_str_loc_free(&str_loc);
4448   } else {
4449     src_loc = __kmp_str_format("unknown");
4450   }
4451 
4452   if (severity == severity_warning)
4453     KMP_WARNING(UserDirectedWarning, src_loc, message);
4454   else
4455     KMP_FATAL(UserDirectedError, src_loc, message);
4456 
4457   __kmp_str_free(&src_loc);
4458 }
4459 
4460 // Mark begin of scope directive.
4461 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4462 // reserved is for extension of scope directive and not used.
4463 #if OMPT_SUPPORT && OMPT_OPTIONAL
4464   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4465     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4466     int tid = __kmp_tid_from_gtid(gtid);
4467     ompt_callbacks.ompt_callback(ompt_callback_work)(
4468         ompt_work_scope, ompt_scope_begin,
4469         &(team->t.ompt_team_info.parallel_data),
4470         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4471         OMPT_GET_RETURN_ADDRESS(0));
4472   }
4473 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4474 }
4475 
4476 // Mark end of scope directive
4477 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4478 // reserved is for extension of scope directive and not used.
4479 #if OMPT_SUPPORT && OMPT_OPTIONAL
4480   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4481     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4482     int tid = __kmp_tid_from_gtid(gtid);
4483     ompt_callbacks.ompt_callback(ompt_callback_work)(
4484         ompt_work_scope, ompt_scope_end,
4485         &(team->t.ompt_team_info.parallel_data),
4486         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4487         OMPT_GET_RETURN_ADDRESS(0));
4488   }
4489 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4490 }
4491 
4492 #ifdef KMP_USE_VERSION_SYMBOLS
4493 // For GOMP compatibility there are two versions of each omp_* API.
4494 // One is the plain C symbol and one is the Fortran symbol with an appended
4495 // underscore. When we implement a specific ompc_* version of an omp_*
4496 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4497 // instead of the Fortran versions in kmp_ftn_entry.h
4498 extern "C" {
4499 // Have to undef these from omp.h so they aren't translated into
4500 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4501 #ifdef omp_set_affinity_format
4502 #undef omp_set_affinity_format
4503 #endif
4504 #ifdef omp_get_affinity_format
4505 #undef omp_get_affinity_format
4506 #endif
4507 #ifdef omp_display_affinity
4508 #undef omp_display_affinity
4509 #endif
4510 #ifdef omp_capture_affinity
4511 #undef omp_capture_affinity
4512 #endif
4513 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4514                         "OMP_5.0");
4515 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4516                         "OMP_5.0");
4517 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4518                         "OMP_5.0");
4519 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4520                         "OMP_5.0");
4521 } // extern "C"
4522 #endif
4523