1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   int monotonicity;
76   // default to monotonic
77   monotonicity = SCHEDULE_MONOTONIC;
78   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79     monotonicity = SCHEDULE_NONMONOTONIC;
80   else if (SCHEDULE_HAS_MONOTONIC(schedule))
81     monotonicity = SCHEDULE_MONOTONIC;
82   return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk.  The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used).  tid is the id of the thread calling
90 // the function within the group of nproc threads.  It will have a value
91 // between 0 and nproc - 1.  This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                    dispatch_private_info_template<T> *pr,
98                                    enum sched_type schedule, T lb, T ub,
99                                    typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101                                    kmp_uint64 *cur_chunk,
102 #endif
103                                    typename traits_t<T>::signed_t chunk,
104                                    T nproc, T tid) {
105   typedef typename traits_t<T>::unsigned_t UT;
106   typedef typename traits_t<T>::floating_t DBL;
107 
108   int active;
109   T tc;
110   kmp_info_t *th;
111   kmp_team_t *team;
112   int monotonicity;
113   bool use_hier;
114 
115 #ifdef KMP_DEBUG
116   typedef typename traits_t<T>::signed_t ST;
117   {
118     char *buff;
119     // create format specifiers before the debug output
120     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                             traits_t<T>::spec, traits_t<T>::spec,
124                             traits_t<ST>::spec, traits_t<ST>::spec,
125                             traits_t<T>::spec, traits_t<T>::spec);
126     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127     __kmp_str_free(&buff);
128   }
129 #endif
130   /* setup data */
131   th = __kmp_threads[gtid];
132   team = th->th.th_team;
133   active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136   int itt_need_metadata_reporting =
137       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139       team->t.t_active_level == 1;
140 #endif
141 
142 #if KMP_USE_HIER_SCHED
143   use_hier = pr->flags.use_hier;
144 #else
145   use_hier = false;
146 #endif
147 
148   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149   monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151 
152   /* Pick up the nomerge/ordered bits from the scheduling type */
153   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154     pr->flags.nomerge = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157   } else {
158     pr->flags.nomerge = FALSE;
159   }
160   pr->type_size = traits_t<T>::type_size; // remember the size of variables
161   if (kmp_ord_lower & schedule) {
162     pr->flags.ordered = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165   } else {
166     pr->flags.ordered = FALSE;
167   }
168   // Ordered overrides nonmonotonic
169   if (pr->flags.ordered) {
170     monotonicity = SCHEDULE_MONOTONIC;
171   }
172 
173   if (schedule == kmp_sch_static) {
174     schedule = __kmp_static;
175   } else {
176     if (schedule == kmp_sch_runtime) {
177       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178       // not specified)
179       schedule = team->t.t_sched.r_sched_type;
180       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182       // Detail the schedule if needed (global controls are differentiated
183       // appropriately)
184       if (schedule == kmp_sch_guided_chunked) {
185         schedule = __kmp_guided;
186       } else if (schedule == kmp_sch_static) {
187         schedule = __kmp_static;
188       }
189       // Use the chunk size specified by OMP_SCHEDULE (or default if not
190       // specified)
191       chunk = team->t.t_sched.chunk;
192 #if USE_ITT_BUILD
193       if (cur_chunk)
194         *cur_chunk = chunk;
195 #endif
196 #ifdef KMP_DEBUG
197       {
198         char *buff;
199         // create format specifiers before the debug output
200         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201                                 "schedule:%%d chunk:%%%s\n",
202                                 traits_t<ST>::spec);
203         KD_TRACE(10, (buff, gtid, schedule, chunk));
204         __kmp_str_free(&buff);
205       }
206 #endif
207     } else {
208       if (schedule == kmp_sch_guided_chunked) {
209         schedule = __kmp_guided;
210       }
211       if (chunk <= 0) {
212         chunk = KMP_DEFAULT_CHUNK;
213       }
214     }
215 
216     if (schedule == kmp_sch_auto) {
217       // mapping and differentiation: in the __kmp_do_serial_initialize()
218       schedule = __kmp_auto;
219 #ifdef KMP_DEBUG
220       {
221         char *buff;
222         // create format specifiers before the debug output
223         buff = __kmp_str_format(
224             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225             "schedule:%%d chunk:%%%s\n",
226             traits_t<ST>::spec);
227         KD_TRACE(10, (buff, gtid, schedule, chunk));
228         __kmp_str_free(&buff);
229       }
230 #endif
231     }
232 #if KMP_STATIC_STEAL_ENABLED
233     // map nonmonotonic:dynamic to static steal
234     if (schedule == kmp_sch_dynamic_chunked) {
235       if (monotonicity == SCHEDULE_NONMONOTONIC)
236         schedule = kmp_sch_static_steal;
237     }
238 #endif
239     /* guided analytical not safe for too many threads */
240     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241       schedule = kmp_sch_guided_iterative_chunked;
242       KMP_WARNING(DispatchManyThreads);
243     }
244     if (schedule == kmp_sch_runtime_simd) {
245       // compiler provides simd_width in the chunk parameter
246       schedule = team->t.t_sched.r_sched_type;
247       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249       // Detail the schedule if needed (global controls are differentiated
250       // appropriately)
251       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252           schedule == __kmp_static) {
253         schedule = kmp_sch_static_balanced_chunked;
254       } else {
255         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256           schedule = kmp_sch_guided_simd;
257         }
258         chunk = team->t.t_sched.chunk * chunk;
259       }
260 #if USE_ITT_BUILD
261       if (cur_chunk)
262         *cur_chunk = chunk;
263 #endif
264 #ifdef KMP_DEBUG
265       {
266         char *buff;
267         // create format specifiers before the debug output
268         buff = __kmp_str_format(
269             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270             " chunk:%%%s\n",
271             traits_t<ST>::spec);
272         KD_TRACE(10, (buff, gtid, schedule, chunk));
273         __kmp_str_free(&buff);
274       }
275 #endif
276     }
277     pr->u.p.parm1 = chunk;
278   }
279   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280               "unknown scheduling type");
281 
282   pr->u.p.count = 0;
283 
284   if (__kmp_env_consistency_check) {
285     if (st == 0) {
286       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288     }
289   }
290   // compute trip count
291   if (st == 1) { // most common case
292     if (ub >= lb) {
293       tc = ub - lb + 1;
294     } else { // ub < lb
295       tc = 0; // zero-trip
296     }
297   } else if (st < 0) {
298     if (lb >= ub) {
299       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300       // where the division needs to be unsigned regardless of the result type
301       tc = (UT)(lb - ub) / (-st) + 1;
302     } else { // lb < ub
303       tc = 0; // zero-trip
304     }
305   } else { // st > 0
306     if (ub >= lb) {
307       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(ub - lb) / st + 1;
310     } else { // ub < lb
311       tc = 0; // zero-trip
312     }
313   }
314 
315 #if KMP_STATS_ENABLED
316   if (KMP_MASTER_GTID(gtid)) {
317     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318   }
319 #endif
320 
321   pr->u.p.lb = lb;
322   pr->u.p.ub = ub;
323   pr->u.p.st = st;
324   pr->u.p.tc = tc;
325 
326 #if KMP_OS_WINDOWS
327   pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
329 
330   /* NOTE: only the active parallel region(s) has active ordered sections */
331 
332   if (active) {
333     if (pr->flags.ordered) {
334       pr->ordered_bumped = 0;
335       pr->u.p.ordered_lower = 1;
336       pr->u.p.ordered_upper = 0;
337     }
338   }
339 
340   switch (schedule) {
341 #if (KMP_STATIC_STEAL_ENABLED)
342   case kmp_sch_static_steal: {
343     T ntc, init;
344 
345     KD_TRACE(100,
346              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347               gtid));
348 
349     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350     if (nproc > 1 && ntc >= nproc) {
351       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352       T id = tid;
353       T small_chunk, extras;
354 
355       small_chunk = ntc / nproc;
356       extras = ntc % nproc;
357 
358       init = id * small_chunk + (id < extras ? id : extras);
359       pr->u.p.count = init;
360       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361 
362       pr->u.p.parm2 = lb;
363       // parm3 is the number of times to attempt stealing which is
364       // proportional to the number of chunks per thread up until
365       // the maximum value of nproc.
366       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368       pr->u.p.st = st;
369       if (traits_t<T>::type_size > 4) {
370         // AC: TODO: check if 16-byte CAS available and use it to
371         // improve performance (probably wait for explicit request
372         // before spending time on this).
373         // For now use dynamically allocated per-thread lock,
374         // free memory in __kmp_dispatch_next when status==0.
375         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376         th->th.th_dispatch->th_steal_lock =
377             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
379       }
380       break;
381     } else {
382       /* too few chunks: switching to kmp_sch_dynamic_chunked */
383       schedule = kmp_sch_dynamic_chunked;
384       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385                      "kmp_sch_dynamic_chunked\n",
386                       gtid));
387       if (pr->u.p.parm1 <= 0)
388         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389       break;
390     } // if
391   } // case
392 #endif
393   case kmp_sch_static_balanced: {
394     T init, limit;
395 
396     KD_TRACE(
397         100,
398         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399          gtid));
400 
401     if (nproc > 1) {
402       T id = tid;
403 
404       if (tc < nproc) {
405         if (id < tc) {
406           init = id;
407           limit = id;
408           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409         } else {
410           pr->u.p.count = 1; /* means no more chunks to execute */
411           pr->u.p.parm1 = FALSE;
412           break;
413         }
414       } else {
415         T small_chunk = tc / nproc;
416         T extras = tc % nproc;
417         init = id * small_chunk + (id < extras ? id : extras);
418         limit = init + small_chunk - (id < extras ? 0 : 1);
419         pr->u.p.parm1 = (id == nproc - 1);
420       }
421     } else {
422       if (tc > 0) {
423         init = 0;
424         limit = tc - 1;
425         pr->u.p.parm1 = TRUE;
426       } else {
427         // zero trip count
428         pr->u.p.count = 1; /* means no more chunks to execute */
429         pr->u.p.parm1 = FALSE;
430         break;
431       }
432     }
433 #if USE_ITT_BUILD
434     // Calculate chunk for metadata report
435     if (itt_need_metadata_reporting)
436       if (cur_chunk)
437         *cur_chunk = limit - init + 1;
438 #endif
439     if (st == 1) {
440       pr->u.p.lb = lb + init;
441       pr->u.p.ub = lb + limit;
442     } else {
443       // calculated upper bound, "ub" is user-defined upper bound
444       T ub_tmp = lb + limit * st;
445       pr->u.p.lb = lb + init * st;
446       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447       // it exactly
448       if (st > 0) {
449         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450       } else {
451         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452       }
453     }
454     if (pr->flags.ordered) {
455       pr->u.p.ordered_lower = init;
456       pr->u.p.ordered_upper = limit;
457     }
458     break;
459   } // case
460   case kmp_sch_static_balanced_chunked: {
461     // similar to balanced, but chunk adjusted to multiple of simd width
462     T nth = nproc;
463     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464                    " -> falling-through to static_greedy\n",
465                    gtid));
466     schedule = kmp_sch_static_greedy;
467     if (nth > 1)
468       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469     else
470       pr->u.p.parm1 = tc;
471     break;
472   } // case
473   case kmp_sch_guided_simd:
474   case kmp_sch_guided_iterative_chunked: {
475     KD_TRACE(
476         100,
477         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478          " case\n",
479          gtid));
480 
481     if (nproc > 1) {
482       if ((2L * chunk + 1) * nproc >= tc) {
483         /* chunk size too large, switch to dynamic */
484         schedule = kmp_sch_dynamic_chunked;
485       } else {
486         // when remaining iters become less than parm2 - switch to dynamic
487         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488         *(double *)&pr->u.p.parm3 =
489             guided_flt_param / nproc; // may occupy parm3 and parm4
490       }
491     } else {
492       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493                      "kmp_sch_static_greedy\n",
494                      gtid));
495       schedule = kmp_sch_static_greedy;
496       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497       KD_TRACE(
498           100,
499           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500            gtid));
501       pr->u.p.parm1 = tc;
502     } // if
503   } // case
504   break;
505   case kmp_sch_guided_analytical_chunked: {
506     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507                    "kmp_sch_guided_analytical_chunked case\n",
508                    gtid));
509 
510     if (nproc > 1) {
511       if ((2L * chunk + 1) * nproc >= tc) {
512         /* chunk size too large, switch to dynamic */
513         schedule = kmp_sch_dynamic_chunked;
514       } else {
515         /* commonly used term: (2 nproc - 1)/(2 nproc) */
516         DBL x;
517 
518 #if KMP_USE_X87CONTROL
519         /* Linux* OS already has 64-bit computation by default for long double,
520            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522            instead of the default 53-bit. Even though long double doesn't work
523            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524            expected to impact the correctness of the algorithm, but this has not
525            been mathematically proven. */
526         // save original FPCW and set precision to 64-bit, as
527         // Windows* OS on IA-32 architecture defaults to 53-bit
528         unsigned int oldFpcw = _control87(0, 0);
529         _control87(_PC_64, _MCW_PC); // 0,0x30000
530 #endif
531         /* value used for comparison in solver for cross-over point */
532         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533 
534         /* crossover point--chunk indexes equal to or greater than
535            this point switch to dynamic-style scheduling */
536         UT cross;
537 
538         /* commonly used term: (2 nproc - 1)/(2 nproc) */
539         x = (long double)1.0 - (long double)0.5 / nproc;
540 
541 #ifdef KMP_DEBUG
542         { // test natural alignment
543           struct _test_a {
544             char a;
545             union {
546               char b;
547               DBL d;
548             };
549           } t;
550           ptrdiff_t natural_alignment =
551               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553           // long)natural_alignment );
554           KMP_DEBUG_ASSERT(
555               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556         }
557 #endif // KMP_DEBUG
558 
559         /* save the term in thread private dispatch structure */
560         *(DBL *)&pr->u.p.parm3 = x;
561 
562         /* solve for the crossover point to the nearest integer i for which C_i
563            <= chunk */
564         {
565           UT left, right, mid;
566           long double p;
567 
568           /* estimate initial upper and lower bound */
569 
570           /* doesn't matter what value right is as long as it is positive, but
571              it affects performance of the solver */
572           right = 229;
573           p = __kmp_pow<UT>(x, right);
574           if (p > target) {
575             do {
576               p *= p;
577               right <<= 1;
578             } while (p > target && right < (1 << 27));
579             /* lower bound is previous (failed) estimate of upper bound */
580             left = right >> 1;
581           } else {
582             left = 0;
583           }
584 
585           /* bisection root-finding method */
586           while (left + 1 < right) {
587             mid = (left + right) / 2;
588             if (__kmp_pow<UT>(x, mid) > target) {
589               left = mid;
590             } else {
591               right = mid;
592             }
593           } // while
594           cross = right;
595         }
596         /* assert sanity of computed crossover point */
597         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598                    __kmp_pow<UT>(x, cross) <= target);
599 
600         /* save the crossover point in thread private dispatch structure */
601         pr->u.p.parm2 = cross;
602 
603 // C75803
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606 #else
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
608 #endif
609         /* dynamic-style scheduling offset */
610         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612                         cross * chunk;
613 #if KMP_USE_X87CONTROL
614         // restore FPCW
615         _control87(oldFpcw, _MCW_PC);
616 #endif
617       } // if
618     } else {
619       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620                      "kmp_sch_static_greedy\n",
621                      gtid));
622       schedule = kmp_sch_static_greedy;
623       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624       pr->u.p.parm1 = tc;
625     } // if
626   } // case
627   break;
628   case kmp_sch_static_greedy:
629     KD_TRACE(
630         100,
631         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632          gtid));
633     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634     break;
635   case kmp_sch_static_chunked:
636   case kmp_sch_dynamic_chunked:
637     if (pr->u.p.parm1 <= 0) {
638       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639     }
640     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642                    gtid));
643     break;
644   case kmp_sch_trapezoidal: {
645     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646 
647     T parm1, parm2, parm3, parm4;
648     KD_TRACE(100,
649              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650               gtid));
651 
652     parm1 = chunk;
653 
654     /* F : size of the first cycle */
655     parm2 = (tc / (2 * nproc));
656 
657     if (parm2 < 1) {
658       parm2 = 1;
659     }
660 
661     /* L : size of the last cycle.  Make sure the last cycle is not larger
662        than the first cycle. */
663     if (parm1 < 1) {
664       parm1 = 1;
665     } else if (parm1 > parm2) {
666       parm1 = parm2;
667     }
668 
669     /* N : number of cycles */
670     parm3 = (parm2 + parm1);
671     parm3 = (2 * tc + parm3 - 1) / parm3;
672 
673     if (parm3 < 2) {
674       parm3 = 2;
675     }
676 
677     /* sigma : decreasing incr of the trapezoid */
678     parm4 = (parm3 - 1);
679     parm4 = (parm2 - parm1) / parm4;
680 
681     // pointless check, because parm4 >= 0 always
682     // if ( parm4 < 0 ) {
683     //    parm4 = 0;
684     //}
685 
686     pr->u.p.parm1 = parm1;
687     pr->u.p.parm2 = parm2;
688     pr->u.p.parm3 = parm3;
689     pr->u.p.parm4 = parm4;
690   } // case
691   break;
692 
693   default: {
694     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695                 KMP_HNT(GetNewerLibrary), // Hint
696                 __kmp_msg_null // Variadic argument list terminator
697                 );
698   } break;
699   } // switch
700   pr->schedule = schedule;
701 }
702 
703 #if KMP_USE_HIER_SCHED
704 template <typename T>
705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706                                              typename traits_t<T>::signed_t st);
707 template <>
708 inline void
709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710                                             kmp_int32 ub, kmp_int32 st) {
711   __kmp_dispatch_init_hierarchy<kmp_int32>(
712       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714 }
715 template <>
716 inline void
717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718                                              kmp_uint32 ub, kmp_int32 st) {
719   __kmp_dispatch_init_hierarchy<kmp_uint32>(
720       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726                                             kmp_int64 ub, kmp_int64 st) {
727   __kmp_dispatch_init_hierarchy<kmp_int64>(
728       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734                                              kmp_uint64 ub, kmp_int64 st) {
735   __kmp_dispatch_init_hierarchy<kmp_uint64>(
736       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739 
740 // free all the hierarchy scheduling memory associated with the team
741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743   for (int i = 0; i < num_disp_buff; ++i) {
744     // type does not matter here so use kmp_int32
745     auto sh =
746         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747             &team->t.t_disp_buffer[i]);
748     if (sh->hier) {
749       sh->hier->deallocate();
750       __kmp_free(sh->hier);
751     }
752   }
753 }
754 #endif
755 
756 // UT - unsigned flavor of T, ST - signed flavor of T,
757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758 template <typename T>
759 static void
760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761                     T ub, typename traits_t<T>::signed_t st,
762                     typename traits_t<T>::signed_t chunk, int push_ws) {
763   typedef typename traits_t<T>::unsigned_t UT;
764 
765   int active;
766   kmp_info_t *th;
767   kmp_team_t *team;
768   kmp_uint32 my_buffer_index;
769   dispatch_private_info_template<T> *pr;
770   dispatch_shared_info_template<T> volatile *sh;
771 
772   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773                    sizeof(dispatch_private_info));
774   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775                    sizeof(dispatch_shared_info));
776 
777   if (!TCR_4(__kmp_init_parallel))
778     __kmp_parallel_initialize();
779 
780   __kmp_resume_if_soft_paused();
781 
782 #if INCLUDE_SSC_MARKS
783   SSC_MARK_DISPATCH_INIT();
784 #endif
785 #ifdef KMP_DEBUG
786   typedef typename traits_t<T>::signed_t ST;
787   {
788     char *buff;
789     // create format specifiers before the debug output
790     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792                             traits_t<ST>::spec, traits_t<T>::spec,
793                             traits_t<T>::spec, traits_t<ST>::spec);
794     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795     __kmp_str_free(&buff);
796   }
797 #endif
798   /* setup data */
799   th = __kmp_threads[gtid];
800   team = th->th.th_team;
801   active = !team->t.t_serialized;
802   th->th.th_ident = loc;
803 
804   // Any half-decent optimizer will remove this test when the blocks are empty
805   // since the macros expand to nothing
806   // when statistics are disabled.
807   if (schedule == __kmp_static) {
808     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
809   } else {
810     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
811   }
812 
813 #if KMP_USE_HIER_SCHED
814   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815   // Hierarchical scheduling does not work with ordered, so if ordered is
816   // detected, then revert back to threaded scheduling.
817   bool ordered;
818   enum sched_type my_sched = schedule;
819   my_buffer_index = th->th.th_dispatch->th_disp_index;
820   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821       &th->th.th_dispatch
822            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
825     my_sched =
826         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827   ordered = (kmp_ord_lower & my_sched);
828   if (pr->flags.use_hier) {
829     if (ordered) {
830       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
831                      "Disabling hierarchical scheduling.\n",
832                      gtid));
833       pr->flags.use_hier = FALSE;
834     }
835   }
836   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837     // Don't use hierarchical for ordered parallel loops and don't
838     // use the runtime hierarchy if one was specified in the program
839     if (!ordered && !pr->flags.use_hier)
840       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841   }
842 #endif // KMP_USE_HIER_SCHED
843 
844 #if USE_ITT_BUILD
845   kmp_uint64 cur_chunk = chunk;
846   int itt_need_metadata_reporting =
847       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849       team->t.t_active_level == 1;
850 #endif
851   if (!active) {
852     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
854   } else {
855     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857 
858     my_buffer_index = th->th.th_dispatch->th_disp_index++;
859 
860     /* What happens when number of threads changes, need to resize buffer? */
861     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862         &th->th.th_dispatch
863              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
867                   my_buffer_index));
868   }
869 
870   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
871 #if USE_ITT_BUILD
872                                 &cur_chunk,
873 #endif
874                                 chunk, (T)th->th.th_team_nproc,
875                                 (T)th->th.th_info.ds.ds_tid);
876   if (active) {
877     if (pr->flags.ordered == 0) {
878       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880     } else {
881       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
883     }
884   }
885 
886   if (active) {
887     /* The name of this buffer should be my_buffer_index when it's free to use
888      * it */
889 
890     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891                    "sh->buffer_index:%d\n",
892                    gtid, my_buffer_index, sh->buffer_index));
893     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895     // Note: KMP_WAIT() cannot be used there: buffer index and
896     // my_buffer_index are *always* 32-bit integers.
897     KMP_MB(); /* is this necessary? */
898     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899                    "sh->buffer_index:%d\n",
900                    gtid, my_buffer_index, sh->buffer_index));
901 
902     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903     th->th.th_dispatch->th_dispatch_sh_current =
904         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
905 #if USE_ITT_BUILD
906     if (pr->flags.ordered) {
907       __kmp_itt_ordered_init(gtid);
908     }
909     // Report loop metadata
910     if (itt_need_metadata_reporting) {
911       // Only report metadata by master of active team at level 1
912       kmp_uint64 schedtype = 0;
913       switch (schedule) {
914       case kmp_sch_static_chunked:
915       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
916         break;
917       case kmp_sch_static_greedy:
918         cur_chunk = pr->u.p.parm1;
919         break;
920       case kmp_sch_dynamic_chunked:
921         schedtype = 1;
922         break;
923       case kmp_sch_guided_iterative_chunked:
924       case kmp_sch_guided_analytical_chunked:
925       case kmp_sch_guided_simd:
926         schedtype = 2;
927         break;
928       default:
929         // Should we put this case under "static"?
930         // case kmp_sch_static_steal:
931         schedtype = 3;
932         break;
933       }
934       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935     }
936 #if KMP_USE_HIER_SCHED
937     if (pr->flags.use_hier) {
938       pr->u.p.count = 0;
939       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940     }
941 #endif // KMP_USER_HIER_SCHED
942 #endif /* USE_ITT_BUILD */
943   }
944 
945 #ifdef KMP_DEBUG
946   {
947     char *buff;
948     // create format specifiers before the debug output
949     buff = __kmp_str_format(
950         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
951         "lb:%%%s ub:%%%s"
952         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962     __kmp_str_free(&buff);
963   }
964 #endif
965 #if (KMP_STATIC_STEAL_ENABLED)
966   // It cannot be guaranteed that after execution of a loop with some other
967   // schedule kind all the parm3 variables will contain the same value. Even if
968   // all parm3 will be the same, it still exists a bad case like using 0 and 1
969   // rather than program life-time increment. So the dedicated variable is
970   // required. The 'static_steal_counter' is used.
971   if (schedule == kmp_sch_static_steal) {
972     // Other threads will inspect this variable when searching for a victim.
973     // This is a flag showing that other threads may steal from this thread
974     // since then.
975     volatile T *p = &pr->u.p.static_steal_counter;
976     *p = *p + 1;
977   }
978 #endif // ( KMP_STATIC_STEAL_ENABLED )
979 
980 #if OMPT_SUPPORT && OMPT_OPTIONAL
981   if (ompt_enabled.ompt_callback_work) {
982     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984     ompt_callbacks.ompt_callback(ompt_callback_work)(
985         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
987   }
988 #endif
989   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
990 }
991 
992 /* For ordered loops, either __kmp_dispatch_finish() should be called after
993  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994  * every chunk of iterations.  If the ordered section(s) were not executed
995  * for this iteration (or every iteration in this chunk), we need to set the
996  * ordered iteration counters so that the next thread can proceed. */
997 template <typename UT>
998 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999   typedef typename traits_t<UT>::signed_t ST;
1000   kmp_info_t *th = __kmp_threads[gtid];
1001 
1002   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003   if (!th->th.th_team->t.t_serialized) {
1004 
1005     dispatch_private_info_template<UT> *pr =
1006         reinterpret_cast<dispatch_private_info_template<UT> *>(
1007             th->th.th_dispatch->th_dispatch_pr_current);
1008     dispatch_shared_info_template<UT> volatile *sh =
1009         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010             th->th.th_dispatch->th_dispatch_sh_current);
1011     KMP_DEBUG_ASSERT(pr);
1012     KMP_DEBUG_ASSERT(sh);
1013     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015 
1016     if (pr->ordered_bumped) {
1017       KD_TRACE(
1018           1000,
1019           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020            gtid));
1021       pr->ordered_bumped = 0;
1022     } else {
1023       UT lower = pr->u.p.ordered_lower;
1024 
1025 #ifdef KMP_DEBUG
1026       {
1027         char *buff;
1028         // create format specifiers before the debug output
1029         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030                                 "ordered_iteration:%%%s lower:%%%s\n",
1031                                 traits_t<UT>::spec, traits_t<UT>::spec);
1032         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033         __kmp_str_free(&buff);
1034       }
1035 #endif
1036 
1037       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039       KMP_MB(); /* is this necessary? */
1040 #ifdef KMP_DEBUG
1041       {
1042         char *buff;
1043         // create format specifiers before the debug output
1044         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045                                 "ordered_iteration:%%%s lower:%%%s\n",
1046                                 traits_t<UT>::spec, traits_t<UT>::spec);
1047         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048         __kmp_str_free(&buff);
1049       }
1050 #endif
1051 
1052       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1053     } // if
1054   } // if
1055   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1056 }
1057 
1058 #ifdef KMP_GOMP_COMPAT
1059 
1060 template <typename UT>
1061 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062   typedef typename traits_t<UT>::signed_t ST;
1063   kmp_info_t *th = __kmp_threads[gtid];
1064 
1065   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066   if (!th->th.th_team->t.t_serialized) {
1067     //        int cid;
1068     dispatch_private_info_template<UT> *pr =
1069         reinterpret_cast<dispatch_private_info_template<UT> *>(
1070             th->th.th_dispatch->th_dispatch_pr_current);
1071     dispatch_shared_info_template<UT> volatile *sh =
1072         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073             th->th.th_dispatch->th_dispatch_sh_current);
1074     KMP_DEBUG_ASSERT(pr);
1075     KMP_DEBUG_ASSERT(sh);
1076     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1078 
1079     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080     UT lower = pr->u.p.ordered_lower;
1081     UT upper = pr->u.p.ordered_upper;
1082     UT inc = upper - lower + 1;
1083 
1084     if (pr->ordered_bumped == inc) {
1085       KD_TRACE(
1086           1000,
1087           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088            gtid));
1089       pr->ordered_bumped = 0;
1090     } else {
1091       inc -= pr->ordered_bumped;
1092 
1093 #ifdef KMP_DEBUG
1094       {
1095         char *buff;
1096         // create format specifiers before the debug output
1097         buff = __kmp_str_format(
1098             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102         __kmp_str_free(&buff);
1103       }
1104 #endif
1105 
1106       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1108 
1109       KMP_MB(); /* is this necessary? */
1110       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111                       "ordered_bumped to zero\n",
1112                       gtid));
1113       pr->ordered_bumped = 0;
1114 //!!!!! TODO check if the inc should be unsigned, or signed???
1115 #ifdef KMP_DEBUG
1116       {
1117         char *buff;
1118         // create format specifiers before the debug output
1119         buff = __kmp_str_format(
1120             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123             traits_t<UT>::spec);
1124         KD_TRACE(1000,
1125                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126         __kmp_str_free(&buff);
1127       }
1128 #endif
1129 
1130       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1131     }
1132     //        }
1133   }
1134   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1135 }
1136 
1137 #endif /* KMP_GOMP_COMPAT */
1138 
1139 template <typename T>
1140 int __kmp_dispatch_next_algorithm(int gtid,
1141                                   dispatch_private_info_template<T> *pr,
1142                                   dispatch_shared_info_template<T> volatile *sh,
1143                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1144                                   typename traits_t<T>::signed_t *p_st, T nproc,
1145                                   T tid) {
1146   typedef typename traits_t<T>::unsigned_t UT;
1147   typedef typename traits_t<T>::signed_t ST;
1148   typedef typename traits_t<T>::floating_t DBL;
1149   int status = 0;
1150   kmp_int32 last = 0;
1151   T start;
1152   ST incr;
1153   UT limit, trip, init;
1154   kmp_info_t *th = __kmp_threads[gtid];
1155   kmp_team_t *team = th->th.th_team;
1156 
1157   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159   KMP_DEBUG_ASSERT(pr);
1160   KMP_DEBUG_ASSERT(sh);
1161   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1162 #ifdef KMP_DEBUG
1163   {
1164     char *buff;
1165     // create format specifiers before the debug output
1166     buff =
1167         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168                          "sh:%%p nproc:%%%s tid:%%%s\n",
1169                          traits_t<T>::spec, traits_t<T>::spec);
1170     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171     __kmp_str_free(&buff);
1172   }
1173 #endif
1174 
1175   // zero trip count
1176   if (pr->u.p.tc == 0) {
1177     KD_TRACE(10,
1178              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1179               "zero status:%d\n",
1180               gtid, status));
1181     return 0;
1182   }
1183 
1184   switch (pr->schedule) {
1185 #if (KMP_STATIC_STEAL_ENABLED)
1186   case kmp_sch_static_steal: {
1187     T chunk = pr->u.p.parm1;
1188 
1189     KD_TRACE(100,
1190              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1191               gtid));
1192 
1193     trip = pr->u.p.tc - 1;
1194 
1195     if (traits_t<T>::type_size > 4) {
1196       // use lock for 8-byte and CAS for 4-byte induction
1197       // variable. TODO (optional): check and use 16-byte CAS
1198       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1199       KMP_DEBUG_ASSERT(lck != NULL);
1200       if (pr->u.p.count < (UT)pr->u.p.ub) {
1201         __kmp_acquire_lock(lck, gtid);
1202         // try to get own chunk of iterations
1203         init = (pr->u.p.count)++;
1204         status = (init < (UT)pr->u.p.ub);
1205         __kmp_release_lock(lck, gtid);
1206       } else {
1207         status = 0; // no own chunks
1208       }
1209       if (!status) { // try to steal
1210         kmp_info_t **other_threads = team->t.t_threads;
1211         int while_limit = pr->u.p.parm3;
1212         int while_index = 0;
1213         // TODO: algorithm of searching for a victim
1214         // should be cleaned up and measured
1215         while ((!status) && (while_limit != ++while_index)) {
1216           T remaining;
1217           T victimIdx = pr->u.p.parm4;
1218           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1219           dispatch_private_info_template<T> *victim =
1220               reinterpret_cast<dispatch_private_info_template<T> *>(
1221                   other_threads[victimIdx]
1222                       ->th.th_dispatch->th_dispatch_pr_current);
1223           while ((victim == NULL || victim == pr ||
1224                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1225                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1226                  oldVictimIdx != victimIdx) {
1227             victimIdx = (victimIdx + 1) % nproc;
1228             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1229                 other_threads[victimIdx]
1230                     ->th.th_dispatch->th_dispatch_pr_current);
1231           }
1232           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1233                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1234             continue; // try once more (nproc attempts in total)
1235             // no victim is ready yet to participate in stealing
1236             // because all victims are still in kmp_init_dispatch
1237           }
1238           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1239             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1240             continue; // not enough chunks to steal, goto next victim
1241           }
1242 
1243           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1244           KMP_ASSERT(lck != NULL);
1245           __kmp_acquire_lock(lck, gtid);
1246           limit = victim->u.p.ub; // keep initial ub
1247           if (victim->u.p.count >= limit ||
1248               (remaining = limit - victim->u.p.count) < 2) {
1249             __kmp_release_lock(lck, gtid);
1250             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1251             continue; // not enough chunks to steal
1252           }
1253           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1254           // by 1
1255           if (remaining > 3) {
1256             // steal 1/4 of remaining
1257             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1258             init = (victim->u.p.ub -= (remaining >> 2));
1259           } else {
1260             // steal 1 chunk of 2 or 3 remaining
1261             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1262             init = (victim->u.p.ub -= 1);
1263           }
1264           __kmp_release_lock(lck, gtid);
1265 
1266           KMP_DEBUG_ASSERT(init + 1 <= limit);
1267           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1268           status = 1;
1269           while_index = 0;
1270           // now update own count and ub with stolen range but init chunk
1271           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1272           pr->u.p.count = init + 1;
1273           pr->u.p.ub = limit;
1274           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1275         } // while (search for victim)
1276       } // if (try to find victim and steal)
1277     } else {
1278       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1279       typedef union {
1280         struct {
1281           UT count;
1282           T ub;
1283         } p;
1284         kmp_int64 b;
1285       } union_i4;
1286       // All operations on 'count' or 'ub' must be combined atomically
1287       // together.
1288       {
1289         union_i4 vold, vnew;
1290         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1291         vnew = vold;
1292         vnew.p.count++;
1293         while (!KMP_COMPARE_AND_STORE_ACQ64(
1294             (volatile kmp_int64 *)&pr->u.p.count,
1295             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1296             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1297           KMP_CPU_PAUSE();
1298           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1299           vnew = vold;
1300           vnew.p.count++;
1301         }
1302         vnew = vold;
1303         init = vnew.p.count;
1304         status = (init < (UT)vnew.p.ub);
1305       }
1306 
1307       if (!status) {
1308         kmp_info_t **other_threads = team->t.t_threads;
1309         int while_limit = pr->u.p.parm3;
1310         int while_index = 0;
1311 
1312         // TODO: algorithm of searching for a victim
1313         // should be cleaned up and measured
1314         while ((!status) && (while_limit != ++while_index)) {
1315           union_i4 vold, vnew;
1316           kmp_int32 remaining;
1317           T victimIdx = pr->u.p.parm4;
1318           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1319           dispatch_private_info_template<T> *victim =
1320               reinterpret_cast<dispatch_private_info_template<T> *>(
1321                   other_threads[victimIdx]
1322                       ->th.th_dispatch->th_dispatch_pr_current);
1323           while ((victim == NULL || victim == pr ||
1324                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1325                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1326                  oldVictimIdx != victimIdx) {
1327             victimIdx = (victimIdx + 1) % nproc;
1328             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1329                 other_threads[victimIdx]
1330                     ->th.th_dispatch->th_dispatch_pr_current);
1331           }
1332           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1333                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1334             continue; // try once more (nproc attempts in total)
1335             // no victim is ready yet to participate in stealing
1336             // because all victims are still in kmp_init_dispatch
1337           }
1338           pr->u.p.parm4 = victimIdx; // new victim found
1339           while (1) { // CAS loop if victim has enough chunks to steal
1340             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1341             vnew = vold;
1342 
1343             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1344             if (vnew.p.count >= (UT)vnew.p.ub ||
1345                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1346               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1347               break; // not enough chunks to steal, goto next victim
1348             }
1349             if (remaining > 3) {
1350               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1351             } else {
1352               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1353             }
1354             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1355             // TODO: Should this be acquire or release?
1356             if (KMP_COMPARE_AND_STORE_ACQ64(
1357                     (volatile kmp_int64 *)&victim->u.p.count,
1358                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1359                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1360               // stealing succedded
1361               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1362                                         vold.p.ub - vnew.p.ub);
1363               status = 1;
1364               while_index = 0;
1365               // now update own count and ub
1366               init = vnew.p.ub;
1367               vold.p.count = init + 1;
1368 #if KMP_ARCH_X86
1369               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1370 #else
1371               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1372 #endif
1373               break;
1374             } // if (check CAS result)
1375             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1376           } // while (try to steal from particular victim)
1377         } // while (search for victim)
1378       } // if (try to find victim and steal)
1379     } // if (4-byte induction variable)
1380     if (!status) {
1381       *p_lb = 0;
1382       *p_ub = 0;
1383       if (p_st != NULL)
1384         *p_st = 0;
1385     } else {
1386       start = pr->u.p.parm2;
1387       init *= chunk;
1388       limit = chunk + init - 1;
1389       incr = pr->u.p.st;
1390       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1391 
1392       KMP_DEBUG_ASSERT(init <= trip);
1393       if ((last = (limit >= trip)) != 0)
1394         limit = trip;
1395       if (p_st != NULL)
1396         *p_st = incr;
1397 
1398       if (incr == 1) {
1399         *p_lb = start + init;
1400         *p_ub = start + limit;
1401       } else {
1402         *p_lb = start + init * incr;
1403         *p_ub = start + limit * incr;
1404       }
1405 
1406       if (pr->flags.ordered) {
1407         pr->u.p.ordered_lower = init;
1408         pr->u.p.ordered_upper = limit;
1409       } // if
1410     } // if
1411     break;
1412   } // case
1413 #endif // ( KMP_STATIC_STEAL_ENABLED )
1414   case kmp_sch_static_balanced: {
1415     KD_TRACE(
1416         10,
1417         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1418          gtid));
1419     /* check if thread has any iteration to do */
1420     if ((status = !pr->u.p.count) != 0) {
1421       pr->u.p.count = 1;
1422       *p_lb = pr->u.p.lb;
1423       *p_ub = pr->u.p.ub;
1424       last = pr->u.p.parm1;
1425       if (p_st != NULL)
1426         *p_st = pr->u.p.st;
1427     } else { /* no iterations to do */
1428       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1429     }
1430   } // case
1431   break;
1432   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1433                                  merged here */
1434   case kmp_sch_static_chunked: {
1435     T parm1;
1436 
1437     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1438                    "kmp_sch_static_[affinity|chunked] case\n",
1439                    gtid));
1440     parm1 = pr->u.p.parm1;
1441 
1442     trip = pr->u.p.tc - 1;
1443     init = parm1 * (pr->u.p.count + tid);
1444 
1445     if ((status = (init <= trip)) != 0) {
1446       start = pr->u.p.lb;
1447       incr = pr->u.p.st;
1448       limit = parm1 + init - 1;
1449 
1450       if ((last = (limit >= trip)) != 0)
1451         limit = trip;
1452 
1453       if (p_st != NULL)
1454         *p_st = incr;
1455 
1456       pr->u.p.count += nproc;
1457 
1458       if (incr == 1) {
1459         *p_lb = start + init;
1460         *p_ub = start + limit;
1461       } else {
1462         *p_lb = start + init * incr;
1463         *p_ub = start + limit * incr;
1464       }
1465 
1466       if (pr->flags.ordered) {
1467         pr->u.p.ordered_lower = init;
1468         pr->u.p.ordered_upper = limit;
1469       } // if
1470     } // if
1471   } // case
1472   break;
1473 
1474   case kmp_sch_dynamic_chunked: {
1475     T chunk = pr->u.p.parm1;
1476 
1477     KD_TRACE(
1478         100,
1479         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1480          gtid));
1481 
1482     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1483     trip = pr->u.p.tc - 1;
1484 
1485     if ((status = (init <= trip)) == 0) {
1486       *p_lb = 0;
1487       *p_ub = 0;
1488       if (p_st != NULL)
1489         *p_st = 0;
1490     } else {
1491       start = pr->u.p.lb;
1492       limit = chunk + init - 1;
1493       incr = pr->u.p.st;
1494 
1495       if ((last = (limit >= trip)) != 0)
1496         limit = trip;
1497 
1498       if (p_st != NULL)
1499         *p_st = incr;
1500 
1501       if (incr == 1) {
1502         *p_lb = start + init;
1503         *p_ub = start + limit;
1504       } else {
1505         *p_lb = start + init * incr;
1506         *p_ub = start + limit * incr;
1507       }
1508 
1509       if (pr->flags.ordered) {
1510         pr->u.p.ordered_lower = init;
1511         pr->u.p.ordered_upper = limit;
1512       } // if
1513     } // if
1514   } // case
1515   break;
1516 
1517   case kmp_sch_guided_iterative_chunked: {
1518     T chunkspec = pr->u.p.parm1;
1519     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1520                    "iterative case\n",
1521                    gtid));
1522     trip = pr->u.p.tc;
1523     // Start atomic part of calculations
1524     while (1) {
1525       ST remaining; // signed, because can be < 0
1526       init = sh->u.s.iteration; // shared value
1527       remaining = trip - init;
1528       if (remaining <= 0) { // AC: need to compare with 0 first
1529         // nothing to do, don't try atomic op
1530         status = 0;
1531         break;
1532       }
1533       if ((T)remaining <
1534           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1535         // use dynamic-style shcedule
1536         // atomically increment iterations, get old value
1537         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1538                                  (ST)chunkspec);
1539         remaining = trip - init;
1540         if (remaining <= 0) {
1541           status = 0; // all iterations got by other threads
1542         } else {
1543           // got some iterations to work on
1544           status = 1;
1545           if ((T)remaining > chunkspec) {
1546             limit = init + chunkspec - 1;
1547           } else {
1548             last = 1; // the last chunk
1549             limit = init + remaining - 1;
1550           } // if
1551         } // if
1552         break;
1553       } // if
1554       limit = init +
1555               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1556       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1557                                (ST)init, (ST)limit)) {
1558         // CAS was successful, chunk obtained
1559         status = 1;
1560         --limit;
1561         break;
1562       } // if
1563     } // while
1564     if (status != 0) {
1565       start = pr->u.p.lb;
1566       incr = pr->u.p.st;
1567       if (p_st != NULL)
1568         *p_st = incr;
1569       *p_lb = start + init * incr;
1570       *p_ub = start + limit * incr;
1571       if (pr->flags.ordered) {
1572         pr->u.p.ordered_lower = init;
1573         pr->u.p.ordered_upper = limit;
1574       } // if
1575     } else {
1576       *p_lb = 0;
1577       *p_ub = 0;
1578       if (p_st != NULL)
1579         *p_st = 0;
1580     } // if
1581   } // case
1582   break;
1583 
1584   case kmp_sch_guided_simd: {
1585     // same as iterative but curr-chunk adjusted to be multiple of given
1586     // chunk
1587     T chunk = pr->u.p.parm1;
1588     KD_TRACE(100,
1589              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1590               gtid));
1591     trip = pr->u.p.tc;
1592     // Start atomic part of calculations
1593     while (1) {
1594       ST remaining; // signed, because can be < 0
1595       init = sh->u.s.iteration; // shared value
1596       remaining = trip - init;
1597       if (remaining <= 0) { // AC: need to compare with 0 first
1598         status = 0; // nothing to do, don't try atomic op
1599         break;
1600       }
1601       KMP_DEBUG_ASSERT(init % chunk == 0);
1602       // compare with K*nproc*(chunk+1), K=2 by default
1603       if ((T)remaining < pr->u.p.parm2) {
1604         // use dynamic-style shcedule
1605         // atomically increment iterations, get old value
1606         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1607                                  (ST)chunk);
1608         remaining = trip - init;
1609         if (remaining <= 0) {
1610           status = 0; // all iterations got by other threads
1611         } else {
1612           // got some iterations to work on
1613           status = 1;
1614           if ((T)remaining > chunk) {
1615             limit = init + chunk - 1;
1616           } else {
1617             last = 1; // the last chunk
1618             limit = init + remaining - 1;
1619           } // if
1620         } // if
1621         break;
1622       } // if
1623       // divide by K*nproc
1624       UT span = remaining * (*(double *)&pr->u.p.parm3);
1625       UT rem = span % chunk;
1626       if (rem) // adjust so that span%chunk == 0
1627         span += chunk - rem;
1628       limit = init + span;
1629       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1630                                (ST)init, (ST)limit)) {
1631         // CAS was successful, chunk obtained
1632         status = 1;
1633         --limit;
1634         break;
1635       } // if
1636     } // while
1637     if (status != 0) {
1638       start = pr->u.p.lb;
1639       incr = pr->u.p.st;
1640       if (p_st != NULL)
1641         *p_st = incr;
1642       *p_lb = start + init * incr;
1643       *p_ub = start + limit * incr;
1644       if (pr->flags.ordered) {
1645         pr->u.p.ordered_lower = init;
1646         pr->u.p.ordered_upper = limit;
1647       } // if
1648     } else {
1649       *p_lb = 0;
1650       *p_ub = 0;
1651       if (p_st != NULL)
1652         *p_st = 0;
1653     } // if
1654   } // case
1655   break;
1656 
1657   case kmp_sch_guided_analytical_chunked: {
1658     T chunkspec = pr->u.p.parm1;
1659     UT chunkIdx;
1660 #if KMP_USE_X87CONTROL
1661     /* for storing original FPCW value for Windows* OS on
1662        IA-32 architecture 8-byte version */
1663     unsigned int oldFpcw;
1664     unsigned int fpcwSet = 0;
1665 #endif
1666     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1667                    "kmp_sch_guided_analytical_chunked case\n",
1668                    gtid));
1669 
1670     trip = pr->u.p.tc;
1671 
1672     KMP_DEBUG_ASSERT(nproc > 1);
1673     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1674 
1675     while (1) { /* this while loop is a safeguard against unexpected zero
1676                    chunk sizes */
1677       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1678       if (chunkIdx >= (UT)pr->u.p.parm2) {
1679         --trip;
1680         /* use dynamic-style scheduling */
1681         init = chunkIdx * chunkspec + pr->u.p.count;
1682         /* need to verify init > 0 in case of overflow in the above
1683          * calculation */
1684         if ((status = (init > 0 && init <= trip)) != 0) {
1685           limit = init + chunkspec - 1;
1686 
1687           if ((last = (limit >= trip)) != 0)
1688             limit = trip;
1689         }
1690         break;
1691       } else {
1692 /* use exponential-style scheduling */
1693 /* The following check is to workaround the lack of long double precision on
1694    Windows* OS.
1695    This check works around the possible effect that init != 0 for chunkIdx == 0.
1696  */
1697 #if KMP_USE_X87CONTROL
1698         /* If we haven't already done so, save original
1699            FPCW and set precision to 64-bit, as Windows* OS
1700            on IA-32 architecture defaults to 53-bit */
1701         if (!fpcwSet) {
1702           oldFpcw = _control87(0, 0);
1703           _control87(_PC_64, _MCW_PC);
1704           fpcwSet = 0x30000;
1705         }
1706 #endif
1707         if (chunkIdx) {
1708           init = __kmp_dispatch_guided_remaining<T>(
1709               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1710           KMP_DEBUG_ASSERT(init);
1711           init = trip - init;
1712         } else
1713           init = 0;
1714         limit = trip - __kmp_dispatch_guided_remaining<T>(
1715                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1716         KMP_ASSERT(init <= limit);
1717         if (init < limit) {
1718           KMP_DEBUG_ASSERT(limit <= trip);
1719           --limit;
1720           status = 1;
1721           break;
1722         } // if
1723       } // if
1724     } // while (1)
1725 #if KMP_USE_X87CONTROL
1726     /* restore FPCW if necessary
1727        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1728     */
1729     if (fpcwSet && (oldFpcw & fpcwSet))
1730       _control87(oldFpcw, _MCW_PC);
1731 #endif
1732     if (status != 0) {
1733       start = pr->u.p.lb;
1734       incr = pr->u.p.st;
1735       if (p_st != NULL)
1736         *p_st = incr;
1737       *p_lb = start + init * incr;
1738       *p_ub = start + limit * incr;
1739       if (pr->flags.ordered) {
1740         pr->u.p.ordered_lower = init;
1741         pr->u.p.ordered_upper = limit;
1742       }
1743     } else {
1744       *p_lb = 0;
1745       *p_ub = 0;
1746       if (p_st != NULL)
1747         *p_st = 0;
1748     }
1749   } // case
1750   break;
1751 
1752   case kmp_sch_trapezoidal: {
1753     UT index;
1754     T parm2 = pr->u.p.parm2;
1755     T parm3 = pr->u.p.parm3;
1756     T parm4 = pr->u.p.parm4;
1757     KD_TRACE(100,
1758              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1759               gtid));
1760 
1761     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1762 
1763     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1764     trip = pr->u.p.tc - 1;
1765 
1766     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1767       *p_lb = 0;
1768       *p_ub = 0;
1769       if (p_st != NULL)
1770         *p_st = 0;
1771     } else {
1772       start = pr->u.p.lb;
1773       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1774       incr = pr->u.p.st;
1775 
1776       if ((last = (limit >= trip)) != 0)
1777         limit = trip;
1778 
1779       if (p_st != NULL)
1780         *p_st = incr;
1781 
1782       if (incr == 1) {
1783         *p_lb = start + init;
1784         *p_ub = start + limit;
1785       } else {
1786         *p_lb = start + init * incr;
1787         *p_ub = start + limit * incr;
1788       }
1789 
1790       if (pr->flags.ordered) {
1791         pr->u.p.ordered_lower = init;
1792         pr->u.p.ordered_upper = limit;
1793       } // if
1794     } // if
1795   } // case
1796   break;
1797   default: {
1798     status = 0; // to avoid complaints on uninitialized variable use
1799     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1800                 KMP_HNT(GetNewerLibrary), // Hint
1801                 __kmp_msg_null // Variadic argument list terminator
1802                 );
1803   } break;
1804   } // switch
1805   if (p_last)
1806     *p_last = last;
1807 #ifdef KMP_DEBUG
1808   if (pr->flags.ordered) {
1809     char *buff;
1810     // create format specifiers before the debug output
1811     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1812                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1813                             traits_t<UT>::spec, traits_t<UT>::spec);
1814     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1815     __kmp_str_free(&buff);
1816   }
1817   {
1818     char *buff;
1819     // create format specifiers before the debug output
1820     buff = __kmp_str_format(
1821         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1822         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1823         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1824     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1825     __kmp_str_free(&buff);
1826   }
1827 #endif
1828   return status;
1829 }
1830 
1831 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1832    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1833    is not called. */
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835 #define OMPT_LOOP_END                                                          \
1836   if (status == 0) {                                                           \
1837     if (ompt_enabled.ompt_callback_work) {                                     \
1838       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1839       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1840       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1841           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1842           &(task_info->task_data), 0, codeptr);                                \
1843     }                                                                          \
1844   }
1845 // TODO: implement count
1846 #else
1847 #define OMPT_LOOP_END // no-op
1848 #endif
1849 
1850 #if KMP_STATS_ENABLED
1851 #define KMP_STATS_LOOP_END                                                     \
1852   {                                                                            \
1853     kmp_int64 u, l, t, i;                                                      \
1854     l = (kmp_int64)(*p_lb);                                                    \
1855     u = (kmp_int64)(*p_ub);                                                    \
1856     i = (kmp_int64)(pr->u.p.st);                                               \
1857     if (status == 0) {                                                         \
1858       t = 0;                                                                   \
1859       KMP_POP_PARTITIONED_TIMER();                                             \
1860     } else if (i == 1) {                                                       \
1861       if (u >= l)                                                              \
1862         t = u - l + 1;                                                         \
1863       else                                                                     \
1864         t = 0;                                                                 \
1865     } else if (i < 0) {                                                        \
1866       if (l >= u)                                                              \
1867         t = (l - u) / (-i) + 1;                                                \
1868       else                                                                     \
1869         t = 0;                                                                 \
1870     } else {                                                                   \
1871       if (u >= l)                                                              \
1872         t = (u - l) / i + 1;                                                   \
1873       else                                                                     \
1874         t = 0;                                                                 \
1875     }                                                                          \
1876     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1877   }
1878 #else
1879 #define KMP_STATS_LOOP_END /* Nothing */
1880 #endif
1881 
1882 template <typename T>
1883 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1884                                T *p_lb, T *p_ub,
1885                                typename traits_t<T>::signed_t *p_st
1886 #if OMPT_SUPPORT && OMPT_OPTIONAL
1887                                ,
1888                                void *codeptr
1889 #endif
1890                                ) {
1891 
1892   typedef typename traits_t<T>::unsigned_t UT;
1893   typedef typename traits_t<T>::signed_t ST;
1894   // This is potentially slightly misleading, schedule(runtime) will appear here
1895   // even if the actual runtme schedule is static. (Which points out a
1896   // disadvantage of schedule(runtime): even when static scheduling is used it
1897   // costs more than a compile time choice to use static scheduling would.)
1898   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1899 
1900   int status;
1901   dispatch_private_info_template<T> *pr;
1902   kmp_info_t *th = __kmp_threads[gtid];
1903   kmp_team_t *team = th->th.th_team;
1904 
1905   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1906   KD_TRACE(
1907       1000,
1908       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1909        gtid, p_lb, p_ub, p_st, p_last));
1910 
1911   if (team->t.t_serialized) {
1912     /* NOTE: serialize this dispatch becase we are not at the active level */
1913     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1914         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1915     KMP_DEBUG_ASSERT(pr);
1916 
1917     if ((status = (pr->u.p.tc != 0)) == 0) {
1918       *p_lb = 0;
1919       *p_ub = 0;
1920       //            if ( p_last != NULL )
1921       //                *p_last = 0;
1922       if (p_st != NULL)
1923         *p_st = 0;
1924       if (__kmp_env_consistency_check) {
1925         if (pr->pushed_ws != ct_none) {
1926           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1927         }
1928       }
1929     } else if (pr->flags.nomerge) {
1930       kmp_int32 last;
1931       T start;
1932       UT limit, trip, init;
1933       ST incr;
1934       T chunk = pr->u.p.parm1;
1935 
1936       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1937                      gtid));
1938 
1939       init = chunk * pr->u.p.count++;
1940       trip = pr->u.p.tc - 1;
1941 
1942       if ((status = (init <= trip)) == 0) {
1943         *p_lb = 0;
1944         *p_ub = 0;
1945         //                if ( p_last != NULL )
1946         //                    *p_last = 0;
1947         if (p_st != NULL)
1948           *p_st = 0;
1949         if (__kmp_env_consistency_check) {
1950           if (pr->pushed_ws != ct_none) {
1951             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1952           }
1953         }
1954       } else {
1955         start = pr->u.p.lb;
1956         limit = chunk + init - 1;
1957         incr = pr->u.p.st;
1958 
1959         if ((last = (limit >= trip)) != 0) {
1960           limit = trip;
1961 #if KMP_OS_WINDOWS
1962           pr->u.p.last_upper = pr->u.p.ub;
1963 #endif /* KMP_OS_WINDOWS */
1964         }
1965         if (p_last != NULL)
1966           *p_last = last;
1967         if (p_st != NULL)
1968           *p_st = incr;
1969         if (incr == 1) {
1970           *p_lb = start + init;
1971           *p_ub = start + limit;
1972         } else {
1973           *p_lb = start + init * incr;
1974           *p_ub = start + limit * incr;
1975         }
1976 
1977         if (pr->flags.ordered) {
1978           pr->u.p.ordered_lower = init;
1979           pr->u.p.ordered_upper = limit;
1980 #ifdef KMP_DEBUG
1981           {
1982             char *buff;
1983             // create format specifiers before the debug output
1984             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1985                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1986                                     traits_t<UT>::spec, traits_t<UT>::spec);
1987             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1988                             pr->u.p.ordered_upper));
1989             __kmp_str_free(&buff);
1990           }
1991 #endif
1992         } // if
1993       } // if
1994     } else {
1995       pr->u.p.tc = 0;
1996       *p_lb = pr->u.p.lb;
1997       *p_ub = pr->u.p.ub;
1998 #if KMP_OS_WINDOWS
1999       pr->u.p.last_upper = *p_ub;
2000 #endif /* KMP_OS_WINDOWS */
2001       if (p_last != NULL)
2002         *p_last = TRUE;
2003       if (p_st != NULL)
2004         *p_st = pr->u.p.st;
2005     } // if
2006 #ifdef KMP_DEBUG
2007     {
2008       char *buff;
2009       // create format specifiers before the debug output
2010       buff = __kmp_str_format(
2011           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2012           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2013           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2014       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2015       __kmp_str_free(&buff);
2016     }
2017 #endif
2018 #if INCLUDE_SSC_MARKS
2019     SSC_MARK_DISPATCH_NEXT();
2020 #endif
2021     OMPT_LOOP_END;
2022     KMP_STATS_LOOP_END;
2023     return status;
2024   } else {
2025     kmp_int32 last = 0;
2026     dispatch_shared_info_template<T> volatile *sh;
2027 
2028     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2029                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2030 
2031     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2032         th->th.th_dispatch->th_dispatch_pr_current);
2033     KMP_DEBUG_ASSERT(pr);
2034     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2035         th->th.th_dispatch->th_dispatch_sh_current);
2036     KMP_DEBUG_ASSERT(sh);
2037 
2038 #if KMP_USE_HIER_SCHED
2039     if (pr->flags.use_hier)
2040       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2041     else
2042 #endif // KMP_USE_HIER_SCHED
2043       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2044                                                 p_st, th->th.th_team_nproc,
2045                                                 th->th.th_info.ds.ds_tid);
2046     // status == 0: no more iterations to execute
2047     if (status == 0) {
2048       UT num_done;
2049 
2050       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2051 #ifdef KMP_DEBUG
2052       {
2053         char *buff;
2054         // create format specifiers before the debug output
2055         buff = __kmp_str_format(
2056             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2057             traits_t<UT>::spec);
2058         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2059         __kmp_str_free(&buff);
2060       }
2061 #endif
2062 
2063 #if KMP_USE_HIER_SCHED
2064       pr->flags.use_hier = FALSE;
2065 #endif
2066       if ((ST)num_done == th->th.th_team_nproc - 1) {
2067 #if (KMP_STATIC_STEAL_ENABLED)
2068         if (pr->schedule == kmp_sch_static_steal &&
2069             traits_t<T>::type_size > 4) {
2070           int i;
2071           kmp_info_t **other_threads = team->t.t_threads;
2072           // loop complete, safe to destroy locks used for stealing
2073           for (i = 0; i < th->th.th_team_nproc; ++i) {
2074             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2075             KMP_ASSERT(lck != NULL);
2076             __kmp_destroy_lock(lck);
2077             __kmp_free(lck);
2078             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2079           }
2080         }
2081 #endif
2082         /* NOTE: release this buffer to be reused */
2083 
2084         KMP_MB(); /* Flush all pending memory write invalidates.  */
2085 
2086         sh->u.s.num_done = 0;
2087         sh->u.s.iteration = 0;
2088 
2089         /* TODO replace with general release procedure? */
2090         if (pr->flags.ordered) {
2091           sh->u.s.ordered_iteration = 0;
2092         }
2093 
2094         KMP_MB(); /* Flush all pending memory write invalidates.  */
2095 
2096         sh->buffer_index += __kmp_dispatch_num_buffers;
2097         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2098                        gtid, sh->buffer_index));
2099 
2100         KMP_MB(); /* Flush all pending memory write invalidates.  */
2101 
2102       } // if
2103       if (__kmp_env_consistency_check) {
2104         if (pr->pushed_ws != ct_none) {
2105           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2106         }
2107       }
2108 
2109       th->th.th_dispatch->th_deo_fcn = NULL;
2110       th->th.th_dispatch->th_dxo_fcn = NULL;
2111       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2112       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2113     } // if (status == 0)
2114 #if KMP_OS_WINDOWS
2115     else if (last) {
2116       pr->u.p.last_upper = pr->u.p.ub;
2117     }
2118 #endif /* KMP_OS_WINDOWS */
2119     if (p_last != NULL && status != 0)
2120       *p_last = last;
2121   } // if
2122 
2123 #ifdef KMP_DEBUG
2124   {
2125     char *buff;
2126     // create format specifiers before the debug output
2127     buff = __kmp_str_format(
2128         "__kmp_dispatch_next: T#%%d normal case: "
2129         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2130         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2131     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2132                   (p_last ? *p_last : 0), status));
2133     __kmp_str_free(&buff);
2134   }
2135 #endif
2136 #if INCLUDE_SSC_MARKS
2137   SSC_MARK_DISPATCH_NEXT();
2138 #endif
2139   OMPT_LOOP_END;
2140   KMP_STATS_LOOP_END;
2141   return status;
2142 }
2143 
2144 template <typename T>
2145 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2146                                   kmp_int32 *plastiter, T *plower, T *pupper,
2147                                   typename traits_t<T>::signed_t incr) {
2148   typedef typename traits_t<T>::unsigned_t UT;
2149   kmp_uint32 team_id;
2150   kmp_uint32 nteams;
2151   UT trip_count;
2152   kmp_team_t *team;
2153   kmp_info_t *th;
2154 
2155   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2156   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2157 #ifdef KMP_DEBUG
2158   typedef typename traits_t<T>::signed_t ST;
2159   {
2160     char *buff;
2161     // create format specifiers before the debug output
2162     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2163                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2164                             traits_t<T>::spec, traits_t<T>::spec,
2165                             traits_t<ST>::spec, traits_t<T>::spec);
2166     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2167     __kmp_str_free(&buff);
2168   }
2169 #endif
2170 
2171   if (__kmp_env_consistency_check) {
2172     if (incr == 0) {
2173       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2174                             loc);
2175     }
2176     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2177       // The loop is illegal.
2178       // Some zero-trip loops maintained by compiler, e.g.:
2179       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2180       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2181       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2182       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2183       // Compiler does not check the following illegal loops:
2184       //   for(i=0;i<10;i+=incr) // where incr<0
2185       //   for(i=10;i>0;i-=incr) // where incr<0
2186       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2187     }
2188   }
2189   th = __kmp_threads[gtid];
2190   team = th->th.th_team;
2191   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2192   nteams = th->th.th_teams_size.nteams;
2193   team_id = team->t.t_master_tid;
2194   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2195 
2196   // compute global trip count
2197   if (incr == 1) {
2198     trip_count = *pupper - *plower + 1;
2199   } else if (incr == -1) {
2200     trip_count = *plower - *pupper + 1;
2201   } else if (incr > 0) {
2202     // upper-lower can exceed the limit of signed type
2203     trip_count = (UT)(*pupper - *plower) / incr + 1;
2204   } else {
2205     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2206   }
2207 
2208   if (trip_count <= nteams) {
2209     KMP_DEBUG_ASSERT(
2210         __kmp_static == kmp_sch_static_greedy ||
2211         __kmp_static ==
2212             kmp_sch_static_balanced); // Unknown static scheduling type.
2213     // only some teams get single iteration, others get nothing
2214     if (team_id < trip_count) {
2215       *pupper = *plower = *plower + team_id * incr;
2216     } else {
2217       *plower = *pupper + incr; // zero-trip loop
2218     }
2219     if (plastiter != NULL)
2220       *plastiter = (team_id == trip_count - 1);
2221   } else {
2222     if (__kmp_static == kmp_sch_static_balanced) {
2223       UT chunk = trip_count / nteams;
2224       UT extras = trip_count % nteams;
2225       *plower +=
2226           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2227       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2228       if (plastiter != NULL)
2229         *plastiter = (team_id == nteams - 1);
2230     } else {
2231       T chunk_inc_count =
2232           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2233       T upper = *pupper;
2234       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2235       // Unknown static scheduling type.
2236       *plower += team_id * chunk_inc_count;
2237       *pupper = *plower + chunk_inc_count - incr;
2238       // Check/correct bounds if needed
2239       if (incr > 0) {
2240         if (*pupper < *plower)
2241           *pupper = traits_t<T>::max_value;
2242         if (plastiter != NULL)
2243           *plastiter = *plower <= upper && *pupper > upper - incr;
2244         if (*pupper > upper)
2245           *pupper = upper; // tracker C73258
2246       } else {
2247         if (*pupper > *plower)
2248           *pupper = traits_t<T>::min_value;
2249         if (plastiter != NULL)
2250           *plastiter = *plower >= upper && *pupper < upper - incr;
2251         if (*pupper < upper)
2252           *pupper = upper; // tracker C73258
2253       }
2254     }
2255   }
2256 }
2257 
2258 //-----------------------------------------------------------------------------
2259 // Dispatch routines
2260 //    Transfer call to template< type T >
2261 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2262 //                         T lb, T ub, ST st, ST chunk )
2263 extern "C" {
2264 
2265 /*!
2266 @ingroup WORK_SHARING
2267 @{
2268 @param loc Source location
2269 @param gtid Global thread id
2270 @param schedule Schedule type
2271 @param lb  Lower bound
2272 @param ub  Upper bound
2273 @param st  Step (or increment if you prefer)
2274 @param chunk The chunk size to block with
2275 
2276 This function prepares the runtime to start a dynamically scheduled for loop,
2277 saving the loop arguments.
2278 These functions are all identical apart from the types of the arguments.
2279 */
2280 
2281 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2282                             enum sched_type schedule, kmp_int32 lb,
2283                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2284   KMP_DEBUG_ASSERT(__kmp_init_serial);
2285 #if OMPT_SUPPORT && OMPT_OPTIONAL
2286   OMPT_STORE_RETURN_ADDRESS(gtid);
2287 #endif
2288   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2289 }
2290 /*!
2291 See @ref __kmpc_dispatch_init_4
2292 */
2293 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2294                              enum sched_type schedule, kmp_uint32 lb,
2295                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2296   KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298   OMPT_STORE_RETURN_ADDRESS(gtid);
2299 #endif
2300   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2301 }
2302 
2303 /*!
2304 See @ref __kmpc_dispatch_init_4
2305 */
2306 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2307                             enum sched_type schedule, kmp_int64 lb,
2308                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2309   KMP_DEBUG_ASSERT(__kmp_init_serial);
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311   OMPT_STORE_RETURN_ADDRESS(gtid);
2312 #endif
2313   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2314 }
2315 
2316 /*!
2317 See @ref __kmpc_dispatch_init_4
2318 */
2319 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2320                              enum sched_type schedule, kmp_uint64 lb,
2321                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2322   KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324   OMPT_STORE_RETURN_ADDRESS(gtid);
2325 #endif
2326   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2327 }
2328 
2329 /*!
2330 See @ref __kmpc_dispatch_init_4
2331 
2332 Difference from __kmpc_dispatch_init set of functions is these functions
2333 are called for composite distribute parallel for construct. Thus before
2334 regular iterations dispatching we need to calc per-team iteration space.
2335 
2336 These functions are all identical apart from the types of the arguments.
2337 */
2338 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2339                                  enum sched_type schedule, kmp_int32 *p_last,
2340                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2341                                  kmp_int32 chunk) {
2342   KMP_DEBUG_ASSERT(__kmp_init_serial);
2343 #if OMPT_SUPPORT && OMPT_OPTIONAL
2344   OMPT_STORE_RETURN_ADDRESS(gtid);
2345 #endif
2346   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2347   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2348 }
2349 
2350 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2351                                   enum sched_type schedule, kmp_int32 *p_last,
2352                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2353                                   kmp_int32 chunk) {
2354   KMP_DEBUG_ASSERT(__kmp_init_serial);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356   OMPT_STORE_RETURN_ADDRESS(gtid);
2357 #endif
2358   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2359   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2360 }
2361 
2362 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2363                                  enum sched_type schedule, kmp_int32 *p_last,
2364                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2365                                  kmp_int64 chunk) {
2366   KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368   OMPT_STORE_RETURN_ADDRESS(gtid);
2369 #endif
2370   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2371   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2372 }
2373 
2374 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2375                                   enum sched_type schedule, kmp_int32 *p_last,
2376                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2377                                   kmp_int64 chunk) {
2378   KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380   OMPT_STORE_RETURN_ADDRESS(gtid);
2381 #endif
2382   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2383   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2384 }
2385 
2386 /*!
2387 @param loc Source code location
2388 @param gtid Global thread id
2389 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2390 otherwise
2391 @param p_lb   Pointer to the lower bound for the next chunk of work
2392 @param p_ub   Pointer to the upper bound for the next chunk of work
2393 @param p_st   Pointer to the stride for the next chunk of work
2394 @return one if there is work to be done, zero otherwise
2395 
2396 Get the next dynamically allocated chunk of work for this thread.
2397 If there is no more work, then the lb,ub and stride need not be modified.
2398 */
2399 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2400                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402   OMPT_STORE_RETURN_ADDRESS(gtid);
2403 #endif
2404   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2406                                         ,
2407                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2408 #endif
2409                                             );
2410 }
2411 
2412 /*!
2413 See @ref __kmpc_dispatch_next_4
2414 */
2415 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2416                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2417                             kmp_int32 *p_st) {
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419   OMPT_STORE_RETURN_ADDRESS(gtid);
2420 #endif
2421   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422 #if OMPT_SUPPORT && OMPT_OPTIONAL
2423                                          ,
2424                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2425 #endif
2426                                              );
2427 }
2428 
2429 /*!
2430 See @ref __kmpc_dispatch_next_4
2431 */
2432 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435   OMPT_STORE_RETURN_ADDRESS(gtid);
2436 #endif
2437   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2438 #if OMPT_SUPPORT && OMPT_OPTIONAL
2439                                         ,
2440                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2441 #endif
2442                                             );
2443 }
2444 
2445 /*!
2446 See @ref __kmpc_dispatch_next_4
2447 */
2448 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2449                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2450                             kmp_int64 *p_st) {
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452   OMPT_STORE_RETURN_ADDRESS(gtid);
2453 #endif
2454   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455 #if OMPT_SUPPORT && OMPT_OPTIONAL
2456                                          ,
2457                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2458 #endif
2459                                              );
2460 }
2461 
2462 /*!
2463 @param loc Source code location
2464 @param gtid Global thread id
2465 
2466 Mark the end of a dynamic loop.
2467 */
2468 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2469   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2470 }
2471 
2472 /*!
2473 See @ref __kmpc_dispatch_fini_4
2474 */
2475 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2476   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2477 }
2478 
2479 /*!
2480 See @ref __kmpc_dispatch_fini_4
2481 */
2482 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2483   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2484 }
2485 
2486 /*!
2487 See @ref __kmpc_dispatch_fini_4
2488 */
2489 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2490   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2491 }
2492 /*! @} */
2493 
2494 //-----------------------------------------------------------------------------
2495 // Non-template routines from kmp_dispatch.cpp used in other sources
2496 
2497 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2498   return value == checker;
2499 }
2500 
2501 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2502   return value != checker;
2503 }
2504 
2505 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2506   return value < checker;
2507 }
2508 
2509 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2510   return value >= checker;
2511 }
2512 
2513 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2514   return value <= checker;
2515 }
2516 
2517 kmp_uint32
2518 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2519              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2520              void *obj // Higher-level synchronization object, or NULL.
2521              ) {
2522   // note: we may not belong to a team at this point
2523   volatile kmp_uint32 *spin = spinner;
2524   kmp_uint32 check = checker;
2525   kmp_uint32 spins;
2526   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2527   kmp_uint32 r;
2528 
2529   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2530   KMP_INIT_YIELD(spins);
2531   // main wait spin loop
2532   while (!f(r = TCR_4(*spin), check)) {
2533     KMP_FSYNC_SPIN_PREPARE(obj);
2534     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2535        split. It causes problems with infinite recursion because of exit lock */
2536     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2537         __kmp_abort_thread(); */
2538     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2539   }
2540   KMP_FSYNC_SPIN_ACQUIRED(obj);
2541   return r;
2542 }
2543 
2544 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2545                       kmp_uint32 (*pred)(void *, kmp_uint32),
2546                       void *obj // Higher-level synchronization object, or NULL.
2547                       ) {
2548   // note: we may not belong to a team at this point
2549   void *spin = spinner;
2550   kmp_uint32 check = checker;
2551   kmp_uint32 spins;
2552   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2553 
2554   KMP_FSYNC_SPIN_INIT(obj, spin);
2555   KMP_INIT_YIELD(spins);
2556   // main wait spin loop
2557   while (!f(spin, check)) {
2558     KMP_FSYNC_SPIN_PREPARE(obj);
2559     /* if we have waited a bit, or are noversubscribed, yield */
2560     /* pause is in the following code */
2561     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2562   }
2563   KMP_FSYNC_SPIN_ACQUIRED(obj);
2564 }
2565 
2566 } // extern "C"
2567 
2568 #ifdef KMP_GOMP_COMPAT
2569 
2570 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2571                                enum sched_type schedule, kmp_int32 lb,
2572                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2573                                int push_ws) {
2574   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2575                                  push_ws);
2576 }
2577 
2578 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2579                                 enum sched_type schedule, kmp_uint32 lb,
2580                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2581                                 int push_ws) {
2582   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2583                                   push_ws);
2584 }
2585 
2586 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2587                                enum sched_type schedule, kmp_int64 lb,
2588                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2589                                int push_ws) {
2590   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2591                                  push_ws);
2592 }
2593 
2594 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2595                                 enum sched_type schedule, kmp_uint64 lb,
2596                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2597                                 int push_ws) {
2598   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2599                                   push_ws);
2600 }
2601 
2602 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2603   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2604 }
2605 
2606 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2607   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2608 }
2609 
2610 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2611   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2612 }
2613 
2614 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2615   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2616 }
2617 
2618 #endif /* KMP_GOMP_COMPAT */
2619 
2620 /* ------------------------------------------------------------------------ */
2621