1 /*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 /* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41
__kmp_dispatch_deo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56 }
57 }
58 }
59
__kmp_dispatch_dxo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69 }
70
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
__kmp_get_monotonicity(ident_t * loc,enum sched_type schedule,bool use_hier=false)72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // Nonmonotonic as default for dynamic schedule when no modifier is specified
76 int monotonicity = SCHEDULE_NONMONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91 }
92
93 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94 // Return floating point number rounded to two decimal points
__kmp_round_2decimal_val(float num)95 static inline float __kmp_round_2decimal_val(float num) {
96 return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97 }
__kmp_get_round_val(float num)98 static inline int __kmp_get_round_val(float num) {
99 return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100 }
101 #endif
102
103 template <typename T>
104 inline void
__kmp_initialize_self_buffer(kmp_team_t * team,T id,dispatch_private_info_template<T> * pr,typename traits_t<T>::unsigned_t nchunks,T nproc,typename traits_t<T>::unsigned_t & init,T & small_chunk,T & extras,T & p_extra)105 __kmp_initialize_self_buffer(kmp_team_t *team, T id,
106 dispatch_private_info_template<T> *pr,
107 typename traits_t<T>::unsigned_t nchunks, T nproc,
108 typename traits_t<T>::unsigned_t &init,
109 T &small_chunk, T &extras, T &p_extra) {
110
111 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112 if (pr->flags.use_hybrid) {
113 kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114 kmp_hw_core_type_t type =
115 (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116 T pchunks = pr->u.p.pchunks;
117 T echunks = nchunks - pchunks;
118 T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119 T num_procs_with_ecore = nproc - num_procs_with_pcore;
120 T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121 T big_chunk =
122 pchunks / num_procs_with_pcore; // chunks per thread with p-core
123 small_chunk =
124 echunks / num_procs_with_ecore; // chunks per thread with e-core
125
126 extras =
127 (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128
129 p_extra = (big_chunk - small_chunk);
130
131 if (type == KMP_HW_CORE_TYPE_CORE) {
132 if (id < first_thread_with_ecore) {
133 init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134 } else {
135 init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136 (id < extras ? id : extras);
137 }
138 } else {
139 if (id == first_thread_with_ecore) {
140 init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141 } else {
142 init = id * small_chunk + first_thread_with_ecore * p_extra +
143 (id < extras ? id : extras);
144 }
145 }
146 p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147 return;
148 }
149 #endif
150
151 small_chunk = nchunks / nproc; // chunks per thread
152 extras = nchunks % nproc;
153 p_extra = 0;
154 init = id * small_chunk + (id < extras ? id : extras);
155 }
156
157 #if KMP_STATIC_STEAL_ENABLED
158 enum { // values for steal_flag (possible states of private per-loop buffer)
159 UNUSED = 0,
160 CLAIMED = 1, // owner thread started initialization
161 READY = 2, // available for stealing
162 THIEF = 3 // finished by owner, or claimed by thief
163 // possible state changes:
164 // 0 -> 1 owner only, sync
165 // 0 -> 3 thief only, sync
166 // 1 -> 2 owner only, async
167 // 2 -> 3 owner only, async
168 // 3 -> 2 owner only, async
169 // 3 -> 0 last thread finishing the loop, async
170 };
171 #endif
172
173 // Initialize a dispatch_private_info_template<T> buffer for a particular
174 // type of schedule,chunk. The loop description is found in lb (lower bound),
175 // ub (upper bound), and st (stride). nproc is the number of threads relevant
176 // to the scheduling (often the number of threads in a team, but not always if
177 // hierarchical scheduling is used). tid is the id of the thread calling
178 // the function within the group of nproc threads. It will have a value
179 // between 0 and nproc - 1. This is often just the thread id within a team, but
180 // is not necessarily the case when using hierarchical scheduling.
181 // loc is the source file location of the corresponding loop
182 // gtid is the global thread id
183 template <typename T>
__kmp_dispatch_init_algorithm(ident_t * loc,int gtid,dispatch_private_info_template<T> * pr,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,kmp_uint64 * cur_chunk,typename traits_t<T>::signed_t chunk,T nproc,T tid)184 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185 dispatch_private_info_template<T> *pr,
186 enum sched_type schedule, T lb, T ub,
187 typename traits_t<T>::signed_t st,
188 #if USE_ITT_BUILD
189 kmp_uint64 *cur_chunk,
190 #endif
191 typename traits_t<T>::signed_t chunk,
192 T nproc, T tid) {
193 typedef typename traits_t<T>::unsigned_t UT;
194 typedef typename traits_t<T>::floating_t DBL;
195
196 int active;
197 T tc;
198 kmp_info_t *th;
199 kmp_team_t *team;
200 int monotonicity;
201 bool use_hier;
202
203 #ifdef KMP_DEBUG
204 typedef typename traits_t<T>::signed_t ST;
205 {
206 char *buff;
207 // create format specifiers before the debug output
208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211 traits_t<T>::spec, traits_t<T>::spec,
212 traits_t<ST>::spec, traits_t<ST>::spec,
213 traits_t<T>::spec, traits_t<T>::spec);
214 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215 __kmp_str_free(&buff);
216 }
217 #endif
218 /* setup data */
219 th = __kmp_threads[gtid];
220 team = th->th.th_team;
221 active = !team->t.t_serialized;
222
223 #if USE_ITT_BUILD
224 int itt_need_metadata_reporting =
225 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227 team->t.t_active_level == 1;
228 #endif
229
230 #if KMP_USE_HIER_SCHED
231 use_hier = pr->flags.use_hier;
232 #else
233 use_hier = false;
234 #endif
235
236 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239
240 /* Pick up the nomerge/ordered bits from the scheduling type */
241 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242 pr->flags.nomerge = TRUE;
243 schedule =
244 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245 } else {
246 pr->flags.nomerge = FALSE;
247 }
248 pr->type_size = traits_t<T>::type_size; // remember the size of variables
249 if (kmp_ord_lower & schedule) {
250 pr->flags.ordered = TRUE;
251 schedule =
252 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253 } else {
254 pr->flags.ordered = FALSE;
255 }
256 // Ordered overrides nonmonotonic
257 if (pr->flags.ordered) {
258 monotonicity = SCHEDULE_MONOTONIC;
259 }
260
261 if (schedule == kmp_sch_static) {
262 schedule = __kmp_static;
263 } else {
264 if (schedule == kmp_sch_runtime) {
265 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266 // not specified)
267 schedule = team->t.t_sched.r_sched_type;
268 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271 monotonicity = SCHEDULE_MONOTONIC;
272 // Detail the schedule if needed (global controls are differentiated
273 // appropriately)
274 if (schedule == kmp_sch_guided_chunked) {
275 schedule = __kmp_guided;
276 } else if (schedule == kmp_sch_static) {
277 schedule = __kmp_static;
278 }
279 // Use the chunk size specified by OMP_SCHEDULE (or default if not
280 // specified)
281 chunk = team->t.t_sched.chunk;
282 #if USE_ITT_BUILD
283 if (cur_chunk)
284 *cur_chunk = chunk;
285 #endif
286 #ifdef KMP_DEBUG
287 {
288 char *buff;
289 // create format specifiers before the debug output
290 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291 "schedule:%%d chunk:%%%s\n",
292 traits_t<ST>::spec);
293 KD_TRACE(10, (buff, gtid, schedule, chunk));
294 __kmp_str_free(&buff);
295 }
296 #endif
297 } else {
298 if (schedule == kmp_sch_guided_chunked) {
299 schedule = __kmp_guided;
300 }
301 if (chunk <= 0) {
302 chunk = KMP_DEFAULT_CHUNK;
303 }
304 }
305
306 if (schedule == kmp_sch_auto) {
307 // mapping and differentiation: in the __kmp_do_serial_initialize()
308 schedule = __kmp_auto;
309 #ifdef KMP_DEBUG
310 {
311 char *buff;
312 // create format specifiers before the debug output
313 buff = __kmp_str_format(
314 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315 "schedule:%%d chunk:%%%s\n",
316 traits_t<ST>::spec);
317 KD_TRACE(10, (buff, gtid, schedule, chunk));
318 __kmp_str_free(&buff);
319 }
320 #endif
321 }
322 #if KMP_STATIC_STEAL_ENABLED
323 // map nonmonotonic:dynamic to static steal
324 if (schedule == kmp_sch_dynamic_chunked) {
325 if (monotonicity == SCHEDULE_NONMONOTONIC)
326 schedule = kmp_sch_static_steal;
327 }
328 #endif
329 /* guided analytical not safe for too many threads */
330 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331 schedule = kmp_sch_guided_iterative_chunked;
332 KMP_WARNING(DispatchManyThreads);
333 }
334 if (schedule == kmp_sch_runtime_simd) {
335 // compiler provides simd_width in the chunk parameter
336 schedule = team->t.t_sched.r_sched_type;
337 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339 // Detail the schedule if needed (global controls are differentiated
340 // appropriately)
341 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342 schedule == __kmp_static) {
343 schedule = kmp_sch_static_balanced_chunked;
344 } else {
345 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346 schedule = kmp_sch_guided_simd;
347 }
348 chunk = team->t.t_sched.chunk * chunk;
349 }
350 #if USE_ITT_BUILD
351 if (cur_chunk)
352 *cur_chunk = chunk;
353 #endif
354 #ifdef KMP_DEBUG
355 {
356 char *buff;
357 // create format specifiers before the debug output
358 buff = __kmp_str_format(
359 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360 " chunk:%%%s\n",
361 traits_t<ST>::spec);
362 KD_TRACE(10, (buff, gtid, schedule, chunk));
363 __kmp_str_free(&buff);
364 }
365 #endif
366 }
367 pr->u.p.parm1 = chunk;
368 }
369 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370 "unknown scheduling type");
371
372 pr->u.p.count = 0;
373
374 if (__kmp_env_consistency_check) {
375 if (st == 0) {
376 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378 }
379 }
380 // compute trip count
381 if (st == 1) { // most common case
382 if (ub >= lb) {
383 tc = ub - lb + 1;
384 } else { // ub < lb
385 tc = 0; // zero-trip
386 }
387 } else if (st < 0) {
388 if (lb >= ub) {
389 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390 // where the division needs to be unsigned regardless of the result type
391 tc = (UT)(lb - ub) / (-st) + 1;
392 } else { // lb < ub
393 tc = 0; // zero-trip
394 }
395 } else { // st > 0
396 if (ub >= lb) {
397 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398 // where the division needs to be unsigned regardless of the result type
399 tc = (UT)(ub - lb) / st + 1;
400 } else { // ub < lb
401 tc = 0; // zero-trip
402 }
403 }
404
405 #if KMP_STATS_ENABLED
406 if (KMP_MASTER_GTID(gtid)) {
407 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408 }
409 #endif
410
411 pr->u.p.lb = lb;
412 pr->u.p.ub = ub;
413 pr->u.p.st = st;
414 pr->u.p.tc = tc;
415
416 #if KMP_OS_WINDOWS
417 pr->u.p.last_upper = ub + st;
418 #endif /* KMP_OS_WINDOWS */
419
420 /* NOTE: only the active parallel region(s) has active ordered sections */
421
422 if (active) {
423 if (pr->flags.ordered) {
424 pr->ordered_bumped = 0;
425 pr->u.p.ordered_lower = 1;
426 pr->u.p.ordered_upper = 0;
427 }
428 }
429
430 switch (schedule) {
431 #if KMP_STATIC_STEAL_ENABLED
432 case kmp_sch_static_steal: {
433 T ntc, init = 0;
434
435 KD_TRACE(100,
436 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437 gtid));
438
439 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440 if (nproc > 1 && ntc >= nproc) {
441 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442 T id = tid;
443 T small_chunk, extras, p_extra = 0;
444 kmp_uint32 old = UNUSED;
445 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446 if (traits_t<T>::type_size > 4) {
447 // AC: TODO: check if 16-byte CAS available and use it to
448 // improve performance (probably wait for explicit request
449 // before spending time on this).
450 // For now use dynamically allocated per-private-buffer lock,
451 // free memory in __kmp_dispatch_next when status==0.
452 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453 __kmp_init_lock(pr->u.p.steal_lock);
454 }
455
456 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457 // Iterations are divided in a 60/40 skewed distribution among CORE and
458 // ATOM processors for hybrid systems
459 bool use_hybrid = false;
460 kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461 T first_thread_with_ecore = 0;
462 T num_procs_with_pcore = 0;
463 T num_procs_with_ecore = 0;
464 T p_ntc = 0, e_ntc = 0;
465 if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466 __kmp_affinity.type != affinity_explicit) {
467 use_hybrid = true;
468 core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470 __kmp_first_osid_with_ecore > -1) {
471 for (int i = 0; i < team->t.t_nproc; ++i) {
472 kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473 ->th.th_topology_attrs.core_type;
474 int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475 if (id == __kmp_first_osid_with_ecore) {
476 first_thread_with_ecore =
477 team->t.t_threads[i]->th.th_info.ds.ds_tid;
478 }
479 if (type == KMP_HW_CORE_TYPE_CORE) {
480 num_procs_with_pcore++;
481 } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482 num_procs_with_ecore++;
483 } else {
484 use_hybrid = false;
485 break;
486 }
487 }
488 }
489 if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490 float multiplier = 60.0 / 40.0;
491 float p_ratio = (float)num_procs_with_pcore / nproc;
492 float e_ratio = (float)num_procs_with_ecore / nproc;
493 float e_multiplier =
494 (float)1 /
495 (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496 float p_multiplier = multiplier * e_multiplier;
497 p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498 if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499 e_ntc =
500 (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501 else
502 e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503 KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504
505 // Use regular static steal if not enough chunks for skewed
506 // distribution
507 use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508 e_ntc >= num_procs_with_ecore)
509 ? true
510 : false);
511 } else {
512 use_hybrid = false;
513 }
514 }
515 pr->flags.use_hybrid = use_hybrid;
516 pr->u.p.pchunks = p_ntc;
517 pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518 pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519
520 if (use_hybrid) {
521 KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522 T big_chunk = p_ntc / num_procs_with_pcore;
523 small_chunk = e_ntc / num_procs_with_ecore;
524
525 extras =
526 (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527
528 p_extra = (big_chunk - small_chunk);
529
530 if (core_type == KMP_HW_CORE_TYPE_CORE) {
531 if (id < first_thread_with_ecore) {
532 init =
533 id * small_chunk + id * p_extra + (id < extras ? id : extras);
534 } else {
535 init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536 (id < extras ? id : extras);
537 }
538 } else {
539 if (id == first_thread_with_ecore) {
540 init =
541 id * small_chunk + id * p_extra + (id < extras ? id : extras);
542 } else {
543 init = id * small_chunk + first_thread_with_ecore * p_extra +
544 (id < extras ? id : extras);
545 }
546 }
547 p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548 } else
549 #endif
550 {
551 small_chunk = ntc / nproc;
552 extras = ntc % nproc;
553 init = id * small_chunk + (id < extras ? id : extras);
554 p_extra = 0;
555 }
556 pr->u.p.count = init;
557 if (claimed) { // are we succeeded in claiming own buffer?
558 pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559 // Other threads will inspect steal_flag when searching for a victim.
560 // READY means other threads may steal from this thread from now on.
561 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562 } else {
563 // other thread has stolen whole our range
564 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565 pr->u.p.ub = init; // mark there is no iterations to work on
566 }
567 pr->u.p.parm2 = ntc; // save number of chunks
568 // parm3 is the number of times to attempt stealing which is
569 // nproc (just a heuristics, could be optimized later on).
570 pr->u.p.parm3 = nproc;
571 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572 break;
573 } else {
574 /* too few chunks: switching to kmp_sch_dynamic_chunked */
575 schedule = kmp_sch_dynamic_chunked;
576 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577 "kmp_sch_dynamic_chunked\n",
578 gtid));
579 goto dynamic_init;
580 break;
581 } // if
582 } // case
583 #endif
584 case kmp_sch_static_balanced: {
585 T init, limit;
586
587 KD_TRACE(
588 100,
589 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590 gtid));
591
592 if (nproc > 1) {
593 T id = tid;
594
595 if (tc < nproc) {
596 if (id < tc) {
597 init = id;
598 limit = id;
599 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600 } else {
601 pr->u.p.count = 1; /* means no more chunks to execute */
602 pr->u.p.parm1 = FALSE;
603 break;
604 }
605 } else {
606 T small_chunk = tc / nproc;
607 T extras = tc % nproc;
608 init = id * small_chunk + (id < extras ? id : extras);
609 limit = init + small_chunk - (id < extras ? 0 : 1);
610 pr->u.p.parm1 = (id == nproc - 1);
611 }
612 } else {
613 if (tc > 0) {
614 init = 0;
615 limit = tc - 1;
616 pr->u.p.parm1 = TRUE;
617 } else {
618 // zero trip count
619 pr->u.p.count = 1; /* means no more chunks to execute */
620 pr->u.p.parm1 = FALSE;
621 break;
622 }
623 }
624 #if USE_ITT_BUILD
625 // Calculate chunk for metadata report
626 if (itt_need_metadata_reporting)
627 if (cur_chunk)
628 *cur_chunk = limit - init + 1;
629 #endif
630 if (st == 1) {
631 pr->u.p.lb = lb + init;
632 pr->u.p.ub = lb + limit;
633 } else {
634 // calculated upper bound, "ub" is user-defined upper bound
635 T ub_tmp = lb + limit * st;
636 pr->u.p.lb = lb + init * st;
637 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638 // it exactly
639 if (st > 0) {
640 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641 } else {
642 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643 }
644 }
645 if (pr->flags.ordered) {
646 pr->u.p.ordered_lower = init;
647 pr->u.p.ordered_upper = limit;
648 }
649 break;
650 } // case
651 case kmp_sch_static_balanced_chunked: {
652 // similar to balanced, but chunk adjusted to multiple of simd width
653 T nth = nproc;
654 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655 " -> falling-through to static_greedy\n",
656 gtid));
657 schedule = kmp_sch_static_greedy;
658 if (nth > 1)
659 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660 else
661 pr->u.p.parm1 = tc;
662 break;
663 } // case
664 case kmp_sch_guided_simd:
665 case kmp_sch_guided_iterative_chunked: {
666 KD_TRACE(
667 100,
668 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669 " case\n",
670 gtid));
671
672 if (nproc > 1) {
673 if ((2L * chunk + 1) * nproc >= tc) {
674 /* chunk size too large, switch to dynamic */
675 schedule = kmp_sch_dynamic_chunked;
676 goto dynamic_init;
677 } else {
678 // when remaining iters become less than parm2 - switch to dynamic
679 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680 *(double *)&pr->u.p.parm3 =
681 guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682 }
683 } else {
684 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685 "kmp_sch_static_greedy\n",
686 gtid));
687 schedule = kmp_sch_static_greedy;
688 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689 KD_TRACE(
690 100,
691 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692 gtid));
693 pr->u.p.parm1 = tc;
694 } // if
695 } // case
696 break;
697 case kmp_sch_guided_analytical_chunked: {
698 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699 "kmp_sch_guided_analytical_chunked case\n",
700 gtid));
701
702 if (nproc > 1) {
703 if ((2L * chunk + 1) * nproc >= tc) {
704 /* chunk size too large, switch to dynamic */
705 schedule = kmp_sch_dynamic_chunked;
706 goto dynamic_init;
707 } else {
708 /* commonly used term: (2 nproc - 1)/(2 nproc) */
709 DBL x;
710
711 #if KMP_USE_X87CONTROL
712 /* Linux* OS already has 64-bit computation by default for long double,
713 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715 instead of the default 53-bit. Even though long double doesn't work
716 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717 expected to impact the correctness of the algorithm, but this has not
718 been mathematically proven. */
719 // save original FPCW and set precision to 64-bit, as
720 // Windows* OS on IA-32 architecture defaults to 53-bit
721 unsigned int oldFpcw = _control87(0, 0);
722 _control87(_PC_64, _MCW_PC); // 0,0x30000
723 #endif
724 /* value used for comparison in solver for cross-over point */
725 KMP_ASSERT(tc > 0);
726 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727
728 /* crossover point--chunk indexes equal to or greater than
729 this point switch to dynamic-style scheduling */
730 UT cross;
731
732 /* commonly used term: (2 nproc - 1)/(2 nproc) */
733 x = 1.0 - 0.5 / (double)nproc;
734
735 #ifdef KMP_DEBUG
736 { // test natural alignment
737 struct _test_a {
738 char a;
739 union {
740 char b;
741 DBL d;
742 };
743 } t;
744 ptrdiff_t natural_alignment =
745 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747 // long)natural_alignment );
748 KMP_DEBUG_ASSERT(
749 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750 }
751 #endif // KMP_DEBUG
752
753 /* save the term in thread private dispatch structure */
754 *(DBL *)&pr->u.p.parm3 = x;
755
756 /* solve for the crossover point to the nearest integer i for which C_i
757 <= chunk */
758 {
759 UT left, right, mid;
760 long double p;
761
762 /* estimate initial upper and lower bound */
763
764 /* doesn't matter what value right is as long as it is positive, but
765 it affects performance of the solver */
766 right = 229;
767 p = __kmp_pow<UT>(x, right);
768 if (p > target) {
769 do {
770 p *= p;
771 right <<= 1;
772 } while (p > target && right < (1 << 27));
773 /* lower bound is previous (failed) estimate of upper bound */
774 left = right >> 1;
775 } else {
776 left = 0;
777 }
778
779 /* bisection root-finding method */
780 while (left + 1 < right) {
781 mid = (left + right) / 2;
782 if (__kmp_pow<UT>(x, mid) > target) {
783 left = mid;
784 } else {
785 right = mid;
786 }
787 } // while
788 cross = right;
789 }
790 /* assert sanity of computed crossover point */
791 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792 __kmp_pow<UT>(x, cross) <= target);
793
794 /* save the crossover point in thread private dispatch structure */
795 pr->u.p.parm2 = cross;
796
797 // C75803
798 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800 #else
801 #define GUIDED_ANALYTICAL_WORKAROUND (x)
802 #endif
803 /* dynamic-style scheduling offset */
804 pr->u.p.count = tc -
805 __kmp_dispatch_guided_remaining(
806 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807 cross * chunk;
808 #if KMP_USE_X87CONTROL
809 // restore FPCW
810 _control87(oldFpcw, _MCW_PC);
811 #endif
812 } // if
813 } else {
814 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815 "kmp_sch_static_greedy\n",
816 gtid));
817 schedule = kmp_sch_static_greedy;
818 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819 pr->u.p.parm1 = tc;
820 } // if
821 } // case
822 break;
823 case kmp_sch_static_greedy:
824 KD_TRACE(
825 100,
826 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827 gtid));
828 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829 break;
830 case kmp_sch_static_chunked:
831 case kmp_sch_dynamic_chunked:
832 dynamic_init:
833 if (tc == 0)
834 break;
835 if (pr->u.p.parm1 <= 0)
836 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837 else if (pr->u.p.parm1 > tc)
838 pr->u.p.parm1 = tc;
839 // Store the total number of chunks to prevent integer overflow during
840 // bounds calculations in the get next chunk routine.
841 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844 gtid));
845 break;
846 case kmp_sch_trapezoidal: {
847 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848
849 T parm1, parm2, parm3, parm4;
850 KD_TRACE(100,
851 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852 gtid));
853
854 parm1 = chunk;
855
856 /* F : size of the first cycle */
857 parm2 = (tc / (2 * nproc));
858
859 if (parm2 < 1) {
860 parm2 = 1;
861 }
862
863 /* L : size of the last cycle. Make sure the last cycle is not larger
864 than the first cycle. */
865 if (parm1 < 1) {
866 parm1 = 1;
867 } else if (parm1 > parm2) {
868 parm1 = parm2;
869 }
870
871 /* N : number of cycles */
872 parm3 = (parm2 + parm1);
873 parm3 = (2 * tc + parm3 - 1) / parm3;
874
875 if (parm3 < 2) {
876 parm3 = 2;
877 }
878
879 /* sigma : decreasing incr of the trapezoid */
880 parm4 = (parm3 - 1);
881 parm4 = (parm2 - parm1) / parm4;
882
883 // pointless check, because parm4 >= 0 always
884 // if ( parm4 < 0 ) {
885 // parm4 = 0;
886 //}
887
888 pr->u.p.parm1 = parm1;
889 pr->u.p.parm2 = parm2;
890 pr->u.p.parm3 = parm3;
891 pr->u.p.parm4 = parm4;
892 } // case
893 break;
894
895 default: {
896 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897 KMP_HNT(GetNewerLibrary), // Hint
898 __kmp_msg_null // Variadic argument list terminator
899 );
900 } break;
901 } // switch
902 pr->schedule = schedule;
903 }
904
905 #if KMP_USE_HIER_SCHED
906 template <typename T>
907 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908 typename traits_t<T>::signed_t st);
909 template <>
910 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int32 lb,kmp_int32 ub,kmp_int32 st)911 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912 kmp_int32 ub, kmp_int32 st) {
913 __kmp_dispatch_init_hierarchy<kmp_int32>(
914 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916 }
917 template <>
918 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st)919 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920 kmp_uint32 ub, kmp_int32 st) {
921 __kmp_dispatch_init_hierarchy<kmp_uint32>(
922 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924 }
925 template <>
926 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int64 lb,kmp_int64 ub,kmp_int64 st)927 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928 kmp_int64 ub, kmp_int64 st) {
929 __kmp_dispatch_init_hierarchy<kmp_int64>(
930 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932 }
933 template <>
934 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st)935 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936 kmp_uint64 ub, kmp_int64 st) {
937 __kmp_dispatch_init_hierarchy<kmp_uint64>(
938 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940 }
941
942 // free all the hierarchy scheduling memory associated with the team
__kmp_dispatch_free_hierarchies(kmp_team_t * team)943 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945 for (int i = 0; i < num_disp_buff; ++i) {
946 // type does not matter here so use kmp_int32
947 auto sh =
948 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949 &team->t.t_disp_buffer[i]);
950 if (sh->hier) {
951 sh->hier->deallocate();
952 __kmp_free(sh->hier);
953 }
954 }
955 }
956 #endif
957
958 // UT - unsigned flavor of T, ST - signed flavor of T,
959 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960 template <typename T>
961 static void
__kmp_dispatch_init(ident_t * loc,int gtid,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,typename traits_t<T>::signed_t chunk,int push_ws)962 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963 T ub, typename traits_t<T>::signed_t st,
964 typename traits_t<T>::signed_t chunk, int push_ws) {
965 typedef typename traits_t<T>::unsigned_t UT;
966
967 int active;
968 kmp_info_t *th;
969 kmp_team_t *team;
970 kmp_uint32 my_buffer_index;
971 dispatch_private_info_template<T> *pr;
972 dispatch_shared_info_template<T> volatile *sh;
973
974 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975 sizeof(dispatch_private_info));
976 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977 sizeof(dispatch_shared_info));
978 __kmp_assert_valid_gtid(gtid);
979
980 if (!TCR_4(__kmp_init_parallel))
981 __kmp_parallel_initialize();
982
983 __kmp_resume_if_soft_paused();
984
985 #if INCLUDE_SSC_MARKS
986 SSC_MARK_DISPATCH_INIT();
987 #endif
988 #ifdef KMP_DEBUG
989 typedef typename traits_t<T>::signed_t ST;
990 {
991 char *buff;
992 // create format specifiers before the debug output
993 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995 traits_t<ST>::spec, traits_t<T>::spec,
996 traits_t<T>::spec, traits_t<ST>::spec);
997 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998 __kmp_str_free(&buff);
999 }
1000 #endif
1001 /* setup data */
1002 th = __kmp_threads[gtid];
1003 team = th->th.th_team;
1004 active = !team->t.t_serialized;
1005 th->th.th_ident = loc;
1006
1007 // Any half-decent optimizer will remove this test when the blocks are empty
1008 // since the macros expand to nothing
1009 // when statistics are disabled.
1010 if (schedule == __kmp_static) {
1011 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012 } else {
1013 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014 }
1015
1016 #if KMP_USE_HIER_SCHED
1017 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018 // Hierarchical scheduling does not work with ordered, so if ordered is
1019 // detected, then revert back to threaded scheduling.
1020 bool ordered;
1021 enum sched_type my_sched = schedule;
1022 my_buffer_index = th->th.th_dispatch->th_disp_index;
1023 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024 &th->th.th_dispatch
1025 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028 my_sched =
1029 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030 ordered = (kmp_ord_lower & my_sched);
1031 if (pr->flags.use_hier) {
1032 if (ordered) {
1033 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
1034 "Disabling hierarchical scheduling.\n",
1035 gtid));
1036 pr->flags.use_hier = FALSE;
1037 }
1038 }
1039 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040 // Don't use hierarchical for ordered parallel loops and don't
1041 // use the runtime hierarchy if one was specified in the program
1042 if (!ordered && !pr->flags.use_hier)
1043 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044 }
1045 #endif // KMP_USE_HIER_SCHED
1046
1047 #if USE_ITT_BUILD
1048 kmp_uint64 cur_chunk = chunk;
1049 int itt_need_metadata_reporting =
1050 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052 team->t.t_active_level == 1;
1053 #endif
1054 if (!active) {
1055 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057 } else {
1058 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060
1061 my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062
1063 /* What happens when number of threads changes, need to resize buffer? */
1064 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065 &th->th.th_dispatch
1066 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070 my_buffer_index));
1071 if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073 " sh->buffer_index:%d\n",
1074 gtid, my_buffer_index, sh->buffer_index));
1075 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077 // Note: KMP_WAIT() cannot be used there: buffer index and
1078 // my_buffer_index are *always* 32-bit integers.
1079 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080 "sh->buffer_index:%d\n",
1081 gtid, my_buffer_index, sh->buffer_index));
1082 }
1083 }
1084
1085 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086 #if USE_ITT_BUILD
1087 &cur_chunk,
1088 #endif
1089 chunk, (T)th->th.th_team_nproc,
1090 (T)th->th.th_info.ds.ds_tid);
1091 if (active) {
1092 if (pr->flags.ordered == 0) {
1093 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095 } else {
1096 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098 }
1099 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100 th->th.th_dispatch->th_dispatch_sh_current =
1101 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102 #if USE_ITT_BUILD
1103 if (pr->flags.ordered) {
1104 __kmp_itt_ordered_init(gtid);
1105 }
1106 // Report loop metadata
1107 if (itt_need_metadata_reporting) {
1108 // Only report metadata by primary thread of active team at level 1
1109 kmp_uint64 schedtype = 0;
1110 switch (schedule) {
1111 case kmp_sch_static_chunked:
1112 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113 break;
1114 case kmp_sch_static_greedy:
1115 cur_chunk = pr->u.p.parm1;
1116 break;
1117 case kmp_sch_dynamic_chunked:
1118 schedtype = 1;
1119 break;
1120 case kmp_sch_guided_iterative_chunked:
1121 case kmp_sch_guided_analytical_chunked:
1122 case kmp_sch_guided_simd:
1123 schedtype = 2;
1124 break;
1125 default:
1126 // Should we put this case under "static"?
1127 // case kmp_sch_static_steal:
1128 schedtype = 3;
1129 break;
1130 }
1131 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132 }
1133 #if KMP_USE_HIER_SCHED
1134 if (pr->flags.use_hier) {
1135 pr->u.p.count = 0;
1136 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137 }
1138 #endif // KMP_USER_HIER_SCHED
1139 #endif /* USE_ITT_BUILD */
1140 }
1141
1142 #ifdef KMP_DEBUG
1143 {
1144 char *buff;
1145 // create format specifiers before the debug output
1146 buff = __kmp_str_format(
1147 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148 "lb:%%%s ub:%%%s"
1149 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159 __kmp_str_free(&buff);
1160 }
1161 #endif
1162 #if OMPT_SUPPORT && OMPT_OPTIONAL
1163 if (ompt_enabled.ompt_callback_work) {
1164 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166 ompt_callbacks.ompt_callback(ompt_callback_work)(
1167 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1168 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1169 }
1170 #endif
1171 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1172 }
1173
1174 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1175 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1176 * every chunk of iterations. If the ordered section(s) were not executed
1177 * for this iteration (or every iteration in this chunk), we need to set the
1178 * ordered iteration counters so that the next thread can proceed. */
1179 template <typename UT>
__kmp_dispatch_finish(int gtid,ident_t * loc)1180 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1181 typedef typename traits_t<UT>::signed_t ST;
1182 __kmp_assert_valid_gtid(gtid);
1183 kmp_info_t *th = __kmp_threads[gtid];
1184
1185 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1186 if (!th->th.th_team->t.t_serialized) {
1187
1188 dispatch_private_info_template<UT> *pr =
1189 reinterpret_cast<dispatch_private_info_template<UT> *>(
1190 th->th.th_dispatch->th_dispatch_pr_current);
1191 dispatch_shared_info_template<UT> volatile *sh =
1192 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1193 th->th.th_dispatch->th_dispatch_sh_current);
1194 KMP_DEBUG_ASSERT(pr);
1195 KMP_DEBUG_ASSERT(sh);
1196 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1197 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1198
1199 if (pr->ordered_bumped) {
1200 KD_TRACE(
1201 1000,
1202 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1203 gtid));
1204 pr->ordered_bumped = 0;
1205 } else {
1206 UT lower = pr->u.p.ordered_lower;
1207
1208 #ifdef KMP_DEBUG
1209 {
1210 char *buff;
1211 // create format specifiers before the debug output
1212 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1213 "ordered_iteration:%%%s lower:%%%s\n",
1214 traits_t<UT>::spec, traits_t<UT>::spec);
1215 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1216 __kmp_str_free(&buff);
1217 }
1218 #endif
1219
1220 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1221 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1222 KMP_MB(); /* is this necessary? */
1223 #ifdef KMP_DEBUG
1224 {
1225 char *buff;
1226 // create format specifiers before the debug output
1227 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1228 "ordered_iteration:%%%s lower:%%%s\n",
1229 traits_t<UT>::spec, traits_t<UT>::spec);
1230 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1231 __kmp_str_free(&buff);
1232 }
1233 #endif
1234
1235 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1236 } // if
1237 } // if
1238 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1239 }
1240
1241 #ifdef KMP_GOMP_COMPAT
1242
1243 template <typename UT>
__kmp_dispatch_finish_chunk(int gtid,ident_t * loc)1244 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1245 typedef typename traits_t<UT>::signed_t ST;
1246 __kmp_assert_valid_gtid(gtid);
1247 kmp_info_t *th = __kmp_threads[gtid];
1248
1249 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1250 if (!th->th.th_team->t.t_serialized) {
1251 dispatch_private_info_template<UT> *pr =
1252 reinterpret_cast<dispatch_private_info_template<UT> *>(
1253 th->th.th_dispatch->th_dispatch_pr_current);
1254 dispatch_shared_info_template<UT> volatile *sh =
1255 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1256 th->th.th_dispatch->th_dispatch_sh_current);
1257 KMP_DEBUG_ASSERT(pr);
1258 KMP_DEBUG_ASSERT(sh);
1259 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1260 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1261
1262 UT lower = pr->u.p.ordered_lower;
1263 UT upper = pr->u.p.ordered_upper;
1264 UT inc = upper - lower + 1;
1265
1266 if (pr->ordered_bumped == inc) {
1267 KD_TRACE(
1268 1000,
1269 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1270 gtid));
1271 pr->ordered_bumped = 0;
1272 } else {
1273 inc -= pr->ordered_bumped;
1274
1275 #ifdef KMP_DEBUG
1276 {
1277 char *buff;
1278 // create format specifiers before the debug output
1279 buff = __kmp_str_format(
1280 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1281 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1282 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1283 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1284 __kmp_str_free(&buff);
1285 }
1286 #endif
1287
1288 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1289 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1290
1291 KMP_MB(); /* is this necessary? */
1292 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1293 "ordered_bumped to zero\n",
1294 gtid));
1295 pr->ordered_bumped = 0;
1296 //!!!!! TODO check if the inc should be unsigned, or signed???
1297 #ifdef KMP_DEBUG
1298 {
1299 char *buff;
1300 // create format specifiers before the debug output
1301 buff = __kmp_str_format(
1302 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1303 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1304 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1305 traits_t<UT>::spec);
1306 KD_TRACE(1000,
1307 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1308 __kmp_str_free(&buff);
1309 }
1310 #endif
1311
1312 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1313 }
1314 // }
1315 }
1316 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1317 }
1318
1319 #endif /* KMP_GOMP_COMPAT */
1320
1321 template <typename T>
__kmp_dispatch_next_algorithm(int gtid,dispatch_private_info_template<T> * pr,dispatch_shared_info_template<T> volatile * sh,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,T nproc,T tid)1322 int __kmp_dispatch_next_algorithm(int gtid,
1323 dispatch_private_info_template<T> *pr,
1324 dispatch_shared_info_template<T> volatile *sh,
1325 kmp_int32 *p_last, T *p_lb, T *p_ub,
1326 typename traits_t<T>::signed_t *p_st, T nproc,
1327 T tid) {
1328 typedef typename traits_t<T>::unsigned_t UT;
1329 typedef typename traits_t<T>::signed_t ST;
1330 typedef typename traits_t<T>::floating_t DBL;
1331 int status = 0;
1332 bool last = false;
1333 T start;
1334 ST incr;
1335 UT limit, trip, init;
1336 kmp_info_t *th = __kmp_threads[gtid];
1337 kmp_team_t *team = th->th.th_team;
1338
1339 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1340 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1341 KMP_DEBUG_ASSERT(pr);
1342 KMP_DEBUG_ASSERT(sh);
1343 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1344 #ifdef KMP_DEBUG
1345 {
1346 char *buff;
1347 // create format specifiers before the debug output
1348 buff =
1349 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1350 "sh:%%p nproc:%%%s tid:%%%s\n",
1351 traits_t<T>::spec, traits_t<T>::spec);
1352 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1353 __kmp_str_free(&buff);
1354 }
1355 #endif
1356
1357 // zero trip count
1358 if (pr->u.p.tc == 0) {
1359 KD_TRACE(10,
1360 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1361 "zero status:%d\n",
1362 gtid, status));
1363 return 0;
1364 }
1365
1366 switch (pr->schedule) {
1367 #if KMP_STATIC_STEAL_ENABLED
1368 case kmp_sch_static_steal: {
1369 T chunk = pr->u.p.parm1;
1370 UT nchunks = pr->u.p.parm2;
1371 KD_TRACE(100,
1372 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1373 gtid));
1374
1375 trip = pr->u.p.tc - 1;
1376
1377 if (traits_t<T>::type_size > 4) {
1378 // use lock for 8-byte induction variable.
1379 // TODO (optional): check presence and use 16-byte CAS
1380 kmp_lock_t *lck = pr->u.p.steal_lock;
1381 KMP_DEBUG_ASSERT(lck != NULL);
1382 if (pr->u.p.count < (UT)pr->u.p.ub) {
1383 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1384 __kmp_acquire_lock(lck, gtid);
1385 // try to get own chunk of iterations
1386 init = (pr->u.p.count)++;
1387 status = (init < (UT)pr->u.p.ub);
1388 __kmp_release_lock(lck, gtid);
1389 } else {
1390 status = 0; // no own chunks
1391 }
1392 if (!status) { // try to steal
1393 kmp_lock_t *lckv; // victim buffer's lock
1394 T while_limit = pr->u.p.parm3;
1395 T while_index = 0;
1396 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1397 __kmp_dispatch_num_buffers; // current loop index
1398 // note: victim thread can potentially execute another loop
1399 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1400 while ((!status) && (while_limit != ++while_index)) {
1401 dispatch_private_info_template<T> *v;
1402 T remaining;
1403 T victimId = pr->u.p.parm4;
1404 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1405 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1406 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1407 KMP_DEBUG_ASSERT(v);
1408 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1409 oldVictimId != victimId) {
1410 victimId = (victimId + 1) % nproc;
1411 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1412 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1413 KMP_DEBUG_ASSERT(v);
1414 }
1415 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1416 continue; // try once more (nproc attempts in total)
1417 }
1418 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1419 kmp_uint32 old = UNUSED;
1420 // try to steal whole range from inactive victim
1421 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1422 if (status) {
1423 // initialize self buffer with victim's whole range of chunks
1424 T id = victimId;
1425 T small_chunk = 0, extras = 0, p_extra = 0;
1426 __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1427 init, small_chunk, extras,
1428 p_extra);
1429 __kmp_acquire_lock(lck, gtid);
1430 pr->u.p.count = init + 1; // exclude one we execute immediately
1431 pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1432 __kmp_release_lock(lck, gtid);
1433 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1434 // no need to reinitialize other thread invariants: lb, st, etc.
1435 #ifdef KMP_DEBUG
1436 {
1437 char *buff;
1438 // create format specifiers before the debug output
1439 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1440 "stolen chunks from T#%%d, "
1441 "count:%%%s ub:%%%s\n",
1442 traits_t<UT>::spec, traits_t<T>::spec);
1443 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1444 __kmp_str_free(&buff);
1445 }
1446 #endif
1447 // activate non-empty buffer and let others steal from us
1448 if (pr->u.p.count < (UT)pr->u.p.ub)
1449 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1450 break;
1451 }
1452 }
1453 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1454 v->u.p.count >= (UT)v->u.p.ub) {
1455 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1456 continue; // no chunks to steal, try next victim
1457 }
1458 lckv = v->u.p.steal_lock;
1459 KMP_ASSERT(lckv != NULL);
1460 __kmp_acquire_lock(lckv, gtid);
1461 limit = v->u.p.ub; // keep initial ub
1462 if (v->u.p.count >= limit) {
1463 __kmp_release_lock(lckv, gtid);
1464 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1465 continue; // no chunks to steal, try next victim
1466 }
1467
1468 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1469 // TODO: is this heuristics good enough??
1470 remaining = limit - v->u.p.count;
1471 if (remaining > 7) {
1472 // steal 1/4 of remaining
1473 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1474 init = (v->u.p.ub -= (remaining >> 2));
1475 } else {
1476 // steal 1 chunk of 1..7 remaining
1477 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1478 init = (v->u.p.ub -= 1);
1479 }
1480 __kmp_release_lock(lckv, gtid);
1481 #ifdef KMP_DEBUG
1482 {
1483 char *buff;
1484 // create format specifiers before the debug output
1485 buff = __kmp_str_format(
1486 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1487 "count:%%%s ub:%%%s\n",
1488 traits_t<UT>::spec, traits_t<UT>::spec);
1489 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1490 __kmp_str_free(&buff);
1491 }
1492 #endif
1493 KMP_DEBUG_ASSERT(init + 1 <= limit);
1494 pr->u.p.parm4 = victimId; // remember victim to steal from
1495 status = 1;
1496 // now update own count and ub with stolen range excluding init chunk
1497 __kmp_acquire_lock(lck, gtid);
1498 pr->u.p.count = init + 1;
1499 pr->u.p.ub = limit;
1500 __kmp_release_lock(lck, gtid);
1501 // activate non-empty buffer and let others steal from us
1502 if (init + 1 < limit)
1503 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1504 } // while (search for victim)
1505 } // if (try to find victim and steal)
1506 } else {
1507 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1508 // as all operations on pair (count, ub) must be done atomically
1509 typedef union {
1510 struct {
1511 UT count;
1512 T ub;
1513 } p;
1514 kmp_int64 b;
1515 } union_i4;
1516 union_i4 vold, vnew;
1517 if (pr->u.p.count < (UT)pr->u.p.ub) {
1518 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1519 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1520 vnew.b = vold.b;
1521 vnew.p.count++; // get chunk from head of self range
1522 while (!KMP_COMPARE_AND_STORE_REL64(
1523 (volatile kmp_int64 *)&pr->u.p.count,
1524 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1525 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1526 KMP_CPU_PAUSE();
1527 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1528 vnew.b = vold.b;
1529 vnew.p.count++;
1530 }
1531 init = vold.p.count;
1532 status = (init < (UT)vold.p.ub);
1533 } else {
1534 status = 0; // no own chunks
1535 }
1536 if (!status) { // try to steal
1537 T while_limit = pr->u.p.parm3;
1538 T while_index = 0;
1539 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1540 __kmp_dispatch_num_buffers; // current loop index
1541 // note: victim thread can potentially execute another loop
1542 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1543 while ((!status) && (while_limit != ++while_index)) {
1544 dispatch_private_info_template<T> *v;
1545 T remaining;
1546 T victimId = pr->u.p.parm4;
1547 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1548 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1549 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1550 KMP_DEBUG_ASSERT(v);
1551 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1552 oldVictimId != victimId) {
1553 victimId = (victimId + 1) % nproc;
1554 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1555 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1556 KMP_DEBUG_ASSERT(v);
1557 }
1558 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1559 continue; // try once more (nproc attempts in total)
1560 }
1561 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1562 kmp_uint32 old = UNUSED;
1563 // try to steal whole range from inactive victim
1564 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1565 if (status) {
1566 // initialize self buffer with victim's whole range of chunks
1567 T id = victimId;
1568 T small_chunk = 0, extras = 0, p_extra = 0;
1569 __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1570 init, small_chunk, extras,
1571 p_extra);
1572 vnew.p.count = init + 1;
1573 vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1574 // write pair (count, ub) at once atomically
1575 #if KMP_ARCH_X86
1576 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1577 #else
1578 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1579 #endif
1580 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1581 // no need to initialize other thread invariants: lb, st, etc.
1582 #ifdef KMP_DEBUG
1583 {
1584 char *buff;
1585 // create format specifiers before the debug output
1586 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1587 "stolen chunks from T#%%d, "
1588 "count:%%%s ub:%%%s\n",
1589 traits_t<UT>::spec, traits_t<T>::spec);
1590 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1591 __kmp_str_free(&buff);
1592 }
1593 #endif
1594 // activate non-empty buffer and let others steal from us
1595 if (pr->u.p.count < (UT)pr->u.p.ub)
1596 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1597 break;
1598 }
1599 }
1600 while (1) { // CAS loop with check if victim still has enough chunks
1601 // many threads may be stealing concurrently from same victim
1602 vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1603 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1604 vold.p.count >= (UT)vold.p.ub) {
1605 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1606 break; // no chunks to steal, try next victim
1607 }
1608 vnew.b = vold.b;
1609 remaining = vold.p.ub - vold.p.count;
1610 // try to steal 1/4 of remaining
1611 // TODO: is this heuristics good enough??
1612 if (remaining > 7) {
1613 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1614 } else {
1615 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1616 }
1617 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1618 if (KMP_COMPARE_AND_STORE_REL64(
1619 (volatile kmp_int64 *)&v->u.p.count,
1620 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1621 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1622 // stealing succedded
1623 #ifdef KMP_DEBUG
1624 {
1625 char *buff;
1626 // create format specifiers before the debug output
1627 buff = __kmp_str_format(
1628 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1629 "count:%%%s ub:%%%s\n",
1630 traits_t<T>::spec, traits_t<T>::spec);
1631 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1632 __kmp_str_free(&buff);
1633 }
1634 #endif
1635 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1636 vold.p.ub - vnew.p.ub);
1637 status = 1;
1638 pr->u.p.parm4 = victimId; // keep victim id
1639 // now update own count and ub
1640 init = vnew.p.ub;
1641 vold.p.count = init + 1;
1642 #if KMP_ARCH_X86
1643 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1644 #else
1645 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1646 #endif
1647 // activate non-empty buffer and let others steal from us
1648 if (vold.p.count < (UT)vold.p.ub)
1649 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1650 break;
1651 } // if (check CAS result)
1652 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1653 } // while (try to steal from particular victim)
1654 } // while (search for victim)
1655 } // if (try to find victim and steal)
1656 } // if (4-byte induction variable)
1657 if (!status) {
1658 *p_lb = 0;
1659 *p_ub = 0;
1660 if (p_st != NULL)
1661 *p_st = 0;
1662 } else {
1663 start = pr->u.p.lb;
1664 init *= chunk;
1665 limit = chunk + init - 1;
1666 incr = pr->u.p.st;
1667 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1668
1669 KMP_DEBUG_ASSERT(init <= trip);
1670 // keep track of done chunks for possible early exit from stealing
1671 // TODO: count executed chunks locally with rare update of shared location
1672 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1673 if ((last = (limit >= trip)) != 0)
1674 limit = trip;
1675 if (p_st != NULL)
1676 *p_st = incr;
1677
1678 if (incr == 1) {
1679 *p_lb = start + init;
1680 *p_ub = start + limit;
1681 } else {
1682 *p_lb = start + init * incr;
1683 *p_ub = start + limit * incr;
1684 }
1685 } // if
1686 break;
1687 } // case
1688 #endif // KMP_STATIC_STEAL_ENABLED
1689 case kmp_sch_static_balanced: {
1690 KD_TRACE(
1691 10,
1692 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1693 gtid));
1694 /* check if thread has any iteration to do */
1695 if ((status = !pr->u.p.count) != 0) {
1696 pr->u.p.count = 1;
1697 *p_lb = pr->u.p.lb;
1698 *p_ub = pr->u.p.ub;
1699 last = (pr->u.p.parm1 != 0);
1700 if (p_st != NULL)
1701 *p_st = pr->u.p.st;
1702 } else { /* no iterations to do */
1703 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1704 }
1705 } // case
1706 break;
1707 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1708 merged here */
1709 case kmp_sch_static_chunked: {
1710 T parm1;
1711
1712 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1713 "kmp_sch_static_[affinity|chunked] case\n",
1714 gtid));
1715 parm1 = pr->u.p.parm1;
1716
1717 trip = pr->u.p.tc - 1;
1718 init = parm1 * (pr->u.p.count + tid);
1719
1720 if ((status = (init <= trip)) != 0) {
1721 start = pr->u.p.lb;
1722 incr = pr->u.p.st;
1723 limit = parm1 + init - 1;
1724
1725 if ((last = (limit >= trip)) != 0)
1726 limit = trip;
1727
1728 if (p_st != NULL)
1729 *p_st = incr;
1730
1731 pr->u.p.count += nproc;
1732
1733 if (incr == 1) {
1734 *p_lb = start + init;
1735 *p_ub = start + limit;
1736 } else {
1737 *p_lb = start + init * incr;
1738 *p_ub = start + limit * incr;
1739 }
1740
1741 if (pr->flags.ordered) {
1742 pr->u.p.ordered_lower = init;
1743 pr->u.p.ordered_upper = limit;
1744 } // if
1745 } // if
1746 } // case
1747 break;
1748
1749 case kmp_sch_dynamic_chunked: {
1750 UT chunk_number;
1751 UT chunk_size = pr->u.p.parm1;
1752 UT nchunks = pr->u.p.parm2;
1753
1754 KD_TRACE(
1755 100,
1756 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1757 gtid));
1758
1759 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1760 status = (chunk_number < nchunks);
1761 if (!status) {
1762 *p_lb = 0;
1763 *p_ub = 0;
1764 if (p_st != NULL)
1765 *p_st = 0;
1766 } else {
1767 init = chunk_size * chunk_number;
1768 trip = pr->u.p.tc - 1;
1769 start = pr->u.p.lb;
1770 incr = pr->u.p.st;
1771
1772 if ((last = (trip - init < (UT)chunk_size)))
1773 limit = trip;
1774 else
1775 limit = chunk_size + init - 1;
1776
1777 if (p_st != NULL)
1778 *p_st = incr;
1779
1780 if (incr == 1) {
1781 *p_lb = start + init;
1782 *p_ub = start + limit;
1783 } else {
1784 *p_lb = start + init * incr;
1785 *p_ub = start + limit * incr;
1786 }
1787
1788 if (pr->flags.ordered) {
1789 pr->u.p.ordered_lower = init;
1790 pr->u.p.ordered_upper = limit;
1791 } // if
1792 } // if
1793 } // case
1794 break;
1795
1796 case kmp_sch_guided_iterative_chunked: {
1797 T chunkspec = pr->u.p.parm1;
1798 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1799 "iterative case\n",
1800 gtid));
1801 trip = pr->u.p.tc;
1802 // Start atomic part of calculations
1803 while (1) {
1804 ST remaining; // signed, because can be < 0
1805 init = sh->u.s.iteration; // shared value
1806 remaining = trip - init;
1807 if (remaining <= 0) { // AC: need to compare with 0 first
1808 // nothing to do, don't try atomic op
1809 status = 0;
1810 break;
1811 }
1812 if ((T)remaining <
1813 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1814 // use dynamic-style schedule
1815 // atomically increment iterations, get old value
1816 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1817 (ST)chunkspec);
1818 remaining = trip - init;
1819 if (remaining <= 0) {
1820 status = 0; // all iterations got by other threads
1821 } else {
1822 // got some iterations to work on
1823 status = 1;
1824 if ((T)remaining > chunkspec) {
1825 limit = init + chunkspec - 1;
1826 } else {
1827 last = true; // the last chunk
1828 limit = init + remaining - 1;
1829 } // if
1830 } // if
1831 break;
1832 } // if
1833 limit = init + (UT)((double)remaining *
1834 *(double *)&pr->u.p.parm3); // divide by K*nproc
1835 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1836 (ST)init, (ST)limit)) {
1837 // CAS was successful, chunk obtained
1838 status = 1;
1839 --limit;
1840 break;
1841 } // if
1842 } // while
1843 if (status != 0) {
1844 start = pr->u.p.lb;
1845 incr = pr->u.p.st;
1846 if (p_st != NULL)
1847 *p_st = incr;
1848 *p_lb = start + init * incr;
1849 *p_ub = start + limit * incr;
1850 if (pr->flags.ordered) {
1851 pr->u.p.ordered_lower = init;
1852 pr->u.p.ordered_upper = limit;
1853 } // if
1854 } else {
1855 *p_lb = 0;
1856 *p_ub = 0;
1857 if (p_st != NULL)
1858 *p_st = 0;
1859 } // if
1860 } // case
1861 break;
1862
1863 case kmp_sch_guided_simd: {
1864 // same as iterative but curr-chunk adjusted to be multiple of given
1865 // chunk
1866 T chunk = pr->u.p.parm1;
1867 KD_TRACE(100,
1868 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1869 gtid));
1870 trip = pr->u.p.tc;
1871 // Start atomic part of calculations
1872 while (1) {
1873 ST remaining; // signed, because can be < 0
1874 init = sh->u.s.iteration; // shared value
1875 remaining = trip - init;
1876 if (remaining <= 0) { // AC: need to compare with 0 first
1877 status = 0; // nothing to do, don't try atomic op
1878 break;
1879 }
1880 KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1881 // compare with K*nproc*(chunk+1), K=2 by default
1882 if ((T)remaining < pr->u.p.parm2) {
1883 // use dynamic-style schedule
1884 // atomically increment iterations, get old value
1885 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1886 (ST)chunk);
1887 remaining = trip - init;
1888 if (remaining <= 0) {
1889 status = 0; // all iterations got by other threads
1890 } else {
1891 // got some iterations to work on
1892 status = 1;
1893 if ((T)remaining > chunk) {
1894 limit = init + chunk - 1;
1895 } else {
1896 last = true; // the last chunk
1897 limit = init + remaining - 1;
1898 } // if
1899 } // if
1900 break;
1901 } // if
1902 // divide by K*nproc
1903 UT span;
1904 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1905 &span);
1906 UT rem = span % chunk;
1907 if (rem) // adjust so that span%chunk == 0
1908 span += chunk - rem;
1909 limit = init + span;
1910 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1911 (ST)init, (ST)limit)) {
1912 // CAS was successful, chunk obtained
1913 status = 1;
1914 --limit;
1915 break;
1916 } // if
1917 } // while
1918 if (status != 0) {
1919 start = pr->u.p.lb;
1920 incr = pr->u.p.st;
1921 if (p_st != NULL)
1922 *p_st = incr;
1923 *p_lb = start + init * incr;
1924 *p_ub = start + limit * incr;
1925 if (pr->flags.ordered) {
1926 pr->u.p.ordered_lower = init;
1927 pr->u.p.ordered_upper = limit;
1928 } // if
1929 } else {
1930 *p_lb = 0;
1931 *p_ub = 0;
1932 if (p_st != NULL)
1933 *p_st = 0;
1934 } // if
1935 } // case
1936 break;
1937
1938 case kmp_sch_guided_analytical_chunked: {
1939 T chunkspec = pr->u.p.parm1;
1940 UT chunkIdx;
1941 #if KMP_USE_X87CONTROL
1942 /* for storing original FPCW value for Windows* OS on
1943 IA-32 architecture 8-byte version */
1944 unsigned int oldFpcw;
1945 unsigned int fpcwSet = 0;
1946 #endif
1947 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1948 "kmp_sch_guided_analytical_chunked case\n",
1949 gtid));
1950
1951 trip = pr->u.p.tc;
1952
1953 KMP_DEBUG_ASSERT(nproc > 1);
1954 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1955
1956 while (1) { /* this while loop is a safeguard against unexpected zero
1957 chunk sizes */
1958 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1959 if (chunkIdx >= (UT)pr->u.p.parm2) {
1960 --trip;
1961 /* use dynamic-style scheduling */
1962 init = chunkIdx * chunkspec + pr->u.p.count;
1963 /* need to verify init > 0 in case of overflow in the above
1964 * calculation */
1965 if ((status = (init > 0 && init <= trip)) != 0) {
1966 limit = init + chunkspec - 1;
1967
1968 if ((last = (limit >= trip)) != 0)
1969 limit = trip;
1970 }
1971 break;
1972 } else {
1973 /* use exponential-style scheduling */
1974 /* The following check is to workaround the lack of long double precision on
1975 Windows* OS.
1976 This check works around the possible effect that init != 0 for chunkIdx == 0.
1977 */
1978 #if KMP_USE_X87CONTROL
1979 /* If we haven't already done so, save original
1980 FPCW and set precision to 64-bit, as Windows* OS
1981 on IA-32 architecture defaults to 53-bit */
1982 if (!fpcwSet) {
1983 oldFpcw = _control87(0, 0);
1984 _control87(_PC_64, _MCW_PC);
1985 fpcwSet = 0x30000;
1986 }
1987 #endif
1988 if (chunkIdx) {
1989 init = __kmp_dispatch_guided_remaining<T>(
1990 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1991 KMP_DEBUG_ASSERT(init);
1992 init = trip - init;
1993 } else
1994 init = 0;
1995 limit = trip - __kmp_dispatch_guided_remaining<T>(
1996 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1997 KMP_ASSERT(init <= limit);
1998 if (init < limit) {
1999 KMP_DEBUG_ASSERT(limit <= trip);
2000 --limit;
2001 status = 1;
2002 break;
2003 } // if
2004 } // if
2005 } // while (1)
2006 #if KMP_USE_X87CONTROL
2007 /* restore FPCW if necessary
2008 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2009 */
2010 if (fpcwSet && (oldFpcw & fpcwSet))
2011 _control87(oldFpcw, _MCW_PC);
2012 #endif
2013 if (status != 0) {
2014 start = pr->u.p.lb;
2015 incr = pr->u.p.st;
2016 if (p_st != NULL)
2017 *p_st = incr;
2018 *p_lb = start + init * incr;
2019 *p_ub = start + limit * incr;
2020 if (pr->flags.ordered) {
2021 pr->u.p.ordered_lower = init;
2022 pr->u.p.ordered_upper = limit;
2023 }
2024 } else {
2025 *p_lb = 0;
2026 *p_ub = 0;
2027 if (p_st != NULL)
2028 *p_st = 0;
2029 }
2030 } // case
2031 break;
2032
2033 case kmp_sch_trapezoidal: {
2034 UT index;
2035 T parm2 = pr->u.p.parm2;
2036 T parm3 = pr->u.p.parm3;
2037 T parm4 = pr->u.p.parm4;
2038 KD_TRACE(100,
2039 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2040 gtid));
2041
2042 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2043
2044 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2045 trip = pr->u.p.tc - 1;
2046
2047 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2048 *p_lb = 0;
2049 *p_ub = 0;
2050 if (p_st != NULL)
2051 *p_st = 0;
2052 } else {
2053 start = pr->u.p.lb;
2054 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2055 incr = pr->u.p.st;
2056
2057 if ((last = (limit >= trip)) != 0)
2058 limit = trip;
2059
2060 if (p_st != NULL)
2061 *p_st = incr;
2062
2063 if (incr == 1) {
2064 *p_lb = start + init;
2065 *p_ub = start + limit;
2066 } else {
2067 *p_lb = start + init * incr;
2068 *p_ub = start + limit * incr;
2069 }
2070
2071 if (pr->flags.ordered) {
2072 pr->u.p.ordered_lower = init;
2073 pr->u.p.ordered_upper = limit;
2074 } // if
2075 } // if
2076 } // case
2077 break;
2078 default: {
2079 status = 0; // to avoid complaints on uninitialized variable use
2080 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2081 KMP_HNT(GetNewerLibrary), // Hint
2082 __kmp_msg_null // Variadic argument list terminator
2083 );
2084 } break;
2085 } // switch
2086 if (p_last)
2087 *p_last = last;
2088 #ifdef KMP_DEBUG
2089 if (pr->flags.ordered) {
2090 char *buff;
2091 // create format specifiers before the debug output
2092 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2093 "ordered_lower:%%%s ordered_upper:%%%s\n",
2094 traits_t<UT>::spec, traits_t<UT>::spec);
2095 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2096 __kmp_str_free(&buff);
2097 }
2098 {
2099 char *buff;
2100 // create format specifiers before the debug output
2101 buff = __kmp_str_format(
2102 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2103 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2104 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2105 KMP_DEBUG_ASSERT(p_last);
2106 KMP_DEBUG_ASSERT(p_st);
2107 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2108 __kmp_str_free(&buff);
2109 }
2110 #endif
2111 return status;
2112 }
2113
2114 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2115 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2116 is not called. */
2117 #if OMPT_SUPPORT && OMPT_OPTIONAL
2118 #define OMPT_LOOP_END \
2119 if (status == 0) { \
2120 if (ompt_enabled.ompt_callback_work) { \
2121 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2122 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2123 ompt_callbacks.ompt_callback(ompt_callback_work)( \
2124 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
2125 &(task_info->task_data), 0, codeptr); \
2126 } \
2127 }
2128 #define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
2129 if (ompt_enabled.ompt_callback_dispatch && status) { \
2130 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2131 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2132 ompt_dispatch_chunk_t chunk; \
2133 ompt_data_t instance = ompt_data_none; \
2134 OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
2135 instance.ptr = &chunk; \
2136 ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
2137 &(team_info->parallel_data), &(task_info->task_data), \
2138 ompt_dispatch_ws_loop_chunk, instance); \
2139 }
2140 // TODO: implement count
2141 #else
2142 #define OMPT_LOOP_END // no-op
2143 #define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2144 #endif
2145
2146 #if KMP_STATS_ENABLED
2147 #define KMP_STATS_LOOP_END \
2148 { \
2149 kmp_int64 u, l, t, i; \
2150 l = (kmp_int64)(*p_lb); \
2151 u = (kmp_int64)(*p_ub); \
2152 i = (kmp_int64)(pr->u.p.st); \
2153 if (status == 0) { \
2154 t = 0; \
2155 KMP_POP_PARTITIONED_TIMER(); \
2156 } else if (i == 1) { \
2157 if (u >= l) \
2158 t = u - l + 1; \
2159 else \
2160 t = 0; \
2161 } else if (i < 0) { \
2162 if (l >= u) \
2163 t = (l - u) / (-i) + 1; \
2164 else \
2165 t = 0; \
2166 } else { \
2167 if (u >= l) \
2168 t = (u - l) / i + 1; \
2169 else \
2170 t = 0; \
2171 } \
2172 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
2173 }
2174 #else
2175 #define KMP_STATS_LOOP_END /* Nothing */
2176 #endif
2177
2178 template <typename T>
__kmp_dispatch_next(ident_t * loc,int gtid,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,void * codeptr)2179 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2180 T *p_lb, T *p_ub,
2181 typename traits_t<T>::signed_t *p_st
2182 #if OMPT_SUPPORT && OMPT_OPTIONAL
2183 ,
2184 void *codeptr
2185 #endif
2186 ) {
2187
2188 typedef typename traits_t<T>::unsigned_t UT;
2189 typedef typename traits_t<T>::signed_t ST;
2190 // This is potentially slightly misleading, schedule(runtime) will appear here
2191 // even if the actual runtime schedule is static. (Which points out a
2192 // disadvantage of schedule(runtime): even when static scheduling is used it
2193 // costs more than a compile time choice to use static scheduling would.)
2194 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2195
2196 int status;
2197 dispatch_private_info_template<T> *pr;
2198 __kmp_assert_valid_gtid(gtid);
2199 kmp_info_t *th = __kmp_threads[gtid];
2200 kmp_team_t *team = th->th.th_team;
2201
2202 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2203 KD_TRACE(
2204 1000,
2205 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2206 gtid, p_lb, p_ub, p_st, p_last));
2207
2208 if (team->t.t_serialized) {
2209 /* NOTE: serialize this dispatch because we are not at the active level */
2210 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2211 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2212 KMP_DEBUG_ASSERT(pr);
2213
2214 if ((status = (pr->u.p.tc != 0)) == 0) {
2215 *p_lb = 0;
2216 *p_ub = 0;
2217 // if ( p_last != NULL )
2218 // *p_last = 0;
2219 if (p_st != NULL)
2220 *p_st = 0;
2221 if (__kmp_env_consistency_check) {
2222 if (pr->pushed_ws != ct_none) {
2223 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2224 }
2225 }
2226 } else if (pr->flags.nomerge) {
2227 kmp_int32 last;
2228 T start;
2229 UT limit, trip, init;
2230 ST incr;
2231 T chunk = pr->u.p.parm1;
2232
2233 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2234 gtid));
2235
2236 init = chunk * pr->u.p.count++;
2237 trip = pr->u.p.tc - 1;
2238
2239 if ((status = (init <= trip)) == 0) {
2240 *p_lb = 0;
2241 *p_ub = 0;
2242 // if ( p_last != NULL )
2243 // *p_last = 0;
2244 if (p_st != NULL)
2245 *p_st = 0;
2246 if (__kmp_env_consistency_check) {
2247 if (pr->pushed_ws != ct_none) {
2248 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2249 }
2250 }
2251 } else {
2252 start = pr->u.p.lb;
2253 limit = chunk + init - 1;
2254 incr = pr->u.p.st;
2255
2256 if ((last = (limit >= trip)) != 0) {
2257 limit = trip;
2258 #if KMP_OS_WINDOWS
2259 pr->u.p.last_upper = pr->u.p.ub;
2260 #endif /* KMP_OS_WINDOWS */
2261 }
2262 if (p_last != NULL)
2263 *p_last = last;
2264 if (p_st != NULL)
2265 *p_st = incr;
2266 if (incr == 1) {
2267 *p_lb = start + init;
2268 *p_ub = start + limit;
2269 } else {
2270 *p_lb = start + init * incr;
2271 *p_ub = start + limit * incr;
2272 }
2273
2274 if (pr->flags.ordered) {
2275 pr->u.p.ordered_lower = init;
2276 pr->u.p.ordered_upper = limit;
2277 #ifdef KMP_DEBUG
2278 {
2279 char *buff;
2280 // create format specifiers before the debug output
2281 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2282 "ordered_lower:%%%s ordered_upper:%%%s\n",
2283 traits_t<UT>::spec, traits_t<UT>::spec);
2284 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2285 pr->u.p.ordered_upper));
2286 __kmp_str_free(&buff);
2287 }
2288 #endif
2289 } // if
2290 } // if
2291 } else {
2292 pr->u.p.tc = 0;
2293 *p_lb = pr->u.p.lb;
2294 *p_ub = pr->u.p.ub;
2295 #if KMP_OS_WINDOWS
2296 pr->u.p.last_upper = *p_ub;
2297 #endif /* KMP_OS_WINDOWS */
2298 if (p_last != NULL)
2299 *p_last = TRUE;
2300 if (p_st != NULL)
2301 *p_st = pr->u.p.st;
2302 } // if
2303 #ifdef KMP_DEBUG
2304 {
2305 char *buff;
2306 // create format specifiers before the debug output
2307 buff = __kmp_str_format(
2308 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2309 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2310 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2311 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2312 (p_last ? *p_last : 0), status));
2313 __kmp_str_free(&buff);
2314 }
2315 #endif
2316 #if INCLUDE_SSC_MARKS
2317 SSC_MARK_DISPATCH_NEXT();
2318 #endif
2319 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2320 OMPT_LOOP_END;
2321 KMP_STATS_LOOP_END;
2322 return status;
2323 } else {
2324 kmp_int32 last = 0;
2325 dispatch_shared_info_template<T> volatile *sh;
2326
2327 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2328 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2329
2330 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2331 th->th.th_dispatch->th_dispatch_pr_current);
2332 KMP_DEBUG_ASSERT(pr);
2333 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2334 th->th.th_dispatch->th_dispatch_sh_current);
2335 KMP_DEBUG_ASSERT(sh);
2336
2337 #if KMP_USE_HIER_SCHED
2338 if (pr->flags.use_hier)
2339 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2340 else
2341 #endif // KMP_USE_HIER_SCHED
2342 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2343 p_st, th->th.th_team_nproc,
2344 th->th.th_info.ds.ds_tid);
2345 // status == 0: no more iterations to execute
2346 if (status == 0) {
2347 ST num_done;
2348 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2349 #ifdef KMP_DEBUG
2350 {
2351 char *buff;
2352 // create format specifiers before the debug output
2353 buff = __kmp_str_format(
2354 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2355 traits_t<ST>::spec);
2356 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2357 __kmp_str_free(&buff);
2358 }
2359 #endif
2360
2361 #if KMP_USE_HIER_SCHED
2362 pr->flags.use_hier = FALSE;
2363 #endif
2364 if (num_done == th->th.th_team_nproc - 1) {
2365 #if KMP_STATIC_STEAL_ENABLED
2366 if (pr->schedule == kmp_sch_static_steal) {
2367 int i;
2368 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2369 __kmp_dispatch_num_buffers; // current loop index
2370 // loop complete, safe to destroy locks used for stealing
2371 for (i = 0; i < th->th.th_team_nproc; ++i) {
2372 dispatch_private_info_template<T> *buf =
2373 reinterpret_cast<dispatch_private_info_template<T> *>(
2374 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2375 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2376 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2377 if (traits_t<T>::type_size > 4) {
2378 // destroy locks used for stealing
2379 kmp_lock_t *lck = buf->u.p.steal_lock;
2380 KMP_ASSERT(lck != NULL);
2381 __kmp_destroy_lock(lck);
2382 __kmp_free(lck);
2383 buf->u.p.steal_lock = NULL;
2384 }
2385 }
2386 }
2387 #endif
2388 /* NOTE: release shared buffer to be reused */
2389
2390 KMP_MB(); /* Flush all pending memory write invalidates. */
2391
2392 sh->u.s.num_done = 0;
2393 sh->u.s.iteration = 0;
2394
2395 /* TODO replace with general release procedure? */
2396 if (pr->flags.ordered) {
2397 sh->u.s.ordered_iteration = 0;
2398 }
2399
2400 sh->buffer_index += __kmp_dispatch_num_buffers;
2401 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2402 gtid, sh->buffer_index));
2403
2404 KMP_MB(); /* Flush all pending memory write invalidates. */
2405
2406 } // if
2407 if (__kmp_env_consistency_check) {
2408 if (pr->pushed_ws != ct_none) {
2409 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2410 }
2411 }
2412
2413 th->th.th_dispatch->th_deo_fcn = NULL;
2414 th->th.th_dispatch->th_dxo_fcn = NULL;
2415 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2416 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2417 } // if (status == 0)
2418 #if KMP_OS_WINDOWS
2419 else if (last) {
2420 pr->u.p.last_upper = pr->u.p.ub;
2421 }
2422 #endif /* KMP_OS_WINDOWS */
2423 if (p_last != NULL && status != 0)
2424 *p_last = last;
2425 } // if
2426
2427 #ifdef KMP_DEBUG
2428 {
2429 char *buff;
2430 // create format specifiers before the debug output
2431 buff = __kmp_str_format(
2432 "__kmp_dispatch_next: T#%%d normal case: "
2433 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2434 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2435 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2436 (p_last ? *p_last : 0), status));
2437 __kmp_str_free(&buff);
2438 }
2439 #endif
2440 #if INCLUDE_SSC_MARKS
2441 SSC_MARK_DISPATCH_NEXT();
2442 #endif
2443 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2444 OMPT_LOOP_END;
2445 KMP_STATS_LOOP_END;
2446 return status;
2447 }
2448
2449 /*!
2450 @ingroup WORK_SHARING
2451 @param loc source location information
2452 @param global_tid global thread number
2453 @return Zero if the parallel region is not active and this thread should execute
2454 all sections, non-zero otherwise.
2455
2456 Beginning of sections construct.
2457 There are no implicit barriers in the "sections" calls, rather the compiler
2458 should introduce an explicit barrier if it is required.
2459
2460 This implementation is based on __kmp_dispatch_init, using same constructs for
2461 shared data (we can't have sections nested directly in omp for loop, there
2462 should be a parallel region in between)
2463 */
__kmpc_sections_init(ident_t * loc,kmp_int32 gtid)2464 kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2465
2466 int active;
2467 kmp_info_t *th;
2468 kmp_team_t *team;
2469 kmp_uint32 my_buffer_index;
2470 dispatch_shared_info_template<kmp_int32> volatile *sh;
2471
2472 KMP_DEBUG_ASSERT(__kmp_init_serial);
2473
2474 if (!TCR_4(__kmp_init_parallel))
2475 __kmp_parallel_initialize();
2476 __kmp_resume_if_soft_paused();
2477
2478 /* setup data */
2479 th = __kmp_threads[gtid];
2480 team = th->th.th_team;
2481 active = !team->t.t_serialized;
2482 th->th.th_ident = loc;
2483
2484 KMP_COUNT_BLOCK(OMP_SECTIONS);
2485 KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2486
2487 if (active) {
2488 // Setup sections in the same way as dynamic scheduled loops.
2489 // We need one shared data: which section is to execute next.
2490 // (in case parallel is not active, all sections will be executed on the
2491 // same thread)
2492 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2493 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2494
2495 my_buffer_index = th->th.th_dispatch->th_disp_index++;
2496
2497 // reuse shared data structures from dynamic sched loops:
2498 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2499 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2500 KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2501 my_buffer_index));
2502
2503 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2504 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2505
2506 KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2507 "sh->buffer_index:%d\n",
2508 gtid, my_buffer_index, sh->buffer_index));
2509 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2510 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2511 // Note: KMP_WAIT() cannot be used there: buffer index and
2512 // my_buffer_index are *always* 32-bit integers.
2513 KMP_MB();
2514 KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2515 "sh->buffer_index:%d\n",
2516 gtid, my_buffer_index, sh->buffer_index));
2517
2518 th->th.th_dispatch->th_dispatch_pr_current =
2519 nullptr; // sections construct doesn't need private data
2520 th->th.th_dispatch->th_dispatch_sh_current =
2521 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2522 }
2523
2524 #if OMPT_SUPPORT && OMPT_OPTIONAL
2525 if (ompt_enabled.ompt_callback_work) {
2526 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2527 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2528 ompt_callbacks.ompt_callback(ompt_callback_work)(
2529 ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2530 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2531 }
2532 #endif
2533 KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2534
2535 return active;
2536 }
2537
2538 /*!
2539 @ingroup WORK_SHARING
2540 @param loc source location information
2541 @param global_tid global thread number
2542 @param numberOfSections number of sections in the 'sections' construct
2543 @return unsigned [from 0 to n) - number (id) of the section to execute next on
2544 this thread. n (or any other number not in range) - nothing to execute on this
2545 thread
2546 */
2547
__kmpc_next_section(ident_t * loc,kmp_int32 gtid,kmp_int32 numberOfSections)2548 kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2549 kmp_int32 numberOfSections) {
2550
2551 KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2552
2553 kmp_info_t *th = __kmp_threads[gtid];
2554 #ifdef KMP_DEBUG
2555 kmp_team_t *team = th->th.th_team;
2556 #endif
2557
2558 KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2559 numberOfSections));
2560
2561 // For serialized case we should not call this function:
2562 KMP_DEBUG_ASSERT(!team->t.t_serialized);
2563
2564 dispatch_shared_info_template<kmp_int32> volatile *sh;
2565
2566 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2567 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2568
2569 KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2570 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2571 th->th.th_dispatch->th_dispatch_sh_current);
2572 KMP_DEBUG_ASSERT(sh);
2573
2574 kmp_int32 sectionIndex = 0;
2575 bool moreSectionsToExecute = true;
2576
2577 // Find section to execute:
2578 sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2579 if (sectionIndex >= numberOfSections) {
2580 moreSectionsToExecute = false;
2581 }
2582
2583 // status == 0: no more sections to execute;
2584 // OMPTODO: __kmpc_end_sections could be bypassed?
2585 if (!moreSectionsToExecute) {
2586 kmp_int32 num_done;
2587
2588 num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2589
2590 if (num_done == th->th.th_team_nproc - 1) {
2591 /* NOTE: release this buffer to be reused */
2592
2593 KMP_MB(); /* Flush all pending memory write invalidates. */
2594
2595 sh->u.s.num_done = 0;
2596 sh->u.s.iteration = 0;
2597
2598 KMP_MB(); /* Flush all pending memory write invalidates. */
2599
2600 sh->buffer_index += __kmp_dispatch_num_buffers;
2601 KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2602 sh->buffer_index));
2603
2604 KMP_MB(); /* Flush all pending memory write invalidates. */
2605
2606 } // if
2607
2608 th->th.th_dispatch->th_deo_fcn = NULL;
2609 th->th.th_dispatch->th_dxo_fcn = NULL;
2610 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2611 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2612
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614 if (ompt_enabled.ompt_callback_dispatch) {
2615 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2616 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2617 ompt_data_t instance = ompt_data_none;
2618 instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2619 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2620 &(team_info->parallel_data), &(task_info->task_data),
2621 ompt_dispatch_section, instance);
2622 }
2623 #endif
2624 }
2625
2626 return sectionIndex;
2627 }
2628
2629 /*!
2630 @ingroup WORK_SHARING
2631 @param loc source location information
2632 @param global_tid global thread number
2633
2634 End of "sections" construct.
2635 Don't need to wait here: barrier is added separately when needed.
2636 */
__kmpc_end_sections(ident_t * loc,kmp_int32 gtid)2637 void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2638
2639 kmp_info_t *th = __kmp_threads[gtid];
2640 int active = !th->th.th_team->t.t_serialized;
2641
2642 KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2643
2644 if (!active) {
2645 // In active case call finalization is done in __kmpc_next_section
2646 #if OMPT_SUPPORT && OMPT_OPTIONAL
2647 if (ompt_enabled.ompt_callback_work) {
2648 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2649 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2650 ompt_callbacks.ompt_callback(ompt_callback_work)(
2651 ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2652 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2653 }
2654 #endif
2655 }
2656
2657 KMP_POP_PARTITIONED_TIMER();
2658 KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2659 }
2660
2661 template <typename T>
__kmp_dist_get_bounds(ident_t * loc,kmp_int32 gtid,kmp_int32 * plastiter,T * plower,T * pupper,typename traits_t<T>::signed_t incr)2662 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2663 kmp_int32 *plastiter, T *plower, T *pupper,
2664 typename traits_t<T>::signed_t incr) {
2665 typedef typename traits_t<T>::unsigned_t UT;
2666 kmp_uint32 team_id;
2667 kmp_uint32 nteams;
2668 UT trip_count;
2669 kmp_team_t *team;
2670 kmp_info_t *th;
2671
2672 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2673 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2674 #ifdef KMP_DEBUG
2675 typedef typename traits_t<T>::signed_t ST;
2676 {
2677 char *buff;
2678 // create format specifiers before the debug output
2679 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2680 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2681 traits_t<T>::spec, traits_t<T>::spec,
2682 traits_t<ST>::spec, traits_t<T>::spec);
2683 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2684 __kmp_str_free(&buff);
2685 }
2686 #endif
2687
2688 if (__kmp_env_consistency_check) {
2689 if (incr == 0) {
2690 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2691 loc);
2692 }
2693 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2694 // The loop is illegal.
2695 // Some zero-trip loops maintained by compiler, e.g.:
2696 // for(i=10;i<0;++i) // lower >= upper - run-time check
2697 // for(i=0;i>10;--i) // lower <= upper - run-time check
2698 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2699 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2700 // Compiler does not check the following illegal loops:
2701 // for(i=0;i<10;i+=incr) // where incr<0
2702 // for(i=10;i>0;i-=incr) // where incr<0
2703 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2704 }
2705 }
2706 __kmp_assert_valid_gtid(gtid);
2707 th = __kmp_threads[gtid];
2708 team = th->th.th_team;
2709 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2710 nteams = th->th.th_teams_size.nteams;
2711 team_id = team->t.t_master_tid;
2712 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2713
2714 // compute global trip count
2715 if (incr == 1) {
2716 trip_count = *pupper - *plower + 1;
2717 } else if (incr == -1) {
2718 trip_count = *plower - *pupper + 1;
2719 } else if (incr > 0) {
2720 // upper-lower can exceed the limit of signed type
2721 trip_count = (UT)(*pupper - *plower) / incr + 1;
2722 } else {
2723 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2724 }
2725
2726 if (trip_count <= nteams) {
2727 KMP_DEBUG_ASSERT(
2728 __kmp_static == kmp_sch_static_greedy ||
2729 __kmp_static ==
2730 kmp_sch_static_balanced); // Unknown static scheduling type.
2731 // only some teams get single iteration, others get nothing
2732 if (team_id < trip_count) {
2733 *pupper = *plower = *plower + team_id * incr;
2734 } else {
2735 *plower = *pupper + incr; // zero-trip loop
2736 }
2737 if (plastiter != NULL)
2738 *plastiter = (team_id == trip_count - 1);
2739 } else {
2740 if (__kmp_static == kmp_sch_static_balanced) {
2741 UT chunk = trip_count / nteams;
2742 UT extras = trip_count % nteams;
2743 *plower +=
2744 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2745 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2746 if (plastiter != NULL)
2747 *plastiter = (team_id == nteams - 1);
2748 } else {
2749 T chunk_inc_count =
2750 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2751 T upper = *pupper;
2752 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2753 // Unknown static scheduling type.
2754 *plower += team_id * chunk_inc_count;
2755 *pupper = *plower + chunk_inc_count - incr;
2756 // Check/correct bounds if needed
2757 if (incr > 0) {
2758 if (*pupper < *plower)
2759 *pupper = traits_t<T>::max_value;
2760 if (plastiter != NULL)
2761 *plastiter = *plower <= upper && *pupper > upper - incr;
2762 if (*pupper > upper)
2763 *pupper = upper; // tracker C73258
2764 } else {
2765 if (*pupper > *plower)
2766 *pupper = traits_t<T>::min_value;
2767 if (plastiter != NULL)
2768 *plastiter = *plower >= upper && *pupper < upper - incr;
2769 if (*pupper < upper)
2770 *pupper = upper; // tracker C73258
2771 }
2772 }
2773 }
2774 }
2775
2776 //-----------------------------------------------------------------------------
2777 // Dispatch routines
2778 // Transfer call to template< type T >
2779 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2780 // T lb, T ub, ST st, ST chunk )
2781 extern "C" {
2782
2783 /*!
2784 @ingroup WORK_SHARING
2785 @{
2786 @param loc Source location
2787 @param gtid Global thread id
2788 @param schedule Schedule type
2789 @param lb Lower bound
2790 @param ub Upper bound
2791 @param st Step (or increment if you prefer)
2792 @param chunk The chunk size to block with
2793
2794 This function prepares the runtime to start a dynamically scheduled for loop,
2795 saving the loop arguments.
2796 These functions are all identical apart from the types of the arguments.
2797 */
2798
__kmpc_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2799 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2800 enum sched_type schedule, kmp_int32 lb,
2801 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2802 KMP_DEBUG_ASSERT(__kmp_init_serial);
2803 #if OMPT_SUPPORT && OMPT_OPTIONAL
2804 OMPT_STORE_RETURN_ADDRESS(gtid);
2805 #endif
2806 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2807 }
2808 /*!
2809 See @ref __kmpc_dispatch_init_4
2810 */
__kmpc_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2811 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2812 enum sched_type schedule, kmp_uint32 lb,
2813 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2814 KMP_DEBUG_ASSERT(__kmp_init_serial);
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816 OMPT_STORE_RETURN_ADDRESS(gtid);
2817 #endif
2818 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2819 }
2820
2821 /*!
2822 See @ref __kmpc_dispatch_init_4
2823 */
__kmpc_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2824 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2825 enum sched_type schedule, kmp_int64 lb,
2826 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2827 KMP_DEBUG_ASSERT(__kmp_init_serial);
2828 #if OMPT_SUPPORT && OMPT_OPTIONAL
2829 OMPT_STORE_RETURN_ADDRESS(gtid);
2830 #endif
2831 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2832 }
2833
2834 /*!
2835 See @ref __kmpc_dispatch_init_4
2836 */
__kmpc_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2837 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2838 enum sched_type schedule, kmp_uint64 lb,
2839 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2840 KMP_DEBUG_ASSERT(__kmp_init_serial);
2841 #if OMPT_SUPPORT && OMPT_OPTIONAL
2842 OMPT_STORE_RETURN_ADDRESS(gtid);
2843 #endif
2844 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2845 }
2846
2847 /*!
2848 See @ref __kmpc_dispatch_init_4
2849
2850 Difference from __kmpc_dispatch_init set of functions is these functions
2851 are called for composite distribute parallel for construct. Thus before
2852 regular iterations dispatching we need to calc per-team iteration space.
2853
2854 These functions are all identical apart from the types of the arguments.
2855 */
__kmpc_dist_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2856 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2857 enum sched_type schedule, kmp_int32 *p_last,
2858 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2859 kmp_int32 chunk) {
2860 KMP_DEBUG_ASSERT(__kmp_init_serial);
2861 #if OMPT_SUPPORT && OMPT_OPTIONAL
2862 OMPT_STORE_RETURN_ADDRESS(gtid);
2863 #endif
2864 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2865 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2866 }
2867
__kmpc_dist_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2868 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2869 enum sched_type schedule, kmp_int32 *p_last,
2870 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2871 kmp_int32 chunk) {
2872 KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 #if OMPT_SUPPORT && OMPT_OPTIONAL
2874 OMPT_STORE_RETURN_ADDRESS(gtid);
2875 #endif
2876 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2877 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2878 }
2879
__kmpc_dist_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2880 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2881 enum sched_type schedule, kmp_int32 *p_last,
2882 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2883 kmp_int64 chunk) {
2884 KMP_DEBUG_ASSERT(__kmp_init_serial);
2885 #if OMPT_SUPPORT && OMPT_OPTIONAL
2886 OMPT_STORE_RETURN_ADDRESS(gtid);
2887 #endif
2888 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2889 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2890 }
2891
__kmpc_dist_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2892 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2893 enum sched_type schedule, kmp_int32 *p_last,
2894 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2895 kmp_int64 chunk) {
2896 KMP_DEBUG_ASSERT(__kmp_init_serial);
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898 OMPT_STORE_RETURN_ADDRESS(gtid);
2899 #endif
2900 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2901 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2902 }
2903
2904 /*!
2905 @param loc Source code location
2906 @param gtid Global thread id
2907 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2908 otherwise
2909 @param p_lb Pointer to the lower bound for the next chunk of work
2910 @param p_ub Pointer to the upper bound for the next chunk of work
2911 @param p_st Pointer to the stride for the next chunk of work
2912 @return one if there is work to be done, zero otherwise
2913
2914 Get the next dynamically allocated chunk of work for this thread.
2915 If there is no more work, then the lb,ub and stride need not be modified.
2916 */
__kmpc_dispatch_next_4(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int32 * p_lb,kmp_int32 * p_ub,kmp_int32 * p_st)2917 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2918 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2919 #if OMPT_SUPPORT && OMPT_OPTIONAL
2920 OMPT_STORE_RETURN_ADDRESS(gtid);
2921 #endif
2922 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2923 #if OMPT_SUPPORT && OMPT_OPTIONAL
2924 ,
2925 OMPT_LOAD_RETURN_ADDRESS(gtid)
2926 #endif
2927 );
2928 }
2929
2930 /*!
2931 See @ref __kmpc_dispatch_next_4
2932 */
__kmpc_dispatch_next_4u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint32 * p_lb,kmp_uint32 * p_ub,kmp_int32 * p_st)2933 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2934 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2935 kmp_int32 *p_st) {
2936 #if OMPT_SUPPORT && OMPT_OPTIONAL
2937 OMPT_STORE_RETURN_ADDRESS(gtid);
2938 #endif
2939 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2940 #if OMPT_SUPPORT && OMPT_OPTIONAL
2941 ,
2942 OMPT_LOAD_RETURN_ADDRESS(gtid)
2943 #endif
2944 );
2945 }
2946
2947 /*!
2948 See @ref __kmpc_dispatch_next_4
2949 */
__kmpc_dispatch_next_8(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int64 * p_lb,kmp_int64 * p_ub,kmp_int64 * p_st)2950 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2951 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2952 #if OMPT_SUPPORT && OMPT_OPTIONAL
2953 OMPT_STORE_RETURN_ADDRESS(gtid);
2954 #endif
2955 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2956 #if OMPT_SUPPORT && OMPT_OPTIONAL
2957 ,
2958 OMPT_LOAD_RETURN_ADDRESS(gtid)
2959 #endif
2960 );
2961 }
2962
2963 /*!
2964 See @ref __kmpc_dispatch_next_4
2965 */
__kmpc_dispatch_next_8u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint64 * p_lb,kmp_uint64 * p_ub,kmp_int64 * p_st)2966 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2967 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2968 kmp_int64 *p_st) {
2969 #if OMPT_SUPPORT && OMPT_OPTIONAL
2970 OMPT_STORE_RETURN_ADDRESS(gtid);
2971 #endif
2972 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2973 #if OMPT_SUPPORT && OMPT_OPTIONAL
2974 ,
2975 OMPT_LOAD_RETURN_ADDRESS(gtid)
2976 #endif
2977 );
2978 }
2979
2980 /*!
2981 @param loc Source code location
2982 @param gtid Global thread id
2983
2984 Mark the end of a dynamic loop.
2985 */
__kmpc_dispatch_fini_4(ident_t * loc,kmp_int32 gtid)2986 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2987 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2988 }
2989
2990 /*!
2991 See @ref __kmpc_dispatch_fini_4
2992 */
__kmpc_dispatch_fini_8(ident_t * loc,kmp_int32 gtid)2993 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2994 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2995 }
2996
2997 /*!
2998 See @ref __kmpc_dispatch_fini_4
2999 */
__kmpc_dispatch_fini_4u(ident_t * loc,kmp_int32 gtid)3000 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3001 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3002 }
3003
3004 /*!
3005 See @ref __kmpc_dispatch_fini_4
3006 */
__kmpc_dispatch_fini_8u(ident_t * loc,kmp_int32 gtid)3007 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3008 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3009 }
3010 /*! @} */
3011
3012 //-----------------------------------------------------------------------------
3013 // Non-template routines from kmp_dispatch.cpp used in other sources
3014
__kmp_eq_4(kmp_uint32 value,kmp_uint32 checker)3015 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3016 return value == checker;
3017 }
3018
__kmp_neq_4(kmp_uint32 value,kmp_uint32 checker)3019 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3020 return value != checker;
3021 }
3022
__kmp_lt_4(kmp_uint32 value,kmp_uint32 checker)3023 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3024 return value < checker;
3025 }
3026
__kmp_ge_4(kmp_uint32 value,kmp_uint32 checker)3027 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3028 return value >= checker;
3029 }
3030
__kmp_le_4(kmp_uint32 value,kmp_uint32 checker)3031 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3032 return value <= checker;
3033 }
3034
3035 kmp_uint32
__kmp_wait_4(volatile kmp_uint32 * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(kmp_uint32,kmp_uint32),void * obj)3036 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3037 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3038 void *obj // Higher-level synchronization object, or NULL.
3039 ) {
3040 // note: we may not belong to a team at this point
3041 volatile kmp_uint32 *spin = spinner;
3042 kmp_uint32 check = checker;
3043 kmp_uint32 spins;
3044 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3045 kmp_uint32 r;
3046 kmp_uint64 time;
3047
3048 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3049 KMP_INIT_YIELD(spins);
3050 KMP_INIT_BACKOFF(time);
3051 // main wait spin loop
3052 while (!f(r = TCR_4(*spin), check)) {
3053 KMP_FSYNC_SPIN_PREPARE(obj);
3054 /* GEH - remove this since it was accidentally introduced when kmp_wait was
3055 split. It causes problems with infinite recursion because of exit lock */
3056 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3057 __kmp_abort_thread(); */
3058 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3059 }
3060 KMP_FSYNC_SPIN_ACQUIRED(obj);
3061 return r;
3062 }
3063
__kmp_wait_4_ptr(void * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(void *,kmp_uint32),void * obj)3064 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3065 kmp_uint32 (*pred)(void *, kmp_uint32),
3066 void *obj // Higher-level synchronization object, or NULL.
3067 ) {
3068 // note: we may not belong to a team at this point
3069 void *spin = spinner;
3070 kmp_uint32 check = checker;
3071 kmp_uint32 spins;
3072 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3073 kmp_uint64 time;
3074
3075 KMP_FSYNC_SPIN_INIT(obj, spin);
3076 KMP_INIT_YIELD(spins);
3077 KMP_INIT_BACKOFF(time);
3078 // main wait spin loop
3079 while (!f(spin, check)) {
3080 KMP_FSYNC_SPIN_PREPARE(obj);
3081 /* if we have waited a bit, or are noversubscribed, yield */
3082 /* pause is in the following code */
3083 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3084 }
3085 KMP_FSYNC_SPIN_ACQUIRED(obj);
3086 }
3087
3088 } // extern "C"
3089
3090 #ifdef KMP_GOMP_COMPAT
3091
__kmp_aux_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)3092 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3093 enum sched_type schedule, kmp_int32 lb,
3094 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3095 int push_ws) {
3096 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3097 push_ws);
3098 }
3099
__kmp_aux_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)3100 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3101 enum sched_type schedule, kmp_uint32 lb,
3102 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3103 int push_ws) {
3104 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3105 push_ws);
3106 }
3107
__kmp_aux_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)3108 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3109 enum sched_type schedule, kmp_int64 lb,
3110 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3111 int push_ws) {
3112 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3113 push_ws);
3114 }
3115
__kmp_aux_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)3116 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3117 enum sched_type schedule, kmp_uint64 lb,
3118 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3119 int push_ws) {
3120 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3121 push_ws);
3122 }
3123
__kmp_aux_dispatch_fini_chunk_4(ident_t * loc,kmp_int32 gtid)3124 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3125 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3126 }
3127
__kmp_aux_dispatch_fini_chunk_8(ident_t * loc,kmp_int32 gtid)3128 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3129 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3130 }
3131
__kmp_aux_dispatch_fini_chunk_4u(ident_t * loc,kmp_int32 gtid)3132 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3133 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134 }
3135
__kmp_aux_dispatch_fini_chunk_8u(ident_t * loc,kmp_int32 gtid)3136 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3137 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138 }
3139
3140 #endif /* KMP_GOMP_COMPAT */
3141
3142 /* ------------------------------------------------------------------------ */
3143