1 /*
2  * kmp_collapse.cpp -- loop collapse feature
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_error.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_str.h"
19 #include "kmp_collapse.h"
20 
21 #if OMPT_SUPPORT
22 #include "ompt-specific.h"
23 #endif
24 
25 // OMPTODO: different style of comments (see kmp_sched)
26 // OMPTODO: OMPT/OMPD
27 
28 // avoid inadevertently using a library based abs
29 template <typename T> T __kmp_abs(const T val) {
30   return (val < 0) ? -val : val;
31 }
32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34 
35 //----------------------------------------------------------------------------
36 // Common functions for working with rectangular and non-rectangular loops
37 //----------------------------------------------------------------------------
38 
39 template <typename T> int __kmp_sign(T val) {
40   return (T(0) < val) - (val < T(0));
41 }
42 
43 template <typename T> class CollapseAllocator {
44   typedef T *pT;
45 
46 private:
47   static const size_t allocaSize = 32; // size limit for stack allocations
48                                        // (8 bytes x 4 nested loops)
49   char stackAlloc[allocaSize];
50   static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51   pT pTAlloc;
52 
53 public:
54   CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55     if (n > maxElemCount) {
56       pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57     }
58   }
59   ~CollapseAllocator() {
60     if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61       __kmp_free(pTAlloc);
62     }
63   }
64   T &operator[](int index) { return pTAlloc[index]; }
65   operator const pT() { return pTAlloc; }
66 };
67 
68 //----------Loop canonicalization---------------------------------------------
69 
70 // For loop nest (any shape):
71 // convert != to < or >;
72 // switch from using < or > to <= or >=.
73 // "bounds" array has to be allocated per thread.
74 // All other internal functions will work only with canonicalized loops.
75 template <typename T>
76 void kmp_canonicalize_one_loop_XX(
77     ident_t *loc,
78     /*in/out*/ bounds_infoXX_template<T> *bounds) {
79 
80   if (__kmp_env_consistency_check) {
81     if (bounds->step == 0) {
82       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83                             loc);
84     }
85   }
86 
87   if (bounds->comparison == comparison_t::comp_not_eq) {
88     // We can convert this to < or >, depends on the sign of the step:
89     if (bounds->step > 0) {
90       bounds->comparison = comparison_t::comp_less;
91     } else {
92       bounds->comparison = comparison_t::comp_greater;
93     }
94   }
95 
96   if (bounds->comparison == comparison_t::comp_less) {
97     // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98     // because ub0 + ub1*j should be still positive (otherwise loop was not
99     // well formed)
100     bounds->ub0 -= 1;
101     bounds->comparison = comparison_t::comp_less_or_eq;
102   } else if (bounds->comparison == comparison_t::comp_greater) {
103     bounds->ub0 += 1;
104     bounds->comparison = comparison_t::comp_greater_or_eq;
105   }
106 }
107 
108 // Canonicalize loop nest. original_bounds_nest is an array of length n.
109 void kmp_canonicalize_loop_nest(ident_t *loc,
110                                 /*in/out*/ bounds_info_t *original_bounds_nest,
111                                 kmp_index_t n) {
112 
113   for (kmp_index_t ind = 0; ind < n; ++ind) {
114     auto bounds = &(original_bounds_nest[ind]);
115 
116     switch (bounds->loop_type) {
117     case loop_type_t::loop_type_int32:
118       kmp_canonicalize_one_loop_XX<kmp_int32>(
119           loc,
120           /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
121       break;
122     case loop_type_t::loop_type_uint32:
123       kmp_canonicalize_one_loop_XX<kmp_uint32>(
124           loc,
125           /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
126       break;
127     case loop_type_t::loop_type_int64:
128       kmp_canonicalize_one_loop_XX<kmp_int64>(
129           loc,
130           /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
131       break;
132     case loop_type_t::loop_type_uint64:
133       kmp_canonicalize_one_loop_XX<kmp_uint64>(
134           loc,
135           /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
136       break;
137     default:
138       KMP_ASSERT(false);
139     }
140   }
141 }
142 
143 //----------Calculating trip count on one level-------------------------------
144 
145 // Calculate trip count on this loop level.
146 // We do this either for a rectangular loop nest,
147 // or after an adjustment bringing the loops to a parallelepiped shape.
148 // This number should not depend on the value of outer IV
149 // even if the formular has lb1 and ub1.
150 // Note: for non-rectangular loops don't use span for this, it's too big.
151 
152 template <typename T>
153 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154     /*in/out*/ bounds_infoXX_template<T> *bounds) {
155 
156   if (bounds->comparison == comparison_t::comp_less_or_eq) {
157     if (bounds->ub0 < bounds->lb0) {
158       // Note: after this we don't need to calculate inner loops,
159       // but that should be an edge case:
160       bounds->trip_count = 0;
161     } else {
162       // ub - lb may exceed signed type range; we need to cast to
163       // kmp_loop_nest_iv_t anyway
164       bounds->trip_count =
165           static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166               __kmp_abs(bounds->step) +
167           1;
168     }
169   } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170     if (bounds->lb0 < bounds->ub0) {
171       // Note: after this we don't need to calculate inner loops,
172       // but that should be an edge case:
173       bounds->trip_count = 0;
174     } else {
175       // lb - ub may exceed signed type range; we need to cast to
176       // kmp_loop_nest_iv_t anyway
177       bounds->trip_count =
178           static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179               __kmp_abs(bounds->step) +
180           1;
181     }
182   } else {
183     KMP_ASSERT(false);
184   }
185   return bounds->trip_count;
186 }
187 
188 // Calculate trip count on this loop level.
189 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
190 
191   kmp_loop_nest_iv_t trip_count = 0;
192 
193   switch (bounds->loop_type) {
194   case loop_type_t::loop_type_int32:
195     trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196         /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
197     break;
198   case loop_type_t::loop_type_uint32:
199     trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200         /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
201     break;
202   case loop_type_t::loop_type_int64:
203     trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204         /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
205     break;
206   case loop_type_t::loop_type_uint64:
207     trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208         /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
209     break;
210   default:
211     KMP_ASSERT(false);
212   }
213 
214   return trip_count;
215 }
216 
217 //----------Trim original iv according to its type----------------------------
218 
219 // Trim original iv according to its type.
220 // Return kmp_uint64 value which can be easily used in all internal calculations
221 // And can be statically cast back to original type in user code.
222 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223   kmp_uint64 res = 0;
224 
225   switch (loop_iv_type) {
226   case loop_type_t::loop_type_int8:
227     res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228     break;
229   case loop_type_t::loop_type_uint8:
230     res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231     break;
232   case loop_type_t::loop_type_int16:
233     res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234     break;
235   case loop_type_t::loop_type_uint16:
236     res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237     break;
238   case loop_type_t::loop_type_int32:
239     res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240     break;
241   case loop_type_t::loop_type_uint32:
242     res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243     break;
244   case loop_type_t::loop_type_int64:
245     res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246     break;
247   case loop_type_t::loop_type_uint64:
248     res = static_cast<kmp_uint64>(original_iv);
249     break;
250   default:
251     KMP_ASSERT(false);
252   }
253 
254   return res;
255 }
256 
257 //----------Compare two IVs (remember they have a type)-----------------------
258 
259 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260                 kmp_uint64 original_iv2) {
261   bool res = false;
262 
263   switch (loop_iv_type) {
264   case loop_type_t::loop_type_int8:
265     res = static_cast<kmp_int8>(original_iv1) ==
266           static_cast<kmp_int8>(original_iv2);
267     break;
268   case loop_type_t::loop_type_uint8:
269     res = static_cast<kmp_uint8>(original_iv1) ==
270           static_cast<kmp_uint8>(original_iv2);
271     break;
272   case loop_type_t::loop_type_int16:
273     res = static_cast<kmp_int16>(original_iv1) ==
274           static_cast<kmp_int16>(original_iv2);
275     break;
276   case loop_type_t::loop_type_uint16:
277     res = static_cast<kmp_uint16>(original_iv1) ==
278           static_cast<kmp_uint16>(original_iv2);
279     break;
280   case loop_type_t::loop_type_int32:
281     res = static_cast<kmp_int32>(original_iv1) ==
282           static_cast<kmp_int32>(original_iv2);
283     break;
284   case loop_type_t::loop_type_uint32:
285     res = static_cast<kmp_uint32>(original_iv1) ==
286           static_cast<kmp_uint32>(original_iv2);
287     break;
288   case loop_type_t::loop_type_int64:
289     res = static_cast<kmp_int64>(original_iv1) ==
290           static_cast<kmp_int64>(original_iv2);
291     break;
292   case loop_type_t::loop_type_uint64:
293     res = static_cast<kmp_uint64>(original_iv1) ==
294           static_cast<kmp_uint64>(original_iv2);
295     break;
296   default:
297     KMP_ASSERT(false);
298   }
299 
300   return res;
301 }
302 
303 //----------Calculate original iv on one level--------------------------------
304 
305 // Return true if the point fits into upper bounds on this level,
306 // false otherwise
307 template <typename T>
308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309                                  const kmp_point_t original_ivs,
310                                  kmp_index_t ind) {
311 
312   T iv = static_cast<T>(original_ivs[ind]);
313   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314 
315   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316        (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318        (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319     // The calculated point is outside of loop upper boundary:
320     return false;
321   }
322 
323   return true;
324 }
325 
326 // Calculate one iv corresponding to iteration on the level ind.
327 // Return true if it fits into lower-upper bounds on this level
328 // (if not, we need to re-calculate)
329 template <typename T>
330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331                         /*in/out*/ kmp_point_t original_ivs,
332                         const kmp_iterations_t iterations, kmp_index_t ind,
333                         bool start_with_lower_bound, bool checkBounds) {
334 
335   kmp_uint64 temp = 0;
336   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337 
338   if (start_with_lower_bound) {
339     // we moved to the next iteration on one of outer loops, should start
340     // with the lower bound here:
341     temp = bounds->lb0 + bounds->lb1 * outer_iv;
342   } else {
343     auto iteration = iterations[ind];
344     temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345   }
346 
347   // Now trim original iv according to its type:
348   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349 
350   if (checkBounds) {
351     return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352   } else {
353     return true;
354   }
355 }
356 
357 bool kmp_calc_one_iv(const bounds_info_t *bounds,
358                      /*in/out*/ kmp_point_t original_ivs,
359                      const kmp_iterations_t iterations, kmp_index_t ind,
360                      bool start_with_lower_bound, bool checkBounds) {
361 
362   switch (bounds->loop_type) {
363   case loop_type_t::loop_type_int32:
364     return kmp_calc_one_iv_XX<kmp_int32>(
365         (bounds_infoXX_template<kmp_int32> *)(bounds),
366         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
367         checkBounds);
368     break;
369   case loop_type_t::loop_type_uint32:
370     return kmp_calc_one_iv_XX<kmp_uint32>(
371         (bounds_infoXX_template<kmp_uint32> *)(bounds),
372         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
373         checkBounds);
374     break;
375   case loop_type_t::loop_type_int64:
376     return kmp_calc_one_iv_XX<kmp_int64>(
377         (bounds_infoXX_template<kmp_int64> *)(bounds),
378         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
379         checkBounds);
380     break;
381   case loop_type_t::loop_type_uint64:
382     return kmp_calc_one_iv_XX<kmp_uint64>(
383         (bounds_infoXX_template<kmp_uint64> *)(bounds),
384         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
385         checkBounds);
386     break;
387   default:
388     KMP_ASSERT(false);
389     return false;
390   }
391 }
392 
393 //----------Calculate original iv on one level for rectangular loop nest------
394 
395 // Calculate one iv corresponding to iteration on the level ind.
396 // Return true if it fits into lower-upper bounds on this level
397 // (if not, we need to re-calculate)
398 template <typename T>
399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400                                 /*in/out*/ kmp_uint64 *original_ivs,
401                                 const kmp_iterations_t iterations,
402                                 kmp_index_t ind) {
403 
404   auto iteration = iterations[ind];
405 
406   kmp_uint64 temp =
407       bounds->lb0 +
408       bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409       iteration * bounds->step;
410 
411   // Now trim original iv according to its type:
412   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413 }
414 
415 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416                              /*in/out*/ kmp_uint64 *original_ivs,
417                              const kmp_iterations_t iterations,
418                              kmp_index_t ind) {
419 
420   switch (bounds->loop_type) {
421   case loop_type_t::loop_type_int32:
422     kmp_calc_one_iv_rectang_XX<kmp_int32>(
423         (bounds_infoXX_template<kmp_int32> *)(bounds),
424         /*in/out*/ original_ivs, iterations, ind);
425     break;
426   case loop_type_t::loop_type_uint32:
427     kmp_calc_one_iv_rectang_XX<kmp_uint32>(
428         (bounds_infoXX_template<kmp_uint32> *)(bounds),
429         /*in/out*/ original_ivs, iterations, ind);
430     break;
431   case loop_type_t::loop_type_int64:
432     kmp_calc_one_iv_rectang_XX<kmp_int64>(
433         (bounds_infoXX_template<kmp_int64> *)(bounds),
434         /*in/out*/ original_ivs, iterations, ind);
435     break;
436   case loop_type_t::loop_type_uint64:
437     kmp_calc_one_iv_rectang_XX<kmp_uint64>(
438         (bounds_infoXX_template<kmp_uint64> *)(bounds),
439         /*in/out*/ original_ivs, iterations, ind);
440     break;
441   default:
442     KMP_ASSERT(false);
443   }
444 }
445 
446 //----------------------------------------------------------------------------
447 // Rectangular loop nest
448 //----------------------------------------------------------------------------
449 
450 //----------Canonicalize loop nest and calculate trip count-------------------
451 
452 // Canonicalize loop nest and calculate overall trip count.
453 // "bounds_nest" has to be allocated per thread.
454 // API will modify original bounds_nest array to bring it to a canonical form
455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
456 // canonical form there will be no changes to bounds in bounds_nest array
457 // (only trip counts will be calculated).
458 // Returns trip count of overall space.
459 extern "C" kmp_loop_nest_iv_t
460 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461                                  /*in/out*/ bounds_info_t *original_bounds_nest,
462                                  kmp_index_t n) {
463 
464   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
465 
466   kmp_loop_nest_iv_t total = 1;
467 
468   for (kmp_index_t ind = 0; ind < n; ++ind) {
469     auto bounds = &(original_bounds_nest[ind]);
470 
471     kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
472     total *= trip_count;
473   }
474 
475   return total;
476 }
477 
478 //----------Calculate old induction variables---------------------------------
479 
480 // Calculate old induction variables corresponding to overall new_iv.
481 // Note: original IV will be returned as if it had kmp_uint64 type,
482 // will have to be converted to original type in user code.
483 // Note: trip counts should be already calculated by
484 // __kmpc_process_loop_nest_rectang.
485 // OMPTODO: special case 2, 3 nested loops: either do different
486 // interface without array or possibly template this over n
487 extern "C" void
488 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489                                  const bounds_info_t *original_bounds_nest,
490                                  /*out*/ kmp_uint64 *original_ivs,
491                                  kmp_index_t n) {
492 
493   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494 
495   // First, calc corresponding iteration in every original loop:
496   for (kmp_index_t ind = n; ind > 0;) {
497     --ind;
498     auto bounds = &(original_bounds_nest[ind]);
499 
500     // should be optimized to OPDIVREM:
501     auto temp = new_iv / bounds->trip_count;
502     auto iteration = new_iv % bounds->trip_count;
503     new_iv = temp;
504 
505     iterations[ind] = iteration;
506   }
507   KMP_ASSERT(new_iv == 0);
508 
509   for (kmp_index_t ind = 0; ind < n; ++ind) {
510     auto bounds = &(original_bounds_nest[ind]);
511 
512     kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
513   }
514 }
515 
516 //----------------------------------------------------------------------------
517 // Non-rectangular loop nest
518 //----------------------------------------------------------------------------
519 
520 //----------Calculate maximum possible span of iv values on one level---------
521 
522 // Calculate span for IV on this loop level for "<=" case.
523 // Note: it's for <= on this loop nest level, so lower bound should be smallest
524 // value, upper bound should be the biggest value. If the loop won't execute,
525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
526 // around.
527 template <typename T>
528 void kmp_calc_span_lessoreq_XX(
529     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
530     /* in/out*/ bounds_info_internal_t *bounds_nest) {
531 
532   typedef typename traits_t<T>::unsigned_t UT;
533   // typedef typename traits_t<T>::signed_t ST;
534 
535   // typedef typename big_span_t span_t;
536   typedef T span_t;
537 
538   auto &bbounds = bounds->b;
539 
540   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
541     // This dimention depends on one of previous ones; can't be the outermost
542     // one.
543     bounds_info_internalXX_template<T> *previous =
544         reinterpret_cast<bounds_info_internalXX_template<T> *>(
545             &(bounds_nest[bbounds.outer_iv]));
546 
547     // OMPTODO: assert that T is compatible with loop variable type on
548     // 'previous' loop
549 
550     {
551       span_t bound_candidate1 =
552           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553       span_t bound_candidate2 =
554           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555       if (bound_candidate1 < bound_candidate2) {
556         bounds->span_smallest = bound_candidate1;
557       } else {
558         bounds->span_smallest = bound_candidate2;
559       }
560     }
561 
562     {
563       // We can't adjust the upper bound with respect to step, because
564       // lower bound might be off after adjustments
565 
566       span_t bound_candidate1 =
567           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568       span_t bound_candidate2 =
569           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570       if (bound_candidate1 < bound_candidate2) {
571         bounds->span_biggest = bound_candidate2;
572       } else {
573         bounds->span_biggest = bound_candidate1;
574       }
575     }
576   } else {
577     // Rectangular:
578     bounds->span_smallest = bbounds.lb0;
579     bounds->span_biggest = bbounds.ub0;
580   }
581   if (!bounds->loop_bounds_adjusted) {
582     // Here it's safe to reduce the space to the multiply of step.
583     // OMPTODO: check if the formular is correct.
584     // Also check if it would be safe to do this if we didn't adjust left side.
585     bounds->span_biggest -=
586         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587   }
588 }
589 
590 // Calculate span for IV on this loop level for ">=" case.
591 template <typename T>
592 void kmp_calc_span_greateroreq_XX(
593     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
594     /* in/out*/ bounds_info_internal_t *bounds_nest) {
595 
596   typedef typename traits_t<T>::unsigned_t UT;
597   // typedef typename traits_t<T>::signed_t ST;
598 
599   // typedef typename big_span_t span_t;
600   typedef T span_t;
601 
602   auto &bbounds = bounds->b;
603 
604   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
605     // This dimention depends on one of previous ones; can't be the outermost
606     // one.
607     bounds_info_internalXX_template<T> *previous =
608         reinterpret_cast<bounds_info_internalXX_template<T> *>(
609             &(bounds_nest[bbounds.outer_iv]));
610 
611     // OMPTODO: assert that T is compatible with loop variable type on
612     // 'previous' loop
613 
614     {
615       span_t bound_candidate1 =
616           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617       span_t bound_candidate2 =
618           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619       if (bound_candidate1 >= bound_candidate2) {
620         bounds->span_smallest = bound_candidate1;
621       } else {
622         bounds->span_smallest = bound_candidate2;
623       }
624     }
625 
626     {
627       // We can't adjust the upper bound with respect to step, because
628       // lower bound might be off after adjustments
629 
630       span_t bound_candidate1 =
631           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632       span_t bound_candidate2 =
633           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634       if (bound_candidate1 >= bound_candidate2) {
635         bounds->span_biggest = bound_candidate2;
636       } else {
637         bounds->span_biggest = bound_candidate1;
638       }
639     }
640 
641   } else {
642     // Rectangular:
643     bounds->span_biggest = bbounds.lb0;
644     bounds->span_smallest = bbounds.ub0;
645   }
646   if (!bounds->loop_bounds_adjusted) {
647     // Here it's safe to reduce the space to the multiply of step.
648     // OMPTODO: check if the formular is correct.
649     // Also check if it would be safe to do this if we didn't adjust left side.
650     bounds->span_biggest -=
651         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652   }
653 }
654 
655 // Calculate maximum possible span for IV on this loop level.
656 template <typename T>
657 void kmp_calc_span_XX(
658     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
659     /* in/out*/ bounds_info_internal_t *bounds_nest) {
660 
661   if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662     kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
663   } else {
664     KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665     kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
666   }
667 }
668 
669 //----------All initial processing of the loop nest---------------------------
670 
671 // Calculate new bounds for this loop level.
672 // To be able to work with the nest we need to get it to a parallelepiped shape.
673 // We need to stay in the original range of values, so that there will be no
674 // overflow, for that we'll adjust both upper and lower bounds as needed.
675 template <typename T>
676 void kmp_calc_new_bounds_XX(
677     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
678     /* in/out*/ bounds_info_internal_t *bounds_nest) {
679 
680   auto &bbounds = bounds->b;
681 
682   if (bbounds.lb1 == bbounds.ub1) {
683     // Already parallel, no need to adjust:
684     bounds->loop_bounds_adjusted = false;
685   } else {
686     bounds->loop_bounds_adjusted = true;
687 
688     T old_lb1 = bbounds.lb1;
689     T old_ub1 = bbounds.ub1;
690 
691     if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692       // With this shape we can adjust to a rectangle:
693       bbounds.lb1 = 0;
694       bbounds.ub1 = 0;
695     } else {
696       // get upper and lower bounds to be parallel
697       // with values in the old range.
698       // Note: abs didn't work here.
699       if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700           ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701         bbounds.lb1 = old_ub1;
702       } else {
703         bbounds.ub1 = old_lb1;
704       }
705     }
706 
707     // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708     // The idea here that for this IV we are now getting the same span
709     // irrespective of the previous IV value.
710     bounds_info_internalXX_template<T> *previous =
711         reinterpret_cast<bounds_info_internalXX_template<T> *>(
712             &bounds_nest[bbounds.outer_iv]);
713 
714     if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715       if (old_lb1 < bbounds.lb1) {
716         KMP_ASSERT(old_lb1 < 0);
717         // The length is good on outer_iv biggest number,
718         // can use it to find where to move the lower bound:
719 
720         T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721         bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722                             // e.g. it was 0?? (same below)
723       } else if (old_lb1 > bbounds.lb1) {
724         // still need to move lower bound:
725         T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726         bbounds.lb0 += add;
727       }
728 
729       if (old_ub1 > bbounds.ub1) {
730         KMP_ASSERT(old_ub1 > 0);
731         // The length is good on outer_iv biggest number,
732         // can use it to find where to move upper bound:
733 
734         T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735         bbounds.ub0 += add;
736       } else if (old_ub1 < bbounds.ub1) {
737         // still need to move upper bound:
738         T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739         bbounds.ub0 -= sub;
740       }
741     } else {
742       KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743       if (old_lb1 < bbounds.lb1) {
744         KMP_ASSERT(old_lb1 < 0);
745         T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746         bbounds.lb0 -= sub;
747       } else if (old_lb1 > bbounds.lb1) {
748         T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749         bbounds.lb0 += add;
750       }
751 
752       if (old_ub1 > bbounds.ub1) {
753         KMP_ASSERT(old_ub1 > 0);
754         T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755         bbounds.ub0 += add;
756       } else if (old_ub1 < bbounds.ub1) {
757         T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758         bbounds.ub0 -= sub;
759       }
760     }
761   }
762 }
763 
764 // Do all processing for one canonicalized loop in the nest
765 // (assuming that outer loops already were processed):
766 template <typename T>
767 kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
769     /*in/out*/ bounds_info_internal_t *bounds_nest) {
770 
771   kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
772   kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
773   return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
774 }
775 
776 // Non-rectangular loop nest, canonicalized to use <= or >=.
777 // Process loop nest to have a parallelepiped shape,
778 // calculate biggest spans for IV's on all levels and calculate overall trip
779 // count. "bounds_nest" has to be allocated per thread.
780 // Returns overall trip count (for adjusted space).
781 kmp_loop_nest_iv_t kmp_process_loop_nest(
782     /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783 
784   kmp_loop_nest_iv_t total = 1;
785 
786   for (kmp_index_t ind = 0; ind < n; ++ind) {
787     auto bounds = &(bounds_nest[ind]);
788     kmp_loop_nest_iv_t trip_count = 0;
789 
790     switch (bounds->b.loop_type) {
791     case loop_type_t::loop_type_int32:
792       trip_count = kmp_process_one_loop_XX<kmp_int32>(
793           /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794           /*in/out*/ bounds_nest);
795       break;
796     case loop_type_t::loop_type_uint32:
797       trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798           /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799           /*in/out*/ bounds_nest);
800       break;
801     case loop_type_t::loop_type_int64:
802       trip_count = kmp_process_one_loop_XX<kmp_int64>(
803           /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804           /*in/out*/ bounds_nest);
805       break;
806     case loop_type_t::loop_type_uint64:
807       trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808           /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809           /*in/out*/ bounds_nest);
810       break;
811     default:
812       KMP_ASSERT(false);
813     }
814     total *= trip_count;
815   }
816 
817   return total;
818 }
819 
820 //----------Calculate iterations (in the original or updated space)-----------
821 
822 // Calculate number of iterations in original or updated space resulting in
823 // original_ivs[ind] (only on this level, non-negative)
824 // (not counting initial iteration)
825 template <typename T>
826 kmp_loop_nest_iv_t
827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828                                  const kmp_point_t original_ivs,
829                                  kmp_index_t ind) {
830 
831   kmp_loop_nest_iv_t iterations = 0;
832 
833   if (bounds->comparison == comparison_t::comp_less_or_eq) {
834     iterations =
835         (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836          bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837         __kmp_abs(bounds->step);
838   } else {
839     KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840     iterations = (bounds->lb0 +
841                   bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842                   static_cast<T>(original_ivs[ind])) /
843                  __kmp_abs(bounds->step);
844   }
845 
846   return iterations;
847 }
848 
849 // Calculate number of iterations in the original or updated space resulting in
850 // original_ivs[ind] (only on this level, non-negative)
851 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852                                                  const kmp_point_t original_ivs,
853                                                  kmp_index_t ind) {
854 
855   switch (bounds->loop_type) {
856   case loop_type_t::loop_type_int32:
857     return kmp_calc_number_of_iterations_XX<kmp_int32>(
858         (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859     break;
860   case loop_type_t::loop_type_uint32:
861     return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862         (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863     break;
864   case loop_type_t::loop_type_int64:
865     return kmp_calc_number_of_iterations_XX<kmp_int64>(
866         (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867     break;
868   case loop_type_t::loop_type_uint64:
869     return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870         (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871     break;
872   default:
873     KMP_ASSERT(false);
874     return 0;
875   }
876 }
877 
878 //----------Calculate new iv corresponding to original ivs--------------------
879 
880 // We got a point in the original loop nest.
881 // Take updated bounds and calculate what new_iv will correspond to this point.
882 // When we are getting original IVs from new_iv, we have to adjust to fit into
883 // original loops bounds. Getting new_iv for the adjusted original IVs will help
884 // with making more chunks non-empty.
885 kmp_loop_nest_iv_t
886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887                                   const kmp_point_t original_ivs,
888                                   kmp_index_t n) {
889 
890   kmp_loop_nest_iv_t new_iv = 0;
891 
892   for (kmp_index_t ind = 0; ind < n; ++ind) {
893     auto bounds = &(bounds_nest[ind].b);
894 
895     new_iv = new_iv * bounds->trip_count +
896              kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897   }
898 
899   return new_iv;
900 }
901 
902 //----------Calculate original ivs for provided iterations--------------------
903 
904 // Calculate original IVs for provided iterations, assuming iterations are
905 // calculated in the original space.
906 // Loop nest is in canonical form (with <= / >=).
907 bool kmp_calc_original_ivs_from_iterations(
908     const bounds_info_t *original_bounds_nest, kmp_index_t n,
909     /*in/out*/ kmp_point_t original_ivs,
910     /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
911 
912   kmp_index_t lengthened_ind = n;
913 
914   for (; ind < n;) {
915     auto bounds = &(original_bounds_nest[ind]);
916     bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
917                                 ind, (lengthened_ind < ind), true);
918 
919     if (!good) {
920       // The calculated iv value is too big (or too small for >=):
921       if (ind == 0) {
922         // Space is empty:
923         return false;
924       } else {
925         // Go to next iteration on the outer loop:
926         --ind;
927         ++iterations[ind];
928         lengthened_ind = ind;
929         for (kmp_index_t i = ind + 1; i < n; ++i) {
930           iterations[i] = 0;
931         }
932         continue;
933       }
934     }
935     ++ind;
936   }
937 
938   return true;
939 }
940 
941 //----------Calculate original ivs for the beginning of the loop nest---------
942 
943 // Calculate IVs for the beginning of the loop nest.
944 // Note: lower bounds of all loops may not work -
945 // if on some of the iterations of the outer loops inner loops are empty.
946 // Loop nest is in canonical form (with <= / >=).
947 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948                                      kmp_index_t n,
949                                      /*out*/ kmp_point_t original_ivs) {
950 
951   // Iterations in the original space, multiplied by step:
952   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953   for (kmp_index_t ind = n; ind > 0;) {
954     --ind;
955     iterations[ind] = 0;
956   }
957 
958   // Now calculate the point:
959   bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960                                                  /*in/out*/ original_ivs,
961                                                  /*in/out*/ iterations, 0);
962   return b;
963 }
964 
965 //----------Calculate next point in the original loop space-------------------
966 
967 // From current set of original IVs calculate next point.
968 // Return false if there is no next point in the loop bounds.
969 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970                                 kmp_index_t n, const kmp_point_t original_ivs,
971                                 /*out*/ kmp_point_t next_original_ivs) {
972   // Iterations in the original space, multiplied by step (so can be negative):
973   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974   // First, calc corresponding iteration in every original loop:
975   for (kmp_index_t ind = 0; ind < n; ++ind) {
976     auto bounds = &(original_bounds_nest[ind]);
977     iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978   }
979 
980   for (kmp_index_t ind = 0; ind < n; ++ind) {
981     next_original_ivs[ind] = original_ivs[ind];
982   }
983 
984   // Next add one step to the iterations on the inner-most level, and see if we
985   // need to move up the nest:
986   kmp_index_t ind = n - 1;
987   ++iterations[ind];
988 
989   bool b = kmp_calc_original_ivs_from_iterations(
990       original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
991 
992   return b;
993 }
994 
995 //----------Calculate chunk end in the original loop space--------------------
996 
997 // For one level calculate old induction variable corresponding to overall
998 // new_iv for the chunk end.
999 // Return true if it fits into upper bound on this level
1000 // (if not, we need to re-calculate)
1001 template <typename T>
1002 bool kmp_calc_one_iv_for_chunk_end_XX(
1003     const bounds_infoXX_template<T> *bounds,
1004     const bounds_infoXX_template<T> *updated_bounds,
1005     /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006     kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007     const kmp_point_t original_ivs_start) {
1008 
1009   // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010   // big_span_t;
1011 
1012   // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013   T temp = 0;
1014 
1015   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016 
1017   if (start_with_lower_bound) {
1018     // we moved to the next iteration on one of outer loops, may as well use
1019     // the lower bound here:
1020     temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021   } else {
1022     // Start in expanded space, but:
1023     // - we need to hit original space lower bound, so need to account for
1024     // that
1025     // - we have to go into original space, even if that means adding more
1026     // iterations than was planned
1027     // - we have to go past (or equal to) previous point (which is the chunk
1028     // starting point)
1029 
1030     auto iteration = iterations[ind];
1031 
1032     auto step = bounds->step;
1033 
1034     // In case of >= it's negative:
1035     auto accountForStep =
1036         ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037          (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038         step;
1039 
1040     temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041            accountForStep + iteration * step;
1042 
1043     if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044          (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045         ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046          (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047       // Too small (or too big), didn't reach the original lower bound. Use
1048       // heuristic:
1049       temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050     }
1051 
1052     if (compare_with_start) {
1053 
1054       T start = static_cast<T>(original_ivs_start[ind]);
1055 
1056       temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057 
1058       // On all previous levels start of the chunk is same as the end, need to
1059       // be really careful here:
1060       if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061            (temp < start)) ||
1062           ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063            (temp > start))) {
1064         // End of the chunk can't be smaller (for >= bigger) than it's start.
1065         // Use heuristic:
1066         temp = start + iteration / 4 * step;
1067       }
1068     }
1069   }
1070 
1071   original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072 
1073   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074        (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076        (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077     // Too big (or too small for >=).
1078     return false;
1079   }
1080 
1081   return true;
1082 }
1083 
1084 // For one level calculate old induction variable corresponding to overall
1085 // new_iv for the chunk end.
1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087                                    const bounds_info_t *updated_bounds,
1088                                    /*in/out*/ kmp_point_t original_ivs,
1089                                    const kmp_iterations_t iterations,
1090                                    kmp_index_t ind, bool start_with_lower_bound,
1091                                    bool compare_with_start,
1092                                    const kmp_point_t original_ivs_start) {
1093 
1094   switch (bounds->loop_type) {
1095   case loop_type_t::loop_type_int32:
1096     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097         (bounds_infoXX_template<kmp_int32> *)(bounds),
1098         (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099         /*in/out*/
1100         original_ivs, iterations, ind, start_with_lower_bound,
1101         compare_with_start, original_ivs_start);
1102     break;
1103   case loop_type_t::loop_type_uint32:
1104     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1106         (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107         /*in/out*/
1108         original_ivs, iterations, ind, start_with_lower_bound,
1109         compare_with_start, original_ivs_start);
1110     break;
1111   case loop_type_t::loop_type_int64:
1112     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113         (bounds_infoXX_template<kmp_int64> *)(bounds),
1114         (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115         /*in/out*/
1116         original_ivs, iterations, ind, start_with_lower_bound,
1117         compare_with_start, original_ivs_start);
1118     break;
1119   case loop_type_t::loop_type_uint64:
1120     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1122         (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123         /*in/out*/
1124         original_ivs, iterations, ind, start_with_lower_bound,
1125         compare_with_start, original_ivs_start);
1126     break;
1127   default:
1128     KMP_ASSERT(false);
1129     return false;
1130   }
1131 }
1132 
1133 // Calculate old induction variables corresponding to overall new_iv for the
1134 // chunk end. If due to space extension we are getting old IVs outside of the
1135 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1136 // esp. on the lower bounds side. When getting result need to make sure that the
1137 // new chunk starts at next position to old chunk, not overlaps with it (this is
1138 // done elsewhere), and need to make sure end of the chunk is further than the
1139 // beginning of the chunk. We don't need an exact ending point here, just
1140 // something more-or-less close to the desired chunk length, bigger is fine
1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1142 // only at the very end of the space). result: false if could not find the
1143 // ending point in the original loop space. In this case the caller can use
1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1145 // it'll have at least the starting point, which is by construction in the
1146 // original space.
1147 bool kmp_calc_original_ivs_for_chunk_end(
1148     const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149     const bounds_info_internal_t *updated_bounds_nest,
1150     const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151     /*out*/ kmp_point_t original_ivs) {
1152 
1153   // Iterations in the expanded space:
1154   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155   // First, calc corresponding iteration in every modified loop:
1156   for (kmp_index_t ind = n; ind > 0;) {
1157     --ind;
1158     auto &updated_bounds = updated_bounds_nest[ind];
1159 
1160     // should be optimized to OPDIVREM:
1161     auto new_ind = new_iv / updated_bounds.b.trip_count;
1162     auto iteration = new_iv % updated_bounds.b.trip_count;
1163 
1164     new_iv = new_ind;
1165     iterations[ind] = iteration;
1166   }
1167   KMP_DEBUG_ASSERT(new_iv == 0);
1168 
1169   kmp_index_t lengthened_ind = n;
1170   kmp_index_t equal_ind = -1;
1171 
1172   // Next calculate the point, but in original loop nest.
1173   for (kmp_index_t ind = 0; ind < n;) {
1174     auto bounds = &(original_bounds_nest[ind]);
1175     auto updated_bounds = &(updated_bounds_nest[ind].b);
1176 
1177     bool good = kmp_calc_one_iv_for_chunk_end(
1178         bounds, updated_bounds,
1179         /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180         (equal_ind >= ind - 1), original_ivs_start);
1181 
1182     if (!good) {
1183       // Too big (or too small for >=).
1184       if (ind == 0) {
1185         // Need to reduce to the end.
1186         return false;
1187       } else {
1188         // Go to next iteration on outer loop:
1189         --ind;
1190         ++(iterations[ind]);
1191         lengthened_ind = ind;
1192         if (equal_ind >= lengthened_ind) {
1193           // We've changed the number of iterations here,
1194           // can't be same anymore:
1195           equal_ind = lengthened_ind - 1;
1196         }
1197         for (kmp_index_t i = ind + 1; i < n; ++i) {
1198           iterations[i] = 0;
1199         }
1200         continue;
1201       }
1202     }
1203 
1204     if ((equal_ind == ind - 1) &&
1205         (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206                     original_ivs_start[ind]))) {
1207       equal_ind = ind;
1208     } else if ((equal_ind > ind - 1) &&
1209                !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210                             original_ivs_start[ind]))) {
1211       equal_ind = ind - 1;
1212     }
1213     ++ind;
1214   }
1215 
1216   return true;
1217 }
1218 
1219 //----------Calculate upper bounds for the last chunk-------------------------
1220 
1221 // Calculate one upper bound for the end.
1222 template <typename T>
1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224                             /*in/out*/ kmp_point_t original_ivs,
1225                             kmp_index_t ind) {
1226 
1227   T temp = bounds->ub0 +
1228            bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229 
1230   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231 }
1232 
1233 void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234                          /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235 
1236   switch (bounds->loop_type) {
1237   default:
1238     KMP_ASSERT(false);
1239     break;
1240   case loop_type_t::loop_type_int32:
1241     kmp_calc_one_iv_end_XX<kmp_int32>(
1242         (bounds_infoXX_template<kmp_int32> *)(bounds),
1243         /*in/out*/ original_ivs, ind);
1244     break;
1245   case loop_type_t::loop_type_uint32:
1246     kmp_calc_one_iv_end_XX<kmp_uint32>(
1247         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1248         /*in/out*/ original_ivs, ind);
1249     break;
1250   case loop_type_t::loop_type_int64:
1251     kmp_calc_one_iv_end_XX<kmp_int64>(
1252         (bounds_infoXX_template<kmp_int64> *)(bounds),
1253         /*in/out*/ original_ivs, ind);
1254     break;
1255   case loop_type_t::loop_type_uint64:
1256     kmp_calc_one_iv_end_XX<kmp_uint64>(
1257         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1258         /*in/out*/ original_ivs, ind);
1259     break;
1260   }
1261 }
1262 
1263 // Calculate upper bounds for the last loop iteration. Just use original upper
1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265 // this point is in the original space (it's likely not)
1266 void kmp_calc_original_ivs_for_end(
1267     const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268     /*out*/ kmp_point_t original_ivs) {
1269   for (kmp_index_t ind = 0; ind < n; ++ind) {
1270     auto bounds = &(original_bounds_nest[ind]);
1271     kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272   }
1273 }
1274 
1275 //----------Init API for non-rectangular loops--------------------------------
1276 
1277 // Init API for collapsed loops (static, no chunks defined).
1278 // "bounds_nest" has to be allocated per thread.
1279 // API will modify original bounds_nest array to bring it to a canonical form
1280 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1281 // canonical form there will be no changes to bounds in bounds_nest array
1282 // (only trip counts will be calculated). Internally API will expand the space
1283 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1284 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1285 // important on the left side, to hit the lower bounds and not step over), and
1286 // pick the correct chunk for this thread (so it will calculate chunks up to the
1287 // needed one). It could be optimized to calculate just this chunk, potentially
1288 // a bit less well distributed among threads. It is designed to make sure that
1289 // threads will receive predictable chunks, deterministically (so that next nest
1290 // of loops with similar characteristics will get exactly same chunks on same
1291 // threads).
1292 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1293 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1294 extern "C" kmp_int32
1295 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1296                           /*in/out*/ bounds_info_t *original_bounds_nest,
1297                           /*out*/ bounds_info_t *chunk_bounds_nest,
1298                           kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1299 
1300   KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1301   KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1302 
1303   if (__kmp_env_consistency_check) {
1304     __kmp_push_workshare(gtid, ct_pdo, loc);
1305   }
1306 
1307   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1308 
1309   CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1310 
1311   for (kmp_index_t i = 0; i < n; ++i) {
1312     updated_bounds_nest[i].b = original_bounds_nest[i];
1313   }
1314 
1315   kmp_loop_nest_iv_t total =
1316       kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1317 
1318   if (plastiter != NULL) {
1319     *plastiter = FALSE;
1320   }
1321 
1322   if (total == 0) {
1323     // Loop won't execute:
1324     return FALSE;
1325   }
1326 
1327   // OMPTODO: DISTRIBUTE is not supported yet
1328   __kmp_assert_valid_gtid(gtid);
1329   kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1330 
1331   kmp_info_t *th = __kmp_threads[gtid];
1332   kmp_team_t *team = th->th.th_team;
1333   kmp_uint32 nth = team->t.t_nproc; // Number of threads
1334 
1335   KMP_DEBUG_ASSERT(tid < nth);
1336 
1337   CollapseAllocator<kmp_uint64> original_ivs_start(n);
1338 
1339   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1340                                        /*out*/ original_ivs_start)) {
1341     // Loop won't execute:
1342     return FALSE;
1343   }
1344 
1345   // Not doing this optimization for one thread:
1346   // (1) more to test
1347   // (2) without it current contract that chunk_bounds_nest has only lb0 and
1348   // ub0, lb1 and ub1 are set to 0 and can be ignored.
1349   // if (nth == 1) {
1350   //  // One thread:
1351   //  // Copy all info from original_bounds_nest, it'll be good enough.
1352 
1353   //  for (kmp_index_t i = 0; i < n; ++i) {
1354   //    chunk_bounds_nest[i] = original_bounds_nest[i];
1355   //  }
1356 
1357   //  if (plastiter != NULL) {
1358   //    *plastiter = TRUE;
1359   //  }
1360   //  return TRUE;
1361   //}
1362 
1363   kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1364       updated_bounds_nest, original_ivs_start, n);
1365 
1366   bool last_iter = false;
1367 
1368   for (; nth > 0;) {
1369     // We could calculate chunk size once, but this is to compensate that the
1370     // original space is not parallelepiped and some threads can be left
1371     // without work:
1372     KMP_DEBUG_ASSERT(total >= new_iv);
1373 
1374     kmp_loop_nest_iv_t total_left = total - new_iv;
1375     kmp_loop_nest_iv_t chunk_size = total_left / nth;
1376     kmp_loop_nest_iv_t remainder = total_left % nth;
1377 
1378     kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1379 
1380     if (remainder > 0) {
1381       ++curr_chunk_size;
1382       --remainder;
1383     }
1384 
1385 #if defined(KMP_DEBUG)
1386     kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1387 #endif
1388 
1389     if (curr_chunk_size > 1) {
1390       new_iv += curr_chunk_size - 1;
1391     }
1392 
1393     CollapseAllocator<kmp_uint64> original_ivs_end(n);
1394     if ((nth == 1) || (new_iv >= total - 1)) {
1395       // Do this one till the end - just in case we miscalculated
1396       // and either too much is left to process or new_iv is a bit too big:
1397       kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1398                                     /*out*/ original_ivs_end);
1399 
1400       last_iter = true;
1401     } else {
1402       // Note: here we make sure it's past (or equal to) the previous point.
1403       if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1404                                                updated_bounds_nest,
1405                                                original_ivs_start, new_iv,
1406                                                /*out*/ original_ivs_end)) {
1407         // We could not find the ending point, use the original upper bounds:
1408         kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1409                                       /*out*/ original_ivs_end);
1410 
1411         last_iter = true;
1412       }
1413     }
1414 
1415 #if defined(KMP_DEBUG)
1416     auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1417         updated_bounds_nest, original_ivs_end, n);
1418     KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1419 #endif
1420 
1421     if (last_iter && (tid != 0)) {
1422       // We are done, this was last chunk, but no chunk for current thread was
1423       // found:
1424       return FALSE;
1425     }
1426 
1427     if (tid == 0) {
1428       // We found the chunk for this thread, now we need to check if it's the
1429       // last chunk or not:
1430 
1431       CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1432       if (last_iter ||
1433           !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1434                                       /*out*/ original_ivs_next_start)) {
1435         // no more loop iterations left to process,
1436         // this means that currently found chunk is the last chunk:
1437         if (plastiter != NULL) {
1438           *plastiter = TRUE;
1439         }
1440       }
1441 
1442       // Fill in chunk bounds:
1443       for (kmp_index_t i = 0; i < n; ++i) {
1444         chunk_bounds_nest[i] =
1445             original_bounds_nest[i]; // To fill in types, etc. - optional
1446         chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1447         chunk_bounds_nest[i].lb1_u64 = 0;
1448 
1449         chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1450         chunk_bounds_nest[i].ub1_u64 = 0;
1451       }
1452 
1453       return TRUE;
1454     }
1455 
1456     --tid;
1457     --nth;
1458 
1459     bool next_chunk = kmp_calc_next_original_ivs(
1460         original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1461     if (!next_chunk) {
1462       // no more loop iterations to process,
1463       // the prevoius chunk was the last chunk
1464       break;
1465     }
1466 
1467     // original_ivs_start is next to previous chunk original_ivs_end,
1468     // we need to start new chunk here, so chunks will be one after another
1469     // without any gap or overlap:
1470     new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1471                                                original_ivs_start, n);
1472   }
1473 
1474   return FALSE;
1475 }
1476