1 /*
2  * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_DISPATCH_HIER_H
14 #define KMP_DISPATCH_HIER_H
15 #include "kmp.h"
16 #include "kmp_dispatch.h"
17 
18 // Layer type for scheduling hierarchy
19 enum kmp_hier_layer_e {
20   LAYER_THREAD = -1,
21   LAYER_L1,
22   LAYER_L2,
23   LAYER_L3,
24   LAYER_NUMA,
25   LAYER_LOOP,
26   LAYER_LAST
27 };
28 
29 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
__kmp_get_hier_str(kmp_hier_layer_e type)30 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
31   switch (type) {
32   case kmp_hier_layer_e::LAYER_THREAD:
33     return "THREAD";
34   case kmp_hier_layer_e::LAYER_L1:
35     return "L1";
36   case kmp_hier_layer_e::LAYER_L2:
37     return "L2";
38   case kmp_hier_layer_e::LAYER_L3:
39     return "L3";
40   case kmp_hier_layer_e::LAYER_NUMA:
41     return "NUMA";
42   case kmp_hier_layer_e::LAYER_LOOP:
43     return "WHOLE_LOOP";
44   case kmp_hier_layer_e::LAYER_LAST:
45     return "LAST";
46   }
47   KMP_ASSERT(0);
48   // Appease compilers, should never get here
49   return "ERROR";
50 }
51 
52 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
53 typedef struct kmp_hier_sched_env_t {
54   int size;
55   int capacity;
56   enum sched_type *scheds;
57   kmp_int32 *small_chunks;
58   kmp_int64 *large_chunks;
59   kmp_hier_layer_e *layers;
60   // Append a level of the hierarchy
appendkmp_hier_sched_env_t61   void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
62     if (capacity == 0) {
63       scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
64                                                  kmp_hier_layer_e::LAYER_LAST);
65       small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
66                                                  kmp_hier_layer_e::LAYER_LAST);
67       large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
68                                                  kmp_hier_layer_e::LAYER_LAST);
69       layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
70                                                   kmp_hier_layer_e::LAYER_LAST);
71       capacity = kmp_hier_layer_e::LAYER_LAST;
72     }
73     int current_size = size;
74     KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
75     scheds[current_size] = sched;
76     layers[current_size] = layer;
77     small_chunks[current_size] = chunk;
78     large_chunks[current_size] = (kmp_int64)chunk;
79     size++;
80   }
81   // Sort the hierarchy using selection sort, size will always be small
82   // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
sortkmp_hier_sched_env_t83   void sort() {
84     if (size <= 1)
85       return;
86     for (int i = 0; i < size; ++i) {
87       int switch_index = i;
88       for (int j = i + 1; j < size; ++j) {
89         if (layers[j] < layers[switch_index])
90           switch_index = j;
91       }
92       if (switch_index != i) {
93         kmp_hier_layer_e temp1 = layers[i];
94         enum sched_type temp2 = scheds[i];
95         kmp_int32 temp3 = small_chunks[i];
96         kmp_int64 temp4 = large_chunks[i];
97         layers[i] = layers[switch_index];
98         scheds[i] = scheds[switch_index];
99         small_chunks[i] = small_chunks[switch_index];
100         large_chunks[i] = large_chunks[switch_index];
101         layers[switch_index] = temp1;
102         scheds[switch_index] = temp2;
103         small_chunks[switch_index] = temp3;
104         large_chunks[switch_index] = temp4;
105       }
106     }
107   }
108   // Free all memory
deallocatekmp_hier_sched_env_t109   void deallocate() {
110     if (capacity > 0) {
111       __kmp_free(scheds);
112       __kmp_free(layers);
113       __kmp_free(small_chunks);
114       __kmp_free(large_chunks);
115       scheds = NULL;
116       layers = NULL;
117       small_chunks = NULL;
118       large_chunks = NULL;
119     }
120     size = 0;
121     capacity = 0;
122   }
123 } kmp_hier_sched_env_t;
124 
125 extern int __kmp_dispatch_hand_threading;
126 extern kmp_hier_sched_env_t __kmp_hier_scheds;
127 
128 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
129 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
130 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
131 
132 extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
133 extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
134 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
135                                         kmp_hier_layer_e t2);
136 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
137 
138 template <typename T> struct kmp_hier_shared_bdata_t {
139   typedef typename traits_t<T>::signed_t ST;
140   volatile kmp_uint64 val[2];
141   kmp_int32 status[2];
142   T lb[2];
143   T ub[2];
144   ST st[2];
145   dispatch_shared_info_template<T> sh[2];
zerokmp_hier_shared_bdata_t146   void zero() {
147     val[0] = val[1] = 0;
148     status[0] = status[1] = 0;
149     lb[0] = lb[1] = 0;
150     ub[0] = ub[1] = 0;
151     st[0] = st[1] = 0;
152     sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
153   }
set_next_hand_threadkmp_hier_shared_bdata_t154   void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
155                             kmp_uint64 index) {
156     lb[1 - index] = nlb;
157     ub[1 - index] = nub;
158     st[1 - index] = nst;
159     status[1 - index] = nstatus;
160   }
set_nextkmp_hier_shared_bdata_t161   void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
162     lb[1 - index] = nlb;
163     ub[1 - index] = nub;
164     st[1 - index] = nst;
165     status[1 - index] = nstatus;
166     sh[1 - index].u.s.iteration = 0;
167   }
168 
get_next_statuskmp_hier_shared_bdata_t169   kmp_int32 get_next_status(kmp_uint64 index) const {
170     return status[1 - index];
171   }
get_next_lbkmp_hier_shared_bdata_t172   T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
get_next_ubkmp_hier_shared_bdata_t173   T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
get_next_stkmp_hier_shared_bdata_t174   ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
get_next_shkmp_hier_shared_bdata_t175   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
176     return &(sh[1 - index]);
177   }
178 
get_curr_statuskmp_hier_shared_bdata_t179   kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
get_curr_lbkmp_hier_shared_bdata_t180   T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
get_curr_ubkmp_hier_shared_bdata_t181   T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
get_curr_stkmp_hier_shared_bdata_t182   ST get_curr_st(kmp_uint64 index) const { return st[index]; }
get_curr_shkmp_hier_shared_bdata_t183   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
184     return &(sh[index]);
185   }
186 };
187 
188 /*
189  * In the barrier implementations, num_active is the number of threads that are
190  * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
191  * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
192  * structure. tdata is the thread private data that resides on the thread
193  * data structure.
194  *
195  * The reset_shared() method is used to initialize the barrier data on the
196  * kmp_hier_top_unit_t hierarchy structure
197  *
198  * The reset_private() method is used to initialize the barrier data on the
199  * thread's private dispatch buffer structure
200  *
201  * The barrier() method takes an id, which is that thread's id for the
202  * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
203  * inside barrier() until all fellow threads who are attached to that
204  * kmp_hier_top_unit_t structure have arrived.
205  */
206 
207 // Core barrier implementation
208 // Can be used in a unit with between 2 to 8 threads
209 template <typename T> class core_barrier_impl {
get_wait_val(int num_active)210   static inline kmp_uint64 get_wait_val(int num_active) {
211     kmp_uint64 wait_val = 0LL;
212     switch (num_active) {
213     case 2:
214       wait_val = 0x0101LL;
215       break;
216     case 3:
217       wait_val = 0x010101LL;
218       break;
219     case 4:
220       wait_val = 0x01010101LL;
221       break;
222     case 5:
223       wait_val = 0x0101010101LL;
224       break;
225     case 6:
226       wait_val = 0x010101010101LL;
227       break;
228     case 7:
229       wait_val = 0x01010101010101LL;
230       break;
231     case 8:
232       wait_val = 0x0101010101010101LL;
233       break;
234     default:
235       // don't use the core_barrier_impl for more than 8 threads
236       KMP_ASSERT(0);
237     }
238     return wait_val;
239   }
240 
241 public:
242   static void reset_private(kmp_int32 num_active,
243                             kmp_hier_private_bdata_t *tdata);
244   static void reset_shared(kmp_int32 num_active,
245                            kmp_hier_shared_bdata_t<T> *bdata);
246   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
247                       kmp_hier_private_bdata_t *tdata);
248 };
249 
250 template <typename T>
reset_private(kmp_int32 num_active,kmp_hier_private_bdata_t * tdata)251 void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
252                                          kmp_hier_private_bdata_t *tdata) {
253   tdata->num_active = num_active;
254   tdata->index = 0;
255   tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
256 }
257 template <typename T>
reset_shared(kmp_int32 num_active,kmp_hier_shared_bdata_t<T> * bdata)258 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
259                                         kmp_hier_shared_bdata_t<T> *bdata) {
260   bdata->val[0] = bdata->val[1] = 0LL;
261   bdata->status[0] = bdata->status[1] = 0LL;
262 }
263 template <typename T>
barrier(kmp_int32 id,kmp_hier_shared_bdata_t<T> * bdata,kmp_hier_private_bdata_t * tdata)264 void core_barrier_impl<T>::barrier(kmp_int32 id,
265                                    kmp_hier_shared_bdata_t<T> *bdata,
266                                    kmp_hier_private_bdata_t *tdata) {
267   kmp_uint64 current_index = tdata->index;
268   kmp_uint64 next_index = 1 - current_index;
269   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
270   kmp_uint64 next_wait_value =
271       (current_wait_value ? 0 : get_wait_val(tdata->num_active));
272   KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
273                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
274                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
275                 next_wait_value));
276   char v = (current_wait_value ? '\1' : '\0');
277   (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
278   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
279                          __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
280   tdata->wait_val[current_index] = next_wait_value;
281   tdata->index = next_index;
282 }
283 
284 // Counter barrier implementation
285 // Can be used in a unit with arbitrary number of active threads
286 template <typename T> class counter_barrier_impl {
287 public:
288   static void reset_private(kmp_int32 num_active,
289                             kmp_hier_private_bdata_t *tdata);
290   static void reset_shared(kmp_int32 num_active,
291                            kmp_hier_shared_bdata_t<T> *bdata);
292   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
293                       kmp_hier_private_bdata_t *tdata);
294 };
295 
296 template <typename T>
reset_private(kmp_int32 num_active,kmp_hier_private_bdata_t * tdata)297 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
298                                             kmp_hier_private_bdata_t *tdata) {
299   tdata->num_active = num_active;
300   tdata->index = 0;
301   tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
302 }
303 template <typename T>
reset_shared(kmp_int32 num_active,kmp_hier_shared_bdata_t<T> * bdata)304 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
305                                            kmp_hier_shared_bdata_t<T> *bdata) {
306   bdata->val[0] = bdata->val[1] = 0LL;
307   bdata->status[0] = bdata->status[1] = 0LL;
308 }
309 template <typename T>
barrier(kmp_int32 id,kmp_hier_shared_bdata_t<T> * bdata,kmp_hier_private_bdata_t * tdata)310 void counter_barrier_impl<T>::barrier(kmp_int32 id,
311                                       kmp_hier_shared_bdata_t<T> *bdata,
312                                       kmp_hier_private_bdata_t *tdata) {
313   volatile kmp_int64 *val;
314   kmp_uint64 current_index = tdata->index;
315   kmp_uint64 next_index = 1 - current_index;
316   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
317   kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
318 
319   KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
320                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
321                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
322                 next_wait_value));
323   val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
324   KMP_TEST_THEN_INC64(val);
325   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
326                          __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
327   tdata->wait_val[current_index] = next_wait_value;
328   tdata->index = next_index;
329 }
330 
331 // Data associated with topology unit within a layer
332 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache
333 template <typename T> struct kmp_hier_top_unit_t {
334   typedef typename traits_t<T>::signed_t ST;
335   typedef typename traits_t<T>::unsigned_t UT;
336   kmp_int32 active; // number of topology units that communicate with this unit
337   // chunk information (lower/upper bound, stride, etc.)
338   dispatch_private_info_template<T> hier_pr;
339   kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
340   kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
341 
get_hier_idkmp_hier_top_unit_t342   kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
reset_shared_barrierkmp_hier_top_unit_t343   void reset_shared_barrier() {
344     KMP_DEBUG_ASSERT(active > 0);
345     if (active == 1)
346       return;
347     hier_barrier.zero();
348     if (active >= 2 && active <= 8) {
349       core_barrier_impl<T>::reset_shared(active, &hier_barrier);
350     } else {
351       counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
352     }
353   }
reset_private_barrierkmp_hier_top_unit_t354   void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
355     KMP_DEBUG_ASSERT(tdata);
356     KMP_DEBUG_ASSERT(active > 0);
357     if (active == 1)
358       return;
359     if (active >= 2 && active <= 8) {
360       core_barrier_impl<T>::reset_private(active, tdata);
361     } else {
362       counter_barrier_impl<T>::reset_private(active, tdata);
363     }
364   }
barrierkmp_hier_top_unit_t365   void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
366     KMP_DEBUG_ASSERT(tdata);
367     KMP_DEBUG_ASSERT(active > 0);
368     KMP_DEBUG_ASSERT(id >= 0 && id < active);
369     if (active == 1) {
370       tdata->index = 1 - tdata->index;
371       return;
372     }
373     if (active >= 2 && active <= 8) {
374       core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
375     } else {
376       counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
377     }
378   }
379 
get_next_statuskmp_hier_top_unit_t380   kmp_int32 get_next_status(kmp_uint64 index) const {
381     return hier_barrier.get_next_status(index);
382   }
get_next_lbkmp_hier_top_unit_t383   T get_next_lb(kmp_uint64 index) const {
384     return hier_barrier.get_next_lb(index);
385   }
get_next_ubkmp_hier_top_unit_t386   T get_next_ub(kmp_uint64 index) const {
387     return hier_barrier.get_next_ub(index);
388   }
get_next_stkmp_hier_top_unit_t389   ST get_next_st(kmp_uint64 index) const {
390     return hier_barrier.get_next_st(index);
391   }
get_next_shkmp_hier_top_unit_t392   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
393     return hier_barrier.get_next_sh(index);
394   }
395 
get_curr_statuskmp_hier_top_unit_t396   kmp_int32 get_curr_status(kmp_uint64 index) const {
397     return hier_barrier.get_curr_status(index);
398   }
get_curr_lbkmp_hier_top_unit_t399   T get_curr_lb(kmp_uint64 index) const {
400     return hier_barrier.get_curr_lb(index);
401   }
get_curr_ubkmp_hier_top_unit_t402   T get_curr_ub(kmp_uint64 index) const {
403     return hier_barrier.get_curr_ub(index);
404   }
get_curr_stkmp_hier_top_unit_t405   ST get_curr_st(kmp_uint64 index) const {
406     return hier_barrier.get_curr_st(index);
407   }
get_curr_shkmp_hier_top_unit_t408   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
409     return hier_barrier.get_curr_sh(index);
410   }
411 
set_next_hand_threadkmp_hier_top_unit_t412   void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
413                             kmp_uint64 index) {
414     hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
415   }
set_nextkmp_hier_top_unit_t416   void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
417     hier_barrier.set_next(lb, ub, st, status, index);
418   }
get_my_prkmp_hier_top_unit_t419   dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
get_parentkmp_hier_top_unit_t420   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
get_parent_prkmp_hier_top_unit_t421   dispatch_private_info_template<T> *get_parent_pr() {
422     return &(hier_parent->hier_pr);
423   }
424 
is_activekmp_hier_top_unit_t425   kmp_int32 is_active() const { return active; }
get_num_activekmp_hier_top_unit_t426   kmp_int32 get_num_active() const { return active; }
427 #ifdef KMP_DEBUG
printkmp_hier_top_unit_t428   void print() {
429     KD_TRACE(
430         10,
431         ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
432          active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
433          hier_pr.u.p.tc));
434   }
435 #endif
436 };
437 
438 // Information regarding a single layer within the scheduling hierarchy
439 template <typename T> struct kmp_hier_layer_info_t {
440   int num_active; // number of threads active in this level
441   kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
442   enum sched_type sched; // static, dynamic, guided, etc.
443   typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
444   int length; // length of the kmp_hier_top_unit_t array
445 
446 #ifdef KMP_DEBUG
447   // Print this layer's information
printkmp_hier_layer_info_t448   void print() {
449     const char *t = __kmp_get_hier_str(type);
450     KD_TRACE(
451         10,
452         ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
453          "length:%d\n",
454          num_active, t, sched, chunk, length));
455   }
456 #endif
457 };
458 
459 /*
460  * Structure to implement entire hierarchy
461  *
462  * The hierarchy is kept as an array of arrays to represent the different
463  * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
464  * highest layer.
465  * Example:
466  * [ 2 ] -> [ L3 | L3 ]
467  * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
468  * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
469  * There is also an array of layer_info_t which has information regarding
470  * each layer
471  */
472 template <typename T> struct kmp_hier_t {
473 public:
474   typedef typename traits_t<T>::unsigned_t UT;
475   typedef typename traits_t<T>::signed_t ST;
476 
477 private:
next_recursekmp_hier_t478   int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
479                    kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
480                    kmp_int32 previous_id, int hier_level) {
481     int status;
482     kmp_info_t *th = __kmp_threads[gtid];
483     auto parent = current->get_parent();
484     bool last_layer = (hier_level == get_num_layers() - 1);
485     KMP_DEBUG_ASSERT(th);
486     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
487     KMP_DEBUG_ASSERT(current);
488     KMP_DEBUG_ASSERT(hier_level >= 0);
489     KMP_DEBUG_ASSERT(hier_level < get_num_layers());
490     KMP_DEBUG_ASSERT(tdata);
491     KMP_DEBUG_ASSERT(parent || last_layer);
492 
493     KD_TRACE(
494         1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
495 
496     T hier_id = (T)current->get_hier_id();
497     // Attempt to grab next iteration range for this level
498     if (previous_id == 0) {
499       KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
500                    gtid, hier_level));
501       kmp_int32 contains_last;
502       T my_lb, my_ub;
503       ST my_st;
504       T nproc;
505       dispatch_shared_info_template<T> volatile *my_sh;
506       dispatch_private_info_template<T> *my_pr;
507       if (last_layer) {
508         // last layer below the very top uses the single shared buffer
509         // from the team struct.
510         KD_TRACE(10,
511                  ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
512                   gtid, hier_level));
513         my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
514             th->th.th_dispatch->th_dispatch_sh_current);
515         nproc = (T)get_top_level_nproc();
516       } else {
517         // middle layers use the shared buffer inside the kmp_hier_top_unit_t
518         // structure
519         KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
520                       gtid, hier_level));
521         my_sh =
522             parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
523         nproc = (T)parent->get_num_active();
524       }
525       my_pr = current->get_my_pr();
526       KMP_DEBUG_ASSERT(my_sh);
527       KMP_DEBUG_ASSERT(my_pr);
528       enum sched_type schedule = get_sched(hier_level);
529       ST chunk = (ST)get_chunk(hier_level);
530       status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
531                                                 &contains_last, &my_lb, &my_ub,
532                                                 &my_st, nproc, hier_id);
533       KD_TRACE(
534           10,
535           ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536            gtid, hier_level, status));
537       // When no iterations are found (status == 0) and this is not the last
538       // layer, attempt to go up the hierarchy for more iterations
539       if (status == 0 && !last_layer) {
540         kmp_int32 hid;
541         __kmp_type_convert(hier_id, &hid);
542         status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
543                               &my_st, hid, hier_level + 1);
544         KD_TRACE(
545             10,
546             ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
547              gtid, hier_level, status));
548         if (status == 1) {
549           kmp_hier_private_bdata_t *upper_tdata =
550               &(th->th.th_hier_bar_data[hier_level + 1]);
551           my_sh = parent->get_curr_sh(upper_tdata->index);
552           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
553                         gtid, hier_level));
554           __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
555                                         parent->get_curr_lb(upper_tdata->index),
556                                         parent->get_curr_ub(upper_tdata->index),
557                                         parent->get_curr_st(upper_tdata->index),
558 #if USE_ITT_BUILD
559                                         NULL,
560 #endif
561                                         chunk, nproc, hier_id);
562           status = __kmp_dispatch_next_algorithm<T>(
563               gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
564               hier_id);
565           if (!status) {
566             KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
567                           "setting to 2!\n",
568                           gtid, hier_level));
569             status = 2;
570           }
571         }
572       }
573       current->set_next(my_lb, my_ub, my_st, status, tdata->index);
574       // Propagate whether a unit holds the actual global last iteration
575       // The contains_last attribute is sent downwards from the top to the
576       // bottom of the hierarchy via the contains_last flag inside the
577       // private dispatch buffers in the hierarchy's middle layers
578       if (contains_last) {
579         // If the next_algorithm() method returns 1 for p_last and it is the
580         // last layer or our parent contains the last serial chunk, then the
581         // chunk must contain the last serial iteration.
582         if (last_layer || parent->hier_pr.flags.contains_last) {
583           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
584                         "to contain last.\n",
585                         gtid, hier_level));
586           current->hier_pr.flags.contains_last = contains_last;
587         }
588         if (!current->hier_pr.flags.contains_last)
589           contains_last = FALSE;
590       }
591       if (p_last)
592         *p_last = contains_last;
593     } // if primary thread of this unit
594     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
595       KD_TRACE(10,
596                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
597                 gtid, hier_level));
598       current->barrier(previous_id, tdata);
599       KD_TRACE(10,
600                ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
601                 gtid, hier_level, current->get_curr_status(tdata->index)));
602     } else {
603       KMP_DEBUG_ASSERT(previous_id == 0);
604       return status;
605     }
606     return current->get_curr_status(tdata->index);
607   }
608 
609 public:
610   int top_level_nproc;
611   int num_layers;
612   bool valid;
613   int type_size;
614   kmp_hier_layer_info_t<T> *info;
615   kmp_hier_top_unit_t<T> **layers;
616   // Deallocate all memory from this hierarchy
deallocatekmp_hier_t617   void deallocate() {
618     for (int i = 0; i < num_layers; ++i)
619       if (layers[i] != NULL) {
620         __kmp_free(layers[i]);
621       }
622     if (layers != NULL) {
623       __kmp_free(layers);
624       layers = NULL;
625     }
626     if (info != NULL) {
627       __kmp_free(info);
628       info = NULL;
629     }
630     num_layers = 0;
631     valid = false;
632   }
633   // Returns true if reallocation is needed else false
need_to_reallocatekmp_hier_t634   bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
635                           const enum sched_type *new_scheds,
636                           const ST *new_chunks) const {
637     if (!valid || layers == NULL || info == NULL ||
638         traits_t<T>::type_size != type_size || n != num_layers)
639       return true;
640     for (int i = 0; i < n; ++i) {
641       if (info[i].type != new_layers[i])
642         return true;
643       if (info[i].sched != new_scheds[i])
644         return true;
645       if (info[i].chunk != new_chunks[i])
646         return true;
647     }
648     return false;
649   }
650   // A single thread should call this function while the other threads wait
651   // create a new scheduling hierarchy consisting of new_layers, new_scheds
652   // and new_chunks.  These should come pre-sorted according to
653   // kmp_hier_layer_e value.  This function will try to avoid reallocation
654   // if it can
allocate_hierkmp_hier_t655   void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
656                      const enum sched_type *new_scheds, const ST *new_chunks) {
657     top_level_nproc = 0;
658     if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
659       KD_TRACE(
660           10,
661           ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
662       for (int i = 0; i < n; ++i) {
663         info[i].num_active = 0;
664         for (int j = 0; j < get_length(i); ++j)
665           layers[i][j].active = 0;
666       }
667       return;
668     }
669     KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
670     deallocate();
671     type_size = traits_t<T>::type_size;
672     num_layers = n;
673     info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
674         sizeof(kmp_hier_layer_info_t<T>) * n);
675     layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
676         sizeof(kmp_hier_top_unit_t<T> *) * n);
677     for (int i = 0; i < n; ++i) {
678       int max = 0;
679       kmp_hier_layer_e layer = new_layers[i];
680       info[i].num_active = 0;
681       info[i].type = layer;
682       info[i].sched = new_scheds[i];
683       info[i].chunk = new_chunks[i];
684       max = __kmp_hier_max_units[layer + 1];
685       if (max == 0) {
686         valid = false;
687         KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
688         deallocate();
689         return;
690       }
691       info[i].length = max;
692       layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
693           sizeof(kmp_hier_top_unit_t<T>) * max);
694       for (int j = 0; j < max; ++j) {
695         layers[i][j].active = 0;
696         layers[i][j].hier_pr.flags.use_hier = TRUE;
697       }
698     }
699     valid = true;
700   }
701   // loc - source file location
702   // gtid - global thread identifier
703   // pr - this thread's private dispatch buffer (corresponding with gtid)
704   // p_last (return value) - pointer to flag indicating this set of iterations
705   // contains last
706   //          iteration
707   // p_lb (return value) - lower bound for this chunk of iterations
708   // p_ub (return value) - upper bound for this chunk of iterations
709   // p_st (return value) - stride for this chunk of iterations
710   //
711   // Returns 1 if there are more iterations to perform, 0 otherwise
nextkmp_hier_t712   int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
713            kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
714     int status;
715     kmp_int32 contains_last = 0;
716     kmp_info_t *th = __kmp_threads[gtid];
717     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
718     auto parent = pr->get_parent();
719     KMP_DEBUG_ASSERT(parent);
720     KMP_DEBUG_ASSERT(th);
721     KMP_DEBUG_ASSERT(tdata);
722     KMP_DEBUG_ASSERT(parent);
723     T nproc = (T)parent->get_num_active();
724     T unit_id = (T)pr->get_hier_id();
725     KD_TRACE(
726         10,
727         ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
728          gtid, nproc, unit_id));
729     // Handthreading implementation
730     // Each iteration is performed by all threads on last unit (typically
731     // cores/tiles)
732     // e.g., threads 0,1,2,3 all execute iteration 0
733     //       threads 0,1,2,3 all execute iteration 1
734     //       threads 4,5,6,7 all execute iteration 2
735     //       threads 4,5,6,7 all execute iteration 3
736     //       ... etc.
737     if (__kmp_dispatch_hand_threading) {
738       KD_TRACE(10,
739                ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
740                 gtid));
741       if (unit_id == 0) {
742         // For hand threading, the sh buffer on the lowest level is only ever
743         // modified and read by the primary thread on that level.  Because of
744         // this, we can always use the first sh buffer.
745         auto sh = &(parent->hier_barrier.sh[0]);
746         KMP_DEBUG_ASSERT(sh);
747         status = __kmp_dispatch_next_algorithm<T>(
748             gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
749         if (!status) {
750           bool done = false;
751           while (!done) {
752             done = true;
753             kmp_int32 uid;
754             __kmp_type_convert(unit_id, &uid);
755             status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
756                                   p_st, uid, 0);
757             if (status == 1) {
758               __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
759                                             parent->get_next_lb(tdata->index),
760                                             parent->get_next_ub(tdata->index),
761                                             parent->get_next_st(tdata->index),
762 #if USE_ITT_BUILD
763                                             NULL,
764 #endif
765                                             pr->u.p.parm1, nproc, unit_id);
766               sh->u.s.iteration = 0;
767               status = __kmp_dispatch_next_algorithm<T>(
768                   gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
769                   unit_id);
770               if (!status) {
771                 KD_TRACE(10,
772                          ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
773                           "after next_pr_sh()"
774                           "trying again.\n",
775                           gtid));
776                 done = false;
777               }
778             } else if (status == 2) {
779               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
780                             "trying again.\n",
781                             gtid));
782               done = false;
783             }
784           }
785         }
786         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
787       } // if primary thread of lowest unit level
788       parent->barrier(pr->get_hier_id(), tdata);
789       if (unit_id != 0) {
790         *p_lb = parent->get_curr_lb(tdata->index);
791         *p_ub = parent->get_curr_ub(tdata->index);
792         *p_st = parent->get_curr_st(tdata->index);
793         status = parent->get_curr_status(tdata->index);
794       }
795     } else {
796       // Normal implementation
797       // Each thread grabs an iteration chunk and executes it (no cooperation)
798       auto sh = parent->get_curr_sh(tdata->index);
799       KMP_DEBUG_ASSERT(sh);
800       status = __kmp_dispatch_next_algorithm<T>(
801           gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
802       KD_TRACE(10,
803                ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
804                 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
805                 gtid, status, contains_last, *p_lb, *p_ub, *p_st));
806       if (!status) {
807         bool done = false;
808         while (!done) {
809           done = true;
810           kmp_int32 uid;
811           __kmp_type_convert(unit_id, &uid);
812           status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
813                                 p_st, uid, 0);
814           if (status == 1) {
815             sh = parent->get_curr_sh(tdata->index);
816             __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
817                                           parent->get_curr_lb(tdata->index),
818                                           parent->get_curr_ub(tdata->index),
819                                           parent->get_curr_st(tdata->index),
820 #if USE_ITT_BUILD
821                                           NULL,
822 #endif
823                                           pr->u.p.parm1, nproc, unit_id);
824             status = __kmp_dispatch_next_algorithm<T>(
825                 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
826             if (!status) {
827               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
828                             "after next_pr_sh()"
829                             "trying again.\n",
830                             gtid));
831               done = false;
832             }
833           } else if (status == 2) {
834             KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
835                           "trying again.\n",
836                           gtid));
837             done = false;
838           }
839         }
840       }
841     }
842     if (contains_last && !parent->hier_pr.flags.contains_last) {
843       KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
844                     "contains_last to FALSE\n",
845                     gtid));
846       contains_last = FALSE;
847     }
848     if (p_last)
849       *p_last = contains_last;
850     KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
851                   status));
852     return status;
853   }
854   // These functions probe the layer info structure
855   // Returns the type of topology unit given level
get_typekmp_hier_t856   kmp_hier_layer_e get_type(int level) const {
857     KMP_DEBUG_ASSERT(level >= 0);
858     KMP_DEBUG_ASSERT(level < num_layers);
859     return info[level].type;
860   }
861   // Returns the schedule type at given level
get_schedkmp_hier_t862   enum sched_type get_sched(int level) const {
863     KMP_DEBUG_ASSERT(level >= 0);
864     KMP_DEBUG_ASSERT(level < num_layers);
865     return info[level].sched;
866   }
867   // Returns the chunk size at given level
get_chunkkmp_hier_t868   ST get_chunk(int level) const {
869     KMP_DEBUG_ASSERT(level >= 0);
870     KMP_DEBUG_ASSERT(level < num_layers);
871     return info[level].chunk;
872   }
873   // Returns the number of active threads at given level
get_num_activekmp_hier_t874   int get_num_active(int level) const {
875     KMP_DEBUG_ASSERT(level >= 0);
876     KMP_DEBUG_ASSERT(level < num_layers);
877     return info[level].num_active;
878   }
879   // Returns the length of topology unit array at given level
get_lengthkmp_hier_t880   int get_length(int level) const {
881     KMP_DEBUG_ASSERT(level >= 0);
882     KMP_DEBUG_ASSERT(level < num_layers);
883     return info[level].length;
884   }
885   // Returns the topology unit given the level and index
get_unitkmp_hier_t886   kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
887     KMP_DEBUG_ASSERT(level >= 0);
888     KMP_DEBUG_ASSERT(level < num_layers);
889     KMP_DEBUG_ASSERT(index >= 0);
890     KMP_DEBUG_ASSERT(index < get_length(level));
891     return &(layers[level][index]);
892   }
893   // Returns the number of layers in the hierarchy
get_num_layerskmp_hier_t894   int get_num_layers() const { return num_layers; }
895   // Returns the number of threads in the top layer
896   // This is necessary because we don't store a topology unit as
897   // the very top level and the scheduling algorithms need this information
get_top_level_nprockmp_hier_t898   int get_top_level_nproc() const { return top_level_nproc; }
899   // Return whether this hierarchy is valid or not
is_validkmp_hier_t900   bool is_valid() const { return valid; }
901 #ifdef KMP_DEBUG
902   // Print the hierarchy
printkmp_hier_t903   void print() {
904     KD_TRACE(10, ("kmp_hier_t:\n"));
905     for (int i = num_layers - 1; i >= 0; --i) {
906       KD_TRACE(10, ("Info[%d] = ", i));
907       info[i].print();
908     }
909     for (int i = num_layers - 1; i >= 0; --i) {
910       KD_TRACE(10, ("Layer[%d] =\n", i));
911       for (int j = 0; j < info[i].length; ++j) {
912         layers[i][j].print();
913       }
914     }
915   }
916 #endif
917 };
918 
919 template <typename T>
__kmp_dispatch_init_hierarchy(ident_t * loc,int n,kmp_hier_layer_e * new_layers,enum sched_type * new_scheds,typename traits_t<T>::signed_t * new_chunks,T lb,T ub,typename traits_t<T>::signed_t st)920 void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
921                                    kmp_hier_layer_e *new_layers,
922                                    enum sched_type *new_scheds,
923                                    typename traits_t<T>::signed_t *new_chunks,
924                                    T lb, T ub,
925                                    typename traits_t<T>::signed_t st) {
926   int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
927   unsigned int my_buffer_index;
928   kmp_info_t *th;
929   kmp_team_t *team;
930   dispatch_private_info_template<T> *pr;
931   dispatch_shared_info_template<T> volatile *sh;
932   gtid = __kmp_entry_gtid();
933   tid = __kmp_tid_from_gtid(gtid);
934 #ifdef KMP_DEBUG
935   KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
936                 gtid, n));
937   for (int i = 0; i < n; ++i) {
938     const char *layer = __kmp_get_hier_str(new_layers[i]);
939     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
940                   "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
941                   gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
942   }
943 #endif // KMP_DEBUG
944   KMP_DEBUG_ASSERT(n > 0);
945   KMP_DEBUG_ASSERT(new_layers);
946   KMP_DEBUG_ASSERT(new_scheds);
947   KMP_DEBUG_ASSERT(new_chunks);
948   if (!TCR_4(__kmp_init_parallel))
949     __kmp_parallel_initialize();
950   __kmp_resume_if_soft_paused();
951 
952   th = __kmp_threads[gtid];
953   team = th->th.th_team;
954   active = !team->t.t_serialized;
955   th->th.th_ident = loc;
956   num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
957   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
958                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
959   my_buffer_index = th->th.th_dispatch->th_disp_index;
960   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
961       &th->th.th_dispatch
962            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
963   sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
964       &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
965   if (!active) {
966     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
967                   "Using normal dispatch functions.\n",
968                   gtid));
969     KMP_DEBUG_ASSERT(pr);
970     pr->flags.use_hier = FALSE;
971     pr->flags.contains_last = FALSE;
972     return;
973   }
974   KMP_DEBUG_ASSERT(pr);
975   KMP_DEBUG_ASSERT(sh);
976   pr->flags.use_hier = TRUE;
977   pr->u.p.tc = 0;
978   // Have primary thread allocate the hierarchy
979   if (__kmp_tid_from_gtid(gtid) == 0) {
980     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
981                   "hierarchy\n",
982                   gtid, pr, sh));
983     if (sh->hier == NULL) {
984       sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
985     }
986     sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
987     sh->u.s.iteration = 0;
988   }
989   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
990   // Check to make sure the hierarchy is valid
991   kmp_hier_t<T> *hier = sh->hier;
992   if (!sh->hier->is_valid()) {
993     pr->flags.use_hier = FALSE;
994     return;
995   }
996   // Have threads allocate their thread-private barrier data if it hasn't
997   // already been allocated
998   if (th->th.th_hier_bar_data == NULL) {
999     th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
1000         sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
1001   }
1002   // Have threads "register" themselves by modifying the active count for each
1003   // level they are involved in. The active count will act as nthreads for that
1004   // level regarding the scheduling algorithms
1005   for (int i = 0; i < n; ++i) {
1006     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1007     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1008     // Setup the thread's private dispatch buffer's hierarchy pointers
1009     if (i == 0)
1010       pr->hier_parent = my_unit;
1011     // If this unit is already active, then increment active count and wait
1012     if (my_unit->is_active()) {
1013       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1014                     "is already active (%d)\n",
1015                     gtid, my_unit, my_unit->active));
1016       KMP_TEST_THEN_INC32(&(my_unit->active));
1017       break;
1018     }
1019     // Flag that this unit is active
1020     if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
1021       // Do not setup parent pointer for top level unit since it has no parent
1022       if (i < n - 1) {
1023         // Setup middle layer pointers to parents
1024         my_unit->get_my_pr()->hier_id =
1025             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1026                                                  hier->get_type(i + 1));
1027         int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1028         my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1029       } else {
1030         // Setup top layer information (no parent pointers are set)
1031         my_unit->get_my_pr()->hier_id =
1032             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1033                                                  kmp_hier_layer_e::LAYER_LOOP);
1034         KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1035         my_unit->hier_parent = nullptr;
1036       }
1037       // Set trip count to 0 so that next() operation will initially climb up
1038       // the hierarchy to get more iterations (early exit in next() for tc == 0)
1039       my_unit->get_my_pr()->u.p.tc = 0;
1040       // Increment this layer's number of active units
1041       KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1042       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1043                     "incrementing num_active\n",
1044                     gtid, my_unit));
1045     } else {
1046       KMP_TEST_THEN_INC32(&(my_unit->active));
1047       break;
1048     }
1049   }
1050   // Set this thread's id
1051   num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1052       kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1053   pr->hier_id = tid % num_threads_per_layer1;
1054   // For oversubscribed threads, increment their index within the lowest unit
1055   // This is done to prevent having two or more threads with id 0, id 1, etc.
1056   if (tid >= num_hw_threads)
1057     pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1058   KD_TRACE(
1059       10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1060            gtid, pr->hier_id));
1061 
1062   pr->flags.contains_last = FALSE;
1063   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1064 
1065   // Now that the number of active threads at each level is determined,
1066   // the barrier data for each unit can be initialized and the last layer's
1067   // loop information can be initialized.
1068   int prev_id = pr->get_hier_id();
1069   for (int i = 0; i < n; ++i) {
1070     if (prev_id != 0)
1071       break;
1072     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1073     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1074     // Only primary threads of this unit within the hierarchy do initialization
1075     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1076                   gtid, i));
1077     my_unit->reset_shared_barrier();
1078     my_unit->hier_pr.flags.contains_last = FALSE;
1079     // Last layer, initialize the private buffers with entire loop information
1080     // Now the next next_algorithm() call will get the first chunk of
1081     // iterations properly
1082     if (i == n - 1) {
1083       __kmp_dispatch_init_algorithm<T>(
1084           loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1085 #if USE_ITT_BUILD
1086           NULL,
1087 #endif
1088           hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1089     }
1090     prev_id = my_unit->get_hier_id();
1091   }
1092   // Initialize each layer of the thread's private barrier data
1093   kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1094   for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1095     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1096     unit->reset_private_barrier(tdata);
1097   }
1098   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1099 
1100 #ifdef KMP_DEBUG
1101   if (__kmp_tid_from_gtid(gtid) == 0) {
1102     for (int i = 0; i < n; ++i) {
1103       KD_TRACE(10,
1104                ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1105                 gtid, i, hier->get_num_active(i)));
1106     }
1107     hier->print();
1108   }
1109   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1110 #endif // KMP_DEBUG
1111 }
1112 #endif
1113