1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     ~Mask() { hwloc_bitmap_free(mask); }
33     void set(int i) override { hwloc_bitmap_set(mask, i); }
34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36     void zero() override { hwloc_bitmap_zero(mask); }
37     bool empty() const override { return hwloc_bitmap_iszero(mask); }
38     void copy(const KMPAffinity::Mask *src) override {
39       const Mask *convert = static_cast<const Mask *>(src);
40       hwloc_bitmap_copy(mask, convert->mask);
41     }
42     void bitwise_and(const KMPAffinity::Mask *rhs) override {
43       const Mask *convert = static_cast<const Mask *>(rhs);
44       hwloc_bitmap_and(mask, mask, convert->mask);
45     }
46     void bitwise_or(const KMPAffinity::Mask *rhs) override {
47       const Mask *convert = static_cast<const Mask *>(rhs);
48       hwloc_bitmap_or(mask, mask, convert->mask);
49     }
50     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51     bool is_equal(const KMPAffinity::Mask *rhs) const override {
52       const Mask *convert = static_cast<const Mask *>(rhs);
53       return hwloc_bitmap_isequal(mask, convert->mask);
54     }
55     int begin() const override { return hwloc_bitmap_first(mask); }
56     int end() const override { return -1; }
57     int next(int previous) const override {
58       return hwloc_bitmap_next(mask, previous);
59     }
60     int get_system_affinity(bool abort_on_error) override {
61       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62                   "Illegal get affinity operation when not capable");
63       long retval =
64           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65       if (retval >= 0) {
66         return 0;
67       }
68       int error = errno;
69       if (abort_on_error) {
70         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71                     KMP_ERR(error), __kmp_msg_null);
72       }
73       return error;
74     }
75     int set_system_affinity(bool abort_on_error) const override {
76       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77                   "Illegal set affinity operation when not capable");
78       long retval =
79           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80       if (retval >= 0) {
81         return 0;
82       }
83       int error = errno;
84       if (abort_on_error) {
85         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86                     KMP_ERR(error), __kmp_msg_null);
87       }
88       return error;
89     }
90 #if KMP_OS_WINDOWS
91     int set_process_affinity(bool abort_on_error) const override {
92       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93                   "Illegal set process affinity operation when not capable");
94       int error = 0;
95       const hwloc_topology_support *support =
96           hwloc_topology_get_support(__kmp_hwloc_topology);
97       if (support->cpubind->set_proc_cpubind) {
98         int retval;
99         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100                                    HWLOC_CPUBIND_PROCESS);
101         if (retval >= 0)
102           return 0;
103         error = errno;
104         if (abort_on_error)
105           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106                       KMP_ERR(error), __kmp_msg_null);
107       }
108       return error;
109     }
110 #endif
111     int get_proc_group() const override {
112       int group = -1;
113 #if KMP_OS_WINDOWS
114       if (__kmp_num_proc_groups == 1) {
115         return 1;
116       }
117       for (int i = 0; i < __kmp_num_proc_groups; i++) {
118         // On windows, the long type is always 32 bits
119         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120         unsigned long second_32_bits =
121             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122         if (first_32_bits == 0 && second_32_bits == 0) {
123           continue;
124         }
125         if (group >= 0) {
126           return -1;
127         }
128         group = i;
129       }
130 #endif /* KMP_OS_WINDOWS */
131       return group;
132     }
133   };
134   void determine_capable(const char *var) override {
135     const hwloc_topology_support *topology_support;
136     if (__kmp_hwloc_topology == NULL) {
137       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138         __kmp_hwloc_error = TRUE;
139         if (__kmp_affinity.flags.verbose) {
140           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141         }
142       }
143       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144         __kmp_hwloc_error = TRUE;
145         if (__kmp_affinity.flags.verbose) {
146           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147         }
148       }
149     }
150     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151     // Is the system capable of setting/getting this thread's affinity?
152     // Also, is topology discovery possible? (pu indicates ability to discover
153     // processing units). And finally, were there no errors when calling any
154     // hwloc_* API functions?
155     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156         topology_support->cpubind->get_thisthread_cpubind &&
157         topology_support->discovery->pu && !__kmp_hwloc_error) {
158       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159       KMP_AFFINITY_ENABLE(TRUE);
160     } else {
161       // indicate that hwloc didn't work and disable affinity
162       __kmp_hwloc_error = TRUE;
163       KMP_AFFINITY_DISABLE();
164     }
165   }
166   void bind_thread(int which) override {
167     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168                 "Illegal set affinity operation when not capable");
169     KMPAffinity::Mask *mask;
170     KMP_CPU_ALLOC_ON_STACK(mask);
171     KMP_CPU_ZERO(mask);
172     KMP_CPU_SET(which, mask);
173     __kmp_set_system_affinity(mask, TRUE);
174     KMP_CPU_FREE_FROM_STACK(mask);
175   }
176   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178   KMPAffinity::Mask *allocate_mask_array(int num) override {
179     return new Mask[num];
180   }
181   void deallocate_mask_array(KMPAffinity::Mask *array) override {
182     Mask *hwloc_array = static_cast<Mask *>(array);
183     delete[] hwloc_array;
184   }
185   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186                                       int index) override {
187     Mask *hwloc_array = static_cast<Mask *>(array);
188     return &(hwloc_array[index]);
189   }
190   api_type get_api_type() const override { return HWLOC; }
191 };
192 #endif /* KMP_USE_HWLOC */
193 
194 #if KMP_OS_LINUX || KMP_OS_FREEBSD
195 #if KMP_OS_LINUX
196 /* On some of the older OS's that we build on, these constants aren't present
197    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198    all systems of the same arch where they are defined, and they cannot change.
199    stone forever. */
200 #include <sys/syscall.h>
201 #if KMP_ARCH_X86 || KMP_ARCH_ARM
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 241
204 #elif __NR_sched_setaffinity != 241
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 242
209 #elif __NR_sched_getaffinity != 242
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_AARCH64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 122
215 #elif __NR_sched_setaffinity != 122
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 123
220 #elif __NR_sched_getaffinity != 123
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_X86_64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 203
226 #elif __NR_sched_setaffinity != 203
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 204
231 #elif __NR_sched_getaffinity != 204
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_PPC64
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 222
237 #elif __NR_sched_setaffinity != 222
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 223
242 #elif __NR_sched_getaffinity != 223
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 #elif KMP_ARCH_MIPS
246 #ifndef __NR_sched_setaffinity
247 #define __NR_sched_setaffinity 4239
248 #elif __NR_sched_setaffinity != 4239
249 #error Wrong code for setaffinity system call.
250 #endif /* __NR_sched_setaffinity */
251 #ifndef __NR_sched_getaffinity
252 #define __NR_sched_getaffinity 4240
253 #elif __NR_sched_getaffinity != 4240
254 #error Wrong code for getaffinity system call.
255 #endif /* __NR_sched_getaffinity */
256 #elif KMP_ARCH_MIPS64
257 #ifndef __NR_sched_setaffinity
258 #define __NR_sched_setaffinity 5195
259 #elif __NR_sched_setaffinity != 5195
260 #error Wrong code for setaffinity system call.
261 #endif /* __NR_sched_setaffinity */
262 #ifndef __NR_sched_getaffinity
263 #define __NR_sched_getaffinity 5196
264 #elif __NR_sched_getaffinity != 5196
265 #error Wrong code for getaffinity system call.
266 #endif /* __NR_sched_getaffinity */
267 #elif KMP_ARCH_LOONGARCH64
268 #ifndef __NR_sched_setaffinity
269 #define __NR_sched_setaffinity 122
270 #elif __NR_sched_setaffinity != 122
271 #error Wrong code for setaffinity system call.
272 #endif /* __NR_sched_setaffinity */
273 #ifndef __NR_sched_getaffinity
274 #define __NR_sched_getaffinity 123
275 #elif __NR_sched_getaffinity != 123
276 #error Wrong code for getaffinity system call.
277 #endif /* __NR_sched_getaffinity */
278 #elif KMP_ARCH_RISCV64
279 #ifndef __NR_sched_setaffinity
280 #define __NR_sched_setaffinity 122
281 #elif __NR_sched_setaffinity != 122
282 #error Wrong code for setaffinity system call.
283 #endif /* __NR_sched_setaffinity */
284 #ifndef __NR_sched_getaffinity
285 #define __NR_sched_getaffinity 123
286 #elif __NR_sched_getaffinity != 123
287 #error Wrong code for getaffinity system call.
288 #endif /* __NR_sched_getaffinity */
289 #elif KMP_ARCH_VE
290 #ifndef __NR_sched_setaffinity
291 #define __NR_sched_setaffinity 203
292 #elif __NR_sched_setaffinity != 203
293 #error Wrong code for setaffinity system call.
294 #endif /* __NR_sched_setaffinity */
295 #ifndef __NR_sched_getaffinity
296 #define __NR_sched_getaffinity 204
297 #elif __NR_sched_getaffinity != 204
298 #error Wrong code for getaffinity system call.
299 #endif /* __NR_sched_getaffinity */
300 #elif KMP_ARCH_S390X
301 #ifndef __NR_sched_setaffinity
302 #define __NR_sched_setaffinity 239
303 #elif __NR_sched_setaffinity != 239
304 #error Wrong code for setaffinity system call.
305 #endif /* __NR_sched_setaffinity */
306 #ifndef __NR_sched_getaffinity
307 #define __NR_sched_getaffinity 240
308 #elif __NR_sched_getaffinity != 240
309 #error Wrong code for getaffinity system call.
310 #endif /* __NR_sched_getaffinity */
311 #else
312 #error Unknown or unsupported architecture
313 #endif /* KMP_ARCH_* */
314 #elif KMP_OS_FREEBSD
315 #include <pthread.h>
316 #include <pthread_np.h>
317 #endif
318 class KMPNativeAffinity : public KMPAffinity {
319   class Mask : public KMPAffinity::Mask {
320     typedef unsigned long mask_t;
321     typedef decltype(__kmp_affin_mask_size) mask_size_type;
322     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
323     static const mask_t ONE = 1;
324     mask_size_type get_num_mask_types() const {
325       return __kmp_affin_mask_size / sizeof(mask_t);
326     }
327 
328   public:
329     mask_t *mask;
330     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
331     ~Mask() {
332       if (mask)
333         __kmp_free(mask);
334     }
335     void set(int i) override {
336       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
337     }
338     bool is_set(int i) const override {
339       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
340     }
341     void clear(int i) override {
342       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
343     }
344     void zero() override {
345       mask_size_type e = get_num_mask_types();
346       for (mask_size_type i = 0; i < e; ++i)
347         mask[i] = (mask_t)0;
348     }
349     bool empty() const override {
350       mask_size_type e = get_num_mask_types();
351       for (mask_size_type i = 0; i < e; ++i)
352         if (mask[i] != (mask_t)0)
353           return false;
354       return true;
355     }
356     void copy(const KMPAffinity::Mask *src) override {
357       const Mask *convert = static_cast<const Mask *>(src);
358       mask_size_type e = get_num_mask_types();
359       for (mask_size_type i = 0; i < e; ++i)
360         mask[i] = convert->mask[i];
361     }
362     void bitwise_and(const KMPAffinity::Mask *rhs) override {
363       const Mask *convert = static_cast<const Mask *>(rhs);
364       mask_size_type e = get_num_mask_types();
365       for (mask_size_type i = 0; i < e; ++i)
366         mask[i] &= convert->mask[i];
367     }
368     void bitwise_or(const KMPAffinity::Mask *rhs) override {
369       const Mask *convert = static_cast<const Mask *>(rhs);
370       mask_size_type e = get_num_mask_types();
371       for (mask_size_type i = 0; i < e; ++i)
372         mask[i] |= convert->mask[i];
373     }
374     void bitwise_not() override {
375       mask_size_type e = get_num_mask_types();
376       for (mask_size_type i = 0; i < e; ++i)
377         mask[i] = ~(mask[i]);
378     }
379     bool is_equal(const KMPAffinity::Mask *rhs) const override {
380       const Mask *convert = static_cast<const Mask *>(rhs);
381       mask_size_type e = get_num_mask_types();
382       for (mask_size_type i = 0; i < e; ++i)
383         if (mask[i] != convert->mask[i])
384           return false;
385       return true;
386     }
387     int begin() const override {
388       int retval = 0;
389       while (retval < end() && !is_set(retval))
390         ++retval;
391       return retval;
392     }
393     int end() const override {
394       int e;
395       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
396       return e;
397     }
398     int next(int previous) const override {
399       int retval = previous + 1;
400       while (retval < end() && !is_set(retval))
401         ++retval;
402       return retval;
403     }
404     int get_system_affinity(bool abort_on_error) override {
405       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
406                   "Illegal get affinity operation when not capable");
407 #if KMP_OS_LINUX
408       long retval =
409           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
410 #elif KMP_OS_FREEBSD
411       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
412                                      reinterpret_cast<cpuset_t *>(mask));
413       int retval = (r == 0 ? 0 : -1);
414 #endif
415       if (retval >= 0) {
416         return 0;
417       }
418       int error = errno;
419       if (abort_on_error) {
420         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
421                     KMP_ERR(error), __kmp_msg_null);
422       }
423       return error;
424     }
425     int set_system_affinity(bool abort_on_error) const override {
426       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
427                   "Illegal set affinity operation when not capable");
428 #if KMP_OS_LINUX
429       long retval =
430           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
431 #elif KMP_OS_FREEBSD
432       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
433                                      reinterpret_cast<cpuset_t *>(mask));
434       int retval = (r == 0 ? 0 : -1);
435 #endif
436       if (retval >= 0) {
437         return 0;
438       }
439       int error = errno;
440       if (abort_on_error) {
441         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
442                     KMP_ERR(error), __kmp_msg_null);
443       }
444       return error;
445     }
446   };
447   void determine_capable(const char *env_var) override {
448     __kmp_affinity_determine_capable(env_var);
449   }
450   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
451   KMPAffinity::Mask *allocate_mask() override {
452     KMPNativeAffinity::Mask *retval = new Mask();
453     return retval;
454   }
455   void deallocate_mask(KMPAffinity::Mask *m) override {
456     KMPNativeAffinity::Mask *native_mask =
457         static_cast<KMPNativeAffinity::Mask *>(m);
458     delete native_mask;
459   }
460   KMPAffinity::Mask *allocate_mask_array(int num) override {
461     return new Mask[num];
462   }
463   void deallocate_mask_array(KMPAffinity::Mask *array) override {
464     Mask *linux_array = static_cast<Mask *>(array);
465     delete[] linux_array;
466   }
467   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
468                                       int index) override {
469     Mask *linux_array = static_cast<Mask *>(array);
470     return &(linux_array[index]);
471   }
472   api_type get_api_type() const override { return NATIVE_OS; }
473 };
474 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
475 
476 #if KMP_OS_WINDOWS
477 class KMPNativeAffinity : public KMPAffinity {
478   class Mask : public KMPAffinity::Mask {
479     typedef ULONG_PTR mask_t;
480     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
481     mask_t *mask;
482 
483   public:
484     Mask() {
485       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
486     }
487     ~Mask() {
488       if (mask)
489         __kmp_free(mask);
490     }
491     void set(int i) override {
492       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
493     }
494     bool is_set(int i) const override {
495       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
496     }
497     void clear(int i) override {
498       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
499     }
500     void zero() override {
501       for (int i = 0; i < __kmp_num_proc_groups; ++i)
502         mask[i] = 0;
503     }
504     bool empty() const override {
505       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
506         if (mask[i])
507           return false;
508       return true;
509     }
510     void copy(const KMPAffinity::Mask *src) override {
511       const Mask *convert = static_cast<const Mask *>(src);
512       for (int i = 0; i < __kmp_num_proc_groups; ++i)
513         mask[i] = convert->mask[i];
514     }
515     void bitwise_and(const KMPAffinity::Mask *rhs) override {
516       const Mask *convert = static_cast<const Mask *>(rhs);
517       for (int i = 0; i < __kmp_num_proc_groups; ++i)
518         mask[i] &= convert->mask[i];
519     }
520     void bitwise_or(const KMPAffinity::Mask *rhs) override {
521       const Mask *convert = static_cast<const Mask *>(rhs);
522       for (int i = 0; i < __kmp_num_proc_groups; ++i)
523         mask[i] |= convert->mask[i];
524     }
525     void bitwise_not() override {
526       for (int i = 0; i < __kmp_num_proc_groups; ++i)
527         mask[i] = ~(mask[i]);
528     }
529     bool is_equal(const KMPAffinity::Mask *rhs) const override {
530       const Mask *convert = static_cast<const Mask *>(rhs);
531       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
532         if (mask[i] != convert->mask[i])
533           return false;
534       return true;
535     }
536     int begin() const override {
537       int retval = 0;
538       while (retval < end() && !is_set(retval))
539         ++retval;
540       return retval;
541     }
542     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
543     int next(int previous) const override {
544       int retval = previous + 1;
545       while (retval < end() && !is_set(retval))
546         ++retval;
547       return retval;
548     }
549     int set_process_affinity(bool abort_on_error) const override {
550       if (__kmp_num_proc_groups <= 1) {
551         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
552           DWORD error = GetLastError();
553           if (abort_on_error) {
554             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
555                         __kmp_msg_null);
556           }
557           return error;
558         }
559       }
560       return 0;
561     }
562     int set_system_affinity(bool abort_on_error) const override {
563       if (__kmp_num_proc_groups > 1) {
564         // Check for a valid mask.
565         GROUP_AFFINITY ga;
566         int group = get_proc_group();
567         if (group < 0) {
568           if (abort_on_error) {
569             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
570           }
571           return -1;
572         }
573         // Transform the bit vector into a GROUP_AFFINITY struct
574         // and make the system call to set affinity.
575         ga.Group = group;
576         ga.Mask = mask[group];
577         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
578 
579         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
580         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
581           DWORD error = GetLastError();
582           if (abort_on_error) {
583             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
584                         __kmp_msg_null);
585           }
586           return error;
587         }
588       } else {
589         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
590           DWORD error = GetLastError();
591           if (abort_on_error) {
592             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
593                         __kmp_msg_null);
594           }
595           return error;
596         }
597       }
598       return 0;
599     }
600     int get_system_affinity(bool abort_on_error) override {
601       if (__kmp_num_proc_groups > 1) {
602         this->zero();
603         GROUP_AFFINITY ga;
604         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
605         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
606           DWORD error = GetLastError();
607           if (abort_on_error) {
608             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
609                         KMP_ERR(error), __kmp_msg_null);
610           }
611           return error;
612         }
613         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
614             (ga.Mask == 0)) {
615           return -1;
616         }
617         mask[ga.Group] = ga.Mask;
618       } else {
619         mask_t newMask, sysMask, retval;
620         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
621           DWORD error = GetLastError();
622           if (abort_on_error) {
623             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
624                         KMP_ERR(error), __kmp_msg_null);
625           }
626           return error;
627         }
628         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
629         if (!retval) {
630           DWORD error = GetLastError();
631           if (abort_on_error) {
632             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
633                         KMP_ERR(error), __kmp_msg_null);
634           }
635           return error;
636         }
637         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
638         if (!newMask) {
639           DWORD error = GetLastError();
640           if (abort_on_error) {
641             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
642                         KMP_ERR(error), __kmp_msg_null);
643           }
644         }
645         *mask = retval;
646       }
647       return 0;
648     }
649     int get_proc_group() const override {
650       int group = -1;
651       if (__kmp_num_proc_groups == 1) {
652         return 1;
653       }
654       for (int i = 0; i < __kmp_num_proc_groups; i++) {
655         if (mask[i] == 0)
656           continue;
657         if (group >= 0)
658           return -1;
659         group = i;
660       }
661       return group;
662     }
663   };
664   void determine_capable(const char *env_var) override {
665     __kmp_affinity_determine_capable(env_var);
666   }
667   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
668   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
669   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
670   KMPAffinity::Mask *allocate_mask_array(int num) override {
671     return new Mask[num];
672   }
673   void deallocate_mask_array(KMPAffinity::Mask *array) override {
674     Mask *windows_array = static_cast<Mask *>(array);
675     delete[] windows_array;
676   }
677   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
678                                       int index) override {
679     Mask *windows_array = static_cast<Mask *>(array);
680     return &(windows_array[index]);
681   }
682   api_type get_api_type() const override { return NATIVE_OS; }
683 };
684 #endif /* KMP_OS_WINDOWS */
685 #endif /* KMP_AFFINITY_SUPPORTED */
686 
687 // Describe an attribute for a level in the machine topology
688 struct kmp_hw_attr_t {
689   int core_type : 8;
690   int core_eff : 8;
691   unsigned valid : 1;
692   unsigned reserved : 15;
693 
694   static const int UNKNOWN_CORE_EFF = -1;
695 
696   kmp_hw_attr_t()
697       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
698         valid(0), reserved(0) {}
699   void set_core_type(kmp_hw_core_type_t type) {
700     valid = 1;
701     core_type = type;
702   }
703   void set_core_eff(int eff) {
704     valid = 1;
705     core_eff = eff;
706   }
707   kmp_hw_core_type_t get_core_type() const {
708     return (kmp_hw_core_type_t)core_type;
709   }
710   int get_core_eff() const { return core_eff; }
711   bool is_core_type_valid() const {
712     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
713   }
714   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
715   operator bool() const { return valid; }
716   void clear() {
717     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
718     core_eff = UNKNOWN_CORE_EFF;
719     valid = 0;
720   }
721   bool contains(const kmp_hw_attr_t &other) const {
722     if (!valid && !other.valid)
723       return true;
724     if (valid && other.valid) {
725       if (other.is_core_type_valid()) {
726         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
727           return false;
728       }
729       if (other.is_core_eff_valid()) {
730         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
731           return false;
732       }
733       return true;
734     }
735     return false;
736   }
737 #if KMP_AFFINITY_SUPPORTED
738   bool contains(const kmp_affinity_attrs_t &attr) const {
739     if (!valid && !attr.valid)
740       return true;
741     if (valid && attr.valid) {
742       if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
743         return (is_core_type_valid() &&
744                 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
745       if (attr.core_eff != UNKNOWN_CORE_EFF)
746         return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
747       return true;
748     }
749     return false;
750   }
751 #endif // KMP_AFFINITY_SUPPORTED
752   bool operator==(const kmp_hw_attr_t &rhs) const {
753     return (rhs.valid == valid && rhs.core_eff == core_eff &&
754             rhs.core_type == core_type);
755   }
756   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
757 };
758 
759 #if KMP_AFFINITY_SUPPORTED
760 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
761 #endif
762 
763 class kmp_hw_thread_t {
764 public:
765   static const int UNKNOWN_ID = -1;
766   static const int MULTIPLE_ID = -2;
767   static int compare_ids(const void *a, const void *b);
768   static int compare_compact(const void *a, const void *b);
769   int ids[KMP_HW_LAST];
770   int sub_ids[KMP_HW_LAST];
771   bool leader;
772   int os_id;
773   kmp_hw_attr_t attrs;
774 
775   void print() const;
776   void clear() {
777     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
778       ids[i] = UNKNOWN_ID;
779     leader = false;
780     attrs.clear();
781   }
782 };
783 
784 class kmp_topology_t {
785 
786   struct flags_t {
787     int uniform : 1;
788     int reserved : 31;
789   };
790 
791   int depth;
792 
793   // The following arrays are all 'depth' long and have been
794   // allocated to hold up to KMP_HW_LAST number of objects if
795   // needed so layers can be added without reallocation of any array
796 
797   // Orderd array of the types in the topology
798   kmp_hw_t *types;
799 
800   // Keep quick topology ratios, for non-uniform topologies,
801   // this ratio holds the max number of itemAs per itemB
802   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
803   int *ratio;
804 
805   // Storage containing the absolute number of each topology layer
806   int *count;
807 
808   // The number of core efficiencies. This is only useful for hybrid
809   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
810   int num_core_efficiencies;
811   int num_core_types;
812   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
813 
814   // The hardware threads array
815   // hw_threads is num_hw_threads long
816   // Each hw_thread's ids and sub_ids are depth deep
817   int num_hw_threads;
818   kmp_hw_thread_t *hw_threads;
819 
820   // Equivalence hash where the key is the hardware topology item
821   // and the value is the equivalent hardware topology type in the
822   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
823   // known equivalence for the topology type
824   kmp_hw_t equivalent[KMP_HW_LAST];
825 
826   // Flags describing the topology
827   flags_t flags;
828 
829   // Compact value used during sort_compact()
830   int compact;
831 
832   // Insert a new topology layer after allocation
833   void _insert_layer(kmp_hw_t type, const int *ids);
834 
835 #if KMP_GROUP_AFFINITY
836   // Insert topology information about Windows Processor groups
837   void _insert_windows_proc_groups();
838 #endif
839 
840   // Count each item & get the num x's per y
841   // e.g., get the number of cores and the number of threads per core
842   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
843   void _gather_enumeration_information();
844 
845   // Remove layers that don't add information to the topology.
846   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
847   void _remove_radix1_layers();
848 
849   // Find out if the topology is uniform
850   void _discover_uniformity();
851 
852   // Set all the sub_ids for each hardware thread
853   void _set_sub_ids();
854 
855   // Set global affinity variables describing the number of threads per
856   // core, the number of packages, the number of cores per package, and
857   // the number of cores.
858   void _set_globals();
859 
860   // Set the last level cache equivalent type
861   void _set_last_level_cache();
862 
863   // Return the number of cores with a particular attribute, 'attr'.
864   // If 'find_all' is true, then find all cores on the machine, otherwise find
865   // all cores per the layer 'above'
866   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
867                             bool find_all = false) const;
868 
869 public:
870   // Force use of allocate()/deallocate()
871   kmp_topology_t() = delete;
872   kmp_topology_t(const kmp_topology_t &t) = delete;
873   kmp_topology_t(kmp_topology_t &&t) = delete;
874   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
875   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
876 
877   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
878   static void deallocate(kmp_topology_t *);
879 
880   // Functions used in create_map() routines
881   kmp_hw_thread_t &at(int index) {
882     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
883     return hw_threads[index];
884   }
885   const kmp_hw_thread_t &at(int index) const {
886     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
887     return hw_threads[index];
888   }
889   int get_num_hw_threads() const { return num_hw_threads; }
890   void sort_ids() {
891     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
892           kmp_hw_thread_t::compare_ids);
893   }
894   // Check if the hardware ids are unique, if they are
895   // return true, otherwise return false
896   bool check_ids() const;
897 
898   // Function to call after the create_map() routine
899   void canonicalize();
900   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
901 
902 // Functions used after canonicalize() called
903 
904 #if KMP_AFFINITY_SUPPORTED
905   // Set the granularity for affinity settings
906   void set_granularity(kmp_affinity_t &stgs) const;
907   bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
908   bool restrict_to_mask(const kmp_affin_mask_t *mask);
909   bool filter_hw_subset();
910 #endif
911   bool is_uniform() const { return flags.uniform; }
912   // Tell whether a type is a valid type in the topology
913   // returns KMP_HW_UNKNOWN when there is no equivalent type
914   kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
915     if (type == KMP_HW_UNKNOWN)
916       return KMP_HW_UNKNOWN;
917     return equivalent[type];
918   }
919   // Set type1 = type2
920   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
921     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
922     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
923     kmp_hw_t real_type2 = equivalent[type2];
924     if (real_type2 == KMP_HW_UNKNOWN)
925       real_type2 = type2;
926     equivalent[type1] = real_type2;
927     // This loop is required since any of the types may have been set to
928     // be equivalent to type1.  They all must be checked and reset to type2.
929     KMP_FOREACH_HW_TYPE(type) {
930       if (equivalent[type] == type1) {
931         equivalent[type] = real_type2;
932       }
933     }
934   }
935   // Calculate number of types corresponding to level1
936   // per types corresponding to level2 (e.g., number of threads per core)
937   int calculate_ratio(int level1, int level2) const {
938     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
939     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
940     int r = 1;
941     for (int level = level1; level > level2; --level)
942       r *= ratio[level];
943     return r;
944   }
945   int get_ratio(int level) const {
946     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
947     return ratio[level];
948   }
949   int get_depth() const { return depth; };
950   kmp_hw_t get_type(int level) const {
951     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
952     return types[level];
953   }
954   int get_level(kmp_hw_t type) const {
955     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
956     int eq_type = equivalent[type];
957     if (eq_type == KMP_HW_UNKNOWN)
958       return -1;
959     for (int i = 0; i < depth; ++i)
960       if (types[i] == eq_type)
961         return i;
962     return -1;
963   }
964   int get_count(int level) const {
965     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
966     return count[level];
967   }
968   // Return the total number of cores with attribute 'attr'
969   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
970     return _get_ncores_with_attr(attr, -1, true);
971   }
972   // Return the number of cores with attribute
973   // 'attr' per topology level 'above'
974   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
975     return _get_ncores_with_attr(attr, above, false);
976   }
977 
978 #if KMP_AFFINITY_SUPPORTED
979   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
980   void sort_compact(kmp_affinity_t &affinity) {
981     compact = affinity.compact;
982     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
983           kmp_hw_thread_t::compare_compact);
984   }
985 #endif
986   void print(const char *env_var = "KMP_AFFINITY") const;
987   void dump() const;
988 };
989 extern kmp_topology_t *__kmp_topology;
990 
991 class kmp_hw_subset_t {
992   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
993 
994 public:
995   // Describe a machine topology item in KMP_HW_SUBSET
996   struct item_t {
997     kmp_hw_t type;
998     int num_attrs;
999     int num[MAX_ATTRS];
1000     int offset[MAX_ATTRS];
1001     kmp_hw_attr_t attr[MAX_ATTRS];
1002   };
1003   // Put parenthesis around max to avoid accidental use of Windows max macro.
1004   const static int USE_ALL = (std::numeric_limits<int>::max)();
1005 
1006 private:
1007   int depth;
1008   int capacity;
1009   item_t *items;
1010   kmp_uint64 set;
1011   bool absolute;
1012   // The set must be able to handle up to KMP_HW_LAST number of layers
1013   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1014   // Sorting the KMP_HW_SUBSET items to follow topology order
1015   // All unknown topology types will be at the beginning of the subset
1016   static int hw_subset_compare(const void *i1, const void *i2) {
1017     kmp_hw_t type1 = ((const item_t *)i1)->type;
1018     kmp_hw_t type2 = ((const item_t *)i2)->type;
1019     int level1 = __kmp_topology->get_level(type1);
1020     int level2 = __kmp_topology->get_level(type2);
1021     return level1 - level2;
1022   }
1023 
1024 public:
1025   // Force use of allocate()/deallocate()
1026   kmp_hw_subset_t() = delete;
1027   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1028   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1029   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1030   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1031 
1032   static kmp_hw_subset_t *allocate() {
1033     int initial_capacity = 5;
1034     kmp_hw_subset_t *retval =
1035         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1036     retval->depth = 0;
1037     retval->capacity = initial_capacity;
1038     retval->set = 0ull;
1039     retval->absolute = false;
1040     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1041     return retval;
1042   }
1043   static void deallocate(kmp_hw_subset_t *subset) {
1044     __kmp_free(subset->items);
1045     __kmp_free(subset);
1046   }
1047   void set_absolute() { absolute = true; }
1048   bool is_absolute() const { return absolute; }
1049   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1050     for (int i = 0; i < depth; ++i) {
1051       // Found an existing item for this layer type
1052       // Add the num, offset, and attr to this item
1053       if (items[i].type == type) {
1054         int idx = items[i].num_attrs++;
1055         if ((size_t)idx >= MAX_ATTRS)
1056           return;
1057         items[i].num[idx] = num;
1058         items[i].offset[idx] = offset;
1059         items[i].attr[idx] = attr;
1060         return;
1061       }
1062     }
1063     if (depth == capacity - 1) {
1064       capacity *= 2;
1065       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1066       for (int i = 0; i < depth; ++i)
1067         new_items[i] = items[i];
1068       __kmp_free(items);
1069       items = new_items;
1070     }
1071     items[depth].num_attrs = 1;
1072     items[depth].type = type;
1073     items[depth].num[0] = num;
1074     items[depth].offset[0] = offset;
1075     items[depth].attr[0] = attr;
1076     depth++;
1077     set |= (1ull << type);
1078   }
1079   int get_depth() const { return depth; }
1080   const item_t &at(int index) const {
1081     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1082     return items[index];
1083   }
1084   item_t &at(int index) {
1085     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1086     return items[index];
1087   }
1088   void remove(int index) {
1089     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1090     set &= ~(1ull << items[index].type);
1091     for (int j = index + 1; j < depth; ++j) {
1092       items[j - 1] = items[j];
1093     }
1094     depth--;
1095   }
1096   void sort() {
1097     KMP_DEBUG_ASSERT(__kmp_topology);
1098     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1099   }
1100   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1101   void dump() const {
1102     printf("**********************\n");
1103     printf("*** kmp_hw_subset: ***\n");
1104     printf("* depth: %d\n", depth);
1105     printf("* items:\n");
1106     for (int i = 0; i < depth; ++i) {
1107       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1108       for (int j = 0; j < items[i].num_attrs; ++j) {
1109         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1110                items[i].offset[j]);
1111         if (!items[i].attr[j]) {
1112           printf(" (none)\n");
1113         } else {
1114           printf(
1115               " core_type = %s, core_eff = %d\n",
1116               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1117               items[i].attr[j].get_core_eff());
1118         }
1119       }
1120     }
1121     printf("* set: 0x%llx\n", set);
1122     printf("* absolute: %d\n", absolute);
1123     printf("**********************\n");
1124   }
1125 };
1126 extern kmp_hw_subset_t *__kmp_hw_subset;
1127 
1128 /* A structure for holding machine-specific hierarchy info to be computed once
1129    at init. This structure represents a mapping of threads to the actual machine
1130    hierarchy, or to our best guess at what the hierarchy might be, for the
1131    purpose of performing an efficient barrier. In the worst case, when there is
1132    no machine hierarchy information, it produces a tree suitable for a barrier,
1133    similar to the tree used in the hyper barrier. */
1134 class hierarchy_info {
1135 public:
1136   /* Good default values for number of leaves and branching factor, given no
1137      affinity information. Behaves a bit like hyper barrier. */
1138   static const kmp_uint32 maxLeaves = 4;
1139   static const kmp_uint32 minBranch = 4;
1140   /** Number of levels in the hierarchy. Typical levels are threads/core,
1141       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1142       to get specific with nomenclature. When the machine is oversubscribed we
1143       add levels to duplicate the hierarchy, doubling the thread capacity of the
1144       hierarchy each time we add a level. */
1145   kmp_uint32 maxLevels;
1146 
1147   /** This is specifically the depth of the machine configuration hierarchy, in
1148       terms of the number of levels along the longest path from root to any
1149       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1150       all but one trailing 1. */
1151   kmp_uint32 depth;
1152   kmp_uint32 base_num_threads;
1153   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1154   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1155   // 2=initialization in progress
1156   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1157 
1158   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1159       the parent of a node at level i has. For example, if we have a machine
1160       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1161       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1162   kmp_uint32 *numPerLevel;
1163   kmp_uint32 *skipPerLevel;
1164 
1165   void deriveLevels() {
1166     int hier_depth = __kmp_topology->get_depth();
1167     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1168       numPerLevel[level] = __kmp_topology->get_ratio(i);
1169     }
1170   }
1171 
1172   hierarchy_info()
1173       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1174 
1175   void fini() {
1176     if (!uninitialized && numPerLevel) {
1177       __kmp_free(numPerLevel);
1178       numPerLevel = NULL;
1179       uninitialized = not_initialized;
1180     }
1181   }
1182 
1183   void init(int num_addrs) {
1184     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1185         &uninitialized, not_initialized, initializing);
1186     if (bool_result == 0) { // Wait for initialization
1187       while (TCR_1(uninitialized) != initialized)
1188         KMP_CPU_PAUSE();
1189       return;
1190     }
1191     KMP_DEBUG_ASSERT(bool_result == 1);
1192 
1193     /* Added explicit initialization of the data fields here to prevent usage of
1194        dirty value observed when static library is re-initialized multiple times
1195        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1196        OpenMP). */
1197     depth = 1;
1198     resizing = 0;
1199     maxLevels = 7;
1200     numPerLevel =
1201         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1202     skipPerLevel = &(numPerLevel[maxLevels]);
1203     for (kmp_uint32 i = 0; i < maxLevels;
1204          ++i) { // init numPerLevel[*] to 1 item per level
1205       numPerLevel[i] = 1;
1206       skipPerLevel[i] = 1;
1207     }
1208 
1209     // Sort table by physical ID
1210     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1211       deriveLevels();
1212     } else {
1213       numPerLevel[0] = maxLeaves;
1214       numPerLevel[1] = num_addrs / maxLeaves;
1215       if (num_addrs % maxLeaves)
1216         numPerLevel[1]++;
1217     }
1218 
1219     base_num_threads = num_addrs;
1220     for (int i = maxLevels - 1; i >= 0;
1221          --i) // count non-empty levels to get depth
1222       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1223         depth++;
1224 
1225     kmp_uint32 branch = minBranch;
1226     if (numPerLevel[0] == 1)
1227       branch = num_addrs / maxLeaves;
1228     if (branch < minBranch)
1229       branch = minBranch;
1230     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1231       while (numPerLevel[d] > branch ||
1232              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1233         if (numPerLevel[d] & 1)
1234           numPerLevel[d]++;
1235         numPerLevel[d] = numPerLevel[d] >> 1;
1236         if (numPerLevel[d + 1] == 1)
1237           depth++;
1238         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1239       }
1240       if (numPerLevel[0] == 1) {
1241         branch = branch >> 1;
1242         if (branch < 4)
1243           branch = minBranch;
1244       }
1245     }
1246 
1247     for (kmp_uint32 i = 1; i < depth; ++i)
1248       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1249     // Fill in hierarchy in the case of oversubscription
1250     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1251       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1252 
1253     uninitialized = initialized; // One writer
1254   }
1255 
1256   // Resize the hierarchy if nproc changes to something larger than before
1257   void resize(kmp_uint32 nproc) {
1258     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1259     while (bool_result == 0) { // someone else is trying to resize
1260       KMP_CPU_PAUSE();
1261       if (nproc <= base_num_threads) // happy with other thread's resize
1262         return;
1263       else // try to resize
1264         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1265     }
1266     KMP_DEBUG_ASSERT(bool_result != 0);
1267     if (nproc <= base_num_threads)
1268       return; // happy with other thread's resize
1269 
1270     // Calculate new maxLevels
1271     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1272     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1273     // First see if old maxLevels is enough to contain new size
1274     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1275       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1276       numPerLevel[i - 1] *= 2;
1277       old_sz *= 2;
1278       depth++;
1279     }
1280     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1281       while (nproc > old_sz) {
1282         old_sz *= 2;
1283         incs++;
1284         depth++;
1285       }
1286       maxLevels += incs;
1287 
1288       // Resize arrays
1289       kmp_uint32 *old_numPerLevel = numPerLevel;
1290       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1291       numPerLevel = skipPerLevel = NULL;
1292       numPerLevel =
1293           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1294       skipPerLevel = &(numPerLevel[maxLevels]);
1295 
1296       // Copy old elements from old arrays
1297       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1298         // init numPerLevel[*] to 1 item per level
1299         numPerLevel[i] = old_numPerLevel[i];
1300         skipPerLevel[i] = old_skipPerLevel[i];
1301       }
1302 
1303       // Init new elements in arrays to 1
1304       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1305         // init numPerLevel[*] to 1 item per level
1306         numPerLevel[i] = 1;
1307         skipPerLevel[i] = 1;
1308       }
1309 
1310       // Free old arrays
1311       __kmp_free(old_numPerLevel);
1312     }
1313 
1314     // Fill in oversubscription levels of hierarchy
1315     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1316       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1317 
1318     base_num_threads = nproc;
1319     resizing = 0; // One writer
1320   }
1321 };
1322 #endif // KMP_AFFINITY_H
1323